Improvements and fixes (hacks?) for HTML sanitization
authorDan
Sat, 06 Oct 2007 13:01:46 -0400
changeset 163 ad00dc1f8706
parent 162 e1a22031b5bd
child 164 54c79adfb694
Improvements and fixes (hacks?) for HTML sanitization
includes/functions.php
includes/render.php
includes/template.php
includes/wikiengine/Tables.php
--- a/includes/functions.php	Fri Oct 05 01:57:00 2007 -0400
+++ b/includes/functions.php	Sat Oct 06 13:01:46 2007 -0400
@@ -1789,6 +1789,26 @@
 
 function sanitize_html($html, $filter_php = true)
 {
+  // Random seed for substitution
+  $rand_seed = md5( sha1(microtime()) . mt_rand() );
+  
+  // Strip out comments that are already escaped
+  preg_match_all('/<!--(.*?)-->/', $html, $comment_match);
+  $i = 0;
+  foreach ( $comment_match[0] as $comment )
+  {
+    $html = str_replace_once($comment, "{HTMLCOMMENT:$i:$rand_seed}", $html);
+    $i++;
+  }
+  
+  // Strip out code sections that will be postprocessed by Text_Wiki
+  preg_match_all(';^<code(\s[^>]*)?>((?:(?R)|.)*?)\n</code>(\s|$);msi', $html, $code_match);
+  $i = 0;
+  foreach ( $code_match[0] as $code )
+  {
+    $html = str_replace_once($code, "{TW_CODE:$i:$rand_seed}", $html);
+    $i++;
+  }
 
   $html = preg_replace('#<([a-z]+)([\s]+)([^>]+?)'.htmlalternatives('javascript:').'(.+?)>(.*?)</\\1>#is', '&lt;\\1\\2\\3javascript:\\59&gt;\\60&lt;/\\1&gt;', $html);
   $html = preg_replace('#<([a-z]+)([\s]+)([^>]+?)'.htmlalternatives('javascript:').'(.+?)>#is', '&lt;\\1\\2\\3javascript:\\59&gt;', $html);
@@ -1900,6 +1920,22 @@
 
   // Unstrip comments
   $html = preg_replace('/&lt;!--([^>]*?)--&gt;/i', '', $html);
+  
+  // Restore stripped comments
+  $i = 0;
+  foreach ( $comment_match[0] as $comment )
+  {
+    $html = str_replace_once("{HTMLCOMMENT:$i:$rand_seed}", $comment, $html);
+    $i++;
+  }
+  
+  // Restore stripped code
+  $i = 0;
+  foreach ( $code_match[0] as $code )
+  {
+    $html = str_replace_once("{TW_CODE:$i:$rand_seed}", $code, $html);
+    $i++;
+  }
 
   return $html;
 
--- a/includes/render.php	Fri Oct 05 01:57:00 2007 -0400
+++ b/includes/render.php	Sat Oct 06 13:01:46 2007 -0400
@@ -248,6 +248,12 @@
       $text = preg_replace('/<nodisplay>(.*?)<\/nodisplay>/is', '', $text);
     }
     
+    $code = $plugins->setHook('render_wikiformat_pre');
+    foreach ( $code as $cmd )
+    {
+      eval($cmd);
+    }
+    
     if ( !$plaintext )
     {
       // Process images
@@ -290,10 +296,26 @@
       $result = $wiki->transform($text, 'Xhtml');
     }
     
-    // if ( !$plaintext )
-    // {
-    //   $result = RenderMan::process_imgtags_stage2($result, $taglist);
-    // }
+    // HTML fixes
+    $result = preg_replace('#<tr>([\s]*?)<\/tr>#is', '', $result);
+    $result = preg_replace('#<p>([\s]*?)<\/p>#is', '', $result);
+    $result = preg_replace('#<br />([\s]*?)<table#is', '<table', $result);
+    $result = str_replace("<pre><code>\n", "<pre><code>", $result);
+    $result = preg_replace("/<p><table([^>]*?)><\/p>/", "<table\\1>", $result);
+    $result = str_replace("<br />\n</td>", "\n</td>", $result);
+    $result = str_replace("<p><tr>", "<tr>", $result);
+    $result = str_replace("<tr><br />", "<tr>", $result);
+    $result = str_replace("</tr><br />", "</tr>", $result);
+    $result = str_replace("</table><br />", "</table>", $result);
+    $result = preg_replace('/<\/table>$/', "</table><br /><br />", $result);
+    $result = str_replace("<p></div></p>", "</div>", $result);
+    $result = str_replace("<p></table></p>", "</table>", $result);
+    
+    $code = $plugins->setHook('render_wikiformat_post');
+    foreach ( $code as $cmd )
+    {
+      eval($cmd);
+    }
     
     // Reinsert <nowiki> sections
     for($i=0;$i<$nw;$i++)
@@ -311,7 +333,8 @@
     
   }
   
-  function wikiFormat($message, $filter_links = true, $do_params = false, $plaintext = false) {
+  function wikiFormat($message, $filter_links = true, $do_params = false, $plaintext = false)
+  {
     global $db, $session, $paths, $template, $plugins; // Common objects
     
     return RenderMan::next_gen_wiki_format($message, $plaintext, $filter_links, $do_params);
@@ -384,6 +407,8 @@
     $result = str_replace("</table></p>", "</table>", $result);
     $result = str_replace("</table><br />", "</table>", $result);
     $result = preg_replace('/<\/table>$/', "</table><br /><br />", $result);
+    $result = str_replace("<p></div></p>", "</div>", $result);
+    $result = str_replace("<p></table></p>", "</table>", $result);
     
     $result = str_replace('<nowiki>',  '&lt;nowiki&gt;',  $result);
     $result = str_replace('</nowiki>', '&lt;/nowiki&gt;', $result);
--- a/includes/template.php	Fri Oct 05 01:57:00 2007 -0400
+++ b/includes/template.php	Sat Oct 06 13:01:46 2007 -0400
@@ -625,8 +625,7 @@
     
     $this->tpl_bool['stupid_mode'] = false;
     
-    if($paths->page == $paths->nslist['Special'].'Administration') $this->tpl_bool['in_admin'] = true;
-    else $this->tpl_bool['in_admin'] = false;
+    $this->tpl_bool['in_admin'] = ( ( $paths->cpage['urlname_nons'] == 'Administration' && $paths->namespace == 'Special' ) || $paths->namespace == 'Admin' );
     
     $p = ( isset($_GET['printable']) ) ? '/printable' : '';
     
--- a/includes/wikiengine/Tables.php	Fri Oct 05 01:57:00 2007 -0400
+++ b/includes/wikiengine/Tables.php	Sat Oct 06 13:01:46 2007 -0400
@@ -422,6 +422,7 @@
 	 * @return array
 	 */
 	function setupAttributeWhitelist() {
+    global $db, $session, $paths, $template, $plugins;
 		$common = array( 'id', 'class', 'lang', 'dir', 'title', 'style' );
 		$block = array_merge( $common, array( 'align' ) );
 		$tablealign = array( 'align', 'char', 'charoff', 'valign' );
@@ -570,6 +571,14 @@
       # XHTML stuff
       'acronym'    => $common
 			);
+    
+    // custom tags can be added by plugins
+    $code = $plugins->setHook('html_attribute_whitelist');
+    foreach ( $code as $cmd )
+    {
+      eval($cmd);
+    }
+    
 		return $whitelist;
 	}