Modified paragraph rule to not use recursive parsing; made parsing of code and pre tags much more reliable. Fixes issue 1 (QA: RE-TEST).
authorDan
Mon, 01 Feb 2010 02:22:54 -0500
changeset 1217 feeb49aa6270
parent 1216 4125e19d3b27
child 1218 25af9c98a7dc
child 1229 17f206fe1d2e
Modified paragraph rule to not use recursive parsing; made parsing of code and pre tags much more reliable. Fixes issue 1 (QA: RE-TEST).
includes/wikiengine/parse_mediawiki.php
includes/wikiengine/render_xhtml.php
includes/wikiformat.php
--- a/includes/wikiengine/parse_mediawiki.php	Mon Feb 01 02:15:04 2010 -0500
+++ b/includes/wikiengine/parse_mediawiki.php	Mon Feb 01 02:22:54 2010 -0500
@@ -22,7 +22,7 @@
     'mailtonotext' => '#\[mailto:([^ \]]+?)\]#',
     'mailtowithtext' => '#\[mailto:([^ \]]+?) (.+?)\]#',
     'hr' => '/^[-]{4,} *$/m',
-    'code' => '/^<code>(?:\r?\n)?(.+?)(?:\r?\n)?<\/code>$/mis'
+    'code' => '/^(?:<code>(?:\r?\n)?|<pre>)(.+?)(?:<\/pre>|(?:\r?\n)?<\/code>)$/mis'
   );
   
   private $blockquote_rand_id;
@@ -186,35 +186,9 @@
     // Wrap all block level tags
     RenderMan::tag_strip('_paragraph_bypass', $text, $_nw);
     
-    // I'm not sure why I had to go through all these alternatives. Trying to bring it
-    // all down to one by ?'ing subpatterns was causing things to return empty and throwing
-    // errors in the parser. Eventually, around ~3:57AM I just settled on this motherf---er
-    // of a regular expression.
-    
-    // FIXME: This regexp triggers a known PHP stack size issue under win32 and possibly
-    // other platforms (<http://bugs.php.net/bug.php?id=47689>). The workaround is going to
-    // involve writing our own parser that takes care of recursion without using the stack,
-    // which is going to be a bitch, and may not make it in until Caoineag RCs.
+    // Find all opening and closing tags
     
-    $regex = ";
-              <($blocklevel)
-              (?:
-                # self closing, no attributes
-                [ ]*/>
-              |
-                # self closing, attributes
-                [ ][^>]+? />
-              |
-                # with inner text, no attributes
-                >
-                (?: (?R) | .*? )*</\\1>
-              |
-                # with inner text and attributes
-                [ ][^>]+?     # attributes
-                >
-                (?: (?R) | .*? )*</\\1>
-              )
-                ;sx";
+    $regex = ";(<(?:/(?:$blocklevel)|(?:$blocklevel)(?: [^>]*?)?)>);s";
                 
     // oh. and we're using this tokens thing because for identical matches, the first match will
     // get wrapped X number of times instead of all matches getting wrapped once; replacing each
@@ -222,27 +196,46 @@
     
     $tokens = array();
     $rand_id = sha1(microtime() . mt_rand());
+    $tag_stack = array();
     
-    // Temporary hack to fix crashes under win32. Sometime I'll write a loop based
-    // parser for this whole section. Maybe. Perhaps the Apache folks will fix their
-    // Windows binaries first.
-    if ( PHP_OS == 'WIN32' || PHP_OS == 'WINNT' )
+    if ( $text_split = preg_split($regex, $text, -1, PREG_SPLIT_DELIM_CAPTURE) )
     {
-      $regex = str_replace("(?: (?R) | .*? )*", "(?: .*? )", $regex);
-    }
-    if ( preg_match_all($regex, $text, $matches) )
-    {
-      foreach ( $matches[0] as $i => $match )
+      $text = '';
+      // go through the text, extract tag names, and push them to a stack.
+      foreach ( $text_split as $splitpart )
       {
-        $text = str_replace_once($match, "{_pb_:$rand_id:$i}", $text);
-        $tokens[$i] = '<_paragraph_bypass>' . $match . '</_paragraph_bypass>';
+        if ( preg_match(";^<(/)?($blocklevel)( |>);i", $splitpart, $match) )
+        {
+          $tagname = $match[2];
+          if ( $match[1] == '/' )
+          {
+            // closing tag
+            if ( $tagname != ($top = array_pop($tag_stack)) )
+            {
+              // invalid - push back
+              array_push($tag_stack, $top);
+            }
+            else
+            {
+              // valid - if stack's at zero, add a </_paragraph_bypass>
+              if ( count($tag_stack) == 0 )
+                $splitpart .= '</_paragraph_bypass>';
+            }
+          }
+          else
+          {
+            // push
+            array_push($tag_stack, $tagname);
+            if ( count($tag_stack) == 1 )
+              $splitpart = '<_paragraph_bypass>' . $splitpart;
+          }
+        }
+        $text .= $splitpart;
       }
+      //echo '<pre>' . htmlspecialchars(print_r($text, true)) . '</pre>';
     }
     
-    foreach ( $tokens as $i => $match )
-    {
-      $text = str_replace_once("{_pb_:$rand_id:$i}", $match, $text);
-    }
+    // All things that should be para-bypassed now are surrounded by _paragraph_bypass tags.
     
     // die('<pre>' . htmlspecialchars($text) . '</pre>');
 	
--- a/includes/wikiengine/render_xhtml.php	Mon Feb 01 02:15:04 2010 -0500
+++ b/includes/wikiengine/render_xhtml.php	Mon Feb 01 02:22:54 2010 -0500
@@ -159,7 +159,7 @@
   
   public function code($match)
   {
-    return '<pre>' . htmlspecialchars($match[1]) . '</pre>';
+    return '<pre class="wikitext-code"><final>' . htmlspecialchars($match[1]) . '</final></pre>';
   }
 }
 
--- a/includes/wikiformat.php	Mon Feb 01 02:15:04 2010 -0500
+++ b/includes/wikiformat.php	Mon Feb 01 02:22:54 2010 -0500
@@ -103,7 +103,7 @@
     $parser_class = "Carpenter_Parse_" . ucwords($this->parser);
     $renderer_class = "Carpenter_Render_" . ucwords($this->renderer);
     
-    // empty?
+    // empty? (don't remove this. the parser will shit bricks later about rules returning empty strings)
     if ( trim($text) === '' )
       return $text;
     
@@ -159,7 +159,7 @@
       $text = $this->perform_render_step($text, $rule, $parser, $renderer);
       if ( empty($text) )
       {
-        trigger_error("Wikitext was empty after rule \"$rule\"; restoring backup", E_USER_WARNING);
+        trigger_error("Wikitext was completely empty after rule \"$rule\"; restoring backup", E_USER_WARNING);
         $text = $text_before;
       }
       unset($text_before);
@@ -178,8 +178,12 @@
           }
         }
       }
+      
+      RenderMan::tag_strip_push('final', $text, $final_stripdata);
     }
     
+    RenderMan::tag_unstrip('final', $text, $final_stripdata);
+    
     // run posthooks
     foreach ( $this->hooks as $hook )
     {