# HG changeset patch # User Dan # Date 1265008974 18000 # Node ID feeb49aa62703fc21c78ec8bb1918ed26089f646 # Parent 4125e19d3b277a810e41cb9ace8db735739adcab Modified paragraph rule to not use recursive parsing; made parsing of code and pre tags much more reliable. Fixes issue 1 (QA: RE-TEST). diff -r 4125e19d3b27 -r feeb49aa6270 includes/wikiengine/parse_mediawiki.php --- a/includes/wikiengine/parse_mediawiki.php Mon Feb 01 02:15:04 2010 -0500 +++ b/includes/wikiengine/parse_mediawiki.php Mon Feb 01 02:22:54 2010 -0500 @@ -22,7 +22,7 @@ 'mailtonotext' => '#\[mailto:([^ \]]+?)\]#', 'mailtowithtext' => '#\[mailto:([^ \]]+?) (.+?)\]#', 'hr' => '/^[-]{4,} *$/m', - 'code' => '/^(?:\r?\n)?(.+?)(?:\r?\n)?<\/code>$/mis' + 'code' => '/^(?:(?:\r?\n)?|
)(.+?)(?:<\/pre>|(?:\r?\n)?<\/code>)$/mis'
   );
   
   private $blockquote_rand_id;
@@ -186,35 +186,9 @@
     // Wrap all block level tags
     RenderMan::tag_strip('_paragraph_bypass', $text, $_nw);
     
-    // I'm not sure why I had to go through all these alternatives. Trying to bring it
-    // all down to one by ?'ing subpatterns was causing things to return empty and throwing
-    // errors in the parser. Eventually, around ~3:57AM I just settled on this motherf---er
-    // of a regular expression.
-    
-    // FIXME: This regexp triggers a known PHP stack size issue under win32 and possibly
-    // other platforms (). The workaround is going to
-    // involve writing our own parser that takes care of recursion without using the stack,
-    // which is going to be a bitch, and may not make it in until Caoineag RCs.
+    // Find all opening and closing tags
     
-    $regex = ";
-              <($blocklevel)
-              (?:
-                # self closing, no attributes
-                [ ]*/>
-              |
-                # self closing, attributes
-                [ ][^>]+? />
-              |
-                # with inner text, no attributes
-                >
-                (?: (?R) | .*? )*
-              |
-                # with inner text and attributes
-                [ ][^>]+?     # attributes
-                >
-                (?: (?R) | .*? )*
-              )
-                ;sx";
+    $regex = ";(<(?:/(?:$blocklevel)|(?:$blocklevel)(?: [^>]*?)?)>);s";
                 
     // oh. and we're using this tokens thing because for identical matches, the first match will
     // get wrapped X number of times instead of all matches getting wrapped once; replacing each
@@ -222,27 +196,46 @@
     
     $tokens = array();
     $rand_id = sha1(microtime() . mt_rand());
+    $tag_stack = array();
     
-    // Temporary hack to fix crashes under win32. Sometime I'll write a loop based
-    // parser for this whole section. Maybe. Perhaps the Apache folks will fix their
-    // Windows binaries first.
-    if ( PHP_OS == 'WIN32' || PHP_OS == 'WINNT' )
+    if ( $text_split = preg_split($regex, $text, -1, PREG_SPLIT_DELIM_CAPTURE) )
     {
-      $regex = str_replace("(?: (?R) | .*? )*", "(?: .*? )", $regex);
-    }
-    if ( preg_match_all($regex, $text, $matches) )
-    {
-      foreach ( $matches[0] as $i => $match )
+      $text = '';
+      // go through the text, extract tag names, and push them to a stack.
+      foreach ( $text_split as $splitpart )
       {
-        $text = str_replace_once($match, "{_pb_:$rand_id:$i}", $text);
-        $tokens[$i] = '<_paragraph_bypass>' . $match . '';
+        if ( preg_match(";^<(/)?($blocklevel)( |>);i", $splitpart, $match) )
+        {
+          $tagname = $match[2];
+          if ( $match[1] == '/' )
+          {
+            // closing tag
+            if ( $tagname != ($top = array_pop($tag_stack)) )
+            {
+              // invalid - push back
+              array_push($tag_stack, $top);
+            }
+            else
+            {
+              // valid - if stack's at zero, add a 
+              if ( count($tag_stack) == 0 )
+                $splitpart .= '';
+            }
+          }
+          else
+          {
+            // push
+            array_push($tag_stack, $tagname);
+            if ( count($tag_stack) == 1 )
+              $splitpart = '<_paragraph_bypass>' . $splitpart;
+          }
+        }
+        $text .= $splitpart;
       }
+      //echo '
' . htmlspecialchars(print_r($text, true)) . '
'; } - foreach ( $tokens as $i => $match ) - { - $text = str_replace_once("{_pb_:$rand_id:$i}", $match, $text); - } + // All things that should be para-bypassed now are surrounded by _paragraph_bypass tags. // die('
' . htmlspecialchars($text) . '
'); diff -r 4125e19d3b27 -r feeb49aa6270 includes/wikiengine/render_xhtml.php --- a/includes/wikiengine/render_xhtml.php Mon Feb 01 02:15:04 2010 -0500 +++ b/includes/wikiengine/render_xhtml.php Mon Feb 01 02:22:54 2010 -0500 @@ -159,7 +159,7 @@ public function code($match) { - return '
' . htmlspecialchars($match[1]) . '
'; + return '
' . htmlspecialchars($match[1]) . '
'; } } diff -r 4125e19d3b27 -r feeb49aa6270 includes/wikiformat.php --- a/includes/wikiformat.php Mon Feb 01 02:15:04 2010 -0500 +++ b/includes/wikiformat.php Mon Feb 01 02:22:54 2010 -0500 @@ -103,7 +103,7 @@ $parser_class = "Carpenter_Parse_" . ucwords($this->parser); $renderer_class = "Carpenter_Render_" . ucwords($this->renderer); - // empty? + // empty? (don't remove this. the parser will shit bricks later about rules returning empty strings) if ( trim($text) === '' ) return $text; @@ -159,7 +159,7 @@ $text = $this->perform_render_step($text, $rule, $parser, $renderer); if ( empty($text) ) { - trigger_error("Wikitext was empty after rule \"$rule\"; restoring backup", E_USER_WARNING); + trigger_error("Wikitext was completely empty after rule \"$rule\"; restoring backup", E_USER_WARNING); $text = $text_before; } unset($text_before); @@ -178,8 +178,12 @@ } } } + + RenderMan::tag_strip_push('final', $text, $final_stripdata); } + RenderMan::tag_unstrip('final', $text, $final_stripdata); + // run posthooks foreach ( $this->hooks as $hook ) {