includes/wikiengine/parse_mediawiki.php
changeset 1217 feeb49aa6270
parent 1174 def792dd9b1b
child 1227 bdac73ed481e
equal deleted inserted replaced
1216:4125e19d3b27 1217:feeb49aa6270
    20     'externalwithtext' => '#\[((?:https?|irc|ftp)://.+?) (.+?)\]#',
    20     'externalwithtext' => '#\[((?:https?|irc|ftp)://.+?) (.+?)\]#',
    21     'externalnotext' => '#\[((?:https?|irc|ftp)://.+?)\]#',
    21     'externalnotext' => '#\[((?:https?|irc|ftp)://.+?)\]#',
    22     'mailtonotext' => '#\[mailto:([^ \]]+?)\]#',
    22     'mailtonotext' => '#\[mailto:([^ \]]+?)\]#',
    23     'mailtowithtext' => '#\[mailto:([^ \]]+?) (.+?)\]#',
    23     'mailtowithtext' => '#\[mailto:([^ \]]+?) (.+?)\]#',
    24     'hr' => '/^[-]{4,} *$/m',
    24     'hr' => '/^[-]{4,} *$/m',
    25     'code' => '/^<code>(?:\r?\n)?(.+?)(?:\r?\n)?<\/code>$/mis'
    25     'code' => '/^(?:<code>(?:\r?\n)?|<pre>)(.+?)(?:<\/pre>|(?:\r?\n)?<\/code>)$/mis'
    26   );
    26   );
    27   
    27   
    28   private $blockquote_rand_id;
    28   private $blockquote_rand_id;
    29   
    29   
    30   public function lang(&$text)
    30   public function lang(&$text)
   184     $blocklevel = 'address|blockquote|center|code|div|dl|fieldset|form|h1|h2|h3|h4|h5|h6|hr|li|ol|p|pre|table|ul|tr|td|th|tbody|thead|tfoot';
   184     $blocklevel = 'address|blockquote|center|code|div|dl|fieldset|form|h1|h2|h3|h4|h5|h6|hr|li|ol|p|pre|table|ul|tr|td|th|tbody|thead|tfoot';
   185     
   185     
   186     // Wrap all block level tags
   186     // Wrap all block level tags
   187     RenderMan::tag_strip('_paragraph_bypass', $text, $_nw);
   187     RenderMan::tag_strip('_paragraph_bypass', $text, $_nw);
   188     
   188     
   189     // I'm not sure why I had to go through all these alternatives. Trying to bring it
   189     // Find all opening and closing tags
   190     // all down to one by ?'ing subpatterns was causing things to return empty and throwing
   190     
   191     // errors in the parser. Eventually, around ~3:57AM I just settled on this motherf---er
   191     $regex = ";(<(?:/(?:$blocklevel)|(?:$blocklevel)(?: [^>]*?)?)>);s";
   192     // of a regular expression.
       
   193     
       
   194     // FIXME: This regexp triggers a known PHP stack size issue under win32 and possibly
       
   195     // other platforms (<http://bugs.php.net/bug.php?id=47689>). The workaround is going to
       
   196     // involve writing our own parser that takes care of recursion without using the stack,
       
   197     // which is going to be a bitch, and may not make it in until Caoineag RCs.
       
   198     
       
   199     $regex = ";
       
   200               <($blocklevel)
       
   201               (?:
       
   202                 # self closing, no attributes
       
   203                 [ ]*/>
       
   204               |
       
   205                 # self closing, attributes
       
   206                 [ ][^>]+? />
       
   207               |
       
   208                 # with inner text, no attributes
       
   209                 >
       
   210                 (?: (?R) | .*? )*</\\1>
       
   211               |
       
   212                 # with inner text and attributes
       
   213                 [ ][^>]+?     # attributes
       
   214                 >
       
   215                 (?: (?R) | .*? )*</\\1>
       
   216               )
       
   217                 ;sx";
       
   218                 
   192                 
   219     // oh. and we're using this tokens thing because for identical matches, the first match will
   193     // oh. and we're using this tokens thing because for identical matches, the first match will
   220     // get wrapped X number of times instead of all matches getting wrapped once; replacing each
   194     // get wrapped X number of times instead of all matches getting wrapped once; replacing each
   221     // with a unique token id remedies this
   195     // with a unique token id remedies this
   222     
   196     
   223     $tokens = array();
   197     $tokens = array();
   224     $rand_id = sha1(microtime() . mt_rand());
   198     $rand_id = sha1(microtime() . mt_rand());
   225     
   199     $tag_stack = array();
   226     // Temporary hack to fix crashes under win32. Sometime I'll write a loop based
   200     
   227     // parser for this whole section. Maybe. Perhaps the Apache folks will fix their
   201     if ( $text_split = preg_split($regex, $text, -1, PREG_SPLIT_DELIM_CAPTURE) )
   228     // Windows binaries first.
   202     {
   229     if ( PHP_OS == 'WIN32' || PHP_OS == 'WINNT' )
   203       $text = '';
   230     {
   204       // go through the text, extract tag names, and push them to a stack.
   231       $regex = str_replace("(?: (?R) | .*? )*", "(?: .*? )", $regex);
   205       foreach ( $text_split as $splitpart )
   232     }
   206       {
   233     if ( preg_match_all($regex, $text, $matches) )
   207         if ( preg_match(";^<(/)?($blocklevel)( |>);i", $splitpart, $match) )
   234     {
   208         {
   235       foreach ( $matches[0] as $i => $match )
   209           $tagname = $match[2];
   236       {
   210           if ( $match[1] == '/' )
   237         $text = str_replace_once($match, "{_pb_:$rand_id:$i}", $text);
   211           {
   238         $tokens[$i] = '<_paragraph_bypass>' . $match . '</_paragraph_bypass>';
   212             // closing tag
   239       }
   213             if ( $tagname != ($top = array_pop($tag_stack)) )
   240     }
   214             {
   241     
   215               // invalid - push back
   242     foreach ( $tokens as $i => $match )
   216               array_push($tag_stack, $top);
   243     {
   217             }
   244       $text = str_replace_once("{_pb_:$rand_id:$i}", $match, $text);
   218             else
   245     }
   219             {
       
   220               // valid - if stack's at zero, add a </_paragraph_bypass>
       
   221               if ( count($tag_stack) == 0 )
       
   222                 $splitpart .= '</_paragraph_bypass>';
       
   223             }
       
   224           }
       
   225           else
       
   226           {
       
   227             // push
       
   228             array_push($tag_stack, $tagname);
       
   229             if ( count($tag_stack) == 1 )
       
   230               $splitpart = '<_paragraph_bypass>' . $splitpart;
       
   231           }
       
   232         }
       
   233         $text .= $splitpart;
       
   234       }
       
   235       //echo '<pre>' . htmlspecialchars(print_r($text, true)) . '</pre>';
       
   236     }
       
   237     
       
   238     // All things that should be para-bypassed now are surrounded by _paragraph_bypass tags.
   246     
   239     
   247     // die('<pre>' . htmlspecialchars($text) . '</pre>');
   240     // die('<pre>' . htmlspecialchars($text) . '</pre>');
   248 	
   241 	
   249     RenderMan::tag_unstrip('_paragraph_bypass', $text, $_nw, true);
   242     RenderMan::tag_unstrip('_paragraph_bypass', $text, $_nw, true);
   250     
   243