includes/wikiengine/parse_mediawiki.php
changeset 1227 bdac73ed481e
parent 1217 feeb49aa6270
child 1231 4797a4a88533
--- a/includes/wikiengine/parse_mediawiki.php	Sun Mar 28 21:49:26 2010 -0400
+++ b/includes/wikiengine/parse_mediawiki.php	Sun Mar 28 23:10:46 2010 -0400
@@ -13,278 +13,278 @@
 
 class Carpenter_Parse_MediaWiki
 {
-  public $rules = array(
-    'bold'   => "/'''(.+?)'''/",
-    'italic' => "/''(.+?)''/",
-    'underline' => '/__(.+?)__/',
-    'externalwithtext' => '#\[((?:https?|irc|ftp)://.+?) (.+?)\]#',
-    'externalnotext' => '#\[((?:https?|irc|ftp)://.+?)\]#',
-    'mailtonotext' => '#\[mailto:([^ \]]+?)\]#',
-    'mailtowithtext' => '#\[mailto:([^ \]]+?) (.+?)\]#',
-    'hr' => '/^[-]{4,} *$/m',
-    'code' => '/^(?:<code>(?:\r?\n)?|<pre>)(.+?)(?:<\/pre>|(?:\r?\n)?<\/code>)$/mis'
-  );
-  
-  private $blockquote_rand_id;
-  
-  public function lang(&$text)
-  {
-    global $lang;
-    
-    preg_match_all('/<lang (?:code|id)="([a-z0-9_-]+)">([\w\W]+?)<\/lang>/', $text, $langmatch);
-    foreach ( $langmatch[0] as $i => $match )
-    {
-      if ( $langmatch[1][$i] == $lang->lang_code )
-      {
-        $text = str_replace_once($match, $langmatch[2][$i], $text);
-      }
-      else
-      {
-        $text = str_replace_once($match, '', $text);
-      }
-    }
-    
-    return array();
-  }
-  
-  public function templates(&$text)
-  {
-    $template_regex = "/\{\{(.+)((\n|\|[ ]*([A-z0-9]+)[ ]*=[ ]*(.+))*)\}\}/isU";
-    $i = 0;
-    while ( preg_match($template_regex, $text, $match) )
-    {
-      $i++;
-      if ( $i == 5 )
-        break;
-      $text = RenderMan::include_templates($text);
-    }
-    
-    return array();
-  }
-  
-  public function heading(&$text)
-  {
-    if ( !preg_match_all('/^(={1,6}) *(.+?) *\\1 *$/m', $text, $results) )
-      return array();
-    
-    $headings = array();
-    foreach ( $results[0] as $i => $match )
-    {
-      $headings[] = array(
-          'level' => strlen($results[1][$i]),
-          'text' => $results[2][$i]
-        );
-    }
-    
-    $text = Carpenter::tokenize($text, $results[0]);
-    
-    return $headings;
-  }
-  
-  public function multilist(&$text)
-  {
-    // Match entire lists
-    $regex = '/^
-                ([:#\*])+     # Initial list delimiter
-                [ ]*
-                .+?
-                (?:
-                  \r?\n
-                  (?:\\1|[ ]{2,})
-                  [ ]*
-                  .+?)*
-                $/mx';
-    
-    if ( !preg_match_all($regex, $text, $lists) )
-      return array();
-    
-    $types = array(
-        '*' => 'unordered',
-        '#' => 'ordered',
-        ':' => 'indent'
-      );
-    
-    $pieces = array();
-    foreach ( $lists[0] as $i => $list )
-    {
-      $token = $lists[1][$i];
-      $piece = array(
-          'type' => $types[$token],
-          'items' => array()
-        );
-      
-      // convert windows newlines to unix
-      $list = str_replace("\r\n", "\n", $list);
-      $items_pre = explode("\n", $list);
-      $items = array();
-      // first pass, go through and combine items that are newlined
-      foreach ( $items_pre as $item )
-      {
-        if ( substr($item, 0, 1) == $token )
-        {
-          $items[] = $item;
-        }
-        else
-        {
-          // it's a continuation of the previous LI. Don't need to worry about
-          // undefined indices here since the regex should filter out all invalid
-          // markup. Just append this line to the previous.
-          $items[ count($items) - 1 ] .= "\n" . trim($item);
-        }
-      }
-      
-      // second pass, separate items and tokens
-      unset($items_pre);
-      foreach ( $items as $item )
-      {
-        // get the depth
-        $itemtoken = preg_replace('/^([#:\*]+).*$/s', '$1', $item);
-        // get the text
-        $itemtext = trim(substr($item, strlen($itemtoken)));
-        $piece['items'][] = array(
-            // depth starts at 1
-            'depth' => strlen($itemtoken),
-            'text' => $itemtext
-          );
-      }
-      $pieces[] = $piece;
-    }
-    
-    $text = Carpenter::tokenize($text, $lists[0]);
-    
-    return $pieces;
-  }
-  
-  public function blockquote(&$text)
-  {
-    $rand_id = hexencode(AESCrypt::randkey(16), '', '');
-    
-    while ( preg_match_all('/^(?:(>+) *.+(?:\r?\n|$))+/m', $text, $quotes) )
-    {
-      foreach ( $quotes[0] as $quote )
-      {
-        $piece = trim(preg_replace('/^> */m', '', $quote));
-        $text = str_replace_once($quote, "{blockquote:$rand_id}\n$piece\n{/blockquote:$rand_id}\n", $text);
-      }
-    }
-    //die('<pre>' . htmlspecialchars($text) . '</pre>');
-    
-    $this->blockquote_rand_id = $rand_id;
-  }
-  
-  public function blockquotepost(&$text)
-  {
-    return $this->blockquote_rand_id;
-  }
-  
-  public function paragraph(&$text)
-  {
-    // The trick with paragraphs is to not turn things into them when a block level element already wraps the block of text.
-    // First we need a list of block level elements (http://htmlhelp.com/reference/html40/block.html + some Enano extensions)
-    $blocklevel = 'address|blockquote|center|code|div|dl|fieldset|form|h1|h2|h3|h4|h5|h6|hr|li|ol|p|pre|table|ul|tr|td|th|tbody|thead|tfoot';
-    
-    // Wrap all block level tags
-    RenderMan::tag_strip('_paragraph_bypass', $text, $_nw);
-    
-    // Find all opening and closing tags
-    
-    $regex = ";(<(?:/(?:$blocklevel)|(?:$blocklevel)(?: [^>]*?)?)>);s";
-                
-    // oh. and we're using this tokens thing because for identical matches, the first match will
-    // get wrapped X number of times instead of all matches getting wrapped once; replacing each
-    // with a unique token id remedies this
-    
-    $tokens = array();
-    $rand_id = sha1(microtime() . mt_rand());
-    $tag_stack = array();
-    
-    if ( $text_split = preg_split($regex, $text, -1, PREG_SPLIT_DELIM_CAPTURE) )
-    {
-      $text = '';
-      // go through the text, extract tag names, and push them to a stack.
-      foreach ( $text_split as $splitpart )
-      {
-        if ( preg_match(";^<(/)?($blocklevel)( |>);i", $splitpart, $match) )
-        {
-          $tagname = $match[2];
-          if ( $match[1] == '/' )
-          {
-            // closing tag
-            if ( $tagname != ($top = array_pop($tag_stack)) )
-            {
-              // invalid - push back
-              array_push($tag_stack, $top);
-            }
-            else
-            {
-              // valid - if stack's at zero, add a </_paragraph_bypass>
-              if ( count($tag_stack) == 0 )
-                $splitpart .= '</_paragraph_bypass>';
-            }
-          }
-          else
-          {
-            // push
-            array_push($tag_stack, $tagname);
-            if ( count($tag_stack) == 1 )
-              $splitpart = '<_paragraph_bypass>' . $splitpart;
-          }
-        }
-        $text .= $splitpart;
-      }
-      //echo '<pre>' . htmlspecialchars(print_r($text, true)) . '</pre>';
-    }
-    
-    // All things that should be para-bypassed now are surrounded by _paragraph_bypass tags.
-    
-    // die('<pre>' . htmlspecialchars($text) . '</pre>');
+	public $rules = array(
+		'bold'   => "/'''(.+?)'''/",
+		'italic' => "/''(.+?)''/",
+		'underline' => '/__(.+?)__/',
+		'externalwithtext' => '#\[((?:https?|irc|ftp)://.+?) (.+?)\]#',
+		'externalnotext' => '#\[((?:https?|irc|ftp)://.+?)\]#',
+		'mailtonotext' => '#\[mailto:([^ \]]+?)\]#',
+		'mailtowithtext' => '#\[mailto:([^ \]]+?) (.+?)\]#',
+		'hr' => '/^[-]{4,} *$/m',
+		'code' => '/^(?:<code>(?:\r?\n)?|<pre>)(.+?)(?:<\/pre>|(?:\r?\n)?<\/code>)$/mis'
+	);
+	
+	private $blockquote_rand_id;
+	
+	public function lang(&$text)
+	{
+		global $lang;
+		
+		preg_match_all('/<lang (?:code|id)="([a-z0-9_-]+)">([\w\W]+?)<\/lang>/', $text, $langmatch);
+		foreach ( $langmatch[0] as $i => $match )
+		{
+			if ( $langmatch[1][$i] == $lang->lang_code )
+			{
+				$text = str_replace_once($match, $langmatch[2][$i], $text);
+			}
+			else
+			{
+				$text = str_replace_once($match, '', $text);
+			}
+		}
+		
+		return array();
+	}
+	
+	public function templates(&$text)
+	{
+		$template_regex = "/\{\{(.+)((\n|\|[ ]*([A-z0-9]+)[ ]*=[ ]*(.+))*)\}\}/isU";
+		$i = 0;
+		while ( preg_match($template_regex, $text, $match) )
+		{
+			$i++;
+			if ( $i == 5 )
+				break;
+			$text = RenderMan::include_templates($text);
+		}
+		
+		return array();
+	}
+	
+	public function heading(&$text)
+	{
+		if ( !preg_match_all('/^(={1,6}) *(.+?) *\\1 *$/m', $text, $results) )
+			return array();
+		
+		$headings = array();
+		foreach ( $results[0] as $i => $match )
+		{
+			$headings[] = array(
+					'level' => strlen($results[1][$i]),
+					'text' => $results[2][$i]
+				);
+		}
+		
+		$text = Carpenter::tokenize($text, $results[0]);
+		
+		return $headings;
+	}
 	
-    RenderMan::tag_unstrip('_paragraph_bypass', $text, $_nw, true);
-    
-    // This is potentially a hack. It allows the parser to stick in <_paragraph_bypass> tags
-    // to prevent the paragraph parser from interfering with pretty HTML generated elsewhere.
-    RenderMan::tag_strip('_paragraph_bypass', $text, $_nw);
-    
-    $startcond = "(?!(?:[\\r\\n]|\{_paragraph_bypass:[a-f0-9]{32}:[0-9]+\}|[ ]*<\/?(?:$blocklevel)(?: .+>|>)))";
-    $regex = "/^
-                $startcond        # line start condition - do not match if the line starts with the condition above
-                .+?               # body text
-                (?:
-                  \\n             # additional lines
-                  $startcond      # make sure of only one newline in a row, and end the paragraph if a new line fails the start condition
-                  .*?
-                )*                # keep going until it fails
-              $
-              /mx";
-    
-    if ( !preg_match_all($regex, $text, $matches) )
-    {
-      RenderMan::tag_unstrip('_paragraph_bypass', $text, $_nw);
-      return array();
-    }
-    
-    // Debugging :)
-    // die('<pre>' . htmlspecialchars($text) . "\n-----------------------------------------------------------\n" . htmlspecialchars(print_r($matches, true)) . '</pre>');
-    
-    // restore stripped
-    RenderMan::tag_unstrip('_paragraph_bypass', $text, $_nw);
-    
-    // tokenize
-    $text = Carpenter::tokenize($text, $matches[0]);
-    
-    return $matches[0];
-  }
+	public function multilist(&$text)
+	{
+		// Match entire lists
+		$regex = '/^
+								([:#\*])+     # Initial list delimiter
+								[ ]*
+								.+?
+								(?:
+									\r?\n
+									(?:\\1|[ ]{2,})
+									[ ]*
+									.+?)*
+								$/mx';
+		
+		if ( !preg_match_all($regex, $text, $lists) )
+			return array();
+		
+		$types = array(
+				'*' => 'unordered',
+				'#' => 'ordered',
+				':' => 'indent'
+			);
+		
+		$pieces = array();
+		foreach ( $lists[0] as $i => $list )
+		{
+			$token = $lists[1][$i];
+			$piece = array(
+					'type' => $types[$token],
+					'items' => array()
+				);
+			
+			// convert windows newlines to unix
+			$list = str_replace("\r\n", "\n", $list);
+			$items_pre = explode("\n", $list);
+			$items = array();
+			// first pass, go through and combine items that are newlined
+			foreach ( $items_pre as $item )
+			{
+				if ( substr($item, 0, 1) == $token )
+				{
+					$items[] = $item;
+				}
+				else
+				{
+					// it's a continuation of the previous LI. Don't need to worry about
+					// undefined indices here since the regex should filter out all invalid
+					// markup. Just append this line to the previous.
+					$items[ count($items) - 1 ] .= "\n" . trim($item);
+				}
+			}
+			
+			// second pass, separate items and tokens
+			unset($items_pre);
+			foreach ( $items as $item )
+			{
+				// get the depth
+				$itemtoken = preg_replace('/^([#:\*]+).*$/s', '$1', $item);
+				// get the text
+				$itemtext = trim(substr($item, strlen($itemtoken)));
+				$piece['items'][] = array(
+						// depth starts at 1
+						'depth' => strlen($itemtoken),
+						'text' => $itemtext
+					);
+			}
+			$pieces[] = $piece;
+		}
+		
+		$text = Carpenter::tokenize($text, $lists[0]);
+		
+		return $pieces;
+	}
+	
+	public function blockquote(&$text)
+	{
+		$rand_id = hexencode(AESCrypt::randkey(16), '', '');
+		
+		while ( preg_match_all('/^(?:(>+) *.+(?:\r?\n|$))+/m', $text, $quotes) )
+		{
+			foreach ( $quotes[0] as $quote )
+			{
+				$piece = trim(preg_replace('/^> */m', '', $quote));
+				$text = str_replace_once($quote, "{blockquote:$rand_id}\n$piece\n{/blockquote:$rand_id}\n", $text);
+			}
+		}
+		//die('<pre>' . htmlspecialchars($text) . '</pre>');
+		
+		$this->blockquote_rand_id = $rand_id;
+	}
+	
+	public function blockquotepost(&$text)
+	{
+		return $this->blockquote_rand_id;
+	}
+	
+	public function paragraph(&$text)
+	{
+		// The trick with paragraphs is to not turn things into them when a block level element already wraps the block of text.
+		// First we need a list of block level elements (http://htmlhelp.com/reference/html40/block.html + some Enano extensions)
+		$blocklevel = 'address|blockquote|center|code|div|dl|fieldset|form|h1|h2|h3|h4|h5|h6|hr|li|ol|p|pre|table|ul|tr|td|th|tbody|thead|tfoot';
+		
+		// Wrap all block level tags
+		RenderMan::tag_strip('_paragraph_bypass', $text, $_nw);
+		
+		// Find all opening and closing tags
+		
+		$regex = ";(<(?:/(?:$blocklevel)|(?:$blocklevel)(?: [^>]*?)?)>);s";
+								
+		// oh. and we're using this tokens thing because for identical matches, the first match will
+		// get wrapped X number of times instead of all matches getting wrapped once; replacing each
+		// with a unique token id remedies this
+		
+		$tokens = array();
+		$rand_id = sha1(microtime() . mt_rand());
+		$tag_stack = array();
+		
+		if ( $text_split = preg_split($regex, $text, -1, PREG_SPLIT_DELIM_CAPTURE) )
+		{
+			$text = '';
+			// go through the text, extract tag names, and push them to a stack.
+			foreach ( $text_split as $splitpart )
+			{
+				if ( preg_match(";^<(/)?($blocklevel)( |>);i", $splitpart, $match) )
+				{
+					$tagname = $match[2];
+					if ( $match[1] == '/' )
+					{
+						// closing tag
+						if ( $tagname != ($top = array_pop($tag_stack)) )
+						{
+							// invalid - push back
+							array_push($tag_stack, $top);
+						}
+						else
+						{
+							// valid - if stack's at zero, add a </_paragraph_bypass>
+							if ( count($tag_stack) == 0 )
+								$splitpart .= '</_paragraph_bypass>';
+						}
+					}
+					else
+					{
+						// push
+						array_push($tag_stack, $tagname);
+						if ( count($tag_stack) == 1 )
+							$splitpart = '<_paragraph_bypass>' . $splitpart;
+					}
+				}
+				$text .= $splitpart;
+			}
+			//echo '<pre>' . htmlspecialchars(print_r($text, true)) . '</pre>';
+		}
+		
+		// All things that should be para-bypassed now are surrounded by _paragraph_bypass tags.
+		
+		// die('<pre>' . htmlspecialchars($text) . '</pre>');
+	
+		RenderMan::tag_unstrip('_paragraph_bypass', $text, $_nw, true);
+		
+		// This is potentially a hack. It allows the parser to stick in <_paragraph_bypass> tags
+		// to prevent the paragraph parser from interfering with pretty HTML generated elsewhere.
+		RenderMan::tag_strip('_paragraph_bypass', $text, $_nw);
+		
+		$startcond = "(?!(?:[\\r\\n]|\{_paragraph_bypass:[a-f0-9]{32}:[0-9]+\}|[ ]*<\/?(?:$blocklevel)(?: .+>|>)))";
+		$regex = "/^
+								$startcond        # line start condition - do not match if the line starts with the condition above
+								.+?               # body text
+								(?:
+									\\n             # additional lines
+									$startcond      # make sure of only one newline in a row, and end the paragraph if a new line fails the start condition
+									.*?
+								)*                # keep going until it fails
+							$
+							/mx";
+		
+		if ( !preg_match_all($regex, $text, $matches) )
+		{
+			RenderMan::tag_unstrip('_paragraph_bypass', $text, $_nw);
+			return array();
+		}
+		
+		// Debugging :)
+		// die('<pre>' . htmlspecialchars($text) . "\n-----------------------------------------------------------\n" . htmlspecialchars(print_r($matches, true)) . '</pre>');
+		
+		// restore stripped
+		RenderMan::tag_unstrip('_paragraph_bypass', $text, $_nw);
+		
+		// tokenize
+		$text = Carpenter::tokenize($text, $matches[0]);
+		
+		return $matches[0];
+	}
 }
 
 function parser_mediawiki_xhtml_image($text)
 {
-  $text = RenderMan::process_image_tags($text, $taglist);
-  $text = RenderMan::process_imgtags_stage2($text, $taglist);
-  return $text;
+	$text = RenderMan::process_image_tags($text, $taglist);
+	$text = RenderMan::process_imgtags_stage2($text, $taglist);
+	return $text;
 }
 
 function parser_mediawiki_xhtml_tables($text)
 {
-  return process_tables($text);
+	return process_tables($text);
 }