includes/wikiengine/parse_mediawiki.php
changeset 1227 bdac73ed481e
parent 1217 feeb49aa6270
child 1231 4797a4a88533
equal deleted inserted replaced
1226:de56132c008d 1227:bdac73ed481e
    11  * warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for details.
    11  * warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for details.
    12  */
    12  */
    13 
    13 
    14 class Carpenter_Parse_MediaWiki
    14 class Carpenter_Parse_MediaWiki
    15 {
    15 {
    16   public $rules = array(
    16 	public $rules = array(
    17     'bold'   => "/'''(.+?)'''/",
    17 		'bold'   => "/'''(.+?)'''/",
    18     'italic' => "/''(.+?)''/",
    18 		'italic' => "/''(.+?)''/",
    19     'underline' => '/__(.+?)__/',
    19 		'underline' => '/__(.+?)__/',
    20     'externalwithtext' => '#\[((?:https?|irc|ftp)://.+?) (.+?)\]#',
    20 		'externalwithtext' => '#\[((?:https?|irc|ftp)://.+?) (.+?)\]#',
    21     'externalnotext' => '#\[((?:https?|irc|ftp)://.+?)\]#',
    21 		'externalnotext' => '#\[((?:https?|irc|ftp)://.+?)\]#',
    22     'mailtonotext' => '#\[mailto:([^ \]]+?)\]#',
    22 		'mailtonotext' => '#\[mailto:([^ \]]+?)\]#',
    23     'mailtowithtext' => '#\[mailto:([^ \]]+?) (.+?)\]#',
    23 		'mailtowithtext' => '#\[mailto:([^ \]]+?) (.+?)\]#',
    24     'hr' => '/^[-]{4,} *$/m',
    24 		'hr' => '/^[-]{4,} *$/m',
    25     'code' => '/^(?:<code>(?:\r?\n)?|<pre>)(.+?)(?:<\/pre>|(?:\r?\n)?<\/code>)$/mis'
    25 		'code' => '/^(?:<code>(?:\r?\n)?|<pre>)(.+?)(?:<\/pre>|(?:\r?\n)?<\/code>)$/mis'
    26   );
    26 	);
    27   
    27 	
    28   private $blockquote_rand_id;
    28 	private $blockquote_rand_id;
    29   
    29 	
    30   public function lang(&$text)
    30 	public function lang(&$text)
    31   {
    31 	{
    32     global $lang;
    32 		global $lang;
    33     
    33 		
    34     preg_match_all('/<lang (?:code|id)="([a-z0-9_-]+)">([\w\W]+?)<\/lang>/', $text, $langmatch);
    34 		preg_match_all('/<lang (?:code|id)="([a-z0-9_-]+)">([\w\W]+?)<\/lang>/', $text, $langmatch);
    35     foreach ( $langmatch[0] as $i => $match )
    35 		foreach ( $langmatch[0] as $i => $match )
    36     {
    36 		{
    37       if ( $langmatch[1][$i] == $lang->lang_code )
    37 			if ( $langmatch[1][$i] == $lang->lang_code )
    38       {
    38 			{
    39         $text = str_replace_once($match, $langmatch[2][$i], $text);
    39 				$text = str_replace_once($match, $langmatch[2][$i], $text);
    40       }
    40 			}
    41       else
    41 			else
    42       {
    42 			{
    43         $text = str_replace_once($match, '', $text);
    43 				$text = str_replace_once($match, '', $text);
    44       }
    44 			}
    45     }
    45 		}
    46     
    46 		
    47     return array();
    47 		return array();
    48   }
    48 	}
    49   
    49 	
    50   public function templates(&$text)
    50 	public function templates(&$text)
    51   {
    51 	{
    52     $template_regex = "/\{\{(.+)((\n|\|[ ]*([A-z0-9]+)[ ]*=[ ]*(.+))*)\}\}/isU";
    52 		$template_regex = "/\{\{(.+)((\n|\|[ ]*([A-z0-9]+)[ ]*=[ ]*(.+))*)\}\}/isU";
    53     $i = 0;
    53 		$i = 0;
    54     while ( preg_match($template_regex, $text, $match) )
    54 		while ( preg_match($template_regex, $text, $match) )
    55     {
    55 		{
    56       $i++;
    56 			$i++;
    57       if ( $i == 5 )
    57 			if ( $i == 5 )
    58         break;
    58 				break;
    59       $text = RenderMan::include_templates($text);
    59 			$text = RenderMan::include_templates($text);
    60     }
    60 		}
    61     
    61 		
    62     return array();
    62 		return array();
    63   }
    63 	}
    64   
    64 	
    65   public function heading(&$text)
    65 	public function heading(&$text)
    66   {
    66 	{
    67     if ( !preg_match_all('/^(={1,6}) *(.+?) *\\1 *$/m', $text, $results) )
    67 		if ( !preg_match_all('/^(={1,6}) *(.+?) *\\1 *$/m', $text, $results) )
    68       return array();
    68 			return array();
    69     
    69 		
    70     $headings = array();
    70 		$headings = array();
    71     foreach ( $results[0] as $i => $match )
    71 		foreach ( $results[0] as $i => $match )
    72     {
    72 		{
    73       $headings[] = array(
    73 			$headings[] = array(
    74           'level' => strlen($results[1][$i]),
    74 					'level' => strlen($results[1][$i]),
    75           'text' => $results[2][$i]
    75 					'text' => $results[2][$i]
    76         );
    76 				);
    77     }
    77 		}
    78     
    78 		
    79     $text = Carpenter::tokenize($text, $results[0]);
    79 		$text = Carpenter::tokenize($text, $results[0]);
    80     
    80 		
    81     return $headings;
    81 		return $headings;
    82   }
    82 	}
    83   
    83 	
    84   public function multilist(&$text)
    84 	public function multilist(&$text)
    85   {
    85 	{
    86     // Match entire lists
    86 		// Match entire lists
    87     $regex = '/^
    87 		$regex = '/^
    88                 ([:#\*])+     # Initial list delimiter
    88 								([:#\*])+     # Initial list delimiter
    89                 [ ]*
    89 								[ ]*
    90                 .+?
    90 								.+?
    91                 (?:
    91 								(?:
    92                   \r?\n
    92 									\r?\n
    93                   (?:\\1|[ ]{2,})
    93 									(?:\\1|[ ]{2,})
    94                   [ ]*
    94 									[ ]*
    95                   .+?)*
    95 									.+?)*
    96                 $/mx';
    96 								$/mx';
    97     
    97 		
    98     if ( !preg_match_all($regex, $text, $lists) )
    98 		if ( !preg_match_all($regex, $text, $lists) )
    99       return array();
    99 			return array();
   100     
   100 		
   101     $types = array(
   101 		$types = array(
   102         '*' => 'unordered',
   102 				'*' => 'unordered',
   103         '#' => 'ordered',
   103 				'#' => 'ordered',
   104         ':' => 'indent'
   104 				':' => 'indent'
   105       );
   105 			);
   106     
   106 		
   107     $pieces = array();
   107 		$pieces = array();
   108     foreach ( $lists[0] as $i => $list )
   108 		foreach ( $lists[0] as $i => $list )
   109     {
   109 		{
   110       $token = $lists[1][$i];
   110 			$token = $lists[1][$i];
   111       $piece = array(
   111 			$piece = array(
   112           'type' => $types[$token],
   112 					'type' => $types[$token],
   113           'items' => array()
   113 					'items' => array()
   114         );
   114 				);
   115       
   115 			
   116       // convert windows newlines to unix
   116 			// convert windows newlines to unix
   117       $list = str_replace("\r\n", "\n", $list);
   117 			$list = str_replace("\r\n", "\n", $list);
   118       $items_pre = explode("\n", $list);
   118 			$items_pre = explode("\n", $list);
   119       $items = array();
   119 			$items = array();
   120       // first pass, go through and combine items that are newlined
   120 			// first pass, go through and combine items that are newlined
   121       foreach ( $items_pre as $item )
   121 			foreach ( $items_pre as $item )
   122       {
   122 			{
   123         if ( substr($item, 0, 1) == $token )
   123 				if ( substr($item, 0, 1) == $token )
   124         {
   124 				{
   125           $items[] = $item;
   125 					$items[] = $item;
   126         }
   126 				}
   127         else
   127 				else
   128         {
   128 				{
   129           // it's a continuation of the previous LI. Don't need to worry about
   129 					// it's a continuation of the previous LI. Don't need to worry about
   130           // undefined indices here since the regex should filter out all invalid
   130 					// undefined indices here since the regex should filter out all invalid
   131           // markup. Just append this line to the previous.
   131 					// markup. Just append this line to the previous.
   132           $items[ count($items) - 1 ] .= "\n" . trim($item);
   132 					$items[ count($items) - 1 ] .= "\n" . trim($item);
   133         }
   133 				}
   134       }
   134 			}
   135       
   135 			
   136       // second pass, separate items and tokens
   136 			// second pass, separate items and tokens
   137       unset($items_pre);
   137 			unset($items_pre);
   138       foreach ( $items as $item )
   138 			foreach ( $items as $item )
   139       {
   139 			{
   140         // get the depth
   140 				// get the depth
   141         $itemtoken = preg_replace('/^([#:\*]+).*$/s', '$1', $item);
   141 				$itemtoken = preg_replace('/^([#:\*]+).*$/s', '$1', $item);
   142         // get the text
   142 				// get the text
   143         $itemtext = trim(substr($item, strlen($itemtoken)));
   143 				$itemtext = trim(substr($item, strlen($itemtoken)));
   144         $piece['items'][] = array(
   144 				$piece['items'][] = array(
   145             // depth starts at 1
   145 						// depth starts at 1
   146             'depth' => strlen($itemtoken),
   146 						'depth' => strlen($itemtoken),
   147             'text' => $itemtext
   147 						'text' => $itemtext
   148           );
   148 					);
   149       }
   149 			}
   150       $pieces[] = $piece;
   150 			$pieces[] = $piece;
   151     }
   151 		}
   152     
   152 		
   153     $text = Carpenter::tokenize($text, $lists[0]);
   153 		$text = Carpenter::tokenize($text, $lists[0]);
   154     
   154 		
   155     return $pieces;
   155 		return $pieces;
   156   }
   156 	}
   157   
   157 	
   158   public function blockquote(&$text)
   158 	public function blockquote(&$text)
   159   {
   159 	{
   160     $rand_id = hexencode(AESCrypt::randkey(16), '', '');
   160 		$rand_id = hexencode(AESCrypt::randkey(16), '', '');
   161     
   161 		
   162     while ( preg_match_all('/^(?:(>+) *.+(?:\r?\n|$))+/m', $text, $quotes) )
   162 		while ( preg_match_all('/^(?:(>+) *.+(?:\r?\n|$))+/m', $text, $quotes) )
   163     {
   163 		{
   164       foreach ( $quotes[0] as $quote )
   164 			foreach ( $quotes[0] as $quote )
   165       {
   165 			{
   166         $piece = trim(preg_replace('/^> */m', '', $quote));
   166 				$piece = trim(preg_replace('/^> */m', '', $quote));
   167         $text = str_replace_once($quote, "{blockquote:$rand_id}\n$piece\n{/blockquote:$rand_id}\n", $text);
   167 				$text = str_replace_once($quote, "{blockquote:$rand_id}\n$piece\n{/blockquote:$rand_id}\n", $text);
   168       }
   168 			}
   169     }
   169 		}
   170     //die('<pre>' . htmlspecialchars($text) . '</pre>');
   170 		//die('<pre>' . htmlspecialchars($text) . '</pre>');
   171     
   171 		
   172     $this->blockquote_rand_id = $rand_id;
   172 		$this->blockquote_rand_id = $rand_id;
   173   }
   173 	}
   174   
   174 	
   175   public function blockquotepost(&$text)
   175 	public function blockquotepost(&$text)
   176   {
   176 	{
   177     return $this->blockquote_rand_id;
   177 		return $this->blockquote_rand_id;
   178   }
   178 	}
   179   
   179 	
   180   public function paragraph(&$text)
   180 	public function paragraph(&$text)
   181   {
   181 	{
   182     // The trick with paragraphs is to not turn things into them when a block level element already wraps the block of text.
   182 		// The trick with paragraphs is to not turn things into them when a block level element already wraps the block of text.
   183     // First we need a list of block level elements (http://htmlhelp.com/reference/html40/block.html + some Enano extensions)
   183 		// First we need a list of block level elements (http://htmlhelp.com/reference/html40/block.html + some Enano extensions)
   184     $blocklevel = 'address|blockquote|center|code|div|dl|fieldset|form|h1|h2|h3|h4|h5|h6|hr|li|ol|p|pre|table|ul|tr|td|th|tbody|thead|tfoot';
   184 		$blocklevel = 'address|blockquote|center|code|div|dl|fieldset|form|h1|h2|h3|h4|h5|h6|hr|li|ol|p|pre|table|ul|tr|td|th|tbody|thead|tfoot';
   185     
   185 		
   186     // Wrap all block level tags
   186 		// Wrap all block level tags
   187     RenderMan::tag_strip('_paragraph_bypass', $text, $_nw);
   187 		RenderMan::tag_strip('_paragraph_bypass', $text, $_nw);
   188     
   188 		
   189     // Find all opening and closing tags
   189 		// Find all opening and closing tags
   190     
   190 		
   191     $regex = ";(<(?:/(?:$blocklevel)|(?:$blocklevel)(?: [^>]*?)?)>);s";
   191 		$regex = ";(<(?:/(?:$blocklevel)|(?:$blocklevel)(?: [^>]*?)?)>);s";
   192                 
   192 								
   193     // oh. and we're using this tokens thing because for identical matches, the first match will
   193 		// oh. and we're using this tokens thing because for identical matches, the first match will
   194     // get wrapped X number of times instead of all matches getting wrapped once; replacing each
   194 		// get wrapped X number of times instead of all matches getting wrapped once; replacing each
   195     // with a unique token id remedies this
   195 		// with a unique token id remedies this
   196     
   196 		
   197     $tokens = array();
   197 		$tokens = array();
   198     $rand_id = sha1(microtime() . mt_rand());
   198 		$rand_id = sha1(microtime() . mt_rand());
   199     $tag_stack = array();
   199 		$tag_stack = array();
   200     
   200 		
   201     if ( $text_split = preg_split($regex, $text, -1, PREG_SPLIT_DELIM_CAPTURE) )
   201 		if ( $text_split = preg_split($regex, $text, -1, PREG_SPLIT_DELIM_CAPTURE) )
   202     {
   202 		{
   203       $text = '';
   203 			$text = '';
   204       // go through the text, extract tag names, and push them to a stack.
   204 			// go through the text, extract tag names, and push them to a stack.
   205       foreach ( $text_split as $splitpart )
   205 			foreach ( $text_split as $splitpart )
   206       {
   206 			{
   207         if ( preg_match(";^<(/)?($blocklevel)( |>);i", $splitpart, $match) )
   207 				if ( preg_match(";^<(/)?($blocklevel)( |>);i", $splitpart, $match) )
   208         {
   208 				{
   209           $tagname = $match[2];
   209 					$tagname = $match[2];
   210           if ( $match[1] == '/' )
   210 					if ( $match[1] == '/' )
   211           {
   211 					{
   212             // closing tag
   212 						// closing tag
   213             if ( $tagname != ($top = array_pop($tag_stack)) )
   213 						if ( $tagname != ($top = array_pop($tag_stack)) )
   214             {
   214 						{
   215               // invalid - push back
   215 							// invalid - push back
   216               array_push($tag_stack, $top);
   216 							array_push($tag_stack, $top);
   217             }
   217 						}
   218             else
   218 						else
   219             {
   219 						{
   220               // valid - if stack's at zero, add a </_paragraph_bypass>
   220 							// valid - if stack's at zero, add a </_paragraph_bypass>
   221               if ( count($tag_stack) == 0 )
   221 							if ( count($tag_stack) == 0 )
   222                 $splitpart .= '</_paragraph_bypass>';
   222 								$splitpart .= '</_paragraph_bypass>';
   223             }
   223 						}
   224           }
   224 					}
   225           else
   225 					else
   226           {
   226 					{
   227             // push
   227 						// push
   228             array_push($tag_stack, $tagname);
   228 						array_push($tag_stack, $tagname);
   229             if ( count($tag_stack) == 1 )
   229 						if ( count($tag_stack) == 1 )
   230               $splitpart = '<_paragraph_bypass>' . $splitpart;
   230 							$splitpart = '<_paragraph_bypass>' . $splitpart;
   231           }
   231 					}
   232         }
   232 				}
   233         $text .= $splitpart;
   233 				$text .= $splitpart;
   234       }
   234 			}
   235       //echo '<pre>' . htmlspecialchars(print_r($text, true)) . '</pre>';
   235 			//echo '<pre>' . htmlspecialchars(print_r($text, true)) . '</pre>';
   236     }
   236 		}
   237     
   237 		
   238     // All things that should be para-bypassed now are surrounded by _paragraph_bypass tags.
   238 		// All things that should be para-bypassed now are surrounded by _paragraph_bypass tags.
   239     
   239 		
   240     // die('<pre>' . htmlspecialchars($text) . '</pre>');
   240 		// die('<pre>' . htmlspecialchars($text) . '</pre>');
   241 	
   241 	
   242     RenderMan::tag_unstrip('_paragraph_bypass', $text, $_nw, true);
   242 		RenderMan::tag_unstrip('_paragraph_bypass', $text, $_nw, true);
   243     
   243 		
   244     // This is potentially a hack. It allows the parser to stick in <_paragraph_bypass> tags
   244 		// This is potentially a hack. It allows the parser to stick in <_paragraph_bypass> tags
   245     // to prevent the paragraph parser from interfering with pretty HTML generated elsewhere.
   245 		// to prevent the paragraph parser from interfering with pretty HTML generated elsewhere.
   246     RenderMan::tag_strip('_paragraph_bypass', $text, $_nw);
   246 		RenderMan::tag_strip('_paragraph_bypass', $text, $_nw);
   247     
   247 		
   248     $startcond = "(?!(?:[\\r\\n]|\{_paragraph_bypass:[a-f0-9]{32}:[0-9]+\}|[ ]*<\/?(?:$blocklevel)(?: .+>|>)))";
   248 		$startcond = "(?!(?:[\\r\\n]|\{_paragraph_bypass:[a-f0-9]{32}:[0-9]+\}|[ ]*<\/?(?:$blocklevel)(?: .+>|>)))";
   249     $regex = "/^
   249 		$regex = "/^
   250                 $startcond        # line start condition - do not match if the line starts with the condition above
   250 								$startcond        # line start condition - do not match if the line starts with the condition above
   251                 .+?               # body text
   251 								.+?               # body text
   252                 (?:
   252 								(?:
   253                   \\n             # additional lines
   253 									\\n             # additional lines
   254                   $startcond      # make sure of only one newline in a row, and end the paragraph if a new line fails the start condition
   254 									$startcond      # make sure of only one newline in a row, and end the paragraph if a new line fails the start condition
   255                   .*?
   255 									.*?
   256                 )*                # keep going until it fails
   256 								)*                # keep going until it fails
   257               $
   257 							$
   258               /mx";
   258 							/mx";
   259     
   259 		
   260     if ( !preg_match_all($regex, $text, $matches) )
   260 		if ( !preg_match_all($regex, $text, $matches) )
   261     {
   261 		{
   262       RenderMan::tag_unstrip('_paragraph_bypass', $text, $_nw);
   262 			RenderMan::tag_unstrip('_paragraph_bypass', $text, $_nw);
   263       return array();
   263 			return array();
   264     }
   264 		}
   265     
   265 		
   266     // Debugging :)
   266 		// Debugging :)
   267     // die('<pre>' . htmlspecialchars($text) . "\n-----------------------------------------------------------\n" . htmlspecialchars(print_r($matches, true)) . '</pre>');
   267 		// die('<pre>' . htmlspecialchars($text) . "\n-----------------------------------------------------------\n" . htmlspecialchars(print_r($matches, true)) . '</pre>');
   268     
   268 		
   269     // restore stripped
   269 		// restore stripped
   270     RenderMan::tag_unstrip('_paragraph_bypass', $text, $_nw);
   270 		RenderMan::tag_unstrip('_paragraph_bypass', $text, $_nw);
   271     
   271 		
   272     // tokenize
   272 		// tokenize
   273     $text = Carpenter::tokenize($text, $matches[0]);
   273 		$text = Carpenter::tokenize($text, $matches[0]);
   274     
   274 		
   275     return $matches[0];
   275 		return $matches[0];
   276   }
   276 	}
   277 }
   277 }
   278 
   278 
   279 function parser_mediawiki_xhtml_image($text)
   279 function parser_mediawiki_xhtml_image($text)
   280 {
   280 {
   281   $text = RenderMan::process_image_tags($text, $taglist);
   281 	$text = RenderMan::process_image_tags($text, $taglist);
   282   $text = RenderMan::process_imgtags_stage2($text, $taglist);
   282 	$text = RenderMan::process_imgtags_stage2($text, $taglist);
   283   return $text;
   283 	return $text;
   284 }
   284 }
   285 
   285 
   286 function parser_mediawiki_xhtml_tables($text)
   286 function parser_mediawiki_xhtml_tables($text)
   287 {
   287 {
   288   return process_tables($text);
   288 	return process_tables($text);
   289 }
   289 }
   290 
   290