includes/wikiengine/TagSanitizer.php
changeset 1227 bdac73ed481e
parent 1127 4b858862c35c
child 1382 78fbedb876f3
equal deleted inserted replaced
1226:de56132c008d 1227:bdac73ed481e
    23  |&\#x([0-9A-Za-z]+);
    23  |&\#x([0-9A-Za-z]+);
    24  |&\#X([0-9A-Za-z]+);
    24  |&\#X([0-9A-Za-z]+);
    25  |(&)/x' );
    25  |(&)/x' );
    26 
    26 
    27 define( 'MW_ATTRIBS_REGEX',
    27 define( 'MW_ATTRIBS_REGEX',
    28   "/(?:^|$space)($attrib+)
    28 	"/(?:^|$space)($attrib+)
    29     ($space*=$space*
    29 		($space*=$space*
    30     (?:
    30 		(?:
    31      # The attribute value: quoted or alone
    31  		# The attribute value: quoted or alone
    32       \"([^<\"]*)\"
    32 			\"([^<\"]*)\"
    33      | '([^<']*)'
    33  		| '([^<']*)'
    34      |  ([a-zA-Z0-9!#$%&()*,\\-.\\/:;<>?@[\\]^_`{|}~]+)
    34  		|  ([a-zA-Z0-9!#$%&()*,\\-.\\/:;<>?@[\\]^_`{|}~]+)
    35      |  (\#[0-9a-fA-F]+) # Technically wrong, but lots of
    35  		|  (\#[0-9a-fA-F]+) # Technically wrong, but lots of
    36                # colors are specified like this.
    36  							# colors are specified like this.
    37                # We'll be normalizing it.
    37  							# We'll be normalizing it.
    38     )
    38 		)
    39      )?(?=$space|\$)/sx" );
    39  		)?(?=$space|\$)/sx" );
    40 
    40 
    41 /**
    41 /**
    42  * Take a tag soup fragment listing an HTML element's attributes
    42  * Take a tag soup fragment listing an HTML element's attributes
    43  * and normalize it to well-formed XML, discarding unwanted attributes.
    43  * and normalize it to well-formed XML, discarding unwanted attributes.
    44  * Output is safe for further wikitext processing, with escaping of
    44  * Output is safe for further wikitext processing, with escaping of
    56  * @param string $text
    56  * @param string $text
    57  * @param string $element
    57  * @param string $element
    58  * @return string
    58  * @return string
    59  */
    59  */
    60 function fixTagAttributes( $text, $element ) {
    60 function fixTagAttributes( $text, $element ) {
    61   if( trim( $text ) == '' ) {
    61 	if( trim( $text ) == '' ) {
    62     return '';
    62 		return '';
    63   }
    63 	}
    64   
    64 	
    65   $stripped = validateTagAttributes(
    65 	$stripped = validateTagAttributes(
    66     decodeTagAttributes( $text ), $element );
    66 		decodeTagAttributes( $text ), $element );
    67   
    67 	
    68   $attribs = array();
    68 	$attribs = array();
    69   foreach( $stripped as $attribute => $value ) {
    69 	foreach( $stripped as $attribute => $value ) {
    70     $encAttribute = htmlspecialchars( $attribute );
    70 		$encAttribute = htmlspecialchars( $attribute );
    71     $encValue = safeEncodeAttribute( $value );
    71 		$encValue = safeEncodeAttribute( $value );
    72     
    72 		
    73     $attribs[] = "$encAttribute=".'"'."$encValue".'"'.""; // "
    73 		$attribs[] = "$encAttribute=".'"'."$encValue".'"'.""; // "
    74   }
    74 	}
    75   return count( $attribs ) ? ' ' . implode( ' ', $attribs ) : '';
    75 	return count( $attribs ) ? ' ' . implode( ' ', $attribs ) : '';
    76 }
    76 }
    77 
    77 
    78 /**
    78 /**
    79  * Encode an attribute value for HTML tags, with extra armoring
    79  * Encode an attribute value for HTML tags, with extra armoring
    80  * against further wiki processing.
    80  * against further wiki processing.
    81  * @param $text
    81  * @param $text
    82  * @return HTML-encoded text fragment
    82  * @return HTML-encoded text fragment
    83  */
    83  */
    84 function safeEncodeAttribute( $text ) {
    84 function safeEncodeAttribute( $text ) {
    85   $encValue= encodeAttribute( $text );
    85 	$encValue= encodeAttribute( $text );
    86   
    86 	
    87   # Templates and links may be expanded in later parsing,
    87 	# Templates and links may be expanded in later parsing,
    88   # creating invalid or dangerous output. Suppress this.
    88 	# creating invalid or dangerous output. Suppress this.
    89   $encValue = strtr( $encValue, array(
    89 	$encValue = strtr( $encValue, array(
    90     '<'    => '&lt;',   // This should never happen,
    90 		'<'    => '&lt;',   // This should never happen,
    91     '>'    => '&gt;',   // we've received invalid input
    91 		'>'    => '&gt;',   // we've received invalid input
    92     '"'    => '&quot;', // which should have been escaped.
    92 		'"'    => '&quot;', // which should have been escaped.
    93     '{'    => '&#123;',
    93 		'{'    => '&#123;',
    94     '['    => '&#91;',
    94 		'['    => '&#91;',
    95     "''"   => '&#39;&#39;',
    95 		"''"   => '&#39;&#39;',
    96     'ISBN' => '&#73;SBN',
    96 		'ISBN' => '&#73;SBN',
    97     'RFC'  => '&#82;FC',
    97 		'RFC'  => '&#82;FC',
    98     'PMID' => '&#80;MID',
    98 		'PMID' => '&#80;MID',
    99     '|'    => '&#124;',
    99 		'|'    => '&#124;',
   100     '__'   => '&#95;_',
   100 		'__'   => '&#95;_',
   101   ) );
   101 	) );
   102 
   102 
   103   return $encValue;
   103 	return $encValue;
   104 }
   104 }
   105 
   105 
   106 /**
   106 /**
   107  * Encode an attribute value for HTML output.
   107  * Encode an attribute value for HTML output.
   108  * @param $text
   108  * @param $text
   109  * @return HTML-encoded text fragment
   109  * @return HTML-encoded text fragment
   110  */
   110  */
   111 function encodeAttribute( $text ) {
   111 function encodeAttribute( $text ) {
   112   
   112 	
   113   // In Enano 1.0.3, added this cheapo hack to keep ampersands
   113 	// In Enano 1.0.3, added this cheapo hack to keep ampersands
   114   // from being double-sanitized. Thanks to markybob from #deluge.
   114 	// from being double-sanitized. Thanks to markybob from #deluge.
   115   
   115 	
   116   // htmlspecialchars() the "manual" way
   116 	// htmlspecialchars() the "manual" way
   117   $encValue = strtr( $text, array(
   117 	$encValue = strtr( $text, array(
   118     '&amp;'  => '&',
   118 		'&amp;'  => '&',
   119     '&quot;' => '"',
   119 		'&quot;' => '"',
   120     '&lt;'   => '<',
   120 		'&lt;'   => '<',
   121     '&gt;'   => '>',
   121 		'&gt;'   => '>',
   122     '&#039;' => "'"
   122 		'&#039;' => "'"
   123   ) );
   123 	) );
   124   
   124 	
   125   $encValue = strtr( $text, array(
   125 	$encValue = strtr( $text, array(
   126     '&' => '&amp;',
   126 		'&' => '&amp;',
   127     '"' => '&quot;',
   127 		'"' => '&quot;',
   128     '<' => '&lt;',
   128 		'<' => '&lt;',
   129     '>' => '&gt;',
   129 		'>' => '&gt;',
   130     "'" => '&#039;'
   130 		"'" => '&#039;'
   131   ) );
   131 	) );
   132   
   132 	
   133   
   133 	
   134   // Whitespace is normalized during attribute decoding,
   134 	// Whitespace is normalized during attribute decoding,
   135   // so if we've been passed non-spaces we must encode them
   135 	// so if we've been passed non-spaces we must encode them
   136   // ahead of time or they won't be preserved.
   136 	// ahead of time or they won't be preserved.
   137   $encValue = strtr( $encValue, array(
   137 	$encValue = strtr( $encValue, array(
   138     "\n" => '&#10;',
   138 		"\n" => '&#10;',
   139     "\r" => '&#13;',
   139 		"\r" => '&#13;',
   140     "\t" => '&#9;',
   140 		"\t" => '&#9;',
   141   ) );
   141 	) );
   142   
   142 	
   143   return $encValue;
   143 	return $encValue;
   144 }
   144 }
   145 
   145 
   146 function unstripForHTML( $text ) {
   146 function unstripForHTML( $text ) {
   147   global $mStripState;
   147 	global $mStripState;
   148   $text = unstrip( $text, $mStripState );
   148 	$text = unstrip( $text, $mStripState );
   149   $text = unstripNoWiki( $text, $mStripState );
   149 	$text = unstripNoWiki( $text, $mStripState );
   150   return $text;
   150 	return $text;
   151 }
   151 }
   152 
   152 
   153 /**
   153 /**
   154  * Always call this after unstrip() to preserve the order
   154  * Always call this after unstrip() to preserve the order
   155  *
   155  *
   156  * @private
   156  * @private
   157  */
   157  */
   158 function unstripNoWiki( $text, &$state ) {
   158 function unstripNoWiki( $text, &$state ) {
   159   if ( !isset( $state['nowiki'] ) ) {
   159 	if ( !isset( $state['nowiki'] ) ) {
   160     return $text;
   160 		return $text;
   161   }
   161 	}
   162 
   162 
   163   # TODO: good candidate for FSS
   163 	# TODO: good candidate for FSS
   164   $text = strtr( $text, $state['nowiki'] );
   164 	$text = strtr( $text, $state['nowiki'] );
   165   
   165 	
   166   return $text;
   166 	return $text;
   167 }
   167 }
   168 
   168 
   169 /**
   169 /**
   170  * Take an array of attribute names and values and normalize or discard
   170  * Take an array of attribute names and values and normalize or discard
   171  * illegal values for the given element type.
   171  * illegal values for the given element type.
   179  *
   179  *
   180  * @todo Check for legal values where the DTD limits things.
   180  * @todo Check for legal values where the DTD limits things.
   181  * @todo Check for unique id attribute :P
   181  * @todo Check for unique id attribute :P
   182  */
   182  */
   183 function validateTagAttributes( $attribs, $element ) {
   183 function validateTagAttributes( $attribs, $element ) {
   184   $whitelist = array_flip( attributeWhitelist( $element ) );
   184 	$whitelist = array_flip( attributeWhitelist( $element ) );
   185   $out = array();
   185 	$out = array();
   186   foreach( $attribs as $attribute => $value ) {
   186 	foreach( $attribs as $attribute => $value ) {
   187     if( !isset( $whitelist[$attribute] ) ) {
   187 		if( !isset( $whitelist[$attribute] ) ) {
   188       continue;
   188 			continue;
   189     }
   189 		}
   190     # Strip javascript "expression" from stylesheets.
   190 		# Strip javascript "expression" from stylesheets.
   191     # http://msdn.microsoft.com/workshop/author/dhtml/overview/recalc.asp
   191 		# http://msdn.microsoft.com/workshop/author/dhtml/overview/recalc.asp
   192     if( $attribute == 'style' ) {
   192 		if( $attribute == 'style' ) {
   193       $value = checkCss( $value );
   193 			$value = checkCss( $value );
   194       if( $value === false ) {
   194 			if( $value === false ) {
   195         # haxx0r
   195 				# haxx0r
   196         continue;
   196 				continue;
   197       }
   197 			}
   198     }
   198 		}
   199 
   199 
   200     if ( $attribute === 'id' )
   200 		if ( $attribute === 'id' )
   201       $value = escapeId( $value );
   201 			$value = escapeId( $value );
   202 
   202 
   203     // If this attribute was previously set, override it.
   203 		// If this attribute was previously set, override it.
   204     // Output should only have one attribute of each name.
   204 		// Output should only have one attribute of each name.
   205     $out[$attribute] = $value;
   205 		$out[$attribute] = $value;
   206   }
   206 	}
   207   return $out;
   207 	return $out;
   208 }
   208 }
   209 
   209 
   210 /**
   210 /**
   211  * Pick apart some CSS and check it for forbidden or unsafe structures.
   211  * Pick apart some CSS and check it for forbidden or unsafe structures.
   212  * Returns a sanitized string, or false if it was just too evil.
   212  * Returns a sanitized string, or false if it was just too evil.
   215  *
   215  *
   216  * @param string $value
   216  * @param string $value
   217  * @return mixed
   217  * @return mixed
   218  */
   218  */
   219 function checkCss( $value ) {
   219 function checkCss( $value ) {
   220   $stripped = decodeCharReferences( $value );
   220 	$stripped = decodeCharReferences( $value );
   221 
   221 
   222   // Remove any comments; IE gets token splitting wrong
   222 	// Remove any comments; IE gets token splitting wrong
   223   $stripped = preg_replace( '!/\\*.*?\\*/!S', '', $stripped );
   223 	$stripped = preg_replace( '!/\\*.*?\\*/!S', '', $stripped );
   224   $value = $stripped;
   224 	$value = $stripped;
   225 
   225 
   226   // ... and continue checks
   226 	// ... and continue checks
   227   $stripped = preg_replace( '!\\\\([0-9A-Fa-f]{1,6})[ \\n\\r\\t\\f]?!e',
   227 	$stripped = preg_replace( '!\\\\([0-9A-Fa-f]{1,6})[ \\n\\r\\t\\f]?!e',
   228     'codepointToUtf8(hexdec("$1"))', $stripped );
   228 		'codepointToUtf8(hexdec("$1"))', $stripped );
   229   $stripped = str_replace( '\\', '', $stripped );
   229 	$stripped = str_replace( '\\', '', $stripped );
   230   if( preg_match( '/(expression|tps*:\/\/|url\\s*\().*/is',
   230 	if( preg_match( '/(expression|tps*:\/\/|url\\s*\().*/is',
   231       $stripped ) ) {
   231 			$stripped ) ) {
   232     # haxx0r
   232 		# haxx0r
   233     return false;
   233 		return false;
   234   }
   234 	}
   235   
   235 	
   236   return $value;
   236 	return $value;
   237 }
   237 }
   238 
   238 
   239 /**
   239 /**
   240  * Decode any character references, numeric or named entities,
   240  * Decode any character references, numeric or named entities,
   241  * in the text and return a UTF-8 string.
   241  * in the text and return a UTF-8 string.
   244  * @return string
   244  * @return string
   245  * @access public
   245  * @access public
   246  * @static
   246  * @static
   247  */
   247  */
   248 function decodeCharReferences( $text ) {
   248 function decodeCharReferences( $text ) {
   249   return preg_replace_callback(
   249 	return preg_replace_callback(
   250     MW_CHAR_REFS_REGEX,
   250 		MW_CHAR_REFS_REGEX,
   251     'decodeCharReferencesCallback',
   251 		'decodeCharReferencesCallback',
   252     $text );
   252 		$text );
   253 }
   253 }
   254 
   254 
   255 /**
   255 /**
   256  * Fetch the whitelist of acceptable attributes for a given
   256  * Fetch the whitelist of acceptable attributes for a given
   257  * element name.
   257  * element name.
   258  *
   258  *
   259  * @param string $element
   259  * @param string $element
   260  * @return array
   260  * @return array
   261  */
   261  */
   262 function attributeWhitelist( $element ) {
   262 function attributeWhitelist( $element ) {
   263   static $list;
   263 	static $list;
   264   if( !isset( $list ) ) {
   264 	if( !isset( $list ) ) {
   265     $list = setupAttributeWhitelist();
   265 		$list = setupAttributeWhitelist();
   266   }
   266 	}
   267   return isset( $list[$element] )
   267 	return isset( $list[$element] )
   268     ? $list[$element]
   268 		? $list[$element]
   269     : array();
   269 		: array();
   270 }
   270 }
   271 
   271 
   272 /**
   272 /**
   273  * @todo Document it a bit
   273  * @todo Document it a bit
   274  * @return array
   274  * @return array
   275  */
   275  */
   276 function setupAttributeWhitelist() {
   276 function setupAttributeWhitelist() {
   277   global $db, $session, $paths, $template, $plugins;
   277 	global $db, $session, $paths, $template, $plugins;
   278   $common = array( 'id', 'class', 'lang', 'dir', 'title', 'style' );
   278 	$common = array( 'id', 'class', 'lang', 'dir', 'title', 'style' );
   279   $block = array_merge( $common, array( 'align' ) );
   279 	$block = array_merge( $common, array( 'align' ) );
   280   $tablealign = array( 'align', 'char', 'charoff', 'valign' );
   280 	$tablealign = array( 'align', 'char', 'charoff', 'valign' );
   281   $tablecell = array( 'abbr',
   281 	$tablecell = array( 'abbr',
   282                       'axis',
   282 											'axis',
   283                       'headers',
   283 											'headers',
   284                       'scope',
   284 											'scope',
   285                       'rowspan',
   285 											'rowspan',
   286                       'colspan',
   286 											'colspan',
   287                       'nowrap', # deprecated
   287 											'nowrap', # deprecated
   288                       'width',  # deprecated
   288 											'width',  # deprecated
   289                       'height', # deprecated
   289 											'height', # deprecated
   290                       'bgcolor' # deprecated
   290 											'bgcolor' # deprecated
   291                       );
   291 											);
   292 
   292 
   293   # Numbers refer to sections in HTML 4.01 standard describing the element.
   293 	# Numbers refer to sections in HTML 4.01 standard describing the element.
   294   # See: http://www.w3.org/TR/html4/
   294 	# See: http://www.w3.org/TR/html4/
   295   $whitelist = array (
   295 	$whitelist = array (
   296     # 7.5.4
   296 		# 7.5.4
   297     'div'        => $block,
   297 		'div'        => $block,
   298     'center'     => $common, # deprecated
   298 		'center'     => $common, # deprecated
   299     'span'       => $block, # ??
   299 		'span'       => $block, # ??
   300 
   300 
   301     # 7.5.5
   301 		# 7.5.5
   302     'h1'         => $block,
   302 		'h1'         => $block,
   303     'h2'         => $block,
   303 		'h2'         => $block,
   304     'h3'         => $block,
   304 		'h3'         => $block,
   305     'h4'         => $block,
   305 		'h4'         => $block,
   306     'h5'         => $block,
   306 		'h5'         => $block,
   307     'h6'         => $block,
   307 		'h6'         => $block,
   308 
   308 
   309     # 7.5.6
   309 		# 7.5.6
   310     # address
   310 		# address
   311 
   311 
   312     # 8.2.4
   312 		# 8.2.4
   313     # bdo
   313 		# bdo
   314 
   314 
   315     # 9.2.1
   315 		# 9.2.1
   316     'em'         => $common,
   316 		'em'         => $common,
   317     'strong'     => $common,
   317 		'strong'     => $common,
   318     'cite'       => $common,
   318 		'cite'       => $common,
   319     # dfn
   319 		# dfn
   320     'code'       => $common,
   320 		'code'       => $common,
   321     # samp
   321 		# samp
   322     # kbd
   322 		# kbd
   323     'var'        => $common,
   323 		'var'        => $common,
   324     # abbr
   324 		# abbr
   325     # acronym
   325 		# acronym
   326 
   326 
   327     # 9.2.2
   327 		# 9.2.2
   328     'blockquote' => array_merge( $common, array( 'cite' ) ),
   328 		'blockquote' => array_merge( $common, array( 'cite' ) ),
   329     # q
   329 		# q
   330 
   330 
   331     # 9.2.3
   331 		# 9.2.3
   332     'sub'        => $common,
   332 		'sub'        => $common,
   333     'sup'        => $common,
   333 		'sup'        => $common,
   334 
   334 
   335     # 9.3.1
   335 		# 9.3.1
   336     'p'          => $block,
   336 		'p'          => $block,
   337 
   337 
   338     # 9.3.2
   338 		# 9.3.2
   339     'br'         => array( 'id', 'class', 'title', 'style', 'clear' ),
   339 		'br'         => array( 'id', 'class', 'title', 'style', 'clear' ),
   340 
   340 
   341     # 9.3.4
   341 		# 9.3.4
   342     'pre'        => array_merge( $common, array( 'width' ) ),
   342 		'pre'        => array_merge( $common, array( 'width' ) ),
   343 
   343 
   344     # 9.4
   344 		# 9.4
   345     'ins'        => array_merge( $common, array( 'cite', 'datetime' ) ),
   345 		'ins'        => array_merge( $common, array( 'cite', 'datetime' ) ),
   346     'del'        => array_merge( $common, array( 'cite', 'datetime' ) ),
   346 		'del'        => array_merge( $common, array( 'cite', 'datetime' ) ),
   347 
   347 
   348     # 10.2
   348 		# 10.2
   349     'ul'         => array_merge( $common, array( 'type' ) ),
   349 		'ul'         => array_merge( $common, array( 'type' ) ),
   350     'ol'         => array_merge( $common, array( 'type', 'start' ) ),
   350 		'ol'         => array_merge( $common, array( 'type', 'start' ) ),
   351     'li'         => array_merge( $common, array( 'type', 'value' ) ),
   351 		'li'         => array_merge( $common, array( 'type', 'value' ) ),
   352 
   352 
   353     # 10.3
   353 		# 10.3
   354     'dl'         => $common,
   354 		'dl'         => $common,
   355     'dd'         => $common,
   355 		'dd'         => $common,
   356     'dt'         => $common,
   356 		'dt'         => $common,
   357 
   357 
   358     # 11.2.1
   358 		# 11.2.1
   359     'table'      => array_merge( $common,
   359 		'table'      => array_merge( $common,
   360               array( 'summary', 'width', 'border', 'frame',
   360 							array( 'summary', 'width', 'border', 'frame',
   361                   'rules', 'cellspacing', 'cellpadding',
   361 									'rules', 'cellspacing', 'cellpadding',
   362                   'align', 'bgcolor',
   362 									'align', 'bgcolor',
   363               ) ),
   363 							) ),
   364 
   364 
   365     # 11.2.2
   365 		# 11.2.2
   366     'caption'    => array_merge( $common, array( 'align' ) ),
   366 		'caption'    => array_merge( $common, array( 'align' ) ),
   367 
   367 
   368     # 11.2.3
   368 		# 11.2.3
   369     'thead'      => array_merge( $common, $tablealign ),
   369 		'thead'      => array_merge( $common, $tablealign ),
   370     'tfoot'      => array_merge( $common, $tablealign ),
   370 		'tfoot'      => array_merge( $common, $tablealign ),
   371     'tbody'      => array_merge( $common, $tablealign ),
   371 		'tbody'      => array_merge( $common, $tablealign ),
   372 
   372 
   373     # 11.2.4
   373 		# 11.2.4
   374     'colgroup'   => array_merge( $common, array( 'span', 'width' ), $tablealign ),
   374 		'colgroup'   => array_merge( $common, array( 'span', 'width' ), $tablealign ),
   375     'col'        => array_merge( $common, array( 'span', 'width' ), $tablealign ),
   375 		'col'        => array_merge( $common, array( 'span', 'width' ), $tablealign ),
   376 
   376 
   377     # 11.2.5
   377 		# 11.2.5
   378     'tr'         => array_merge( $common, array( 'bgcolor' ), $tablealign ),
   378 		'tr'         => array_merge( $common, array( 'bgcolor' ), $tablealign ),
   379 
   379 
   380     # 11.2.6
   380 		# 11.2.6
   381     'td'         => array_merge( $common, $tablecell, $tablealign ),
   381 		'td'         => array_merge( $common, $tablecell, $tablealign ),
   382     'th'         => array_merge( $common, $tablecell, $tablealign ),
   382 		'th'         => array_merge( $common, $tablecell, $tablealign ),
   383     
   383 		
   384     # 12.2
   384 		# 12.2
   385     # added by dan
   385 		# added by dan
   386     'a'          => array_merge( $common, array( 'href', 'name' ) ),
   386 		'a'          => array_merge( $common, array( 'href', 'name' ) ),
   387     
   387 		
   388     # 13.2
   388 		# 13.2
   389     # added by dan
   389 		# added by dan
   390     'img'        => array_merge( $common, array( 'src', 'width', 'height', 'alt' ) ),
   390 		'img'        => array_merge( $common, array( 'src', 'width', 'height', 'alt' ) ),
   391 
   391 
   392     # 15.2.1
   392 		# 15.2.1
   393     'tt'         => $common,
   393 		'tt'         => $common,
   394     'b'          => $common,
   394 		'b'          => $common,
   395     'i'          => $common,
   395 		'i'          => $common,
   396     'big'        => $common,
   396 		'big'        => $common,
   397     'small'      => $common,
   397 		'small'      => $common,
   398     'strike'     => $common,
   398 		'strike'     => $common,
   399     's'          => $common,
   399 		's'          => $common,
   400     'u'          => $common,
   400 		'u'          => $common,
   401 
   401 
   402     # 15.2.2
   402 		# 15.2.2
   403     'font'       => array_merge( $common, array( 'size', 'color', 'face' ) ),
   403 		'font'       => array_merge( $common, array( 'size', 'color', 'face' ) ),
   404     # basefont
   404 		# basefont
   405 
   405 
   406     # 15.3
   406 		# 15.3
   407     'hr'         => array_merge( $common, array( 'noshade', 'size', 'width' ) ),
   407 		'hr'         => array_merge( $common, array( 'noshade', 'size', 'width' ) ),
   408 
   408 
   409     # XHTML Ruby annotation text module, simple ruby only.
   409 		# XHTML Ruby annotation text module, simple ruby only.
   410     # http://www.w3c.org/TR/ruby/
   410 		# http://www.w3c.org/TR/ruby/
   411     'ruby'       => $common,
   411 		'ruby'       => $common,
   412     # rbc
   412 		# rbc
   413     # rtc
   413 		# rtc
   414     'rb'         => $common,
   414 		'rb'         => $common,
   415     'rt'         => $common, #array_merge( $common, array( 'rbspan' ) ),
   415 		'rt'         => $common, #array_merge( $common, array( 'rbspan' ) ),
   416     'rp'         => $common,
   416 		'rp'         => $common,
   417     
   417 		
   418     # For compatibility with the XHTML parser.
   418 		# For compatibility with the XHTML parser.
   419     'nowiki'     => array(),
   419 		'nowiki'     => array(),
   420     'noinclude'  => array(),
   420 		'noinclude'  => array(),
   421     'nodisplay'  => array(),
   421 		'nodisplay'  => array(),
   422     'lang'       => array('code'),
   422 		'lang'       => array('code'),
   423     
   423 		
   424     # XHTML stuff
   424 		# XHTML stuff
   425     'acronym'    => $common
   425 		'acronym'    => $common
   426     );
   426 		);
   427   
   427 	
   428   // custom tags can be added by plugins
   428 	// custom tags can be added by plugins
   429   $code = $plugins->setHook('html_attribute_whitelist');
   429 	$code = $plugins->setHook('html_attribute_whitelist');
   430   foreach ( $code as $cmd )
   430 	foreach ( $code as $cmd )
   431   {
   431 	{
   432     eval($cmd);
   432 		eval($cmd);
   433   }
   433 	}
   434   
   434 	
   435   return $whitelist;
   435 	return $whitelist;
   436 }
   436 }
   437 
   437 
   438 /**
   438 /**
   439  * Given a value escape it so that it can be used in an id attribute and
   439  * Given a value escape it so that it can be used in an id attribute and
   440  * return it, this does not validate the value however (see first link)
   440  * return it, this does not validate the value however (see first link)
   450  *
   450  *
   451  * @param string $id
   451  * @param string $id
   452  * @return string
   452  * @return string
   453  */
   453  */
   454 function escapeId( $id ) {
   454 function escapeId( $id ) {
   455   static $replace = array(
   455 	static $replace = array(
   456     '%3A' => ':',
   456 		'%3A' => ':',
   457     '%' => '.'
   457 		'%' => '.'
   458   );
   458 	);
   459 
   459 
   460   $id = urlencode( decodeCharReferences( strtr( $id, ' ', '_' ) ) );
   460 	$id = urlencode( decodeCharReferences( strtr( $id, ' ', '_' ) ) );
   461 
   461 
   462   return str_replace( array_keys( $replace ), array_values( $replace ), $id );
   462 	return str_replace( array_keys( $replace ), array_values( $replace ), $id );
   463 }
   463 }
   464 
   464 
   465 /**
   465 /**
   466  * More or less "markup-safe" explode()
   466  * More or less "markup-safe" explode()
   467  * Ignores any instances of the separator inside <...>
   467  * Ignores any instances of the separator inside <...>
   468  * @param string $separator
   468  * @param string $separator
   469  * @param string $text
   469  * @param string $text
   470  * @return array
   470  * @return array
   471  */
   471  */
   472 function wfExplodeMarkup( $separator, $text ) {
   472 function wfExplodeMarkup( $separator, $text ) {
   473   $placeholder = "\x00";
   473 	$placeholder = "\x00";
   474   
   474 	
   475   // Just in case...
   475 	// Just in case...
   476   $text = str_replace( $placeholder, '', $text );
   476 	$text = str_replace( $placeholder, '', $text );
   477   
   477 	
   478   // Trim stuff
   478 	// Trim stuff
   479   $replacer = new ReplacerCallback( $separator, $placeholder );
   479 	$replacer = new ReplacerCallback( $separator, $placeholder );
   480   $cleaned = preg_replace_callback( '/(<.*?>)/', array( $replacer, 'go' ), $text );
   480 	$cleaned = preg_replace_callback( '/(<.*?>)/', array( $replacer, 'go' ), $text );
   481   
   481 	
   482   $items = explode( $separator, $cleaned );
   482 	$items = explode( $separator, $cleaned );
   483   foreach( $items as $i => $str ) {
   483 	foreach( $items as $i => $str ) {
   484     $items[$i] = str_replace( $placeholder, $separator, $str );
   484 		$items[$i] = str_replace( $placeholder, $separator, $str );
   485   }
   485 	}
   486   
   486 	
   487   return $items;
   487 	return $items;
   488 }
   488 }
   489 
   489 
   490 class ReplacerCallback {
   490 class ReplacerCallback {
   491   function ReplacerCallback( $from, $to ) {
   491 	function ReplacerCallback( $from, $to ) {
   492     $this->from = $from;
   492 		$this->from = $from;
   493     $this->to = $to;
   493 		$this->to = $to;
   494   }
   494 	}
   495   
   495 	
   496   function go( $matches ) {
   496 	function go( $matches ) {
   497     return str_replace( $this->from, $this->to, $matches[1] );
   497 		return str_replace( $this->from, $this->to, $matches[1] );
   498   }
   498 	}
   499 }
   499 }
   500 
   500 
   501 /**
   501 /**
   502  * Return an associative array of attribute names and values from
   502  * Return an associative array of attribute names and values from
   503  * a partial tag string. Attribute names are forces to lowercase,
   503  * a partial tag string. Attribute names are forces to lowercase,
   505  *
   505  *
   506  * @param string
   506  * @param string
   507  * @return array
   507  * @return array
   508  */
   508  */
   509 function decodeTagAttributes( $text ) {
   509 function decodeTagAttributes( $text ) {
   510   $attribs = array();
   510 	$attribs = array();
   511 
   511 
   512   if( trim( $text ) == '' ) {
   512 	if( trim( $text ) == '' ) {
   513     return $attribs;
   513 		return $attribs;
   514   }
   514 	}
   515 
   515 
   516   $pairs = array();
   516 	$pairs = array();
   517   if( !preg_match_all(
   517 	if( !preg_match_all(
   518     MW_ATTRIBS_REGEX,
   518 		MW_ATTRIBS_REGEX,
   519     $text,
   519 		$text,
   520     $pairs,
   520 		$pairs,
   521     PREG_SET_ORDER ) ) {
   521 		PREG_SET_ORDER ) ) {
   522     return $attribs;
   522 		return $attribs;
   523   }
   523 	}
   524 
   524 
   525   foreach( $pairs as $set ) {
   525 	foreach( $pairs as $set ) {
   526     $attribute = strtolower( $set[1] );
   526 		$attribute = strtolower( $set[1] );
   527     $value = getTagAttributeCallback( $set );
   527 		$value = getTagAttributeCallback( $set );
   528     
   528 		
   529     // Normalize whitespace
   529 		// Normalize whitespace
   530     $value = preg_replace( '/[\t\r\n ]+/', ' ', $value );
   530 		$value = preg_replace( '/[\t\r\n ]+/', ' ', $value );
   531     $value = trim( $value );
   531 		$value = trim( $value );
   532     
   532 		
   533     // Decode character references
   533 		// Decode character references
   534     $attribs[$attribute] = decodeCharReferences( $value );
   534 		$attribs[$attribute] = decodeCharReferences( $value );
   535   }
   535 	}
   536   return $attribs;
   536 	return $attribs;
   537 }
   537 }
   538 
   538 
   539 /**
   539 /**
   540  * Pick the appropriate attribute value from a match set from the
   540  * Pick the appropriate attribute value from a match set from the
   541  * MW_ATTRIBS_REGEX matches.
   541  * MW_ATTRIBS_REGEX matches.
   543  * @param array $set
   543  * @param array $set
   544  * @return string
   544  * @return string
   545  * @access private
   545  * @access private
   546  */
   546  */
   547 function getTagAttributeCallback( $set ) {
   547 function getTagAttributeCallback( $set ) {
   548   if( isset( $set[6] ) ) {
   548 	if( isset( $set[6] ) ) {
   549     # Illegal #XXXXXX color with no quotes.
   549 		# Illegal #XXXXXX color with no quotes.
   550     return $set[6];
   550 		return $set[6];
   551   } elseif( isset( $set[5] ) ) {
   551 	} elseif( isset( $set[5] ) ) {
   552     # No quotes.
   552 		# No quotes.
   553     return $set[5];
   553 		return $set[5];
   554   } elseif( isset( $set[4] ) ) {
   554 	} elseif( isset( $set[4] ) ) {
   555     # Single-quoted
   555 		# Single-quoted
   556     return $set[4];
   556 		return $set[4];
   557   } elseif( isset( $set[3] ) ) {
   557 	} elseif( isset( $set[3] ) ) {
   558     # Double-quoted
   558 		# Double-quoted
   559     return $set[3];
   559 		return $set[3];
   560   } elseif( !isset( $set[2] ) ) {
   560 	} elseif( !isset( $set[2] ) ) {
   561     # In XHTML, attributes must have a value.
   561 		# In XHTML, attributes must have a value.
   562     # For 'reduced' form, return explicitly the attribute name here.
   562 		# For 'reduced' form, return explicitly the attribute name here.
   563     return $set[1];
   563 		return $set[1];
   564   } else {
   564 	} else {
   565     die_friendly('Parser error', "<p>Tag conditions not met. This should never happen and is a bug.</p>" );
   565 		die_friendly('Parser error', "<p>Tag conditions not met. This should never happen and is a bug.</p>" );
   566   }
   566 	}
   567 }
   567 }
   568 
   568 
   569 /**
   569 /**
   570  * Strips and renders nowiki, pre, math, hiero
   570  * Strips and renders nowiki, pre, math, hiero
   571  * If $render is set, performs necessary rendering operations on plugins
   571  * If $render is set, performs necessary rendering operations on plugins
   581  *  used to prevent stipping of <gallery> when saving (fixes bug 2700)
   581  *  used to prevent stipping of <gallery> when saving (fixes bug 2700)
   582  *
   582  *
   583  * @access private
   583  * @access private
   584  */
   584  */
   585 function mwStrip( $text, &$state, $stripcomments = false , $dontstrip = array () ) {
   585 function mwStrip( $text, &$state, $stripcomments = false , $dontstrip = array () ) {
   586   global $wgRandomKey;
   586 	global $wgRandomKey;
   587   $render = true;
   587 	$render = true;
   588 
   588 
   589   $wgRandomKey = "\x07UNIQ" . dechex(mt_rand(0, 0x7fffffff)) . dechex(mt_rand(0, 0x7fffffff));
   589 	$wgRandomKey = "\x07UNIQ" . dechex(mt_rand(0, 0x7fffffff)) . dechex(mt_rand(0, 0x7fffffff));
   590   $uniq_prefix =& $wgRandomKey;
   590 	$uniq_prefix =& $wgRandomKey;
   591   $commentState = array();
   591 	$commentState = array();
   592   
   592 	
   593   $elements = array( 'nowiki', 'gallery' );
   593 	$elements = array( 'nowiki', 'gallery' );
   594   
   594 	
   595   # Removing $dontstrip tags from $elements list (currently only 'gallery', fixing bug 2700)
   595 	# Removing $dontstrip tags from $elements list (currently only 'gallery', fixing bug 2700)
   596   foreach ( $elements AS $k => $v ) {
   596 	foreach ( $elements AS $k => $v ) {
   597     if ( !in_array ( $v , $dontstrip ) ) continue;
   597 		if ( !in_array ( $v , $dontstrip ) ) continue;
   598     unset ( $elements[$k] );
   598 		unset ( $elements[$k] );
   599   }
   599 	}
   600   
   600 	
   601   $matches = array();
   601 	$matches = array();
   602   $text = extractTagsAndParams( $elements, $text, $matches, $uniq_prefix );
   602 	$text = extractTagsAndParams( $elements, $text, $matches, $uniq_prefix );
   603 
   603 
   604   foreach( $matches as $marker => $data ) {
   604 	foreach( $matches as $marker => $data ) {
   605     list( $element, $content, $params, $tag ) = $data;
   605 		list( $element, $content, $params, $tag ) = $data;
   606     if( $render ) {
   606 		if( $render ) {
   607       $tagName = strtolower( $element );
   607 			$tagName = strtolower( $element );
   608       switch( $tagName ) {
   608 			switch( $tagName ) {
   609       case '!--':
   609 			case '!--':
   610         // Comment
   610 				// Comment
   611         if( substr( $tag, -3 ) == '-->' ) {
   611 				if( substr( $tag, -3 ) == '-->' ) {
   612           $output = $tag;
   612 					$output = $tag;
   613         } else {
   613 				} else {
   614           // Unclosed comment in input.
   614 					// Unclosed comment in input.
   615           // Close it so later stripping can remove it
   615 					// Close it so later stripping can remove it
   616           $output = "$tag-->";
   616 					$output = "$tag-->";
   617         }
   617 				}
   618         break;
   618 				break;
   619       case 'html':
   619 			case 'html':
   620         if( $wgRawHtml ) {
   620 				if( $wgRawHtml ) {
   621           $output = $content;
   621 					$output = $content;
   622           break;
   622 					break;
   623         }
   623 				}
   624         // Shouldn't happen otherwise. :)
   624 				// Shouldn't happen otherwise. :)
   625       case 'nowiki':
   625 			case 'nowiki':
   626         $output = wfEscapeHTMLTagsOnly( $content );
   626 				$output = wfEscapeHTMLTagsOnly( $content );
   627         break;
   627 				break;
   628       default:
   628 			default:
   629       }
   629 			}
   630     } else {
   630 		} else {
   631       // Just stripping tags; keep the source
   631 			// Just stripping tags; keep the source
   632       $output = $tag;
   632 			$output = $tag;
   633     }
   633 		}
   634 
   634 
   635     // Unstrip the output, because unstrip() is no longer recursive so 
   635 		// Unstrip the output, because unstrip() is no longer recursive so 
   636     // it won't do it itself
   636 		// it won't do it itself
   637     $output = unstrip( $output, $state );
   637 		$output = unstrip( $output, $state );
   638 
   638 
   639     if( !$stripcomments && $element == '!--' ) {
   639 		if( !$stripcomments && $element == '!--' ) {
   640       $commentState[$marker] = $output;
   640 			$commentState[$marker] = $output;
   641     } elseif ( $element == 'html' || $element == 'nowiki' ) {
   641 		} elseif ( $element == 'html' || $element == 'nowiki' ) {
   642       $state['nowiki'][$marker] = $output;
   642 			$state['nowiki'][$marker] = $output;
   643     } else {
   643 		} else {
   644       $state['general'][$marker] = $output;
   644 			$state['general'][$marker] = $output;
   645     }
   645 		}
   646   }
   646 	}
   647 
   647 
   648   # Unstrip comments unless explicitly told otherwise.
   648 	# Unstrip comments unless explicitly told otherwise.
   649   # (The comments are always stripped prior to this point, so as to
   649 	# (The comments are always stripped prior to this point, so as to
   650   # not invoke any extension tags / parser hooks contained within
   650 	# not invoke any extension tags / parser hooks contained within
   651   # a comment.)
   651 	# a comment.)
   652   if ( !$stripcomments ) {
   652 	if ( !$stripcomments ) {
   653     // Put them all back and forget them
   653 		// Put them all back and forget them
   654     $text = strtr( $text, $commentState );
   654 		$text = strtr( $text, $commentState );
   655   }
   655 	}
   656 
   656 
   657   return $text;
   657 	return $text;
   658 }
   658 }
   659 
   659 
   660 /**
   660 /**
   661  * Replaces all occurrences of HTML-style comments and the given tags
   661  * Replaces all occurrences of HTML-style comments and the given tags
   662  * in the text with a random marker and returns teh next text. The output
   662  * in the text with a random marker and returns teh next text. The output
   674  *
   674  *
   675  * @access private
   675  * @access private
   676  * @static
   676  * @static
   677  */
   677  */
   678 function extractTagsAndParams($elements, $text, &$matches, $uniq_prefix = ''){
   678 function extractTagsAndParams($elements, $text, &$matches, $uniq_prefix = ''){
   679   static $n = 1;
   679 	static $n = 1;
   680   $stripped = '';
   680 	$stripped = '';
   681   $matches = array();
   681 	$matches = array();
   682 
   682 
   683   $taglist = implode( '|', $elements );
   683 	$taglist = implode( '|', $elements );
   684   $start = "/<($taglist)(\\s+[^>]*?|\\s*?)(\/?>)|<(!--)/i";
   684 	$start = "/<($taglist)(\\s+[^>]*?|\\s*?)(\/?>)|<(!--)/i";
   685 
   685 
   686   while ( '' != $text ) {
   686 	while ( '' != $text ) {
   687     $p = preg_split( $start, $text, 2, PREG_SPLIT_DELIM_CAPTURE );
   687 		$p = preg_split( $start, $text, 2, PREG_SPLIT_DELIM_CAPTURE );
   688     $stripped .= $p[0];
   688 		$stripped .= $p[0];
   689     if( count( $p ) < 5 ) {
   689 		if( count( $p ) < 5 ) {
   690       break;
   690 			break;
   691     }
   691 		}
   692     if( count( $p ) > 5 ) {
   692 		if( count( $p ) > 5 ) {
   693       // comment
   693 			// comment
   694       $element    = $p[4];
   694 			$element    = $p[4];
   695       $attributes = '';
   695 			$attributes = '';
   696       $close      = '';
   696 			$close      = '';
   697       $inside     = $p[5];
   697 			$inside     = $p[5];
   698     } else {
   698 		} else {
   699       // tag
   699 			// tag
   700       $element    = $p[1];
   700 			$element    = $p[1];
   701       $attributes = $p[2];
   701 			$attributes = $p[2];
   702       $close      = $p[3];
   702 			$close      = $p[3];
   703       $inside     = $p[4];
   703 			$inside     = $p[4];
   704     }
   704 		}
   705 
   705 
   706     $marker = "$uniq_prefix-$element-" . sprintf('%08X', $n++) . '-QINU';
   706 		$marker = "$uniq_prefix-$element-" . sprintf('%08X', $n++) . '-QINU';
   707     $stripped .= $marker;
   707 		$stripped .= $marker;
   708 
   708 
   709     if ( $close === '/>' ) {
   709 		if ( $close === '/>' ) {
   710       // Empty element tag, <tag />
   710 			// Empty element tag, <tag />
   711       $content = null;
   711 			$content = null;
   712       $text = $inside;
   712 			$text = $inside;
   713       $tail = null;
   713 			$tail = null;
   714     } else {
   714 		} else {
   715       if( $element == '!--' ) {
   715 			if( $element == '!--' ) {
   716         $end = '/(-->)/';
   716 				$end = '/(-->)/';
   717       } else {
   717 			} else {
   718         $end = "/(<\\/$element\\s*>)/i";
   718 				$end = "/(<\\/$element\\s*>)/i";
   719       }
   719 			}
   720       $q = preg_split( $end, $inside, 2, PREG_SPLIT_DELIM_CAPTURE );
   720 			$q = preg_split( $end, $inside, 2, PREG_SPLIT_DELIM_CAPTURE );
   721       $content = $q[0];
   721 			$content = $q[0];
   722       if( count( $q ) < 3 ) {
   722 			if( count( $q ) < 3 ) {
   723         # No end tag -- let it run out to the end of the text.
   723 				# No end tag -- let it run out to the end of the text.
   724         $tail = '';
   724 				$tail = '';
   725         $text = '';
   725 				$text = '';
   726       } else {
   726 			} else {
   727         $tail = $q[1];
   727 				$tail = $q[1];
   728         $text = $q[2];
   728 				$text = $q[2];
   729       }
   729 			}
   730     }
   730 		}
   731     
   731 		
   732     $matches[$marker] = array( $element,
   732 		$matches[$marker] = array( $element,
   733       $content,
   733 			$content,
   734       decodeTagAttributes( $attributes ),
   734 			decodeTagAttributes( $attributes ),
   735       "<$element$attributes$close$content$tail" );
   735 			"<$element$attributes$close$content$tail" );
   736   }
   736 	}
   737   return $stripped;
   737 	return $stripped;
   738 }
   738 }
   739 
   739 
   740 /**
   740 /**
   741  * Escape html tags
   741  * Escape html tags
   742  * Basically replacing " > and < with HTML entities ( &quot;, &gt;, &lt;)
   742  * Basically replacing " > and < with HTML entities ( &quot;, &gt;, &lt;)
   743  *
   743  *
   744  * @param $in String: text that might contain HTML tags.
   744  * @param $in String: text that might contain HTML tags.
   745  * @return string Escaped string
   745  * @return string Escaped string
   746  */
   746  */
   747 function wfEscapeHTMLTagsOnly( $in ) {
   747 function wfEscapeHTMLTagsOnly( $in ) {
   748   return str_replace(
   748 	return str_replace(
   749     array( '"', '>', '<' ),
   749 		array( '"', '>', '<' ),
   750     array( '&quot;', '&gt;', '&lt;' ),
   750 		array( '&quot;', '&gt;', '&lt;' ),
   751     $in );
   751 		$in );
   752 }
   752 }
   753 
   753 
   754 /**
   754 /**
   755  * Restores pre, math, and other extensions removed by strip()
   755  * Restores pre, math, and other extensions removed by strip()
   756  *
   756  *
   757  * always call unstripNoWiki() after this one
   757  * always call unstripNoWiki() after this one
   758  * @private
   758  * @private
   759  */
   759  */
   760 function unstrip( $text, &$state ) {
   760 function unstrip( $text, &$state ) {
   761   if ( !isset( $state['general'] ) ) {
   761 	if ( !isset( $state['general'] ) ) {
   762     return $text;
   762 		return $text;
   763   }
   763 	}
   764 
   764 
   765   # TODO: good candidate for FSS
   765 	# TODO: good candidate for FSS
   766   $text = strtr( $text, $state['general'] );
   766 	$text = strtr( $text, $state['general'] );
   767   
   767 	
   768   return $text;
   768 	return $text;
   769 }
   769 }
   770 
   770 
   771 /**
   771 /**
   772  * Return UTF-8 string for a codepoint if that is a valid
   772  * Return UTF-8 string for a codepoint if that is a valid
   773  * character reference, otherwise U+FFFD REPLACEMENT CHARACTER.
   773  * character reference, otherwise U+FFFD REPLACEMENT CHARACTER.
   774  * @param int $codepoint
   774  * @param int $codepoint
   775  * @return string
   775  * @return string
   776  * @private
   776  * @private
   777  */
   777  */
   778 function decodeChar( $codepoint ) {
   778 function decodeChar( $codepoint ) {
   779   if( validateCodepoint( $codepoint ) ) {
   779 	if( validateCodepoint( $codepoint ) ) {
   780     return codepointToUtf8( $codepoint );
   780 		return codepointToUtf8( $codepoint );
   781   } else {
   781 	} else {
   782     return UTF8_REPLACEMENT;
   782 		return UTF8_REPLACEMENT;
   783   }
   783 	}
   784 }
   784 }
   785 
   785 
   786 /**
   786 /**
   787  * If the named entity is defined in the HTML 4.0/XHTML 1.0 DTD,
   787  * If the named entity is defined in the HTML 4.0/XHTML 1.0 DTD,
   788  * return the UTF-8 encoding of that character. Otherwise, returns
   788  * return the UTF-8 encoding of that character. Otherwise, returns
   790  *
   790  *
   791  * @param string $name
   791  * @param string $name
   792  * @return string
   792  * @return string
   793  */
   793  */
   794 function decodeEntity( $name ) {
   794 function decodeEntity( $name ) {
   795   global $wgHtmlEntities;
   795 	global $wgHtmlEntities;
   796   if( isset( $wgHtmlEntities[$name] ) ) {
   796 	if( isset( $wgHtmlEntities[$name] ) ) {
   797     return codepointToUtf8( $wgHtmlEntities[$name] );
   797 		return codepointToUtf8( $wgHtmlEntities[$name] );
   798   } else {
   798 	} else {
   799     return "&$name;";
   799 		return "&$name;";
   800   }
   800 	}
   801 }
   801 }
   802 
   802 
   803 /**
   803 /**
   804  * Returns true if a given Unicode codepoint is a valid character in XML.
   804  * Returns true if a given Unicode codepoint is a valid character in XML.
   805  * @param int $codepoint
   805  * @param int $codepoint
   806  * @return bool
   806  * @return bool
   807  */
   807  */
   808 function validateCodepoint( $codepoint ) {
   808 function validateCodepoint( $codepoint ) {
   809   return ($codepoint ==    0x09)
   809 	return ($codepoint ==    0x09)
   810     || ($codepoint ==    0x0a)
   810 		|| ($codepoint ==    0x0a)
   811     || ($codepoint ==    0x0d)
   811 		|| ($codepoint ==    0x0d)
   812     || ($codepoint >=    0x20 && $codepoint <=   0xd7ff)
   812 		|| ($codepoint >=    0x20 && $codepoint <=   0xd7ff)
   813     || ($codepoint >=  0xe000 && $codepoint <=   0xfffd)
   813 		|| ($codepoint >=  0xe000 && $codepoint <=   0xfffd)
   814     || ($codepoint >= 0x10000 && $codepoint <= 0x10ffff);
   814 		|| ($codepoint >= 0x10000 && $codepoint <= 0x10ffff);
   815 }
   815 }
   816   
   816 	
   817 /**
   817 /**
   818  * Return UTF-8 sequence for a given Unicode code point.
   818  * Return UTF-8 sequence for a given Unicode code point.
   819  * May die if fed out of range data.
   819  * May die if fed out of range data.
   820  *
   820  *
   821  * @param $codepoint Integer:
   821  * @param $codepoint Integer:
   841 /**
   841 /**
   842  * @param string $matches
   842  * @param string $matches
   843  * @return string
   843  * @return string
   844  */
   844  */
   845 function decodeCharReferencesCallback( $matches ) {
   845 function decodeCharReferencesCallback( $matches ) {
   846   if( $matches[1] != '' ) {
   846 	if( $matches[1] != '' ) {
   847     return decodeEntity( $matches[1] );
   847 		return decodeEntity( $matches[1] );
   848   } elseif( $matches[2] != '' ) {
   848 	} elseif( $matches[2] != '' ) {
   849     return  decodeChar( intval( $matches[2] ) );
   849 		return  decodeChar( intval( $matches[2] ) );
   850   } elseif( $matches[3] != ''  ) {
   850 	} elseif( $matches[3] != ''  ) {
   851     return  decodeChar( hexdec( $matches[3] ) );
   851 		return  decodeChar( hexdec( $matches[3] ) );
   852   } elseif( $matches[4] != '' ) {
   852 	} elseif( $matches[4] != '' ) {
   853     return  decodeChar( hexdec( $matches[4] ) );
   853 		return  decodeChar( hexdec( $matches[4] ) );
   854   }
   854 	}
   855   # Last case should be an ampersand by itself
   855 	# Last case should be an ampersand by itself
   856   return $matches[0];
   856 	return $matches[0];
   857 }
   857 }
   858 
   858