includes/wikiengine/TagSanitizer.php
author Dan Fuhry <dan@enanocms.org>
Wed, 29 Dec 2010 13:25:32 -0500
changeset 1332 12286b3ee214
parent 1227 bdac73ed481e
child 1382 78fbedb876f3
permissions -rw-r--r--
Added some more hooks to the page editing pipeline. It should now be possible to add controls to the page editor, send the data from them out to the server, and process them on the server side.

<?php

/*
 * Enano - an open-source CMS capable of wiki functions, Drupal-like sidebar blocks, and everything in between
 * Copyright (C) 2006-2009 Dan Fuhry
 *
 * This program is Free Software; you can redistribute and/or modify it under the terms of the GNU General Public License
 * as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied
 * warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for details.
 *
 * This script contains code originally found in MediaWiki (http://www.mediawiki.org). MediaWiki is also licensed under
 * the GPLv2 or later; see the file GPL included with this package for details.
 */

$attrib = '[a-zA-Z0-9]';
$space = '[\x09\x0a\x0d\x20]';

define( 'MW_CHAR_REFS_REGEX',
'/&([A-Za-z0-9]+);
 |&\#([0-9]+);
 |&\#x([0-9A-Za-z]+);
 |&\#X([0-9A-Za-z]+);
 |(&)/x' );

define( 'MW_ATTRIBS_REGEX',
	"/(?:^|$space)($attrib+)
		($space*=$space*
		(?:
 		# The attribute value: quoted or alone
			\"([^<\"]*)\"
 		| '([^<']*)'
 		|  ([a-zA-Z0-9!#$%&()*,\\-.\\/:;<>?@[\\]^_`{|}~]+)
 		|  (\#[0-9a-fA-F]+) # Technically wrong, but lots of
 							# colors are specified like this.
 							# We'll be normalizing it.
		)
 		)?(?=$space|\$)/sx" );

/**
 * Take a tag soup fragment listing an HTML element's attributes
 * and normalize it to well-formed XML, discarding unwanted attributes.
 * Output is safe for further wikitext processing, with escaping of
 * values that could trigger problems.
 *
 * - Normalizes attribute names to lowercase
 * - Discards attributes not on a whitelist for the given element
 * - Turns broken or invalid entities into plaintext
 * - Double-quotes all attribute values
 * - Attributes without values are given the name as attribute
 * - Double attributes are discarded
 * - Unsafe style attributes are discarded
 * - Prepends space if there are attributes.
 *
 * @param string $text
 * @param string $element
 * @return string
 */
function fixTagAttributes( $text, $element ) {
	if( trim( $text ) == '' ) {
		return '';
	}
	
	$stripped = validateTagAttributes(
		decodeTagAttributes( $text ), $element );
	
	$attribs = array();
	foreach( $stripped as $attribute => $value ) {
		$encAttribute = htmlspecialchars( $attribute );
		$encValue = safeEncodeAttribute( $value );
		
		$attribs[] = "$encAttribute=".'"'."$encValue".'"'.""; // "
	}
	return count( $attribs ) ? ' ' . implode( ' ', $attribs ) : '';
}

/**
 * Encode an attribute value for HTML tags, with extra armoring
 * against further wiki processing.
 * @param $text
 * @return HTML-encoded text fragment
 */
function safeEncodeAttribute( $text ) {
	$encValue= encodeAttribute( $text );
	
	# Templates and links may be expanded in later parsing,
	# creating invalid or dangerous output. Suppress this.
	$encValue = strtr( $encValue, array(
		'<'    => '&lt;',   // This should never happen,
		'>'    => '&gt;',   // we've received invalid input
		'"'    => '&quot;', // which should have been escaped.
		'{'    => '&#123;',
		'['    => '&#91;',
		"''"   => '&#39;&#39;',
		'ISBN' => '&#73;SBN',
		'RFC'  => '&#82;FC',
		'PMID' => '&#80;MID',
		'|'    => '&#124;',
		'__'   => '&#95;_',
	) );

	return $encValue;
}

/**
 * Encode an attribute value for HTML output.
 * @param $text
 * @return HTML-encoded text fragment
 */
function encodeAttribute( $text ) {
	
	// In Enano 1.0.3, added this cheapo hack to keep ampersands
	// from being double-sanitized. Thanks to markybob from #deluge.
	
	// htmlspecialchars() the "manual" way
	$encValue = strtr( $text, array(
		'&amp;'  => '&',
		'&quot;' => '"',
		'&lt;'   => '<',
		'&gt;'   => '>',
		'&#039;' => "'"
	) );
	
	$encValue = strtr( $text, array(
		'&' => '&amp;',
		'"' => '&quot;',
		'<' => '&lt;',
		'>' => '&gt;',
		"'" => '&#039;'
	) );
	
	
	// Whitespace is normalized during attribute decoding,
	// so if we've been passed non-spaces we must encode them
	// ahead of time or they won't be preserved.
	$encValue = strtr( $encValue, array(
		"\n" => '&#10;',
		"\r" => '&#13;',
		"\t" => '&#9;',
	) );
	
	return $encValue;
}

function unstripForHTML( $text ) {
	global $mStripState;
	$text = unstrip( $text, $mStripState );
	$text = unstripNoWiki( $text, $mStripState );
	return $text;
}

/**
 * Always call this after unstrip() to preserve the order
 *
 * @private
 */
function unstripNoWiki( $text, &$state ) {
	if ( !isset( $state['nowiki'] ) ) {
		return $text;
	}

	# TODO: good candidate for FSS
	$text = strtr( $text, $state['nowiki'] );
	
	return $text;
}

/**
 * Take an array of attribute names and values and normalize or discard
 * illegal values for the given element type.
 *
 * - Discards attributes not on a whitelist for the given element
 * - Unsafe style attributes are discarded
 *
 * @param array $attribs
 * @param string $element
 * @return array
 *
 * @todo Check for legal values where the DTD limits things.
 * @todo Check for unique id attribute :P
 */
function validateTagAttributes( $attribs, $element ) {
	$whitelist = array_flip( attributeWhitelist( $element ) );
	$out = array();
	foreach( $attribs as $attribute => $value ) {
		if( !isset( $whitelist[$attribute] ) ) {
			continue;
		}
		# Strip javascript "expression" from stylesheets.
		# http://msdn.microsoft.com/workshop/author/dhtml/overview/recalc.asp
		if( $attribute == 'style' ) {
			$value = checkCss( $value );
			if( $value === false ) {
				# haxx0r
				continue;
			}
		}

		if ( $attribute === 'id' )
			$value = escapeId( $value );

		// If this attribute was previously set, override it.
		// Output should only have one attribute of each name.
		$out[$attribute] = $value;
	}
	return $out;
}

/**
 * Pick apart some CSS and check it for forbidden or unsafe structures.
 * Returns a sanitized string, or false if it was just too evil.
 *
 * Currently URL references, 'expression', 'tps' are forbidden.
 *
 * @param string $value
 * @return mixed
 */
function checkCss( $value ) {
	$stripped = decodeCharReferences( $value );

	// Remove any comments; IE gets token splitting wrong
	$stripped = preg_replace( '!/\\*.*?\\*/!S', '', $stripped );
	$value = $stripped;

	// ... and continue checks
	$stripped = preg_replace( '!\\\\([0-9A-Fa-f]{1,6})[ \\n\\r\\t\\f]?!e',
		'codepointToUtf8(hexdec("$1"))', $stripped );
	$stripped = str_replace( '\\', '', $stripped );
	if( preg_match( '/(expression|tps*:\/\/|url\\s*\().*/is',
			$stripped ) ) {
		# haxx0r
		return false;
	}
	
	return $value;
}

/**
 * Decode any character references, numeric or named entities,
 * in the text and return a UTF-8 string.
 *
 * @param string $text
 * @return string
 * @access public
 * @static
 */
function decodeCharReferences( $text ) {
	return preg_replace_callback(
		MW_CHAR_REFS_REGEX,
		'decodeCharReferencesCallback',
		$text );
}

/**
 * Fetch the whitelist of acceptable attributes for a given
 * element name.
 *
 * @param string $element
 * @return array
 */
function attributeWhitelist( $element ) {
	static $list;
	if( !isset( $list ) ) {
		$list = setupAttributeWhitelist();
	}
	return isset( $list[$element] )
		? $list[$element]
		: array();
}

/**
 * @todo Document it a bit
 * @return array
 */
function setupAttributeWhitelist() {
	global $db, $session, $paths, $template, $plugins;
	$common = array( 'id', 'class', 'lang', 'dir', 'title', 'style' );
	$block = array_merge( $common, array( 'align' ) );
	$tablealign = array( 'align', 'char', 'charoff', 'valign' );
	$tablecell = array( 'abbr',
											'axis',
											'headers',
											'scope',
											'rowspan',
											'colspan',
											'nowrap', # deprecated
											'width',  # deprecated
											'height', # deprecated
											'bgcolor' # deprecated
											);

	# Numbers refer to sections in HTML 4.01 standard describing the element.
	# See: http://www.w3.org/TR/html4/
	$whitelist = array (
		# 7.5.4
		'div'        => $block,
		'center'     => $common, # deprecated
		'span'       => $block, # ??

		# 7.5.5
		'h1'         => $block,
		'h2'         => $block,
		'h3'         => $block,
		'h4'         => $block,
		'h5'         => $block,
		'h6'         => $block,

		# 7.5.6
		# address

		# 8.2.4
		# bdo

		# 9.2.1
		'em'         => $common,
		'strong'     => $common,
		'cite'       => $common,
		# dfn
		'code'       => $common,
		# samp
		# kbd
		'var'        => $common,
		# abbr
		# acronym

		# 9.2.2
		'blockquote' => array_merge( $common, array( 'cite' ) ),
		# q

		# 9.2.3
		'sub'        => $common,
		'sup'        => $common,

		# 9.3.1
		'p'          => $block,

		# 9.3.2
		'br'         => array( 'id', 'class', 'title', 'style', 'clear' ),

		# 9.3.4
		'pre'        => array_merge( $common, array( 'width' ) ),

		# 9.4
		'ins'        => array_merge( $common, array( 'cite', 'datetime' ) ),
		'del'        => array_merge( $common, array( 'cite', 'datetime' ) ),

		# 10.2
		'ul'         => array_merge( $common, array( 'type' ) ),
		'ol'         => array_merge( $common, array( 'type', 'start' ) ),
		'li'         => array_merge( $common, array( 'type', 'value' ) ),

		# 10.3
		'dl'         => $common,
		'dd'         => $common,
		'dt'         => $common,

		# 11.2.1
		'table'      => array_merge( $common,
							array( 'summary', 'width', 'border', 'frame',
									'rules', 'cellspacing', 'cellpadding',
									'align', 'bgcolor',
							) ),

		# 11.2.2
		'caption'    => array_merge( $common, array( 'align' ) ),

		# 11.2.3
		'thead'      => array_merge( $common, $tablealign ),
		'tfoot'      => array_merge( $common, $tablealign ),
		'tbody'      => array_merge( $common, $tablealign ),

		# 11.2.4
		'colgroup'   => array_merge( $common, array( 'span', 'width' ), $tablealign ),
		'col'        => array_merge( $common, array( 'span', 'width' ), $tablealign ),

		# 11.2.5
		'tr'         => array_merge( $common, array( 'bgcolor' ), $tablealign ),

		# 11.2.6
		'td'         => array_merge( $common, $tablecell, $tablealign ),
		'th'         => array_merge( $common, $tablecell, $tablealign ),
		
		# 12.2
		# added by dan
		'a'          => array_merge( $common, array( 'href', 'name' ) ),
		
		# 13.2
		# added by dan
		'img'        => array_merge( $common, array( 'src', 'width', 'height', 'alt' ) ),

		# 15.2.1
		'tt'         => $common,
		'b'          => $common,
		'i'          => $common,
		'big'        => $common,
		'small'      => $common,
		'strike'     => $common,
		's'          => $common,
		'u'          => $common,

		# 15.2.2
		'font'       => array_merge( $common, array( 'size', 'color', 'face' ) ),
		# basefont

		# 15.3
		'hr'         => array_merge( $common, array( 'noshade', 'size', 'width' ) ),

		# XHTML Ruby annotation text module, simple ruby only.
		# http://www.w3c.org/TR/ruby/
		'ruby'       => $common,
		# rbc
		# rtc
		'rb'         => $common,
		'rt'         => $common, #array_merge( $common, array( 'rbspan' ) ),
		'rp'         => $common,
		
		# For compatibility with the XHTML parser.
		'nowiki'     => array(),
		'noinclude'  => array(),
		'nodisplay'  => array(),
		'lang'       => array('code'),
		
		# XHTML stuff
		'acronym'    => $common
		);
	
	// custom tags can be added by plugins
	$code = $plugins->setHook('html_attribute_whitelist');
	foreach ( $code as $cmd )
	{
		eval($cmd);
	}
	
	return $whitelist;
}

/**
 * Given a value escape it so that it can be used in an id attribute and
 * return it, this does not validate the value however (see first link)
 *
 * @link http://www.w3.org/TR/html401/types.html#type-name Valid characters
 *                                                          in the id and
 *                                                          name attributes
 * @link http://www.w3.org/TR/html401/struct/links.html#h-12.2.3 Anchors with the id attribute
 *
 * @bug 4461
 *
 * @static
 *
 * @param string $id
 * @return string
 */
function escapeId( $id ) {
	static $replace = array(
		'%3A' => ':',
		'%' => '.'
	);

	$id = urlencode( decodeCharReferences( strtr( $id, ' ', '_' ) ) );

	return str_replace( array_keys( $replace ), array_values( $replace ), $id );
}

/**
 * More or less "markup-safe" explode()
 * Ignores any instances of the separator inside <...>
 * @param string $separator
 * @param string $text
 * @return array
 */
function wfExplodeMarkup( $separator, $text ) {
	$placeholder = "\x00";
	
	// Just in case...
	$text = str_replace( $placeholder, '', $text );
	
	// Trim stuff
	$replacer = new ReplacerCallback( $separator, $placeholder );
	$cleaned = preg_replace_callback( '/(<.*?>)/', array( $replacer, 'go' ), $text );
	
	$items = explode( $separator, $cleaned );
	foreach( $items as $i => $str ) {
		$items[$i] = str_replace( $placeholder, $separator, $str );
	}
	
	return $items;
}

class ReplacerCallback {
	function ReplacerCallback( $from, $to ) {
		$this->from = $from;
		$this->to = $to;
	}
	
	function go( $matches ) {
		return str_replace( $this->from, $this->to, $matches[1] );
	}
}

/**
 * Return an associative array of attribute names and values from
 * a partial tag string. Attribute names are forces to lowercase,
 * character references are decoded to UTF-8 text.
 *
 * @param string
 * @return array
 */
function decodeTagAttributes( $text ) {
	$attribs = array();

	if( trim( $text ) == '' ) {
		return $attribs;
	}

	$pairs = array();
	if( !preg_match_all(
		MW_ATTRIBS_REGEX,
		$text,
		$pairs,
		PREG_SET_ORDER ) ) {
		return $attribs;
	}

	foreach( $pairs as $set ) {
		$attribute = strtolower( $set[1] );
		$value = getTagAttributeCallback( $set );
		
		// Normalize whitespace
		$value = preg_replace( '/[\t\r\n ]+/', ' ', $value );
		$value = trim( $value );
		
		// Decode character references
		$attribs[$attribute] = decodeCharReferences( $value );
	}
	return $attribs;
}

/**
 * Pick the appropriate attribute value from a match set from the
 * MW_ATTRIBS_REGEX matches.
 *
 * @param array $set
 * @return string
 * @access private
 */
function getTagAttributeCallback( $set ) {
	if( isset( $set[6] ) ) {
		# Illegal #XXXXXX color with no quotes.
		return $set[6];
	} elseif( isset( $set[5] ) ) {
		# No quotes.
		return $set[5];
	} elseif( isset( $set[4] ) ) {
		# Single-quoted
		return $set[4];
	} elseif( isset( $set[3] ) ) {
		# Double-quoted
		return $set[3];
	} elseif( !isset( $set[2] ) ) {
		# In XHTML, attributes must have a value.
		# For 'reduced' form, return explicitly the attribute name here.
		return $set[1];
	} else {
		die_friendly('Parser error', "<p>Tag conditions not met. This should never happen and is a bug.</p>" );
	}
}

/**
 * Strips and renders nowiki, pre, math, hiero
 * If $render is set, performs necessary rendering operations on plugins
 * Returns the text, and fills an array with data needed in unstrip()
 * If the $state is already a valid strip state, it adds to the state
 *
 * @param bool $stripcomments when set, HTML comments <!-- like this -->
 *  will be stripped in addition to other tags. This is important
 *  for section editing, where these comments cause confusion when
 *  counting the sections in the wikisource
 * 
 * @param array dontstrip contains tags which should not be stripped;
 *  used to prevent stipping of <gallery> when saving (fixes bug 2700)
 *
 * @access private
 */
function mwStrip( $text, &$state, $stripcomments = false , $dontstrip = array () ) {
	global $wgRandomKey;
	$render = true;

	$wgRandomKey = "\x07UNIQ" . dechex(mt_rand(0, 0x7fffffff)) . dechex(mt_rand(0, 0x7fffffff));
	$uniq_prefix =& $wgRandomKey;
	$commentState = array();
	
	$elements = array( 'nowiki', 'gallery' );
	
	# Removing $dontstrip tags from $elements list (currently only 'gallery', fixing bug 2700)
	foreach ( $elements AS $k => $v ) {
		if ( !in_array ( $v , $dontstrip ) ) continue;
		unset ( $elements[$k] );
	}
	
	$matches = array();
	$text = extractTagsAndParams( $elements, $text, $matches, $uniq_prefix );

	foreach( $matches as $marker => $data ) {
		list( $element, $content, $params, $tag ) = $data;
		if( $render ) {
			$tagName = strtolower( $element );
			switch( $tagName ) {
			case '!--':
				// Comment
				if( substr( $tag, -3 ) == '-->' ) {
					$output = $tag;
				} else {
					// Unclosed comment in input.
					// Close it so later stripping can remove it
					$output = "$tag-->";
				}
				break;
			case 'html':
				if( $wgRawHtml ) {
					$output = $content;
					break;
				}
				// Shouldn't happen otherwise. :)
			case 'nowiki':
				$output = wfEscapeHTMLTagsOnly( $content );
				break;
			default:
			}
		} else {
			// Just stripping tags; keep the source
			$output = $tag;
		}

		// Unstrip the output, because unstrip() is no longer recursive so 
		// it won't do it itself
		$output = unstrip( $output, $state );

		if( !$stripcomments && $element == '!--' ) {
			$commentState[$marker] = $output;
		} elseif ( $element == 'html' || $element == 'nowiki' ) {
			$state['nowiki'][$marker] = $output;
		} else {
			$state['general'][$marker] = $output;
		}
	}

	# Unstrip comments unless explicitly told otherwise.
	# (The comments are always stripped prior to this point, so as to
	# not invoke any extension tags / parser hooks contained within
	# a comment.)
	if ( !$stripcomments ) {
		// Put them all back and forget them
		$text = strtr( $text, $commentState );
	}

	return $text;
}

/**
 * Replaces all occurrences of HTML-style comments and the given tags
 * in the text with a random marker and returns teh next text. The output
 * parameter $matches will be an associative array filled with data in
 * the form:
 *   'UNIQ-xxxxx' => array(
 *     'element',
 *     'tag content',
 *     array( 'param' => 'x' ),
 *     '<element param="x">tag content</element>' ) )
 *
 * @param $elements list of element names. Comments are always extracted.
 * @param $text Source text string.
 * @param $uniq_prefix
 *
 * @access private
 * @static
 */
function extractTagsAndParams($elements, $text, &$matches, $uniq_prefix = ''){
	static $n = 1;
	$stripped = '';
	$matches = array();

	$taglist = implode( '|', $elements );
	$start = "/<($taglist)(\\s+[^>]*?|\\s*?)(\/?>)|<(!--)/i";

	while ( '' != $text ) {
		$p = preg_split( $start, $text, 2, PREG_SPLIT_DELIM_CAPTURE );
		$stripped .= $p[0];
		if( count( $p ) < 5 ) {
			break;
		}
		if( count( $p ) > 5 ) {
			// comment
			$element    = $p[4];
			$attributes = '';
			$close      = '';
			$inside     = $p[5];
		} else {
			// tag
			$element    = $p[1];
			$attributes = $p[2];
			$close      = $p[3];
			$inside     = $p[4];
		}

		$marker = "$uniq_prefix-$element-" . sprintf('%08X', $n++) . '-QINU';
		$stripped .= $marker;

		if ( $close === '/>' ) {
			// Empty element tag, <tag />
			$content = null;
			$text = $inside;
			$tail = null;
		} else {
			if( $element == '!--' ) {
				$end = '/(-->)/';
			} else {
				$end = "/(<\\/$element\\s*>)/i";
			}
			$q = preg_split( $end, $inside, 2, PREG_SPLIT_DELIM_CAPTURE );
			$content = $q[0];
			if( count( $q ) < 3 ) {
				# No end tag -- let it run out to the end of the text.
				$tail = '';
				$text = '';
			} else {
				$tail = $q[1];
				$text = $q[2];
			}
		}
		
		$matches[$marker] = array( $element,
			$content,
			decodeTagAttributes( $attributes ),
			"<$element$attributes$close$content$tail" );
	}
	return $stripped;
}

/**
 * Escape html tags
 * Basically replacing " > and < with HTML entities ( &quot;, &gt;, &lt;)
 *
 * @param $in String: text that might contain HTML tags.
 * @return string Escaped string
 */
function wfEscapeHTMLTagsOnly( $in ) {
	return str_replace(
		array( '"', '>', '<' ),
		array( '&quot;', '&gt;', '&lt;' ),
		$in );
}

/**
 * Restores pre, math, and other extensions removed by strip()
 *
 * always call unstripNoWiki() after this one
 * @private
 */
function unstrip( $text, &$state ) {
	if ( !isset( $state['general'] ) ) {
		return $text;
	}

	# TODO: good candidate for FSS
	$text = strtr( $text, $state['general'] );
	
	return $text;
}

/**
 * Return UTF-8 string for a codepoint if that is a valid
 * character reference, otherwise U+FFFD REPLACEMENT CHARACTER.
 * @param int $codepoint
 * @return string
 * @private
 */
function decodeChar( $codepoint ) {
	if( validateCodepoint( $codepoint ) ) {
		return codepointToUtf8( $codepoint );
	} else {
		return UTF8_REPLACEMENT;
	}
}

/**
 * If the named entity is defined in the HTML 4.0/XHTML 1.0 DTD,
 * return the UTF-8 encoding of that character. Otherwise, returns
 * pseudo-entity source (eg &foo;)
 *
 * @param string $name
 * @return string
 */
function decodeEntity( $name ) {
	global $wgHtmlEntities;
	if( isset( $wgHtmlEntities[$name] ) ) {
		return codepointToUtf8( $wgHtmlEntities[$name] );
	} else {
		return "&$name;";
	}
}

/**
 * Returns true if a given Unicode codepoint is a valid character in XML.
 * @param int $codepoint
 * @return bool
 */
function validateCodepoint( $codepoint ) {
	return ($codepoint ==    0x09)
		|| ($codepoint ==    0x0a)
		|| ($codepoint ==    0x0d)
		|| ($codepoint >=    0x20 && $codepoint <=   0xd7ff)
		|| ($codepoint >=  0xe000 && $codepoint <=   0xfffd)
		|| ($codepoint >= 0x10000 && $codepoint <= 0x10ffff);
}
	
/**
 * Return UTF-8 sequence for a given Unicode code point.
 * May die if fed out of range data.
 *
 * @param $codepoint Integer:
 * @return String
 * @public
 */
function codepointToUtf8( $codepoint ) {
	if($codepoint <		0x80) return chr($codepoint);
	if($codepoint <    0x800) return chr($codepoint >>	6 & 0x3f | 0xc0) .
									 chr($codepoint		  & 0x3f | 0x80);
	if($codepoint <  0x10000) return chr($codepoint >> 12 & 0x0f | 0xe0) .
									 chr($codepoint >>	6 & 0x3f | 0x80) .
									 chr($codepoint		  & 0x3f | 0x80);
	if($codepoint < 0x110000) return chr($codepoint >> 18 & 0x07 | 0xf0) .
									 chr($codepoint >> 12 & 0x3f | 0x80) .
									 chr($codepoint >>	6 & 0x3f | 0x80) .
									 chr($codepoint		  & 0x3f | 0x80);

	echo "Asked for code outside of range ($codepoint)\n";
	die( -1 );
}

/**
 * @param string $matches
 * @return string
 */
function decodeCharReferencesCallback( $matches ) {
	if( $matches[1] != '' ) {
		return decodeEntity( $matches[1] );
	} elseif( $matches[2] != '' ) {
		return  decodeChar( intval( $matches[2] ) );
	} elseif( $matches[3] != ''  ) {
		return  decodeChar( hexdec( $matches[3] ) );
	} elseif( $matches[4] != '' ) {
		return  decodeChar( hexdec( $matches[4] ) );
	}
	# Last case should be an ampersand by itself
	return $matches[0];
}