includes/wikiengine/TagSanitizer.php
author Dan Fuhry <dan@enanocms.org>
Fri, 01 Apr 2016 10:26:18 -0400
changeset 1382 78fbedb876f3
parent 1227 bdac73ed481e
permissions -rw-r--r--
Rename constructors named from class name to __construct

<?php

/*
 * Enano - an open-source CMS capable of wiki functions, Drupal-like sidebar blocks, and everything in between
 * Copyright (C) 2006-2009 Dan Fuhry
 *
 * This program is Free Software; you can redistribute and/or modify it under the terms of the GNU General Public License
 * as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied
 * warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for details.
 *
 * This script contains code originally found in MediaWiki (http://www.mediawiki.org). MediaWiki is also licensed under
 * the GPLv2 or later; see the file GPL included with this package for details.
 */

$attrib = '[a-zA-Z0-9]';
$space = '[\x09\x0a\x0d\x20]';

define( 'MW_CHAR_REFS_REGEX',
'/&([A-Za-z0-9]+);
 |&\#([0-9]+);
 |&\#x([0-9A-Za-z]+);
 |&\#X([0-9A-Za-z]+);
 |(&)/x' );

define( 'MW_ATTRIBS_REGEX',
	"/(?:^|$space)($attrib+)
		($space*=$space*
		(?:
 		# The attribute value: quoted or alone
			\"([^<\"]*)\"
 		| '([^<']*)'
 		|  ([a-zA-Z0-9!#$%&()*,\\-.\\/:;<>?@[\\]^_`{|}~]+)
 		|  (\#[0-9a-fA-F]+) # Technically wrong, but lots of
 							# colors are specified like this.
 							# We'll be normalizing it.
		)
 		)?(?=$space|\$)/sx" );

/**
 * Take a tag soup fragment listing an HTML element's attributes
 * and normalize it to well-formed XML, discarding unwanted attributes.
 * Output is safe for further wikitext processing, with escaping of
 * values that could trigger problems.
 *
 * - Normalizes attribute names to lowercase
 * - Discards attributes not on a whitelist for the given element
 * - Turns broken or invalid entities into plaintext
 * - Double-quotes all attribute values
 * - Attributes without values are given the name as attribute
 * - Double attributes are discarded
 * - Unsafe style attributes are discarded
 * - Prepends space if there are attributes.
 *
 * @param string $text
 * @param string $element
 * @return string
 */
function fixTagAttributes( $text, $element ) {
	if( trim( $text ) == '' ) {
		return '';
	}
	
	$stripped = validateTagAttributes(
		decodeTagAttributes( $text ), $element );
	
	$attribs = array();
	foreach( $stripped as $attribute => $value ) {
		$encAttribute = htmlspecialchars( $attribute );
		$encValue = safeEncodeAttribute( $value );
		
		$attribs[] = "$encAttribute=".'"'."$encValue".'"'.""; // "
	}
	return count( $attribs ) ? ' ' . implode( ' ', $attribs ) : '';
}

/**
 * Encode an attribute value for HTML tags, with extra armoring
 * against further wiki processing.
 * @param $text
 * @return HTML-encoded text fragment
 */
function safeEncodeAttribute( $text ) {
	$encValue= encodeAttribute( $text );
	
	# Templates and links may be expanded in later parsing,
	# creating invalid or dangerous output. Suppress this.
	$encValue = strtr( $encValue, array(
		'<'    => '&lt;',   // This should never happen,
		'>'    => '&gt;',   // we've received invalid input
		'"'    => '&quot;', // which should have been escaped.
		'{'    => '&#123;',
		'['    => '&#91;',
		"''"   => '&#39;&#39;',
		'ISBN' => '&#73;SBN',
		'RFC'  => '&#82;FC',
		'PMID' => '&#80;MID',
		'|'    => '&#124;',
		'__'   => '&#95;_',
	) );

	return $encValue;
}

/**
 * Encode an attribute value for HTML output.
 * @param $text
 * @return HTML-encoded text fragment
 */
function encodeAttribute( $text ) {
	
	// In Enano 1.0.3, added this cheapo hack to keep ampersands
	// from being double-sanitized. Thanks to markybob from #deluge.
	
	// htmlspecialchars() the "manual" way
	$encValue = strtr( $text, array(
		'&amp;'  => '&',
		'&quot;' => '"',
		'&lt;'   => '<',
		'&gt;'   => '>',
		'&#039;' => "'"
	) );
	
	$encValue = strtr( $text, array(
		'&' => '&amp;',
		'"' => '&quot;',
		'<' => '&lt;',
		'>' => '&gt;',
		"'" => '&#039;'
	) );
	
	
	// Whitespace is normalized during attribute decoding,
	// so if we've been passed non-spaces we must encode them
	// ahead of time or they won't be preserved.
	$encValue = strtr( $encValue, array(
		"\n" => '&#10;',
		"\r" => '&#13;',
		"\t" => '&#9;',
	) );
	
	return $encValue;
}

function unstripForHTML( $text ) {
	global $mStripState;
	$text = unstrip( $text, $mStripState );
	$text = unstripNoWiki( $text, $mStripState );
	return $text;
}

/**
 * Always call this after unstrip() to preserve the order
 *
 * @private
 */
function unstripNoWiki( $text, &$state ) {
	if ( !isset( $state['nowiki'] ) ) {
		return $text;
	}

	# TODO: good candidate for FSS
	$text = strtr( $text, $state['nowiki'] );
	
	return $text;
}

/**
 * Take an array of attribute names and values and normalize or discard
 * illegal values for the given element type.
 *
 * - Discards attributes not on a whitelist for the given element
 * - Unsafe style attributes are discarded
 *
 * @param array $attribs
 * @param string $element
 * @return array
 *
 * @todo Check for legal values where the DTD limits things.
 * @todo Check for unique id attribute :P
 */
function validateTagAttributes( $attribs, $element ) {
	$whitelist = array_flip( attributeWhitelist( $element ) );
	$out = array();
	foreach( $attribs as $attribute => $value ) {
		if( !isset( $whitelist[$attribute] ) ) {
			continue;
		}
		# Strip javascript "expression" from stylesheets.
		# http://msdn.microsoft.com/workshop/author/dhtml/overview/recalc.asp
		if( $attribute == 'style' ) {
			$value = checkCss( $value );
			if( $value === false ) {
				# haxx0r
				continue;
			}
		}

		if ( $attribute === 'id' )
			$value = escapeId( $value );

		// If this attribute was previously set, override it.
		// Output should only have one attribute of each name.
		$out[$attribute] = $value;
	}
	return $out;
}

/**
 * Pick apart some CSS and check it for forbidden or unsafe structures.
 * Returns a sanitized string, or false if it was just too evil.
 *
 * Currently URL references, 'expression', 'tps' are forbidden.
 *
 * @param string $value
 * @return mixed
 */
function checkCss( $value ) {
	$stripped = decodeCharReferences( $value );

	// Remove any comments; IE gets token splitting wrong
	$stripped = preg_replace( '!/\\*.*?\\*/!S', '', $stripped );
	$value = $stripped;

	// ... and continue checks
	$stripped = preg_replace( '!\\\\([0-9A-Fa-f]{1,6})[ \\n\\r\\t\\f]?!e',
		'codepointToUtf8(hexdec("$1"))', $stripped );
	$stripped = str_replace( '\\', '', $stripped );
	if( preg_match( '/(expression|tps*:\/\/|url\\s*\().*/is',
			$stripped ) ) {
		# haxx0r
		return false;
	}
	
	return $value;
}

/**
 * Decode any character references, numeric or named entities,
 * in the text and return a UTF-8 string.
 *
 * @param string $text
 * @return string
 * @access public
 * @static
 */
function decodeCharReferences( $text ) {
	return preg_replace_callback(
		MW_CHAR_REFS_REGEX,
		'decodeCharReferencesCallback',
		$text );
}

/**
 * Fetch the whitelist of acceptable attributes for a given
 * element name.
 *
 * @param string $element
 * @return array
 */
function attributeWhitelist( $element ) {
	static $list;
	if( !isset( $list ) ) {
		$list = setupAttributeWhitelist();
	}
	return isset( $list[$element] )
		? $list[$element]
		: array();
}

/**
 * @todo Document it a bit
 * @return array
 */
function setupAttributeWhitelist() {
	global $db, $session, $paths, $template, $plugins;
	$common = array( 'id', 'class', 'lang', 'dir', 'title', 'style' );
	$block = array_merge( $common, array( 'align' ) );
	$tablealign = array( 'align', 'char', 'charoff', 'valign' );
	$tablecell = array( 'abbr',
											'axis',
											'headers',
											'scope',
											'rowspan',
											'colspan',
											'nowrap', # deprecated
											'width',  # deprecated
											'height', # deprecated
											'bgcolor' # deprecated
											);

	# Numbers refer to sections in HTML 4.01 standard describing the element.
	# See: http://www.w3.org/TR/html4/
	$whitelist = array (
		# 7.5.4
		'div'        => $block,
		'center'     => $common, # deprecated
		'span'       => $block, # ??

		# 7.5.5
		'h1'         => $block,
		'h2'         => $block,
		'h3'         => $block,
		'h4'         => $block,
		'h5'         => $block,
		'h6'         => $block,

		# 7.5.6
		# address

		# 8.2.4
		# bdo

		# 9.2.1
		'em'         => $common,
		'strong'     => $common,
		'cite'       => $common,
		# dfn
		'code'       => $common,
		# samp
		# kbd
		'var'        => $common,
		# abbr
		# acronym

		# 9.2.2
		'blockquote' => array_merge( $common, array( 'cite' ) ),
		# q

		# 9.2.3
		'sub'        => $common,
		'sup'        => $common,

		# 9.3.1
		'p'          => $block,

		# 9.3.2
		'br'         => array( 'id', 'class', 'title', 'style', 'clear' ),

		# 9.3.4
		'pre'        => array_merge( $common, array( 'width' ) ),

		# 9.4
		'ins'        => array_merge( $common, array( 'cite', 'datetime' ) ),
		'del'        => array_merge( $common, array( 'cite', 'datetime' ) ),

		# 10.2
		'ul'         => array_merge( $common, array( 'type' ) ),
		'ol'         => array_merge( $common, array( 'type', 'start' ) ),
		'li'         => array_merge( $common, array( 'type', 'value' ) ),

		# 10.3
		'dl'         => $common,
		'dd'         => $common,
		'dt'         => $common,

		# 11.2.1
		'table'      => array_merge( $common,
							array( 'summary', 'width', 'border', 'frame',
									'rules', 'cellspacing', 'cellpadding',
									'align', 'bgcolor',
							) ),

		# 11.2.2
		'caption'    => array_merge( $common, array( 'align' ) ),

		# 11.2.3
		'thead'      => array_merge( $common, $tablealign ),
		'tfoot'      => array_merge( $common, $tablealign ),
		'tbody'      => array_merge( $common, $tablealign ),

		# 11.2.4
		'colgroup'   => array_merge( $common, array( 'span', 'width' ), $tablealign ),
		'col'        => array_merge( $common, array( 'span', 'width' ), $tablealign ),

		# 11.2.5
		'tr'         => array_merge( $common, array( 'bgcolor' ), $tablealign ),

		# 11.2.6
		'td'         => array_merge( $common, $tablecell, $tablealign ),
		'th'         => array_merge( $common, $tablecell, $tablealign ),
		
		# 12.2
		# added by dan
		'a'          => array_merge( $common, array( 'href', 'name' ) ),
		
		# 13.2
		# added by dan
		'img'        => array_merge( $common, array( 'src', 'width', 'height', 'alt' ) ),

		# 15.2.1
		'tt'         => $common,
		'b'          => $common,
		'i'          => $common,
		'big'        => $common,
		'small'      => $common,
		'strike'     => $common,
		's'          => $common,
		'u'          => $common,

		# 15.2.2
		'font'       => array_merge( $common, array( 'size', 'color', 'face' ) ),
		# basefont

		# 15.3
		'hr'         => array_merge( $common, array( 'noshade', 'size', 'width' ) ),

		# XHTML Ruby annotation text module, simple ruby only.
		# http://www.w3c.org/TR/ruby/
		'ruby'       => $common,
		# rbc
		# rtc
		'rb'         => $common,
		'rt'         => $common, #array_merge( $common, array( 'rbspan' ) ),
		'rp'         => $common,
		
		# For compatibility with the XHTML parser.
		'nowiki'     => array(),
		'noinclude'  => array(),
		'nodisplay'  => array(),
		'lang'       => array('code'),
		
		# XHTML stuff
		'acronym'    => $common
		);
	
	// custom tags can be added by plugins
	$code = $plugins->setHook('html_attribute_whitelist');
	foreach ( $code as $cmd )
	{
		eval($cmd);
	}
	
	return $whitelist;
}

/**
 * Given a value escape it so that it can be used in an id attribute and
 * return it, this does not validate the value however (see first link)
 *
 * @link http://www.w3.org/TR/html401/types.html#type-name Valid characters
 *                                                          in the id and
 *                                                          name attributes
 * @link http://www.w3.org/TR/html401/struct/links.html#h-12.2.3 Anchors with the id attribute
 *
 * @bug 4461
 *
 * @static
 *
 * @param string $id
 * @return string
 */
function escapeId( $id ) {
	static $replace = array(
		'%3A' => ':',
		'%' => '.'
	);

	$id = urlencode( decodeCharReferences( strtr( $id, ' ', '_' ) ) );

	return str_replace( array_keys( $replace ), array_values( $replace ), $id );
}

/**
 * More or less "markup-safe" explode()
 * Ignores any instances of the separator inside <...>
 * @param string $separator
 * @param string $text
 * @return array
 */
function wfExplodeMarkup( $separator, $text ) {
	$placeholder = "\x00";
	
	// Just in case...
	$text = str_replace( $placeholder, '', $text );
	
	// Trim stuff
	$replacer = new ReplacerCallback( $separator, $placeholder );
	$cleaned = preg_replace_callback( '/(<.*?>)/', array( $replacer, 'go' ), $text );
	
	$items = explode( $separator, $cleaned );
	foreach( $items as $i => $str ) {
		$items[$i] = str_replace( $placeholder, $separator, $str );
	}
	
	return $items;
}

class ReplacerCallback {
	function __construct( $from, $to ) {
		$this->from = $from;
		$this->to = $to;
	}
	
	function go( $matches ) {
		return str_replace( $this->from, $this->to, $matches[1] );
	}
}

/**
 * Return an associative array of attribute names and values from
 * a partial tag string. Attribute names are forces to lowercase,
 * character references are decoded to UTF-8 text.
 *
 * @param string
 * @return array
 */
function decodeTagAttributes( $text ) {
	$attribs = array();

	if( trim( $text ) == '' ) {
		return $attribs;
	}

	$pairs = array();
	if( !preg_match_all(
		MW_ATTRIBS_REGEX,
		$text,
		$pairs,
		PREG_SET_ORDER ) ) {
		return $attribs;
	}

	foreach( $pairs as $set ) {
		$attribute = strtolower( $set[1] );
		$value = getTagAttributeCallback( $set );
		
		// Normalize whitespace
		$value = preg_replace( '/[\t\r\n ]+/', ' ', $value );
		$value = trim( $value );
		
		// Decode character references
		$attribs[$attribute] = decodeCharReferences( $value );
	}
	return $attribs;
}

/**
 * Pick the appropriate attribute value from a match set from the
 * MW_ATTRIBS_REGEX matches.
 *
 * @param array $set
 * @return string
 * @access private
 */
function getTagAttributeCallback( $set ) {
	if( isset( $set[6] ) ) {
		# Illegal #XXXXXX color with no quotes.
		return $set[6];
	} elseif( isset( $set[5] ) ) {
		# No quotes.
		return $set[5];
	} elseif( isset( $set[4] ) ) {
		# Single-quoted
		return $set[4];
	} elseif( isset( $set[3] ) ) {
		# Double-quoted
		return $set[3];
	} elseif( !isset( $set[2] ) ) {
		# In XHTML, attributes must have a value.
		# For 'reduced' form, return explicitly the attribute name here.
		return $set[1];
	} else {
		die_friendly('Parser error', "<p>Tag conditions not met. This should never happen and is a bug.</p>" );
	}
}

/**
 * Strips and renders nowiki, pre, math, hiero
 * If $render is set, performs necessary rendering operations on plugins
 * Returns the text, and fills an array with data needed in unstrip()
 * If the $state is already a valid strip state, it adds to the state
 *
 * @param bool $stripcomments when set, HTML comments <!-- like this -->
 *  will be stripped in addition to other tags. This is important
 *  for section editing, where these comments cause confusion when
 *  counting the sections in the wikisource
 * 
 * @param array dontstrip contains tags which should not be stripped;
 *  used to prevent stipping of <gallery> when saving (fixes bug 2700)
 *
 * @access private
 */
function mwStrip( $text, &$state, $stripcomments = false , $dontstrip = array () ) {
	global $wgRandomKey;
	$render = true;

	$wgRandomKey = "\x07UNIQ" . dechex(mt_rand(0, 0x7fffffff)) . dechex(mt_rand(0, 0x7fffffff));
	$uniq_prefix =& $wgRandomKey;
	$commentState = array();
	
	$elements = array( 'nowiki', 'gallery' );
	
	# Removing $dontstrip tags from $elements list (currently only 'gallery', fixing bug 2700)
	foreach ( $elements AS $k => $v ) {
		if ( !in_array ( $v , $dontstrip ) ) continue;
		unset ( $elements[$k] );
	}
	
	$matches = array();
	$text = extractTagsAndParams( $elements, $text, $matches, $uniq_prefix );

	foreach( $matches as $marker => $data ) {
		list( $element, $content, $params, $tag ) = $data;
		if( $render ) {
			$tagName = strtolower( $element );
			switch( $tagName ) {
			case '!--':
				// Comment
				if( substr( $tag, -3 ) == '-->' ) {
					$output = $tag;
				} else {
					// Unclosed comment in input.
					// Close it so later stripping can remove it
					$output = "$tag-->";
				}
				break;
			case 'html':
				if( $wgRawHtml ) {
					$output = $content;
					break;
				}
				// Shouldn't happen otherwise. :)
			case 'nowiki':
				$output = wfEscapeHTMLTagsOnly( $content );
				break;
			default:
			}
		} else {
			// Just stripping tags; keep the source
			$output = $tag;
		}

		// Unstrip the output, because unstrip() is no longer recursive so 
		// it won't do it itself
		$output = unstrip( $output, $state );

		if( !$stripcomments && $element == '!--' ) {
			$commentState[$marker] = $output;
		} elseif ( $element == 'html' || $element == 'nowiki' ) {
			$state['nowiki'][$marker] = $output;
		} else {
			$state['general'][$marker] = $output;
		}
	}

	# Unstrip comments unless explicitly told otherwise.
	# (The comments are always stripped prior to this point, so as to
	# not invoke any extension tags / parser hooks contained within
	# a comment.)
	if ( !$stripcomments ) {
		// Put them all back and forget them
		$text = strtr( $text, $commentState );
	}

	return $text;
}

/**
 * Replaces all occurrences of HTML-style comments and the given tags
 * in the text with a random marker and returns teh next text. The output
 * parameter $matches will be an associative array filled with data in
 * the form:
 *   'UNIQ-xxxxx' => array(
 *     'element',
 *     'tag content',
 *     array( 'param' => 'x' ),
 *     '<element param="x">tag content</element>' ) )
 *
 * @param $elements list of element names. Comments are always extracted.
 * @param $text Source text string.
 * @param $uniq_prefix
 *
 * @access private
 * @static
 */
function extractTagsAndParams($elements, $text, &$matches, $uniq_prefix = ''){
	static $n = 1;
	$stripped = '';
	$matches = array();

	$taglist = implode( '|', $elements );
	$start = "/<($taglist)(\\s+[^>]*?|\\s*?)(\/?>)|<(!--)/i";

	while ( '' != $text ) {
		$p = preg_split( $start, $text, 2, PREG_SPLIT_DELIM_CAPTURE );
		$stripped .= $p[0];
		if( count( $p ) < 5 ) {
			break;
		}
		if( count( $p ) > 5 ) {
			// comment
			$element    = $p[4];
			$attributes = '';
			$close      = '';
			$inside     = $p[5];
		} else {
			// tag
			$element    = $p[1];
			$attributes = $p[2];
			$close      = $p[3];
			$inside     = $p[4];
		}

		$marker = "$uniq_prefix-$element-" . sprintf('%08X', $n++) . '-QINU';
		$stripped .= $marker;

		if ( $close === '/>' ) {
			// Empty element tag, <tag />
			$content = null;
			$text = $inside;
			$tail = null;
		} else {
			if( $element == '!--' ) {
				$end = '/(-->)/';
			} else {
				$end = "/(<\\/$element\\s*>)/i";
			}
			$q = preg_split( $end, $inside, 2, PREG_SPLIT_DELIM_CAPTURE );
			$content = $q[0];
			if( count( $q ) < 3 ) {
				# No end tag -- let it run out to the end of the text.
				$tail = '';
				$text = '';
			} else {
				$tail = $q[1];
				$text = $q[2];
			}
		}
		
		$matches[$marker] = array( $element,
			$content,
			decodeTagAttributes( $attributes ),
			"<$element$attributes$close$content$tail" );
	}
	return $stripped;
}

/**
 * Escape html tags
 * Basically replacing " > and < with HTML entities ( &quot;, &gt;, &lt;)
 *
 * @param $in String: text that might contain HTML tags.
 * @return string Escaped string
 */
function wfEscapeHTMLTagsOnly( $in ) {
	return str_replace(
		array( '"', '>', '<' ),
		array( '&quot;', '&gt;', '&lt;' ),
		$in );
}

/**
 * Restores pre, math, and other extensions removed by strip()
 *
 * always call unstripNoWiki() after this one
 * @private
 */
function unstrip( $text, &$state ) {
	if ( !isset( $state['general'] ) ) {
		return $text;
	}

	# TODO: good candidate for FSS
	$text = strtr( $text, $state['general'] );
	
	return $text;
}

/**
 * Return UTF-8 string for a codepoint if that is a valid
 * character reference, otherwise U+FFFD REPLACEMENT CHARACTER.
 * @param int $codepoint
 * @return string
 * @private
 */
function decodeChar( $codepoint ) {
	if( validateCodepoint( $codepoint ) ) {
		return codepointToUtf8( $codepoint );
	} else {
		return UTF8_REPLACEMENT;
	}
}

/**
 * If the named entity is defined in the HTML 4.0/XHTML 1.0 DTD,
 * return the UTF-8 encoding of that character. Otherwise, returns
 * pseudo-entity source (eg &foo;)
 *
 * @param string $name
 * @return string
 */
function decodeEntity( $name ) {
	global $wgHtmlEntities;
	if( isset( $wgHtmlEntities[$name] ) ) {
		return codepointToUtf8( $wgHtmlEntities[$name] );
	} else {
		return "&$name;";
	}
}

/**
 * Returns true if a given Unicode codepoint is a valid character in XML.
 * @param int $codepoint
 * @return bool
 */
function validateCodepoint( $codepoint ) {
	return ($codepoint ==    0x09)
		|| ($codepoint ==    0x0a)
		|| ($codepoint ==    0x0d)
		|| ($codepoint >=    0x20 && $codepoint <=   0xd7ff)
		|| ($codepoint >=  0xe000 && $codepoint <=   0xfffd)
		|| ($codepoint >= 0x10000 && $codepoint <= 0x10ffff);
}
	
/**
 * Return UTF-8 sequence for a given Unicode code point.
 * May die if fed out of range data.
 *
 * @param $codepoint Integer:
 * @return String
 * @public
 */
function codepointToUtf8( $codepoint ) {
	if($codepoint <		0x80) return chr($codepoint);
	if($codepoint <    0x800) return chr($codepoint >>	6 & 0x3f | 0xc0) .
									 chr($codepoint		  & 0x3f | 0x80);
	if($codepoint <  0x10000) return chr($codepoint >> 12 & 0x0f | 0xe0) .
									 chr($codepoint >>	6 & 0x3f | 0x80) .
									 chr($codepoint		  & 0x3f | 0x80);
	if($codepoint < 0x110000) return chr($codepoint >> 18 & 0x07 | 0xf0) .
									 chr($codepoint >> 12 & 0x3f | 0x80) .
									 chr($codepoint >>	6 & 0x3f | 0x80) .
									 chr($codepoint		  & 0x3f | 0x80);

	echo "Asked for code outside of range ($codepoint)\n";
	die( -1 );
}

/**
 * @param string $matches
 * @return string
 */
function decodeCharReferencesCallback( $matches ) {
	if( $matches[1] != '' ) {
		return decodeEntity( $matches[1] );
	} elseif( $matches[2] != '' ) {
		return  decodeChar( intval( $matches[2] ) );
	} elseif( $matches[3] != ''  ) {
		return  decodeChar( hexdec( $matches[3] ) );
	} elseif( $matches[4] != '' ) {
		return  decodeChar( hexdec( $matches[4] ) );
	}
	# Last case should be an ampersand by itself
	return $matches[0];
}