includes/wikiengine/TagSanitizer.php
author Dan
Thu, 20 Aug 2009 20:01:55 -0400
changeset 1081 745200a9cc2a
parent 1027 98c052fc3337
child 1127 4b858862c35c
permissions -rw-r--r--
Fixed some upgrade bugs; added support for choosing one's own date/time formats; rebrand as 1.1.7

<?php

/*
 * Enano - an open-source CMS capable of wiki functions, Drupal-like sidebar blocks, and everything in between
 * Copyright (C) 2006-2009 Dan Fuhry
 *
 * This program is Free Software; you can redistribute and/or modify it under the terms of the GNU General Public License
 * as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied
 * warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for details.
 *
 * This script contains code originally found in MediaWiki (http://www.mediawiki.org). MediaWiki is also licensed under
 * the GPLv2 or later; see the file GPL included with this package for details.
 */

$attrib = '[a-zA-Z0-9]';
$space = '[\x09\x0a\x0d\x20]';

define( 'MW_CHAR_REFS_REGEX',
'/&([A-Za-z0-9]+);
 |&\#([0-9]+);
 |&\#x([0-9A-Za-z]+);
 |&\#X([0-9A-Za-z]+);
 |(&)/x' );

define( 'MW_ATTRIBS_REGEX',
  "/(?:^|$space)($attrib+)
    ($space*=$space*
    (?:
     # The attribute value: quoted or alone
      ".'"'."([^<".'"'."]*)".'"'."
     | '([^<']*)'
     |  ([a-zA-Z0-9!#$%&()*,\\-.\\/:;<>?@[\\]^_`{|}~]+)
     |  (\#[0-9a-fA-F]+) # Technically wrong, but lots of
               # colors are specified like this.
               # We'll be normalizing it.
    )
     )?(?=$space|\$)/sx" );

/**
 * Take a tag soup fragment listing an HTML element's attributes
 * and normalize it to well-formed XML, discarding unwanted attributes.
 * Output is safe for further wikitext processing, with escaping of
 * values that could trigger problems.
 *
 * - Normalizes attribute names to lowercase
 * - Discards attributes not on a whitelist for the given element
 * - Turns broken or invalid entities into plaintext
 * - Double-quotes all attribute values
 * - Attributes without values are given the name as attribute
 * - Double attributes are discarded
 * - Unsafe style attributes are discarded
 * - Prepends space if there are attributes.
 *
 * @param string $text
 * @param string $element
 * @return string
 */
function fixTagAttributes( $text, $element ) {
  if( trim( $text ) == '' ) {
    return '';
  }
  
  $stripped = validateTagAttributes(
    decodeTagAttributes( $text ), $element );
  
  $attribs = array();
  foreach( $stripped as $attribute => $value ) {
    $encAttribute = htmlspecialchars( $attribute );
    $encValue = safeEncodeAttribute( $value );
    
    $attribs[] = "$encAttribute=".'"'."$encValue".'"'.""; // "
  }
  return count( $attribs ) ? ' ' . implode( ' ', $attribs ) : '';
}

/**
 * Encode an attribute value for HTML tags, with extra armoring
 * against further wiki processing.
 * @param $text
 * @return HTML-encoded text fragment
 */
function safeEncodeAttribute( $text ) {
  $encValue= encodeAttribute( $text );
  
  # Templates and links may be expanded in later parsing,
  # creating invalid or dangerous output. Suppress this.
  $encValue = strtr( $encValue, array(
    '<'    => '&lt;',   // This should never happen,
    '>'    => '&gt;',   // we've received invalid input
    '"'    => '&quot;', // which should have been escaped.
    '{'    => '&#123;',
    '['    => '&#91;',
    "''"   => '&#39;&#39;',
    'ISBN' => '&#73;SBN',
    'RFC'  => '&#82;FC',
    'PMID' => '&#80;MID',
    '|'    => '&#124;',
    '__'   => '&#95;_',
  ) );

  return $encValue;
}

/**
 * Encode an attribute value for HTML output.
 * @param $text
 * @return HTML-encoded text fragment
 */
function encodeAttribute( $text ) {
  
  // In Enano 1.0.3, added this cheapo hack to keep ampersands
  // from being double-sanitized. Thanks to markybob from #deluge.
  
  // htmlspecialchars() the "manual" way
  $encValue = strtr( $text, array(
    '&amp;'  => '&',
    '&quot;' => '"',
    '&lt;'   => '<',
    '&gt;'   => '>',
    '&#039;' => "'"
  ) );
  
  $encValue = strtr( $text, array(
    '&' => '&amp;',
    '"' => '&quot;',
    '<' => '&lt;',
    '>' => '&gt;',
    "'" => '&#039;'
  ) );
  
  
  // Whitespace is normalized during attribute decoding,
  // so if we've been passed non-spaces we must encode them
  // ahead of time or they won't be preserved.
  $encValue = strtr( $encValue, array(
    "\n" => '&#10;',
    "\r" => '&#13;',
    "\t" => '&#9;',
  ) );
  
  return $encValue;
}

function unstripForHTML( $text ) {
  global $mStripState;
  $text = unstrip( $text, $mStripState );
  $text = unstripNoWiki( $text, $mStripState );
  return $text;
}

/**
 * Always call this after unstrip() to preserve the order
 *
 * @private
 */
function unstripNoWiki( $text, &$state ) {
  if ( !isset( $state['nowiki'] ) ) {
    return $text;
  }

  # TODO: good candidate for FSS
  $text = strtr( $text, $state['nowiki'] );
  
  return $text;
}

/**
 * Take an array of attribute names and values and normalize or discard
 * illegal values for the given element type.
 *
 * - Discards attributes not on a whitelist for the given element
 * - Unsafe style attributes are discarded
 *
 * @param array $attribs
 * @param string $element
 * @return array
 *
 * @todo Check for legal values where the DTD limits things.
 * @todo Check for unique id attribute :P
 */
function validateTagAttributes( $attribs, $element ) {
  $whitelist = array_flip( attributeWhitelist( $element ) );
  $out = array();
  foreach( $attribs as $attribute => $value ) {
    if( !isset( $whitelist[$attribute] ) ) {
      continue;
    }
    # Strip javascript "expression" from stylesheets.
    # http://msdn.microsoft.com/workshop/author/dhtml/overview/recalc.asp
    if( $attribute == 'style' ) {
      $value = checkCss( $value );
      if( $value === false ) {
        # haxx0r
        continue;
      }
    }

    if ( $attribute === 'id' )
      $value = escapeId( $value );

    // If this attribute was previously set, override it.
    // Output should only have one attribute of each name.
    $out[$attribute] = $value;
  }
  return $out;
}

/**
 * Pick apart some CSS and check it for forbidden or unsafe structures.
 * Returns a sanitized string, or false if it was just too evil.
 *
 * Currently URL references, 'expression', 'tps' are forbidden.
 *
 * @param string $value
 * @return mixed
 */
function checkCss( $value ) {
  $stripped = decodeCharReferences( $value );

  // Remove any comments; IE gets token splitting wrong
  $stripped = preg_replace( '!/\\*.*?\\*/!S', '', $stripped );
  $value = $stripped;

  // ... and continue checks
  $stripped = preg_replace( '!\\\\([0-9A-Fa-f]{1,6})[ \\n\\r\\t\\f]?!e',
    'codepointToUtf8(hexdec("$1"))', $stripped );
  $stripped = str_replace( '\\', '', $stripped );
  if( preg_match( '/(expression|tps*:\/\/|url\\s*\().*/is',
      $stripped ) ) {
    # haxx0r
    return false;
  }
  
  return $value;
}

/**
 * Decode any character references, numeric or named entities,
 * in the text and return a UTF-8 string.
 *
 * @param string $text
 * @return string
 * @access public
 * @static
 */
function decodeCharReferences( $text ) {
  return preg_replace_callback(
    MW_CHAR_REFS_REGEX,
    'decodeCharReferencesCallback',
    $text );
}

/**
 * Fetch the whitelist of acceptable attributes for a given
 * element name.
 *
 * @param string $element
 * @return array
 */
function attributeWhitelist( $element ) {
  static $list;
  if( !isset( $list ) ) {
    $list = setupAttributeWhitelist();
  }
  return isset( $list[$element] )
    ? $list[$element]
    : array();
}

/**
 * @todo Document it a bit
 * @return array
 */
function setupAttributeWhitelist() {
  global $db, $session, $paths, $template, $plugins;
  $common = array( 'id', 'class', 'lang', 'dir', 'title', 'style' );
  $block = array_merge( $common, array( 'align' ) );
  $tablealign = array( 'align', 'char', 'charoff', 'valign' );
  $tablecell = array( 'abbr',
                      'axis',
                      'headers',
                      'scope',
                      'rowspan',
                      'colspan',
                      'nowrap', # deprecated
                      'width',  # deprecated
                      'height', # deprecated
                      'bgcolor' # deprecated
                      );

  # Numbers refer to sections in HTML 4.01 standard describing the element.
  # See: http://www.w3.org/TR/html4/
  $whitelist = array (
    # 7.5.4
    'div'        => $block,
    'center'     => $common, # deprecated
    'span'       => $block, # ??

    # 7.5.5
    'h1'         => $block,
    'h2'         => $block,
    'h3'         => $block,
    'h4'         => $block,
    'h5'         => $block,
    'h6'         => $block,

    # 7.5.6
    # address

    # 8.2.4
    # bdo

    # 9.2.1
    'em'         => $common,
    'strong'     => $common,
    'cite'       => $common,
    # dfn
    'code'       => $common,
    # samp
    # kbd
    'var'        => $common,
    # abbr
    # acronym

    # 9.2.2
    'blockquote' => array_merge( $common, array( 'cite' ) ),
    # q

    # 9.2.3
    'sub'        => $common,
    'sup'        => $common,

    # 9.3.1
    'p'          => $block,

    # 9.3.2
    'br'         => array( 'id', 'class', 'title', 'style', 'clear' ),

    # 9.3.4
    'pre'        => array_merge( $common, array( 'width' ) ),

    # 9.4
    'ins'        => array_merge( $common, array( 'cite', 'datetime' ) ),
    'del'        => array_merge( $common, array( 'cite', 'datetime' ) ),

    # 10.2
    'ul'         => array_merge( $common, array( 'type' ) ),
    'ol'         => array_merge( $common, array( 'type', 'start' ) ),
    'li'         => array_merge( $common, array( 'type', 'value' ) ),

    # 10.3
    'dl'         => $common,
    'dd'         => $common,
    'dt'         => $common,

    # 11.2.1
    'table'      => array_merge( $common,
              array( 'summary', 'width', 'border', 'frame',
                  'rules', 'cellspacing', 'cellpadding',
                  'align', 'bgcolor',
              ) ),

    # 11.2.2
    'caption'    => array_merge( $common, array( 'align' ) ),

    # 11.2.3
    'thead'      => array_merge( $common, $tablealign ),
    'tfoot'      => array_merge( $common, $tablealign ),
    'tbody'      => array_merge( $common, $tablealign ),

    # 11.2.4
    'colgroup'   => array_merge( $common, array( 'span', 'width' ), $tablealign ),
    'col'        => array_merge( $common, array( 'span', 'width' ), $tablealign ),

    # 11.2.5
    'tr'         => array_merge( $common, array( 'bgcolor' ), $tablealign ),

    # 11.2.6
    'td'         => array_merge( $common, $tablecell, $tablealign ),
    'th'         => array_merge( $common, $tablecell, $tablealign ),
    
    # 12.2
    # added by dan
    'a'          => array_merge( $common, array( 'href', 'name' ) ),
    
    # 13.2
    # added by dan
    'img'        => array_merge( $common, array( 'src', 'width', 'height', 'alt' ) ),

    # 15.2.1
    'tt'         => $common,
    'b'          => $common,
    'i'          => $common,
    'big'        => $common,
    'small'      => $common,
    'strike'     => $common,
    's'          => $common,
    'u'          => $common,

    # 15.2.2
    'font'       => array_merge( $common, array( 'size', 'color', 'face' ) ),
    # basefont

    # 15.3
    'hr'         => array_merge( $common, array( 'noshade', 'size', 'width' ) ),

    # XHTML Ruby annotation text module, simple ruby only.
    # http://www.w3c.org/TR/ruby/
    'ruby'       => $common,
    # rbc
    # rtc
    'rb'         => $common,
    'rt'         => $common, #array_merge( $common, array( 'rbspan' ) ),
    'rp'         => $common,
    
    # For compatibility with the XHTML parser.
    'nowiki'     => array(),
    'noinclude'  => array(),
    'nodisplay'  => array(),
    'lang'       => array('code'),
    
    # XHTML stuff
    'acronym'    => $common
    );
  
  // custom tags can be added by plugins
  $code = $plugins->setHook('html_attribute_whitelist');
  foreach ( $code as $cmd )
  {
    eval($cmd);
  }
  
  return $whitelist;
}

/**
 * Given a value escape it so that it can be used in an id attribute and
 * return it, this does not validate the value however (see first link)
 *
 * @link http://www.w3.org/TR/html401/types.html#type-name Valid characters
 *                                                          in the id and
 *                                                          name attributes
 * @link http://www.w3.org/TR/html401/struct/links.html#h-12.2.3 Anchors with the id attribute
 *
 * @bug 4461
 *
 * @static
 *
 * @param string $id
 * @return string
 */
function escapeId( $id ) {
  static $replace = array(
    '%3A' => ':',
    '%' => '.'
  );

  $id = urlencode( decodeCharReferences( strtr( $id, ' ', '_' ) ) );

  return str_replace( array_keys( $replace ), array_values( $replace ), $id );
}

/**
 * More or less "markup-safe" explode()
 * Ignores any instances of the separator inside <...>
 * @param string $separator
 * @param string $text
 * @return array
 */
function wfExplodeMarkup( $separator, $text ) {
  $placeholder = "\x00";
  
  // Just in case...
  $text = str_replace( $placeholder, '', $text );
  
  // Trim stuff
  $replacer = new ReplacerCallback( $separator, $placeholder );
  $cleaned = preg_replace_callback( '/(<.*?>)/', array( $replacer, 'go' ), $text );
  
  $items = explode( $separator, $cleaned );
  foreach( $items as $i => $str ) {
    $items[$i] = str_replace( $placeholder, $separator, $str );
  }
  
  return $items;
}

class ReplacerCallback {
  function ReplacerCallback( $from, $to ) {
    $this->from = $from;
    $this->to = $to;
  }
  
  function go( $matches ) {
    return str_replace( $this->from, $this->to, $matches[1] );
  }
}

/**
 * Return an associative array of attribute names and values from
 * a partial tag string. Attribute names are forces to lowercase,
 * character references are decoded to UTF-8 text.
 *
 * @param string
 * @return array
 */
function decodeTagAttributes( $text ) {
  $attribs = array();

  if( trim( $text ) == '' ) {
    return $attribs;
  }

  $pairs = array();
  if( !preg_match_all(
    MW_ATTRIBS_REGEX,
    $text,
    $pairs,
    PREG_SET_ORDER ) ) {
    return $attribs;
  }

  foreach( $pairs as $set ) {
    $attribute = strtolower( $set[1] );
    $value = getTagAttributeCallback( $set );
    
    // Normalize whitespace
    $value = preg_replace( '/[\t\r\n ]+/', ' ', $value );
    $value = trim( $value );
    
    // Decode character references
    $attribs[$attribute] = decodeCharReferences( $value );
  }
  return $attribs;
}

/**
 * Pick the appropriate attribute value from a match set from the
 * MW_ATTRIBS_REGEX matches.
 *
 * @param array $set
 * @return string
 * @access private
 */
function getTagAttributeCallback( $set ) {
  if( isset( $set[6] ) ) {
    # Illegal #XXXXXX color with no quotes.
    return $set[6];
  } elseif( isset( $set[5] ) ) {
    # No quotes.
    return $set[5];
  } elseif( isset( $set[4] ) ) {
    # Single-quoted
    return $set[4];
  } elseif( isset( $set[3] ) ) {
    # Double-quoted
    return $set[3];
  } elseif( !isset( $set[2] ) ) {
    # In XHTML, attributes must have a value.
    # For 'reduced' form, return explicitly the attribute name here.
    return $set[1];
  } else {
    die_friendly('Parser error', "<p>Tag conditions not met. This should never happen and is a bug.</p>" );
  }
}

/**
 * Strips and renders nowiki, pre, math, hiero
 * If $render is set, performs necessary rendering operations on plugins
 * Returns the text, and fills an array with data needed in unstrip()
 * If the $state is already a valid strip state, it adds to the state
 *
 * @param bool $stripcomments when set, HTML comments <!-- like this -->
 *  will be stripped in addition to other tags. This is important
 *  for section editing, where these comments cause confusion when
 *  counting the sections in the wikisource
 * 
 * @param array dontstrip contains tags which should not be stripped;
 *  used to prevent stipping of <gallery> when saving (fixes bug 2700)
 *
 * @access private
 */
function mwStrip( $text, &$state, $stripcomments = false , $dontstrip = array () ) {
  global $wgRandomKey;
  $render = true;

  $wgRandomKey = "\x07UNIQ" . dechex(mt_rand(0, 0x7fffffff)) . dechex(mt_rand(0, 0x7fffffff));
  $uniq_prefix =& $wgRandomKey;
  $commentState = array();
  
  $elements = array( 'nowiki', 'gallery' );
  
  # Removing $dontstrip tags from $elements list (currently only 'gallery', fixing bug 2700)
  foreach ( $elements AS $k => $v ) {
    if ( !in_array ( $v , $dontstrip ) ) continue;
    unset ( $elements[$k] );
  }
  
  $matches = array();
  $text = extractTagsAndParams( $elements, $text, $matches, $uniq_prefix );

  foreach( $matches as $marker => $data ) {
    list( $element, $content, $params, $tag ) = $data;
    if( $render ) {
      $tagName = strtolower( $element );
      switch( $tagName ) {
      case '!--':
        // Comment
        if( substr( $tag, -3 ) == '-->' ) {
          $output = $tag;
        } else {
          // Unclosed comment in input.
          // Close it so later stripping can remove it
          $output = "$tag-->";
        }
        break;
      case 'html':
        if( $wgRawHtml ) {
          $output = $content;
          break;
        }
        // Shouldn't happen otherwise. :)
      case 'nowiki':
        $output = wfEscapeHTMLTagsOnly( $content );
        break;
      default:
      }
    } else {
      // Just stripping tags; keep the source
      $output = $tag;
    }

    // Unstrip the output, because unstrip() is no longer recursive so 
    // it won't do it itself
    $output = unstrip( $output, $state );

    if( !$stripcomments && $element == '!--' ) {
      $commentState[$marker] = $output;
    } elseif ( $element == 'html' || $element == 'nowiki' ) {
      $state['nowiki'][$marker] = $output;
    } else {
      $state['general'][$marker] = $output;
    }
  }

  # Unstrip comments unless explicitly told otherwise.
  # (The comments are always stripped prior to this point, so as to
  # not invoke any extension tags / parser hooks contained within
  # a comment.)
  if ( !$stripcomments ) {
    // Put them all back and forget them
    $text = strtr( $text, $commentState );
  }

  return $text;
}

/**
 * Replaces all occurrences of HTML-style comments and the given tags
 * in the text with a random marker and returns teh next text. The output
 * parameter $matches will be an associative array filled with data in
 * the form:
 *   'UNIQ-xxxxx' => array(
 *     'element',
 *     'tag content',
 *     array( 'param' => 'x' ),
 *     '<element param="x">tag content</element>' ) )
 *
 * @param $elements list of element names. Comments are always extracted.
 * @param $text Source text string.
 * @param $uniq_prefix
 *
 * @access private
 * @static
 */
function extractTagsAndParams($elements, $text, &$matches, $uniq_prefix = ''){
  static $n = 1;
  $stripped = '';
  $matches = array();

  $taglist = implode( '|', $elements );
  $start = "/<($taglist)(\\s+[^>]*?|\\s*?)(\/?>)|<(!--)/i";

  while ( '' != $text ) {
    $p = preg_split( $start, $text, 2, PREG_SPLIT_DELIM_CAPTURE );
    $stripped .= $p[0];
    if( count( $p ) < 5 ) {
      break;
    }
    if( count( $p ) > 5 ) {
      // comment
      $element    = $p[4];
      $attributes = '';
      $close      = '';
      $inside     = $p[5];
    } else {
      // tag
      $element    = $p[1];
      $attributes = $p[2];
      $close      = $p[3];
      $inside     = $p[4];
    }

    $marker = "$uniq_prefix-$element-" . sprintf('%08X', $n++) . '-QINU';
    $stripped .= $marker;

    if ( $close === '/>' ) {
      // Empty element tag, <tag />
      $content = null;
      $text = $inside;
      $tail = null;
    } else {
      if( $element == '!--' ) {
        $end = '/(-->)/';
      } else {
        $end = "/(<\\/$element\\s*>)/i";
      }
      $q = preg_split( $end, $inside, 2, PREG_SPLIT_DELIM_CAPTURE );
      $content = $q[0];
      if( count( $q ) < 3 ) {
        # No end tag -- let it run out to the end of the text.
        $tail = '';
        $text = '';
      } else {
        $tail = $q[1];
        $text = $q[2];
      }
    }
    
    $matches[$marker] = array( $element,
      $content,
      decodeTagAttributes( $attributes ),
      "<$element$attributes$close$content$tail" );
  }
  return $stripped;
}

/**
 * Escape html tags
 * Basically replacing " > and < with HTML entities ( &quot;, &gt;, &lt;)
 *
 * @param $in String: text that might contain HTML tags.
 * @return string Escaped string
 */
function wfEscapeHTMLTagsOnly( $in ) {
  return str_replace(
    array( '"', '>', '<' ),
    array( '&quot;', '&gt;', '&lt;' ),
    $in );
}

/**
 * Restores pre, math, and other extensions removed by strip()
 *
 * always call unstripNoWiki() after this one
 * @private
 */
function unstrip( $text, &$state ) {
  if ( !isset( $state['general'] ) ) {
    return $text;
  }

  # TODO: good candidate for FSS
  $text = strtr( $text, $state['general'] );
  
  return $text;
}

/**
 * Return UTF-8 string for a codepoint if that is a valid
 * character reference, otherwise U+FFFD REPLACEMENT CHARACTER.
 * @param int $codepoint
 * @return string
 * @private
 */
function decodeChar( $codepoint ) {
  if( validateCodepoint( $codepoint ) ) {
    return codepointToUtf8( $codepoint );
  } else {
    return UTF8_REPLACEMENT;
  }
}

/**
 * If the named entity is defined in the HTML 4.0/XHTML 1.0 DTD,
 * return the UTF-8 encoding of that character. Otherwise, returns
 * pseudo-entity source (eg &foo;)
 *
 * @param string $name
 * @return string
 */
function decodeEntity( $name ) {
  global $wgHtmlEntities;
  if( isset( $wgHtmlEntities[$name] ) ) {
    return codepointToUtf8( $wgHtmlEntities[$name] );
  } else {
    return "&$name;";
  }
}

/**
 * Returns true if a given Unicode codepoint is a valid character in XML.
 * @param int $codepoint
 * @return bool
 */
function validateCodepoint( $codepoint ) {
  return ($codepoint ==    0x09)
    || ($codepoint ==    0x0a)
    || ($codepoint ==    0x0d)
    || ($codepoint >=    0x20 && $codepoint <=   0xd7ff)
    || ($codepoint >=  0xe000 && $codepoint <=   0xfffd)
    || ($codepoint >= 0x10000 && $codepoint <= 0x10ffff);
}
  
/**
 * Return UTF-8 sequence for a given Unicode code point.
 * May die if fed out of range data.
 *
 * @param $codepoint Integer:
 * @return String
 * @public
 */
function codepointToUtf8( $codepoint ) {
	if($codepoint <		0x80) return chr($codepoint);
	if($codepoint <    0x800) return chr($codepoint >>	6 & 0x3f | 0xc0) .
									 chr($codepoint		  & 0x3f | 0x80);
	if($codepoint <  0x10000) return chr($codepoint >> 12 & 0x0f | 0xe0) .
									 chr($codepoint >>	6 & 0x3f | 0x80) .
									 chr($codepoint		  & 0x3f | 0x80);
	if($codepoint < 0x110000) return chr($codepoint >> 18 & 0x07 | 0xf0) .
									 chr($codepoint >> 12 & 0x3f | 0x80) .
									 chr($codepoint >>	6 & 0x3f | 0x80) .
									 chr($codepoint		  & 0x3f | 0x80);

	echo "Asked for code outside of range ($codepoint)\n";
	die( -1 );
}

/**
 * @param string $matches
 * @return string
 */
function decodeCharReferencesCallback( $matches ) {
  if( $matches[1] != '' ) {
    return decodeEntity( $matches[1] );
  } elseif( $matches[2] != '' ) {
    return  decodeChar( intval( $matches[2] ) );
  } elseif( $matches[3] != ''  ) {
    return  decodeChar( hexdec( $matches[3] ) );
  } elseif( $matches[4] != '' ) {
    return  decodeChar( hexdec( $matches[4] ) );
  }
  # Last case should be an ampersand by itself
  return $matches[0];
}