author | Dan |
Tue, 18 Sep 2007 17:03:03 -0400 | |
changeset 136 | f2ee42f026f7 |
parent 73 | 0a74676a2f2f |
child 142 | ca9118d9c0f2 |
permissions | -rw-r--r-- |
<?php /** * Enano - an open-source CMS capable of wiki functions, Drupal-like sidebar blocks, and everything in between * Version 1.0.1 (Loch Ness) * Copyright (C) 2006-2007 Dan Fuhry * * This program is Free Software; you can redistribute and/or modify it under the terms of the GNU General Public License * as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version. * * This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied * warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for details. * * This script contains code originally found in MediaWiki (http://www.mediawiki.org). MediaWiki is also licensed under * the GPLv2; see the file GPL included with this package for details. * * We're using the MW parser because the Text_Wiki version simply refused to work under PHP 5.2.0. Porting this was * _not_ easy. <leaves to get cup of coffee> */ global $mStripState, $wgRandomKey; $mStripState = Array(); $attrib = '[a-zA-Z0-9]'; $space = '[\x09\x0a\x0d\x20]'; define( 'MW_CHAR_REFS_REGEX', '/&([A-Za-z0-9]+); |&\#([0-9]+); |&\#x([0-9A-Za-z]+); |&\#X([0-9A-Za-z]+); |(&)/x' ); define( 'MW_ATTRIBS_REGEX', "/(?:^|$space)($attrib+) ($space*=$space* (?: # The attribute value: quoted or alone ".'"'."([^<".'"'."]*)".'"'." | '([^<']*)' | ([a-zA-Z0-9!#$%&()*,\\-.\\/:;<>?@[\\]^_`{|}~]+) | (\#[0-9a-fA-F]+) # Technically wrong, but lots of # colors are specified like this. # We'll be normalizing it. ) )?(?=$space|\$)/sx" ); /** * emulate mediawiki parser, including stripping, etc. * * @param string $text the text to parse * @return string * @access public */ function process_tables( $text ) { // include some globals, do some parser stuff that would normally be done in the parent parser function global $mStripState; $x =& $mStripState; //$text = mwStrip( $text, $x ); // parse the text $text = doTableStuff($text); // Unstrip it // $text = unstrip( $text, $mStripState ); // $text = unstripNoWiki( $text, $mStripState ); //die('<pre>'.print_r($mStripState, true).'</pre>'); return $text; } /** * parse the wiki syntax used to render tables * * @param string $t the text to parse * @return string * @access private */ function doTableStuff( $t ) { $t = explode ( "\n" , $t ) ; $td = array () ; # Is currently a td tag open? $ltd = array () ; # Was it TD or TH? $tr = array () ; # Is currently a tr tag open? $ltr = array () ; # tr attributes $has_opened_tr = array(); # Did this table open a <tr> element? $indent_level = 0; # indent level of the table foreach ( $t AS $k => $x ) { $x = trim ( $x ) ; $fc = substr ( $x , 0 , 1 ) ; if ( preg_match( '/^(:*)\{\|(.*)$/', $x, $matches ) ) { $indent_level = strlen( $matches[1] ); $attributes = unstripForHTML( $matches[2] ); $t[$k] = str_repeat( '<dl><dd>', $indent_level ) . '<nowiki><table' . fixTagAttributes( $attributes, 'table' ) . '></nowiki>' ; array_push ( $td , false ) ; array_push ( $ltd , '' ) ; array_push ( $tr , false ) ; array_push ( $ltr , '' ) ; array_push ( $has_opened_tr, false ); } else if ( count ( $td ) == 0 ) { } # Don't do any of the following else if ( '|}' == substr ( $x , 0 , 2 ) ) { $z = "<nowiki></table></nowiki>" . substr ( $x , 2); $l = array_pop ( $ltd ) ; if ( !array_pop ( $has_opened_tr ) ) $z = "<nowiki><tr><td></td></tr></nowiki>" . $z ; if ( array_pop ( $tr ) ) $z = '<nowiki></tr></nowiki>' . $z ; if ( array_pop ( $td ) ) $z = '<nowiki></'.$l.'></nowiki>' . $z ; array_pop ( $ltr ) ; $t[$k] = $z . str_repeat( '<nowiki></dd></dl></nowiki>', $indent_level ); } else if ( '|-' == substr ( $x , 0 , 2 ) ) { # Allows for |--------------- $x = substr ( $x , 1 ) ; while ( $x != '' && substr ( $x , 0 , 1 ) == '-' ) $x = substr ( $x , 1 ) ; $z = '' ; $l = array_pop ( $ltd ) ; array_pop ( $has_opened_tr ); array_push ( $has_opened_tr , true ) ; if ( array_pop ( $tr ) ) $z = '<nowiki></tr></nowiki>' . $z ; if ( array_pop ( $td ) ) $z = '<nowiki></'.$l.'></nowiki>' . $z ; array_pop ( $ltr ) ; $t[$k] = $z ; array_push ( $tr , false ) ; array_push ( $td , false ) ; array_push ( $ltd , '' ) ; $attributes = unstripForHTML( $x ); array_push ( $ltr , fixTagAttributes( $attributes, 'tr' ) ) ; } else if ( '|' == $fc || '!' == $fc || '|+' == substr ( $x , 0 , 2 ) ) { # Caption # $x is a table row if ( '|+' == substr ( $x , 0 , 2 ) ) { $fc = '+' ; $x = substr ( $x , 1 ) ; } $after = substr ( $x , 1 ) ; if ( $fc == '!' ) $after = str_replace ( '!!' , '||' , $after ) ; // Split up multiple cells on the same line. // FIXME: This can result in improper nesting of tags processed // by earlier parser steps, but should avoid splitting up eg // attribute values containing literal "||". $after = wfExplodeMarkup( '||', $after ); $t[$k] = '' ; # Loop through each table cell foreach ( $after AS $theline ) { $z = '' ; if ( $fc != '+' ) { $tra = array_pop ( $ltr ) ; if ( !array_pop ( $tr ) ) $z = '<nowiki><tr'.$tra."></nowiki>\n" ; array_push ( $tr , true ) ; array_push ( $ltr , '' ) ; array_pop ( $has_opened_tr ); array_push ( $has_opened_tr , true ) ; } $l = array_pop ( $ltd ) ; if ( array_pop ( $td ) ) $z = '<nowiki></'.$l.'></nowiki>' . $z ; if ( $fc == '|' ) $l = 'td' ; else if ( $fc == '!' ) $l = 'th' ; else if ( $fc == '+' ) $l = 'caption' ; else $l = '' ; array_push ( $ltd , $l ) ; # Cell parameters $y = explode ( '|' , $theline , 2 ) ; # Note that a '|' inside an invalid link should not # be mistaken as delimiting cell parameters if ( strpos( $y[0], '[[' ) !== false ) { $y = array ($theline); } if ( count ( $y ) == 1 ) $y = "{$z}<nowiki><{$l}></nowiki>{$y[0]}" ; else { $attributes = unstripForHTML( $y[0] ); $y = "{$z}<nowiki><{$l}".fixTagAttributes($attributes, $l)."></nowiki>{$y[1]}" ; } $t[$k] .= $y ; array_push ( $td , true ) ; } } } # Closing open td, tr && table while ( count ( $td ) > 0 ) { $l = array_pop ( $ltd ) ; if ( array_pop ( $td ) ) $t[] = '<nowiki></td></nowiki>' ; if ( array_pop ( $tr ) ) $t[] = '<nowiki></tr></nowiki>' ; if ( !array_pop ( $has_opened_tr ) ) $t[] = "<nowiki><tr><td></td></tr></nowiki>" ; $t[] = '<nowiki></table></nowiki>' ; } $t = implode ( "\n" , $t ) ; # special case: don't return empty table if($t == "<nowiki><table></nowiki>\n<nowiki><tr><td></td></tr></nowiki>\n<nowiki></table></nowiki>") $t = ''; return $t ; } /** * Take a tag soup fragment listing an HTML element's attributes * and normalize it to well-formed XML, discarding unwanted attributes. * Output is safe for further wikitext processing, with escaping of * values that could trigger problems. * * - Normalizes attribute names to lowercase * - Discards attributes not on a whitelist for the given element * - Turns broken or invalid entities into plaintext * - Double-quotes all attribute values * - Attributes without values are given the name as attribute * - Double attributes are discarded * - Unsafe style attributes are discarded * - Prepends space if there are attributes. * * @param string $text * @param string $element * @return string */ function fixTagAttributes( $text, $element ) { if( trim( $text ) == '' ) { return ''; } $stripped = validateTagAttributes( decodeTagAttributes( $text ), $element ); $attribs = array(); foreach( $stripped as $attribute => $value ) { $encAttribute = htmlspecialchars( $attribute ); $encValue = safeEncodeAttribute( $value ); $attribs[] = "$encAttribute=".'"'."$encValue".'"'.""; // " } return count( $attribs ) ? ' ' . implode( ' ', $attribs ) : ''; } /** * Encode an attribute value for HTML tags, with extra armoring * against further wiki processing. * @param $text * @return HTML-encoded text fragment */ function safeEncodeAttribute( $text ) { $encValue= encodeAttribute( $text ); # Templates and links may be expanded in later parsing, # creating invalid or dangerous output. Suppress this. $encValue = strtr( $encValue, array( '<' => '<', // This should never happen, '>' => '>', // we've received invalid input '"' => '"', // which should have been escaped. '{' => '{', '[' => '[', "''" => '''', 'ISBN' => 'ISBN', 'RFC' => 'RFC', 'PMID' => 'PMID', '|' => '|', '__' => '__', ) ); return $encValue; } /** * Encode an attribute value for HTML output. * @param $text * @return HTML-encoded text fragment */ function encodeAttribute( $text ) { $encValue = htmlspecialchars( $text ); // Whitespace is normalized during attribute decoding, // so if we've been passed non-spaces we must encode them // ahead of time or they won't be preserved. $encValue = strtr( $encValue, array( "\n" => ' ', "\r" => ' ', "\t" => '	', ) ); return $encValue; } function unstripForHTML( $text ) { global $mStripState; $text = unstrip( $text, $mStripState ); $text = unstripNoWiki( $text, $mStripState ); return $text; } /** * Always call this after unstrip() to preserve the order * * @private */ function unstripNoWiki( $text, &$state ) { if ( !isset( $state['nowiki'] ) ) { return $text; } # TODO: good candidate for FSS $text = strtr( $text, $state['nowiki'] ); return $text; } /** * Take an array of attribute names and values and normalize or discard * illegal values for the given element type. * * - Discards attributes not on a whitelist for the given element * - Unsafe style attributes are discarded * * @param array $attribs * @param string $element * @return array * * @todo Check for legal values where the DTD limits things. * @todo Check for unique id attribute :P */ function validateTagAttributes( $attribs, $element ) { $whitelist = array_flip( attributeWhitelist( $element ) ); $out = array(); foreach( $attribs as $attribute => $value ) { if( !isset( $whitelist[$attribute] ) ) { continue; } # Strip javascript "expression" from stylesheets. # http://msdn.microsoft.com/workshop/author/dhtml/overview/recalc.asp if( $attribute == 'style' ) { $value = checkCss( $value ); if( $value === false ) { # haxx0r continue; } } if ( $attribute === 'id' ) $value = escapeId( $value ); // If this attribute was previously set, override it. // Output should only have one attribute of each name. $out[$attribute] = $value; } return $out; } /** * Pick apart some CSS and check it for forbidden or unsafe structures. * Returns a sanitized string, or false if it was just too evil. * * Currently URL references, 'expression', 'tps' are forbidden. * * @param string $value * @return mixed */ function checkCss( $value ) { $stripped = decodeCharReferences( $value ); // Remove any comments; IE gets token splitting wrong $stripped = preg_replace( '!/\\*.*?\\*/!S', '', $stripped ); $value = $stripped; // ... and continue checks $stripped = preg_replace( '!\\\\([0-9A-Fa-f]{1,6})[ \\n\\r\\t\\f]?!e', 'codepointToUtf8(hexdec("$1"))', $stripped ); $stripped = str_replace( '\\', '', $stripped ); if( preg_match( '/(expression|tps*:\/\/|url\\s*\().*/is', $stripped ) ) { # haxx0r return false; } return $value; } /** * Decode any character references, numeric or named entities, * in the text and return a UTF-8 string. * * @param string $text * @return string * @access public * @static */ function decodeCharReferences( $text ) { return preg_replace_callback( MW_CHAR_REFS_REGEX, 'decodeCharReferencesCallback', $text ); } /** * Fetch the whitelist of acceptable attributes for a given * element name. * * @param string $element * @return array */ function attributeWhitelist( $element ) { static $list; if( !isset( $list ) ) { $list = setupAttributeWhitelist(); } return isset( $list[$element] ) ? $list[$element] : array(); } /** * @todo Document it a bit * @return array */ function setupAttributeWhitelist() { $common = array( 'id', 'class', 'lang', 'dir', 'title', 'style' ); $block = array_merge( $common, array( 'align' ) ); $tablealign = array( 'align', 'char', 'charoff', 'valign' ); $tablecell = array( 'abbr', 'axis', 'headers', 'scope', 'rowspan', 'colspan', 'nowrap', # deprecated 'width', # deprecated 'height', # deprecated 'bgcolor' # deprecated ); # Numbers refer to sections in HTML 4.01 standard describing the element. # See: http://www.w3.org/TR/html4/ $whitelist = array ( # 7.5.4 'div' => $block, 'center' => $common, # deprecated 'span' => $block, # ?? # 7.5.5 'h1' => $block, 'h2' => $block, 'h3' => $block, 'h4' => $block, 'h5' => $block, 'h6' => $block, # 7.5.6 # address # 8.2.4 # bdo # 9.2.1 'em' => $common, 'strong' => $common, 'cite' => $common, # dfn 'code' => $common, # samp # kbd 'var' => $common, # abbr # acronym # 9.2.2 'blockquote' => array_merge( $common, array( 'cite' ) ), # q # 9.2.3 'sub' => $common, 'sup' => $common, # 9.3.1 'p' => $block, # 9.3.2 'br' => array( 'id', 'class', 'title', 'style', 'clear' ), # 9.3.4 'pre' => array_merge( $common, array( 'width' ) ), # 9.4 'ins' => array_merge( $common, array( 'cite', 'datetime' ) ), 'del' => array_merge( $common, array( 'cite', 'datetime' ) ), # 10.2 'ul' => array_merge( $common, array( 'type' ) ), 'ol' => array_merge( $common, array( 'type', 'start' ) ), 'li' => array_merge( $common, array( 'type', 'value' ) ), # 10.3 'dl' => $common, 'dd' => $common, 'dt' => $common, # 11.2.1 'table' => array_merge( $common, array( 'summary', 'width', 'border', 'frame', 'rules', 'cellspacing', 'cellpadding', 'align', 'bgcolor', ) ), # 11.2.2 'caption' => array_merge( $common, array( 'align' ) ), # 11.2.3 'thead' => array_merge( $common, $tablealign ), 'tfoot' => array_merge( $common, $tablealign ), 'tbody' => array_merge( $common, $tablealign ), # 11.2.4 'colgroup' => array_merge( $common, array( 'span', 'width' ), $tablealign ), 'col' => array_merge( $common, array( 'span', 'width' ), $tablealign ), # 11.2.5 'tr' => array_merge( $common, array( 'bgcolor' ), $tablealign ), # 11.2.6 'td' => array_merge( $common, $tablecell, $tablealign ), 'th' => array_merge( $common, $tablecell, $tablealign ), # 12.2 # added by dan 'a' => array_merge( $common, array( 'href', 'name' ) ), # 13.2 # added by dan 'img' => array_merge( $common, array( 'src', 'width', 'height', 'alt' ) ), # 15.2.1 'tt' => $common, 'b' => $common, 'i' => $common, 'big' => $common, 'small' => $common, 'strike' => $common, 's' => $common, 'u' => $common, # 15.2.2 'font' => array_merge( $common, array( 'size', 'color', 'face' ) ), # basefont # 15.3 'hr' => array_merge( $common, array( 'noshade', 'size', 'width' ) ), # XHTML Ruby annotation text module, simple ruby only. # http://www.w3c.org/TR/ruby/ 'ruby' => $common, # rbc # rtc 'rb' => $common, 'rt' => $common, #array_merge( $common, array( 'rbspan' ) ), 'rp' => $common, # For compatibility with the XHTML parser. 'nowiki' => array(), 'noinclude' => array(), 'nodisplay' => array(), # XHTML stuff 'acronym' => $common ); return $whitelist; } /** * Given a value escape it so that it can be used in an id attribute and * return it, this does not validate the value however (see first link) * * @link http://www.w3.org/TR/html401/types.html#type-name Valid characters * in the id and * name attributes * @link http://www.w3.org/TR/html401/struct/links.html#h-12.2.3 Anchors with the id attribute * * @bug 4461 * * @static * * @param string $id * @return string */ function escapeId( $id ) { static $replace = array( '%3A' => ':', '%' => '.' ); $id = urlencode( decodeCharReferences( strtr( $id, ' ', '_' ) ) ); return str_replace( array_keys( $replace ), array_values( $replace ), $id ); } /** * More or less "markup-safe" explode() * Ignores any instances of the separator inside <...> * @param string $separator * @param string $text * @return array */ function wfExplodeMarkup( $separator, $text ) { $placeholder = "\x00"; // Just in case... $text = str_replace( $placeholder, '', $text ); // Trim stuff $replacer = new ReplacerCallback( $separator, $placeholder ); $cleaned = preg_replace_callback( '/(<.*?>)/', array( $replacer, 'go' ), $text ); $items = explode( $separator, $cleaned ); foreach( $items as $i => $str ) { $items[$i] = str_replace( $placeholder, $separator, $str ); } return $items; } class ReplacerCallback { function ReplacerCallback( $from, $to ) { $this->from = $from; $this->to = $to; } function go( $matches ) { return str_replace( $this->from, $this->to, $matches[1] ); } } /** * Return an associative array of attribute names and values from * a partial tag string. Attribute names are forces to lowercase, * character references are decoded to UTF-8 text. * * @param string * @return array */ function decodeTagAttributes( $text ) { $attribs = array(); if( trim( $text ) == '' ) { return $attribs; } $pairs = array(); if( !preg_match_all( MW_ATTRIBS_REGEX, $text, $pairs, PREG_SET_ORDER ) ) { return $attribs; } foreach( $pairs as $set ) { $attribute = strtolower( $set[1] ); $value = getTagAttributeCallback( $set ); // Normalize whitespace $value = preg_replace( '/[\t\r\n ]+/', ' ', $value ); $value = trim( $value ); // Decode character references $attribs[$attribute] = decodeCharReferences( $value ); } return $attribs; } /** * Pick the appropriate attribute value from a match set from the * MW_ATTRIBS_REGEX matches. * * @param array $set * @return string * @access private */ function getTagAttributeCallback( $set ) { if( isset( $set[6] ) ) { # Illegal #XXXXXX color with no quotes. return $set[6]; } elseif( isset( $set[5] ) ) { # No quotes. return $set[5]; } elseif( isset( $set[4] ) ) { # Single-quoted return $set[4]; } elseif( isset( $set[3] ) ) { # Double-quoted return $set[3]; } elseif( !isset( $set[2] ) ) { # In XHTML, attributes must have a value. # For 'reduced' form, return explicitly the attribute name here. return $set[1]; } else { die_friendly('Parser error', "<p>Tag conditions not met. This should never happen and is a bug.</p>" ); } } /** * Strips and renders nowiki, pre, math, hiero * If $render is set, performs necessary rendering operations on plugins * Returns the text, and fills an array with data needed in unstrip() * If the $state is already a valid strip state, it adds to the state * * @param bool $stripcomments when set, HTML comments <!-- like this --> * will be stripped in addition to other tags. This is important * for section editing, where these comments cause confusion when * counting the sections in the wikisource * * @param array dontstrip contains tags which should not be stripped; * used to prevent stipping of <gallery> when saving (fixes bug 2700) * * @access private */ function mwStrip( $text, &$state, $stripcomments = false , $dontstrip = array () ) { global $wgRandomKey; $render = true; $wgRandomKey = "\x07UNIQ" . dechex(mt_rand(0, 0x7fffffff)) . dechex(mt_rand(0, 0x7fffffff)); $uniq_prefix =& $wgRandomKey; $commentState = array(); $elements = array( 'nowiki', 'gallery' ); # Removing $dontstrip tags from $elements list (currently only 'gallery', fixing bug 2700) foreach ( $elements AS $k => $v ) { if ( !in_array ( $v , $dontstrip ) ) continue; unset ( $elements[$k] ); } $matches = array(); $text = extractTagsAndParams( $elements, $text, $matches, $uniq_prefix ); foreach( $matches as $marker => $data ) { list( $element, $content, $params, $tag ) = $data; if( $render ) { $tagName = strtolower( $element ); switch( $tagName ) { case '!--': // Comment if( substr( $tag, -3 ) == '-->' ) { $output = $tag; } else { // Unclosed comment in input. // Close it so later stripping can remove it $output = "$tag-->"; } break; case 'html': if( $wgRawHtml ) { $output = $content; break; } // Shouldn't happen otherwise. :) case 'nowiki': $output = wfEscapeHTMLTagsOnly( $content ); break; default: } } else { // Just stripping tags; keep the source $output = $tag; } // Unstrip the output, because unstrip() is no longer recursive so // it won't do it itself $output = unstrip( $output, $state ); if( !$stripcomments && $element == '!--' ) { $commentState[$marker] = $output; } elseif ( $element == 'html' || $element == 'nowiki' ) { $state['nowiki'][$marker] = $output; } else { $state['general'][$marker] = $output; } } # Unstrip comments unless explicitly told otherwise. # (The comments are always stripped prior to this point, so as to # not invoke any extension tags / parser hooks contained within # a comment.) if ( !$stripcomments ) { // Put them all back and forget them $text = strtr( $text, $commentState ); } return $text; } /** * Replaces all occurrences of HTML-style comments and the given tags * in the text with a random marker and returns teh next text. The output * parameter $matches will be an associative array filled with data in * the form: * 'UNIQ-xxxxx' => array( * 'element', * 'tag content', * array( 'param' => 'x' ), * '<element param="x">tag content</element>' ) ) * * @param $elements list of element names. Comments are always extracted. * @param $text Source text string. * @param $uniq_prefix * * @access private * @static */ function extractTagsAndParams($elements, $text, &$matches, $uniq_prefix = ''){ static $n = 1; $stripped = ''; $matches = array(); $taglist = implode( '|', $elements ); $start = "/<($taglist)(\\s+[^>]*?|\\s*?)(\/?>)|<(!--)/i"; while ( '' != $text ) { $p = preg_split( $start, $text, 2, PREG_SPLIT_DELIM_CAPTURE ); $stripped .= $p[0]; if( count( $p ) < 5 ) { break; } if( count( $p ) > 5 ) { // comment $element = $p[4]; $attributes = ''; $close = ''; $inside = $p[5]; } else { // tag $element = $p[1]; $attributes = $p[2]; $close = $p[3]; $inside = $p[4]; } $marker = "$uniq_prefix-$element-" . sprintf('%08X', $n++) . '-QINU'; $stripped .= $marker; if ( $close === '/>' ) { // Empty element tag, <tag /> $content = null; $text = $inside; $tail = null; } else { if( $element == '!--' ) { $end = '/(-->)/'; } else { $end = "/(<\\/$element\\s*>)/i"; } $q = preg_split( $end, $inside, 2, PREG_SPLIT_DELIM_CAPTURE ); $content = $q[0]; if( count( $q ) < 3 ) { # No end tag -- let it run out to the end of the text. $tail = ''; $text = ''; } else { $tail = $q[1]; $text = $q[2]; } } $matches[$marker] = array( $element, $content, decodeTagAttributes( $attributes ), "<$element$attributes$close$content$tail" ); } return $stripped; } /** * Escape html tags * Basically replacing " > and < with HTML entities ( ", >, <) * * @param $in String: text that might contain HTML tags. * @return string Escaped string */ function wfEscapeHTMLTagsOnly( $in ) { return str_replace( array( '"', '>', '<' ), array( '"', '>', '<' ), $in ); } /** * Restores pre, math, and other extensions removed by strip() * * always call unstripNoWiki() after this one * @private */ function unstrip( $text, &$state ) { if ( !isset( $state['general'] ) ) { return $text; } # TODO: good candidate for FSS $text = strtr( $text, $state['general'] ); return $text; } /** * Return UTF-8 string for a codepoint if that is a valid * character reference, otherwise U+FFFD REPLACEMENT CHARACTER. * @param int $codepoint * @return string * @private */ function decodeChar( $codepoint ) { if( validateCodepoint( $codepoint ) ) { return codepointToUtf8( $codepoint ); } else { return UTF8_REPLACEMENT; } } /** * If the named entity is defined in the HTML 4.0/XHTML 1.0 DTD, * return the UTF-8 encoding of that character. Otherwise, returns * pseudo-entity source (eg &foo;) * * @param string $name * @return string */ function decodeEntity( $name ) { global $wgHtmlEntities; if( isset( $wgHtmlEntities[$name] ) ) { return codepointToUtf8( $wgHtmlEntities[$name] ); } else { return "&$name;"; } } /** * Returns true if a given Unicode codepoint is a valid character in XML. * @param int $codepoint * @return bool */ function validateCodepoint( $codepoint ) { return ($codepoint == 0x09) || ($codepoint == 0x0a) || ($codepoint == 0x0d) || ($codepoint >= 0x20 && $codepoint <= 0xd7ff) || ($codepoint >= 0xe000 && $codepoint <= 0xfffd) || ($codepoint >= 0x10000 && $codepoint <= 0x10ffff); } /** * Return UTF-8 sequence for a given Unicode code point. * May die if fed out of range data. * * @param $codepoint Integer: * @return String * @public */ function codepointToUtf8( $codepoint ) { if($codepoint < 0x80) return chr($codepoint); if($codepoint < 0x800) return chr($codepoint >> 6 & 0x3f | 0xc0) . chr($codepoint & 0x3f | 0x80); if($codepoint < 0x10000) return chr($codepoint >> 12 & 0x0f | 0xe0) . chr($codepoint >> 6 & 0x3f | 0x80) . chr($codepoint & 0x3f | 0x80); if($codepoint < 0x110000) return chr($codepoint >> 18 & 0x07 | 0xf0) . chr($codepoint >> 12 & 0x3f | 0x80) . chr($codepoint >> 6 & 0x3f | 0x80) . chr($codepoint & 0x3f | 0x80); echo "Asked for code outside of range ($codepoint)\n"; die( -1 ); } /** * @param string $matches * @return string */ function decodeCharReferencesCallback( $matches ) { if( $matches[1] != '' ) { return decodeEntity( $matches[1] ); } elseif( $matches[2] != '' ) { return decodeChar( intval( $matches[2] ) ); } elseif( $matches[3] != '' ) { return decodeChar( hexdec( $matches[3] ) ); } elseif( $matches[4] != '' ) { return decodeChar( hexdec( $matches[4] ) ); } # Last case should be an ampersand by itself return $matches[0]; } ?>