diff -r de56132c008d -r bdac73ed481e includes/wikiengine/TagSanitizer.php --- a/includes/wikiengine/TagSanitizer.php Sun Mar 28 21:49:26 2010 -0400 +++ b/includes/wikiengine/TagSanitizer.php Sun Mar 28 23:10:46 2010 -0400 @@ -25,18 +25,18 @@ |(&)/x' ); define( 'MW_ATTRIBS_REGEX', - "/(?:^|$space)($attrib+) - ($space*=$space* - (?: - # The attribute value: quoted or alone - \"([^<\"]*)\" - | '([^<']*)' - | ([a-zA-Z0-9!#$%&()*,\\-.\\/:;<>?@[\\]^_`{|}~]+) - | (\#[0-9a-fA-F]+) # Technically wrong, but lots of - # colors are specified like this. - # We'll be normalizing it. - ) - )?(?=$space|\$)/sx" ); + "/(?:^|$space)($attrib+) + ($space*=$space* + (?: + # The attribute value: quoted or alone + \"([^<\"]*)\" + | '([^<']*)' + | ([a-zA-Z0-9!#$%&()*,\\-.\\/:;<>?@[\\]^_`{|}~]+) + | (\#[0-9a-fA-F]+) # Technically wrong, but lots of + # colors are specified like this. + # We'll be normalizing it. + ) + )?(?=$space|\$)/sx" ); /** * Take a tag soup fragment listing an HTML element's attributes @@ -58,21 +58,21 @@ * @return string */ function fixTagAttributes( $text, $element ) { - if( trim( $text ) == '' ) { - return ''; - } - - $stripped = validateTagAttributes( - decodeTagAttributes( $text ), $element ); - - $attribs = array(); - foreach( $stripped as $attribute => $value ) { - $encAttribute = htmlspecialchars( $attribute ); - $encValue = safeEncodeAttribute( $value ); - - $attribs[] = "$encAttribute=".'"'."$encValue".'"'.""; // " - } - return count( $attribs ) ? ' ' . implode( ' ', $attribs ) : ''; + if( trim( $text ) == '' ) { + return ''; + } + + $stripped = validateTagAttributes( + decodeTagAttributes( $text ), $element ); + + $attribs = array(); + foreach( $stripped as $attribute => $value ) { + $encAttribute = htmlspecialchars( $attribute ); + $encValue = safeEncodeAttribute( $value ); + + $attribs[] = "$encAttribute=".'"'."$encValue".'"'.""; // " + } + return count( $attribs ) ? ' ' . implode( ' ', $attribs ) : ''; } /** @@ -82,25 +82,25 @@ * @return HTML-encoded text fragment */ function safeEncodeAttribute( $text ) { - $encValue= encodeAttribute( $text ); - - # Templates and links may be expanded in later parsing, - # creating invalid or dangerous output. Suppress this. - $encValue = strtr( $encValue, array( - '<' => '<', // This should never happen, - '>' => '>', // we've received invalid input - '"' => '"', // which should have been escaped. - '{' => '{', - '[' => '[', - "''" => '''', - 'ISBN' => 'ISBN', - 'RFC' => 'RFC', - 'PMID' => 'PMID', - '|' => '|', - '__' => '__', - ) ); + $encValue= encodeAttribute( $text ); + + # Templates and links may be expanded in later parsing, + # creating invalid or dangerous output. Suppress this. + $encValue = strtr( $encValue, array( + '<' => '<', // This should never happen, + '>' => '>', // we've received invalid input + '"' => '"', // which should have been escaped. + '{' => '{', + '[' => '[', + "''" => '''', + 'ISBN' => 'ISBN', + 'RFC' => 'RFC', + 'PMID' => 'PMID', + '|' => '|', + '__' => '__', + ) ); - return $encValue; + return $encValue; } /** @@ -109,45 +109,45 @@ * @return HTML-encoded text fragment */ function encodeAttribute( $text ) { - - // In Enano 1.0.3, added this cheapo hack to keep ampersands - // from being double-sanitized. Thanks to markybob from #deluge. - - // htmlspecialchars() the "manual" way - $encValue = strtr( $text, array( - '&' => '&', - '"' => '"', - '<' => '<', - '>' => '>', - ''' => "'" - ) ); - - $encValue = strtr( $text, array( - '&' => '&', - '"' => '"', - '<' => '<', - '>' => '>', - "'" => ''' - ) ); - - - // Whitespace is normalized during attribute decoding, - // so if we've been passed non-spaces we must encode them - // ahead of time or they won't be preserved. - $encValue = strtr( $encValue, array( - "\n" => ' ', - "\r" => ' ', - "\t" => ' ', - ) ); - - return $encValue; + + // In Enano 1.0.3, added this cheapo hack to keep ampersands + // from being double-sanitized. Thanks to markybob from #deluge. + + // htmlspecialchars() the "manual" way + $encValue = strtr( $text, array( + '&' => '&', + '"' => '"', + '<' => '<', + '>' => '>', + ''' => "'" + ) ); + + $encValue = strtr( $text, array( + '&' => '&', + '"' => '"', + '<' => '<', + '>' => '>', + "'" => ''' + ) ); + + + // Whitespace is normalized during attribute decoding, + // so if we've been passed non-spaces we must encode them + // ahead of time or they won't be preserved. + $encValue = strtr( $encValue, array( + "\n" => ' ', + "\r" => ' ', + "\t" => ' ', + ) ); + + return $encValue; } function unstripForHTML( $text ) { - global $mStripState; - $text = unstrip( $text, $mStripState ); - $text = unstripNoWiki( $text, $mStripState ); - return $text; + global $mStripState; + $text = unstrip( $text, $mStripState ); + $text = unstripNoWiki( $text, $mStripState ); + return $text; } /** @@ -156,14 +156,14 @@ * @private */ function unstripNoWiki( $text, &$state ) { - if ( !isset( $state['nowiki'] ) ) { - return $text; - } + if ( !isset( $state['nowiki'] ) ) { + return $text; + } - # TODO: good candidate for FSS - $text = strtr( $text, $state['nowiki'] ); - - return $text; + # TODO: good candidate for FSS + $text = strtr( $text, $state['nowiki'] ); + + return $text; } /** @@ -181,30 +181,30 @@ * @todo Check for unique id attribute :P */ function validateTagAttributes( $attribs, $element ) { - $whitelist = array_flip( attributeWhitelist( $element ) ); - $out = array(); - foreach( $attribs as $attribute => $value ) { - if( !isset( $whitelist[$attribute] ) ) { - continue; - } - # Strip javascript "expression" from stylesheets. - # http://msdn.microsoft.com/workshop/author/dhtml/overview/recalc.asp - if( $attribute == 'style' ) { - $value = checkCss( $value ); - if( $value === false ) { - # haxx0r - continue; - } - } + $whitelist = array_flip( attributeWhitelist( $element ) ); + $out = array(); + foreach( $attribs as $attribute => $value ) { + if( !isset( $whitelist[$attribute] ) ) { + continue; + } + # Strip javascript "expression" from stylesheets. + # http://msdn.microsoft.com/workshop/author/dhtml/overview/recalc.asp + if( $attribute == 'style' ) { + $value = checkCss( $value ); + if( $value === false ) { + # haxx0r + continue; + } + } - if ( $attribute === 'id' ) - $value = escapeId( $value ); + if ( $attribute === 'id' ) + $value = escapeId( $value ); - // If this attribute was previously set, override it. - // Output should only have one attribute of each name. - $out[$attribute] = $value; - } - return $out; + // If this attribute was previously set, override it. + // Output should only have one attribute of each name. + $out[$attribute] = $value; + } + return $out; } /** @@ -217,23 +217,23 @@ * @return mixed */ function checkCss( $value ) { - $stripped = decodeCharReferences( $value ); + $stripped = decodeCharReferences( $value ); - // Remove any comments; IE gets token splitting wrong - $stripped = preg_replace( '!/\\*.*?\\*/!S', '', $stripped ); - $value = $stripped; + // Remove any comments; IE gets token splitting wrong + $stripped = preg_replace( '!/\\*.*?\\*/!S', '', $stripped ); + $value = $stripped; - // ... and continue checks - $stripped = preg_replace( '!\\\\([0-9A-Fa-f]{1,6})[ \\n\\r\\t\\f]?!e', - 'codepointToUtf8(hexdec("$1"))', $stripped ); - $stripped = str_replace( '\\', '', $stripped ); - if( preg_match( '/(expression|tps*:\/\/|url\\s*\().*/is', - $stripped ) ) { - # haxx0r - return false; - } - - return $value; + // ... and continue checks + $stripped = preg_replace( '!\\\\([0-9A-Fa-f]{1,6})[ \\n\\r\\t\\f]?!e', + 'codepointToUtf8(hexdec("$1"))', $stripped ); + $stripped = str_replace( '\\', '', $stripped ); + if( preg_match( '/(expression|tps*:\/\/|url\\s*\().*/is', + $stripped ) ) { + # haxx0r + return false; + } + + return $value; } /** @@ -246,10 +246,10 @@ * @static */ function decodeCharReferences( $text ) { - return preg_replace_callback( - MW_CHAR_REFS_REGEX, - 'decodeCharReferencesCallback', - $text ); + return preg_replace_callback( + MW_CHAR_REFS_REGEX, + 'decodeCharReferencesCallback', + $text ); } /** @@ -260,13 +260,13 @@ * @return array */ function attributeWhitelist( $element ) { - static $list; - if( !isset( $list ) ) { - $list = setupAttributeWhitelist(); - } - return isset( $list[$element] ) - ? $list[$element] - : array(); + static $list; + if( !isset( $list ) ) { + $list = setupAttributeWhitelist(); + } + return isset( $list[$element] ) + ? $list[$element] + : array(); } /** @@ -274,165 +274,165 @@ * @return array */ function setupAttributeWhitelist() { - global $db, $session, $paths, $template, $plugins; - $common = array( 'id', 'class', 'lang', 'dir', 'title', 'style' ); - $block = array_merge( $common, array( 'align' ) ); - $tablealign = array( 'align', 'char', 'charoff', 'valign' ); - $tablecell = array( 'abbr', - 'axis', - 'headers', - 'scope', - 'rowspan', - 'colspan', - 'nowrap', # deprecated - 'width', # deprecated - 'height', # deprecated - 'bgcolor' # deprecated - ); + global $db, $session, $paths, $template, $plugins; + $common = array( 'id', 'class', 'lang', 'dir', 'title', 'style' ); + $block = array_merge( $common, array( 'align' ) ); + $tablealign = array( 'align', 'char', 'charoff', 'valign' ); + $tablecell = array( 'abbr', + 'axis', + 'headers', + 'scope', + 'rowspan', + 'colspan', + 'nowrap', # deprecated + 'width', # deprecated + 'height', # deprecated + 'bgcolor' # deprecated + ); - # Numbers refer to sections in HTML 4.01 standard describing the element. - # See: http://www.w3.org/TR/html4/ - $whitelist = array ( - # 7.5.4 - 'div' => $block, - 'center' => $common, # deprecated - 'span' => $block, # ?? + # Numbers refer to sections in HTML 4.01 standard describing the element. + # See: http://www.w3.org/TR/html4/ + $whitelist = array ( + # 7.5.4 + 'div' => $block, + 'center' => $common, # deprecated + 'span' => $block, # ?? - # 7.5.5 - 'h1' => $block, - 'h2' => $block, - 'h3' => $block, - 'h4' => $block, - 'h5' => $block, - 'h6' => $block, + # 7.5.5 + 'h1' => $block, + 'h2' => $block, + 'h3' => $block, + 'h4' => $block, + 'h5' => $block, + 'h6' => $block, - # 7.5.6 - # address + # 7.5.6 + # address - # 8.2.4 - # bdo + # 8.2.4 + # bdo - # 9.2.1 - 'em' => $common, - 'strong' => $common, - 'cite' => $common, - # dfn - 'code' => $common, - # samp - # kbd - 'var' => $common, - # abbr - # acronym + # 9.2.1 + 'em' => $common, + 'strong' => $common, + 'cite' => $common, + # dfn + 'code' => $common, + # samp + # kbd + 'var' => $common, + # abbr + # acronym - # 9.2.2 - 'blockquote' => array_merge( $common, array( 'cite' ) ), - # q + # 9.2.2 + 'blockquote' => array_merge( $common, array( 'cite' ) ), + # q - # 9.2.3 - 'sub' => $common, - 'sup' => $common, + # 9.2.3 + 'sub' => $common, + 'sup' => $common, - # 9.3.1 - 'p' => $block, + # 9.3.1 + 'p' => $block, - # 9.3.2 - 'br' => array( 'id', 'class', 'title', 'style', 'clear' ), + # 9.3.2 + 'br' => array( 'id', 'class', 'title', 'style', 'clear' ), - # 9.3.4 - 'pre' => array_merge( $common, array( 'width' ) ), + # 9.3.4 + 'pre' => array_merge( $common, array( 'width' ) ), - # 9.4 - 'ins' => array_merge( $common, array( 'cite', 'datetime' ) ), - 'del' => array_merge( $common, array( 'cite', 'datetime' ) ), + # 9.4 + 'ins' => array_merge( $common, array( 'cite', 'datetime' ) ), + 'del' => array_merge( $common, array( 'cite', 'datetime' ) ), - # 10.2 - 'ul' => array_merge( $common, array( 'type' ) ), - 'ol' => array_merge( $common, array( 'type', 'start' ) ), - 'li' => array_merge( $common, array( 'type', 'value' ) ), + # 10.2 + 'ul' => array_merge( $common, array( 'type' ) ), + 'ol' => array_merge( $common, array( 'type', 'start' ) ), + 'li' => array_merge( $common, array( 'type', 'value' ) ), - # 10.3 - 'dl' => $common, - 'dd' => $common, - 'dt' => $common, + # 10.3 + 'dl' => $common, + 'dd' => $common, + 'dt' => $common, - # 11.2.1 - 'table' => array_merge( $common, - array( 'summary', 'width', 'border', 'frame', - 'rules', 'cellspacing', 'cellpadding', - 'align', 'bgcolor', - ) ), + # 11.2.1 + 'table' => array_merge( $common, + array( 'summary', 'width', 'border', 'frame', + 'rules', 'cellspacing', 'cellpadding', + 'align', 'bgcolor', + ) ), - # 11.2.2 - 'caption' => array_merge( $common, array( 'align' ) ), + # 11.2.2 + 'caption' => array_merge( $common, array( 'align' ) ), - # 11.2.3 - 'thead' => array_merge( $common, $tablealign ), - 'tfoot' => array_merge( $common, $tablealign ), - 'tbody' => array_merge( $common, $tablealign ), + # 11.2.3 + 'thead' => array_merge( $common, $tablealign ), + 'tfoot' => array_merge( $common, $tablealign ), + 'tbody' => array_merge( $common, $tablealign ), - # 11.2.4 - 'colgroup' => array_merge( $common, array( 'span', 'width' ), $tablealign ), - 'col' => array_merge( $common, array( 'span', 'width' ), $tablealign ), + # 11.2.4 + 'colgroup' => array_merge( $common, array( 'span', 'width' ), $tablealign ), + 'col' => array_merge( $common, array( 'span', 'width' ), $tablealign ), - # 11.2.5 - 'tr' => array_merge( $common, array( 'bgcolor' ), $tablealign ), + # 11.2.5 + 'tr' => array_merge( $common, array( 'bgcolor' ), $tablealign ), - # 11.2.6 - 'td' => array_merge( $common, $tablecell, $tablealign ), - 'th' => array_merge( $common, $tablecell, $tablealign ), - - # 12.2 - # added by dan - 'a' => array_merge( $common, array( 'href', 'name' ) ), - - # 13.2 - # added by dan - 'img' => array_merge( $common, array( 'src', 'width', 'height', 'alt' ) ), + # 11.2.6 + 'td' => array_merge( $common, $tablecell, $tablealign ), + 'th' => array_merge( $common, $tablecell, $tablealign ), + + # 12.2 + # added by dan + 'a' => array_merge( $common, array( 'href', 'name' ) ), + + # 13.2 + # added by dan + 'img' => array_merge( $common, array( 'src', 'width', 'height', 'alt' ) ), - # 15.2.1 - 'tt' => $common, - 'b' => $common, - 'i' => $common, - 'big' => $common, - 'small' => $common, - 'strike' => $common, - 's' => $common, - 'u' => $common, + # 15.2.1 + 'tt' => $common, + 'b' => $common, + 'i' => $common, + 'big' => $common, + 'small' => $common, + 'strike' => $common, + 's' => $common, + 'u' => $common, - # 15.2.2 - 'font' => array_merge( $common, array( 'size', 'color', 'face' ) ), - # basefont + # 15.2.2 + 'font' => array_merge( $common, array( 'size', 'color', 'face' ) ), + # basefont - # 15.3 - 'hr' => array_merge( $common, array( 'noshade', 'size', 'width' ) ), + # 15.3 + 'hr' => array_merge( $common, array( 'noshade', 'size', 'width' ) ), - # XHTML Ruby annotation text module, simple ruby only. - # http://www.w3c.org/TR/ruby/ - 'ruby' => $common, - # rbc - # rtc - 'rb' => $common, - 'rt' => $common, #array_merge( $common, array( 'rbspan' ) ), - 'rp' => $common, - - # For compatibility with the XHTML parser. - 'nowiki' => array(), - 'noinclude' => array(), - 'nodisplay' => array(), - 'lang' => array('code'), - - # XHTML stuff - 'acronym' => $common - ); - - // custom tags can be added by plugins - $code = $plugins->setHook('html_attribute_whitelist'); - foreach ( $code as $cmd ) - { - eval($cmd); - } - - return $whitelist; + # XHTML Ruby annotation text module, simple ruby only. + # http://www.w3c.org/TR/ruby/ + 'ruby' => $common, + # rbc + # rtc + 'rb' => $common, + 'rt' => $common, #array_merge( $common, array( 'rbspan' ) ), + 'rp' => $common, + + # For compatibility with the XHTML parser. + 'nowiki' => array(), + 'noinclude' => array(), + 'nodisplay' => array(), + 'lang' => array('code'), + + # XHTML stuff + 'acronym' => $common + ); + + // custom tags can be added by plugins + $code = $plugins->setHook('html_attribute_whitelist'); + foreach ( $code as $cmd ) + { + eval($cmd); + } + + return $whitelist; } /** @@ -452,14 +452,14 @@ * @return string */ function escapeId( $id ) { - static $replace = array( - '%3A' => ':', - '%' => '.' - ); + static $replace = array( + '%3A' => ':', + '%' => '.' + ); - $id = urlencode( decodeCharReferences( strtr( $id, ' ', '_' ) ) ); + $id = urlencode( decodeCharReferences( strtr( $id, ' ', '_' ) ) ); - return str_replace( array_keys( $replace ), array_values( $replace ), $id ); + return str_replace( array_keys( $replace ), array_values( $replace ), $id ); } /** @@ -470,32 +470,32 @@ * @return array */ function wfExplodeMarkup( $separator, $text ) { - $placeholder = "\x00"; - - // Just in case... - $text = str_replace( $placeholder, '', $text ); - - // Trim stuff - $replacer = new ReplacerCallback( $separator, $placeholder ); - $cleaned = preg_replace_callback( '/(<.*?>)/', array( $replacer, 'go' ), $text ); - - $items = explode( $separator, $cleaned ); - foreach( $items as $i => $str ) { - $items[$i] = str_replace( $placeholder, $separator, $str ); - } - - return $items; + $placeholder = "\x00"; + + // Just in case... + $text = str_replace( $placeholder, '', $text ); + + // Trim stuff + $replacer = new ReplacerCallback( $separator, $placeholder ); + $cleaned = preg_replace_callback( '/(<.*?>)/', array( $replacer, 'go' ), $text ); + + $items = explode( $separator, $cleaned ); + foreach( $items as $i => $str ) { + $items[$i] = str_replace( $placeholder, $separator, $str ); + } + + return $items; } class ReplacerCallback { - function ReplacerCallback( $from, $to ) { - $this->from = $from; - $this->to = $to; - } - - function go( $matches ) { - return str_replace( $this->from, $this->to, $matches[1] ); - } + function ReplacerCallback( $from, $to ) { + $this->from = $from; + $this->to = $to; + } + + function go( $matches ) { + return str_replace( $this->from, $this->to, $matches[1] ); + } } /** @@ -507,33 +507,33 @@ * @return array */ function decodeTagAttributes( $text ) { - $attribs = array(); + $attribs = array(); - if( trim( $text ) == '' ) { - return $attribs; - } + if( trim( $text ) == '' ) { + return $attribs; + } - $pairs = array(); - if( !preg_match_all( - MW_ATTRIBS_REGEX, - $text, - $pairs, - PREG_SET_ORDER ) ) { - return $attribs; - } + $pairs = array(); + if( !preg_match_all( + MW_ATTRIBS_REGEX, + $text, + $pairs, + PREG_SET_ORDER ) ) { + return $attribs; + } - foreach( $pairs as $set ) { - $attribute = strtolower( $set[1] ); - $value = getTagAttributeCallback( $set ); - - // Normalize whitespace - $value = preg_replace( '/[\t\r\n ]+/', ' ', $value ); - $value = trim( $value ); - - // Decode character references - $attribs[$attribute] = decodeCharReferences( $value ); - } - return $attribs; + foreach( $pairs as $set ) { + $attribute = strtolower( $set[1] ); + $value = getTagAttributeCallback( $set ); + + // Normalize whitespace + $value = preg_replace( '/[\t\r\n ]+/', ' ', $value ); + $value = trim( $value ); + + // Decode character references + $attribs[$attribute] = decodeCharReferences( $value ); + } + return $attribs; } /** @@ -545,25 +545,25 @@ * @access private */ function getTagAttributeCallback( $set ) { - if( isset( $set[6] ) ) { - # Illegal #XXXXXX color with no quotes. - return $set[6]; - } elseif( isset( $set[5] ) ) { - # No quotes. - return $set[5]; - } elseif( isset( $set[4] ) ) { - # Single-quoted - return $set[4]; - } elseif( isset( $set[3] ) ) { - # Double-quoted - return $set[3]; - } elseif( !isset( $set[2] ) ) { - # In XHTML, attributes must have a value. - # For 'reduced' form, return explicitly the attribute name here. - return $set[1]; - } else { - die_friendly('Parser error', "

Tag conditions not met. This should never happen and is a bug.

" ); - } + if( isset( $set[6] ) ) { + # Illegal #XXXXXX color with no quotes. + return $set[6]; + } elseif( isset( $set[5] ) ) { + # No quotes. + return $set[5]; + } elseif( isset( $set[4] ) ) { + # Single-quoted + return $set[4]; + } elseif( isset( $set[3] ) ) { + # Double-quoted + return $set[3]; + } elseif( !isset( $set[2] ) ) { + # In XHTML, attributes must have a value. + # For 'reduced' form, return explicitly the attribute name here. + return $set[1]; + } else { + die_friendly('Parser error', "

Tag conditions not met. This should never happen and is a bug.

" ); + } } /** @@ -583,78 +583,78 @@ * @access private */ function mwStrip( $text, &$state, $stripcomments = false , $dontstrip = array () ) { - global $wgRandomKey; - $render = true; + global $wgRandomKey; + $render = true; - $wgRandomKey = "\x07UNIQ" . dechex(mt_rand(0, 0x7fffffff)) . dechex(mt_rand(0, 0x7fffffff)); - $uniq_prefix =& $wgRandomKey; - $commentState = array(); - - $elements = array( 'nowiki', 'gallery' ); - - # Removing $dontstrip tags from $elements list (currently only 'gallery', fixing bug 2700) - foreach ( $elements AS $k => $v ) { - if ( !in_array ( $v , $dontstrip ) ) continue; - unset ( $elements[$k] ); - } - - $matches = array(); - $text = extractTagsAndParams( $elements, $text, $matches, $uniq_prefix ); + $wgRandomKey = "\x07UNIQ" . dechex(mt_rand(0, 0x7fffffff)) . dechex(mt_rand(0, 0x7fffffff)); + $uniq_prefix =& $wgRandomKey; + $commentState = array(); + + $elements = array( 'nowiki', 'gallery' ); + + # Removing $dontstrip tags from $elements list (currently only 'gallery', fixing bug 2700) + foreach ( $elements AS $k => $v ) { + if ( !in_array ( $v , $dontstrip ) ) continue; + unset ( $elements[$k] ); + } + + $matches = array(); + $text = extractTagsAndParams( $elements, $text, $matches, $uniq_prefix ); - foreach( $matches as $marker => $data ) { - list( $element, $content, $params, $tag ) = $data; - if( $render ) { - $tagName = strtolower( $element ); - switch( $tagName ) { - case '!--': - // Comment - if( substr( $tag, -3 ) == '-->' ) { - $output = $tag; - } else { - // Unclosed comment in input. - // Close it so later stripping can remove it - $output = "$tag-->"; - } - break; - case 'html': - if( $wgRawHtml ) { - $output = $content; - break; - } - // Shouldn't happen otherwise. :) - case 'nowiki': - $output = wfEscapeHTMLTagsOnly( $content ); - break; - default: - } - } else { - // Just stripping tags; keep the source - $output = $tag; - } + foreach( $matches as $marker => $data ) { + list( $element, $content, $params, $tag ) = $data; + if( $render ) { + $tagName = strtolower( $element ); + switch( $tagName ) { + case '!--': + // Comment + if( substr( $tag, -3 ) == '-->' ) { + $output = $tag; + } else { + // Unclosed comment in input. + // Close it so later stripping can remove it + $output = "$tag-->"; + } + break; + case 'html': + if( $wgRawHtml ) { + $output = $content; + break; + } + // Shouldn't happen otherwise. :) + case 'nowiki': + $output = wfEscapeHTMLTagsOnly( $content ); + break; + default: + } + } else { + // Just stripping tags; keep the source + $output = $tag; + } - // Unstrip the output, because unstrip() is no longer recursive so - // it won't do it itself - $output = unstrip( $output, $state ); + // Unstrip the output, because unstrip() is no longer recursive so + // it won't do it itself + $output = unstrip( $output, $state ); - if( !$stripcomments && $element == '!--' ) { - $commentState[$marker] = $output; - } elseif ( $element == 'html' || $element == 'nowiki' ) { - $state['nowiki'][$marker] = $output; - } else { - $state['general'][$marker] = $output; - } - } + if( !$stripcomments && $element == '!--' ) { + $commentState[$marker] = $output; + } elseif ( $element == 'html' || $element == 'nowiki' ) { + $state['nowiki'][$marker] = $output; + } else { + $state['general'][$marker] = $output; + } + } - # Unstrip comments unless explicitly told otherwise. - # (The comments are always stripped prior to this point, so as to - # not invoke any extension tags / parser hooks contained within - # a comment.) - if ( !$stripcomments ) { - // Put them all back and forget them - $text = strtr( $text, $commentState ); - } + # Unstrip comments unless explicitly told otherwise. + # (The comments are always stripped prior to this point, so as to + # not invoke any extension tags / parser hooks contained within + # a comment.) + if ( !$stripcomments ) { + // Put them all back and forget them + $text = strtr( $text, $commentState ); + } - return $text; + return $text; } /** @@ -676,65 +676,65 @@ * @static */ function extractTagsAndParams($elements, $text, &$matches, $uniq_prefix = ''){ - static $n = 1; - $stripped = ''; - $matches = array(); + static $n = 1; + $stripped = ''; + $matches = array(); - $taglist = implode( '|', $elements ); - $start = "/<($taglist)(\\s+[^>]*?|\\s*?)(\/?>)|<(!--)/i"; + $taglist = implode( '|', $elements ); + $start = "/<($taglist)(\\s+[^>]*?|\\s*?)(\/?>)|<(!--)/i"; - while ( '' != $text ) { - $p = preg_split( $start, $text, 2, PREG_SPLIT_DELIM_CAPTURE ); - $stripped .= $p[0]; - if( count( $p ) < 5 ) { - break; - } - if( count( $p ) > 5 ) { - // comment - $element = $p[4]; - $attributes = ''; - $close = ''; - $inside = $p[5]; - } else { - // tag - $element = $p[1]; - $attributes = $p[2]; - $close = $p[3]; - $inside = $p[4]; - } + while ( '' != $text ) { + $p = preg_split( $start, $text, 2, PREG_SPLIT_DELIM_CAPTURE ); + $stripped .= $p[0]; + if( count( $p ) < 5 ) { + break; + } + if( count( $p ) > 5 ) { + // comment + $element = $p[4]; + $attributes = ''; + $close = ''; + $inside = $p[5]; + } else { + // tag + $element = $p[1]; + $attributes = $p[2]; + $close = $p[3]; + $inside = $p[4]; + } - $marker = "$uniq_prefix-$element-" . sprintf('%08X', $n++) . '-QINU'; - $stripped .= $marker; + $marker = "$uniq_prefix-$element-" . sprintf('%08X', $n++) . '-QINU'; + $stripped .= $marker; - if ( $close === '/>' ) { - // Empty element tag, - $content = null; - $text = $inside; - $tail = null; - } else { - if( $element == '!--' ) { - $end = '/(-->)/'; - } else { - $end = "/(<\\/$element\\s*>)/i"; - } - $q = preg_split( $end, $inside, 2, PREG_SPLIT_DELIM_CAPTURE ); - $content = $q[0]; - if( count( $q ) < 3 ) { - # No end tag -- let it run out to the end of the text. - $tail = ''; - $text = ''; - } else { - $tail = $q[1]; - $text = $q[2]; - } - } - - $matches[$marker] = array( $element, - $content, - decodeTagAttributes( $attributes ), - "<$element$attributes$close$content$tail" ); - } - return $stripped; + if ( $close === '/>' ) { + // Empty element tag, + $content = null; + $text = $inside; + $tail = null; + } else { + if( $element == '!--' ) { + $end = '/(-->)/'; + } else { + $end = "/(<\\/$element\\s*>)/i"; + } + $q = preg_split( $end, $inside, 2, PREG_SPLIT_DELIM_CAPTURE ); + $content = $q[0]; + if( count( $q ) < 3 ) { + # No end tag -- let it run out to the end of the text. + $tail = ''; + $text = ''; + } else { + $tail = $q[1]; + $text = $q[2]; + } + } + + $matches[$marker] = array( $element, + $content, + decodeTagAttributes( $attributes ), + "<$element$attributes$close$content$tail" ); + } + return $stripped; } /** @@ -745,10 +745,10 @@ * @return string Escaped string */ function wfEscapeHTMLTagsOnly( $in ) { - return str_replace( - array( '"', '>', '<' ), - array( '"', '>', '<' ), - $in ); + return str_replace( + array( '"', '>', '<' ), + array( '"', '>', '<' ), + $in ); } /** @@ -758,14 +758,14 @@ * @private */ function unstrip( $text, &$state ) { - if ( !isset( $state['general'] ) ) { - return $text; - } + if ( !isset( $state['general'] ) ) { + return $text; + } - # TODO: good candidate for FSS - $text = strtr( $text, $state['general'] ); - - return $text; + # TODO: good candidate for FSS + $text = strtr( $text, $state['general'] ); + + return $text; } /** @@ -776,11 +776,11 @@ * @private */ function decodeChar( $codepoint ) { - if( validateCodepoint( $codepoint ) ) { - return codepointToUtf8( $codepoint ); - } else { - return UTF8_REPLACEMENT; - } + if( validateCodepoint( $codepoint ) ) { + return codepointToUtf8( $codepoint ); + } else { + return UTF8_REPLACEMENT; + } } /** @@ -792,12 +792,12 @@ * @return string */ function decodeEntity( $name ) { - global $wgHtmlEntities; - if( isset( $wgHtmlEntities[$name] ) ) { - return codepointToUtf8( $wgHtmlEntities[$name] ); - } else { - return "&$name;"; - } + global $wgHtmlEntities; + if( isset( $wgHtmlEntities[$name] ) ) { + return codepointToUtf8( $wgHtmlEntities[$name] ); + } else { + return "&$name;"; + } } /** @@ -806,14 +806,14 @@ * @return bool */ function validateCodepoint( $codepoint ) { - return ($codepoint == 0x09) - || ($codepoint == 0x0a) - || ($codepoint == 0x0d) - || ($codepoint >= 0x20 && $codepoint <= 0xd7ff) - || ($codepoint >= 0xe000 && $codepoint <= 0xfffd) - || ($codepoint >= 0x10000 && $codepoint <= 0x10ffff); + return ($codepoint == 0x09) + || ($codepoint == 0x0a) + || ($codepoint == 0x0d) + || ($codepoint >= 0x20 && $codepoint <= 0xd7ff) + || ($codepoint >= 0xe000 && $codepoint <= 0xfffd) + || ($codepoint >= 0x10000 && $codepoint <= 0x10ffff); } - + /** * Return UTF-8 sequence for a given Unicode code point. * May die if fed out of range data. @@ -843,16 +843,16 @@ * @return string */ function decodeCharReferencesCallback( $matches ) { - if( $matches[1] != '' ) { - return decodeEntity( $matches[1] ); - } elseif( $matches[2] != '' ) { - return decodeChar( intval( $matches[2] ) ); - } elseif( $matches[3] != '' ) { - return decodeChar( hexdec( $matches[3] ) ); - } elseif( $matches[4] != '' ) { - return decodeChar( hexdec( $matches[4] ) ); - } - # Last case should be an ampersand by itself - return $matches[0]; + if( $matches[1] != '' ) { + return decodeEntity( $matches[1] ); + } elseif( $matches[2] != '' ) { + return decodeChar( intval( $matches[2] ) ); + } elseif( $matches[3] != '' ) { + return decodeChar( hexdec( $matches[3] ) ); + } elseif( $matches[4] != '' ) { + return decodeChar( hexdec( $matches[4] ) ); + } + # Last case should be an ampersand by itself + return $matches[0]; }