includes/wikiengine/TagSanitizer.php
changeset 1027 98c052fc3337
child 1081 745200a9cc2a
equal deleted inserted replaced
1026:f0431eb8161e 1027:98c052fc3337
       
     1 <?php
       
     2 
       
     3 /*
       
     4  * Enano - an open-source CMS capable of wiki functions, Drupal-like sidebar blocks, and everything in between
       
     5  * Version 1.1.6 (Caoineag beta 1)
       
     6  * Copyright (C) 2006-2008 Dan Fuhry
       
     7  *
       
     8  * This program is Free Software; you can redistribute and/or modify it under the terms of the GNU General Public License
       
     9  * as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version.
       
    10  *
       
    11  * This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied
       
    12  * warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for details.
       
    13  *
       
    14  * This script contains code originally found in MediaWiki (http://www.mediawiki.org). MediaWiki is also licensed under
       
    15  * the GPLv2 or later; see the file GPL included with this package for details.
       
    16  */
       
    17 
       
    18 $attrib = '[a-zA-Z0-9]';
       
    19 $space = '[\x09\x0a\x0d\x20]';
       
    20 
       
    21 define( 'MW_CHAR_REFS_REGEX',
       
    22 '/&([A-Za-z0-9]+);
       
    23  |&\#([0-9]+);
       
    24  |&\#x([0-9A-Za-z]+);
       
    25  |&\#X([0-9A-Za-z]+);
       
    26  |(&)/x' );
       
    27 
       
    28 define( 'MW_ATTRIBS_REGEX',
       
    29   "/(?:^|$space)($attrib+)
       
    30     ($space*=$space*
       
    31     (?:
       
    32      # The attribute value: quoted or alone
       
    33       ".'"'."([^<".'"'."]*)".'"'."
       
    34      | '([^<']*)'
       
    35      |  ([a-zA-Z0-9!#$%&()*,\\-.\\/:;<>?@[\\]^_`{|}~]+)
       
    36      |  (\#[0-9a-fA-F]+) # Technically wrong, but lots of
       
    37                # colors are specified like this.
       
    38                # We'll be normalizing it.
       
    39     )
       
    40      )?(?=$space|\$)/sx" );
       
    41 
       
    42 /**
       
    43  * Take a tag soup fragment listing an HTML element's attributes
       
    44  * and normalize it to well-formed XML, discarding unwanted attributes.
       
    45  * Output is safe for further wikitext processing, with escaping of
       
    46  * values that could trigger problems.
       
    47  *
       
    48  * - Normalizes attribute names to lowercase
       
    49  * - Discards attributes not on a whitelist for the given element
       
    50  * - Turns broken or invalid entities into plaintext
       
    51  * - Double-quotes all attribute values
       
    52  * - Attributes without values are given the name as attribute
       
    53  * - Double attributes are discarded
       
    54  * - Unsafe style attributes are discarded
       
    55  * - Prepends space if there are attributes.
       
    56  *
       
    57  * @param string $text
       
    58  * @param string $element
       
    59  * @return string
       
    60  */
       
    61 function fixTagAttributes( $text, $element ) {
       
    62   if( trim( $text ) == '' ) {
       
    63     return '';
       
    64   }
       
    65   
       
    66   $stripped = validateTagAttributes(
       
    67     decodeTagAttributes( $text ), $element );
       
    68   
       
    69   $attribs = array();
       
    70   foreach( $stripped as $attribute => $value ) {
       
    71     $encAttribute = htmlspecialchars( $attribute );
       
    72     $encValue = safeEncodeAttribute( $value );
       
    73     
       
    74     $attribs[] = "$encAttribute=".'"'."$encValue".'"'.""; // "
       
    75   }
       
    76   return count( $attribs ) ? ' ' . implode( ' ', $attribs ) : '';
       
    77 }
       
    78 
       
    79 /**
       
    80  * Encode an attribute value for HTML tags, with extra armoring
       
    81  * against further wiki processing.
       
    82  * @param $text
       
    83  * @return HTML-encoded text fragment
       
    84  */
       
    85 function safeEncodeAttribute( $text ) {
       
    86   $encValue= encodeAttribute( $text );
       
    87   
       
    88   # Templates and links may be expanded in later parsing,
       
    89   # creating invalid or dangerous output. Suppress this.
       
    90   $encValue = strtr( $encValue, array(
       
    91     '<'    => '&lt;',   // This should never happen,
       
    92     '>'    => '&gt;',   // we've received invalid input
       
    93     '"'    => '&quot;', // which should have been escaped.
       
    94     '{'    => '&#123;',
       
    95     '['    => '&#91;',
       
    96     "''"   => '&#39;&#39;',
       
    97     'ISBN' => '&#73;SBN',
       
    98     'RFC'  => '&#82;FC',
       
    99     'PMID' => '&#80;MID',
       
   100     '|'    => '&#124;',
       
   101     '__'   => '&#95;_',
       
   102   ) );
       
   103 
       
   104   return $encValue;
       
   105 }
       
   106 
       
   107 /**
       
   108  * Encode an attribute value for HTML output.
       
   109  * @param $text
       
   110  * @return HTML-encoded text fragment
       
   111  */
       
   112 function encodeAttribute( $text ) {
       
   113   
       
   114   // In Enano 1.0.3, added this cheapo hack to keep ampersands
       
   115   // from being double-sanitized. Thanks to markybob from #deluge.
       
   116   
       
   117   // htmlspecialchars() the "manual" way
       
   118   $encValue = strtr( $text, array(
       
   119     '&amp;'  => '&',
       
   120     '&quot;' => '"',
       
   121     '&lt;'   => '<',
       
   122     '&gt;'   => '>',
       
   123     '&#039;' => "'"
       
   124   ) );
       
   125   
       
   126   $encValue = strtr( $text, array(
       
   127     '&' => '&amp;',
       
   128     '"' => '&quot;',
       
   129     '<' => '&lt;',
       
   130     '>' => '&gt;',
       
   131     "'" => '&#039;'
       
   132   ) );
       
   133   
       
   134   
       
   135   // Whitespace is normalized during attribute decoding,
       
   136   // so if we've been passed non-spaces we must encode them
       
   137   // ahead of time or they won't be preserved.
       
   138   $encValue = strtr( $encValue, array(
       
   139     "\n" => '&#10;',
       
   140     "\r" => '&#13;',
       
   141     "\t" => '&#9;',
       
   142   ) );
       
   143   
       
   144   return $encValue;
       
   145 }
       
   146 
       
   147 function unstripForHTML( $text ) {
       
   148   global $mStripState;
       
   149   $text = unstrip( $text, $mStripState );
       
   150   $text = unstripNoWiki( $text, $mStripState );
       
   151   return $text;
       
   152 }
       
   153 
       
   154 /**
       
   155  * Always call this after unstrip() to preserve the order
       
   156  *
       
   157  * @private
       
   158  */
       
   159 function unstripNoWiki( $text, &$state ) {
       
   160   if ( !isset( $state['nowiki'] ) ) {
       
   161     return $text;
       
   162   }
       
   163 
       
   164   # TODO: good candidate for FSS
       
   165   $text = strtr( $text, $state['nowiki'] );
       
   166   
       
   167   return $text;
       
   168 }
       
   169 
       
   170 /**
       
   171  * Take an array of attribute names and values and normalize or discard
       
   172  * illegal values for the given element type.
       
   173  *
       
   174  * - Discards attributes not on a whitelist for the given element
       
   175  * - Unsafe style attributes are discarded
       
   176  *
       
   177  * @param array $attribs
       
   178  * @param string $element
       
   179  * @return array
       
   180  *
       
   181  * @todo Check for legal values where the DTD limits things.
       
   182  * @todo Check for unique id attribute :P
       
   183  */
       
   184 function validateTagAttributes( $attribs, $element ) {
       
   185   $whitelist = array_flip( attributeWhitelist( $element ) );
       
   186   $out = array();
       
   187   foreach( $attribs as $attribute => $value ) {
       
   188     if( !isset( $whitelist[$attribute] ) ) {
       
   189       continue;
       
   190     }
       
   191     # Strip javascript "expression" from stylesheets.
       
   192     # http://msdn.microsoft.com/workshop/author/dhtml/overview/recalc.asp
       
   193     if( $attribute == 'style' ) {
       
   194       $value = checkCss( $value );
       
   195       if( $value === false ) {
       
   196         # haxx0r
       
   197         continue;
       
   198       }
       
   199     }
       
   200 
       
   201     if ( $attribute === 'id' )
       
   202       $value = escapeId( $value );
       
   203 
       
   204     // If this attribute was previously set, override it.
       
   205     // Output should only have one attribute of each name.
       
   206     $out[$attribute] = $value;
       
   207   }
       
   208   return $out;
       
   209 }
       
   210 
       
   211 /**
       
   212  * Pick apart some CSS and check it for forbidden or unsafe structures.
       
   213  * Returns a sanitized string, or false if it was just too evil.
       
   214  *
       
   215  * Currently URL references, 'expression', 'tps' are forbidden.
       
   216  *
       
   217  * @param string $value
       
   218  * @return mixed
       
   219  */
       
   220 function checkCss( $value ) {
       
   221   $stripped = decodeCharReferences( $value );
       
   222 
       
   223   // Remove any comments; IE gets token splitting wrong
       
   224   $stripped = preg_replace( '!/\\*.*?\\*/!S', '', $stripped );
       
   225   $value = $stripped;
       
   226 
       
   227   // ... and continue checks
       
   228   $stripped = preg_replace( '!\\\\([0-9A-Fa-f]{1,6})[ \\n\\r\\t\\f]?!e',
       
   229     'codepointToUtf8(hexdec("$1"))', $stripped );
       
   230   $stripped = str_replace( '\\', '', $stripped );
       
   231   if( preg_match( '/(expression|tps*:\/\/|url\\s*\().*/is',
       
   232       $stripped ) ) {
       
   233     # haxx0r
       
   234     return false;
       
   235   }
       
   236   
       
   237   return $value;
       
   238 }
       
   239 
       
   240 /**
       
   241  * Decode any character references, numeric or named entities,
       
   242  * in the text and return a UTF-8 string.
       
   243  *
       
   244  * @param string $text
       
   245  * @return string
       
   246  * @access public
       
   247  * @static
       
   248  */
       
   249 function decodeCharReferences( $text ) {
       
   250   return preg_replace_callback(
       
   251     MW_CHAR_REFS_REGEX,
       
   252     'decodeCharReferencesCallback',
       
   253     $text );
       
   254 }
       
   255 
       
   256 /**
       
   257  * Fetch the whitelist of acceptable attributes for a given
       
   258  * element name.
       
   259  *
       
   260  * @param string $element
       
   261  * @return array
       
   262  */
       
   263 function attributeWhitelist( $element ) {
       
   264   static $list;
       
   265   if( !isset( $list ) ) {
       
   266     $list = setupAttributeWhitelist();
       
   267   }
       
   268   return isset( $list[$element] )
       
   269     ? $list[$element]
       
   270     : array();
       
   271 }
       
   272 
       
   273 /**
       
   274  * @todo Document it a bit
       
   275  * @return array
       
   276  */
       
   277 function setupAttributeWhitelist() {
       
   278   global $db, $session, $paths, $template, $plugins;
       
   279   $common = array( 'id', 'class', 'lang', 'dir', 'title', 'style' );
       
   280   $block = array_merge( $common, array( 'align' ) );
       
   281   $tablealign = array( 'align', 'char', 'charoff', 'valign' );
       
   282   $tablecell = array( 'abbr',
       
   283                       'axis',
       
   284                       'headers',
       
   285                       'scope',
       
   286                       'rowspan',
       
   287                       'colspan',
       
   288                       'nowrap', # deprecated
       
   289                       'width',  # deprecated
       
   290                       'height', # deprecated
       
   291                       'bgcolor' # deprecated
       
   292                       );
       
   293 
       
   294   # Numbers refer to sections in HTML 4.01 standard describing the element.
       
   295   # See: http://www.w3.org/TR/html4/
       
   296   $whitelist = array (
       
   297     # 7.5.4
       
   298     'div'        => $block,
       
   299     'center'     => $common, # deprecated
       
   300     'span'       => $block, # ??
       
   301 
       
   302     # 7.5.5
       
   303     'h1'         => $block,
       
   304     'h2'         => $block,
       
   305     'h3'         => $block,
       
   306     'h4'         => $block,
       
   307     'h5'         => $block,
       
   308     'h6'         => $block,
       
   309 
       
   310     # 7.5.6
       
   311     # address
       
   312 
       
   313     # 8.2.4
       
   314     # bdo
       
   315 
       
   316     # 9.2.1
       
   317     'em'         => $common,
       
   318     'strong'     => $common,
       
   319     'cite'       => $common,
       
   320     # dfn
       
   321     'code'       => $common,
       
   322     # samp
       
   323     # kbd
       
   324     'var'        => $common,
       
   325     # abbr
       
   326     # acronym
       
   327 
       
   328     # 9.2.2
       
   329     'blockquote' => array_merge( $common, array( 'cite' ) ),
       
   330     # q
       
   331 
       
   332     # 9.2.3
       
   333     'sub'        => $common,
       
   334     'sup'        => $common,
       
   335 
       
   336     # 9.3.1
       
   337     'p'          => $block,
       
   338 
       
   339     # 9.3.2
       
   340     'br'         => array( 'id', 'class', 'title', 'style', 'clear' ),
       
   341 
       
   342     # 9.3.4
       
   343     'pre'        => array_merge( $common, array( 'width' ) ),
       
   344 
       
   345     # 9.4
       
   346     'ins'        => array_merge( $common, array( 'cite', 'datetime' ) ),
       
   347     'del'        => array_merge( $common, array( 'cite', 'datetime' ) ),
       
   348 
       
   349     # 10.2
       
   350     'ul'         => array_merge( $common, array( 'type' ) ),
       
   351     'ol'         => array_merge( $common, array( 'type', 'start' ) ),
       
   352     'li'         => array_merge( $common, array( 'type', 'value' ) ),
       
   353 
       
   354     # 10.3
       
   355     'dl'         => $common,
       
   356     'dd'         => $common,
       
   357     'dt'         => $common,
       
   358 
       
   359     # 11.2.1
       
   360     'table'      => array_merge( $common,
       
   361               array( 'summary', 'width', 'border', 'frame',
       
   362                   'rules', 'cellspacing', 'cellpadding',
       
   363                   'align', 'bgcolor',
       
   364               ) ),
       
   365 
       
   366     # 11.2.2
       
   367     'caption'    => array_merge( $common, array( 'align' ) ),
       
   368 
       
   369     # 11.2.3
       
   370     'thead'      => array_merge( $common, $tablealign ),
       
   371     'tfoot'      => array_merge( $common, $tablealign ),
       
   372     'tbody'      => array_merge( $common, $tablealign ),
       
   373 
       
   374     # 11.2.4
       
   375     'colgroup'   => array_merge( $common, array( 'span', 'width' ), $tablealign ),
       
   376     'col'        => array_merge( $common, array( 'span', 'width' ), $tablealign ),
       
   377 
       
   378     # 11.2.5
       
   379     'tr'         => array_merge( $common, array( 'bgcolor' ), $tablealign ),
       
   380 
       
   381     # 11.2.6
       
   382     'td'         => array_merge( $common, $tablecell, $tablealign ),
       
   383     'th'         => array_merge( $common, $tablecell, $tablealign ),
       
   384     
       
   385     # 12.2
       
   386     # added by dan
       
   387     'a'          => array_merge( $common, array( 'href', 'name' ) ),
       
   388     
       
   389     # 13.2
       
   390     # added by dan
       
   391     'img'        => array_merge( $common, array( 'src', 'width', 'height', 'alt' ) ),
       
   392 
       
   393     # 15.2.1
       
   394     'tt'         => $common,
       
   395     'b'          => $common,
       
   396     'i'          => $common,
       
   397     'big'        => $common,
       
   398     'small'      => $common,
       
   399     'strike'     => $common,
       
   400     's'          => $common,
       
   401     'u'          => $common,
       
   402 
       
   403     # 15.2.2
       
   404     'font'       => array_merge( $common, array( 'size', 'color', 'face' ) ),
       
   405     # basefont
       
   406 
       
   407     # 15.3
       
   408     'hr'         => array_merge( $common, array( 'noshade', 'size', 'width' ) ),
       
   409 
       
   410     # XHTML Ruby annotation text module, simple ruby only.
       
   411     # http://www.w3c.org/TR/ruby/
       
   412     'ruby'       => $common,
       
   413     # rbc
       
   414     # rtc
       
   415     'rb'         => $common,
       
   416     'rt'         => $common, #array_merge( $common, array( 'rbspan' ) ),
       
   417     'rp'         => $common,
       
   418     
       
   419     # For compatibility with the XHTML parser.
       
   420     'nowiki'     => array(),
       
   421     'noinclude'  => array(),
       
   422     'nodisplay'  => array(),
       
   423     'lang'       => array('code'),
       
   424     
       
   425     # XHTML stuff
       
   426     'acronym'    => $common
       
   427     );
       
   428   
       
   429   // custom tags can be added by plugins
       
   430   $code = $plugins->setHook('html_attribute_whitelist');
       
   431   foreach ( $code as $cmd )
       
   432   {
       
   433     eval($cmd);
       
   434   }
       
   435   
       
   436   return $whitelist;
       
   437 }
       
   438 
       
   439 /**
       
   440  * Given a value escape it so that it can be used in an id attribute and
       
   441  * return it, this does not validate the value however (see first link)
       
   442  *
       
   443  * @link http://www.w3.org/TR/html401/types.html#type-name Valid characters
       
   444  *                                                          in the id and
       
   445  *                                                          name attributes
       
   446  * @link http://www.w3.org/TR/html401/struct/links.html#h-12.2.3 Anchors with the id attribute
       
   447  *
       
   448  * @bug 4461
       
   449  *
       
   450  * @static
       
   451  *
       
   452  * @param string $id
       
   453  * @return string
       
   454  */
       
   455 function escapeId( $id ) {
       
   456   static $replace = array(
       
   457     '%3A' => ':',
       
   458     '%' => '.'
       
   459   );
       
   460 
       
   461   $id = urlencode( decodeCharReferences( strtr( $id, ' ', '_' ) ) );
       
   462 
       
   463   return str_replace( array_keys( $replace ), array_values( $replace ), $id );
       
   464 }
       
   465 
       
   466 /**
       
   467  * More or less "markup-safe" explode()
       
   468  * Ignores any instances of the separator inside <...>
       
   469  * @param string $separator
       
   470  * @param string $text
       
   471  * @return array
       
   472  */
       
   473 function wfExplodeMarkup( $separator, $text ) {
       
   474   $placeholder = "\x00";
       
   475   
       
   476   // Just in case...
       
   477   $text = str_replace( $placeholder, '', $text );
       
   478   
       
   479   // Trim stuff
       
   480   $replacer = new ReplacerCallback( $separator, $placeholder );
       
   481   $cleaned = preg_replace_callback( '/(<.*?>)/', array( $replacer, 'go' ), $text );
       
   482   
       
   483   $items = explode( $separator, $cleaned );
       
   484   foreach( $items as $i => $str ) {
       
   485     $items[$i] = str_replace( $placeholder, $separator, $str );
       
   486   }
       
   487   
       
   488   return $items;
       
   489 }
       
   490 
       
   491 class ReplacerCallback {
       
   492   function ReplacerCallback( $from, $to ) {
       
   493     $this->from = $from;
       
   494     $this->to = $to;
       
   495   }
       
   496   
       
   497   function go( $matches ) {
       
   498     return str_replace( $this->from, $this->to, $matches[1] );
       
   499   }
       
   500 }
       
   501 
       
   502 /**
       
   503  * Return an associative array of attribute names and values from
       
   504  * a partial tag string. Attribute names are forces to lowercase,
       
   505  * character references are decoded to UTF-8 text.
       
   506  *
       
   507  * @param string
       
   508  * @return array
       
   509  */
       
   510 function decodeTagAttributes( $text ) {
       
   511   $attribs = array();
       
   512 
       
   513   if( trim( $text ) == '' ) {
       
   514     return $attribs;
       
   515   }
       
   516 
       
   517   $pairs = array();
       
   518   if( !preg_match_all(
       
   519     MW_ATTRIBS_REGEX,
       
   520     $text,
       
   521     $pairs,
       
   522     PREG_SET_ORDER ) ) {
       
   523     return $attribs;
       
   524   }
       
   525 
       
   526   foreach( $pairs as $set ) {
       
   527     $attribute = strtolower( $set[1] );
       
   528     $value = getTagAttributeCallback( $set );
       
   529     
       
   530     // Normalize whitespace
       
   531     $value = preg_replace( '/[\t\r\n ]+/', ' ', $value );
       
   532     $value = trim( $value );
       
   533     
       
   534     // Decode character references
       
   535     $attribs[$attribute] = decodeCharReferences( $value );
       
   536   }
       
   537   return $attribs;
       
   538 }
       
   539 
       
   540 /**
       
   541  * Pick the appropriate attribute value from a match set from the
       
   542  * MW_ATTRIBS_REGEX matches.
       
   543  *
       
   544  * @param array $set
       
   545  * @return string
       
   546  * @access private
       
   547  */
       
   548 function getTagAttributeCallback( $set ) {
       
   549   if( isset( $set[6] ) ) {
       
   550     # Illegal #XXXXXX color with no quotes.
       
   551     return $set[6];
       
   552   } elseif( isset( $set[5] ) ) {
       
   553     # No quotes.
       
   554     return $set[5];
       
   555   } elseif( isset( $set[4] ) ) {
       
   556     # Single-quoted
       
   557     return $set[4];
       
   558   } elseif( isset( $set[3] ) ) {
       
   559     # Double-quoted
       
   560     return $set[3];
       
   561   } elseif( !isset( $set[2] ) ) {
       
   562     # In XHTML, attributes must have a value.
       
   563     # For 'reduced' form, return explicitly the attribute name here.
       
   564     return $set[1];
       
   565   } else {
       
   566     die_friendly('Parser error', "<p>Tag conditions not met. This should never happen and is a bug.</p>" );
       
   567   }
       
   568 }
       
   569 
       
   570 /**
       
   571  * Strips and renders nowiki, pre, math, hiero
       
   572  * If $render is set, performs necessary rendering operations on plugins
       
   573  * Returns the text, and fills an array with data needed in unstrip()
       
   574  * If the $state is already a valid strip state, it adds to the state
       
   575  *
       
   576  * @param bool $stripcomments when set, HTML comments <!-- like this -->
       
   577  *  will be stripped in addition to other tags. This is important
       
   578  *  for section editing, where these comments cause confusion when
       
   579  *  counting the sections in the wikisource
       
   580  * 
       
   581  * @param array dontstrip contains tags which should not be stripped;
       
   582  *  used to prevent stipping of <gallery> when saving (fixes bug 2700)
       
   583  *
       
   584  * @access private
       
   585  */
       
   586 function mwStrip( $text, &$state, $stripcomments = false , $dontstrip = array () ) {
       
   587   global $wgRandomKey;
       
   588   $render = true;
       
   589 
       
   590   $wgRandomKey = "\x07UNIQ" . dechex(mt_rand(0, 0x7fffffff)) . dechex(mt_rand(0, 0x7fffffff));
       
   591   $uniq_prefix =& $wgRandomKey;
       
   592   $commentState = array();
       
   593   
       
   594   $elements = array( 'nowiki', 'gallery' );
       
   595   
       
   596   # Removing $dontstrip tags from $elements list (currently only 'gallery', fixing bug 2700)
       
   597   foreach ( $elements AS $k => $v ) {
       
   598     if ( !in_array ( $v , $dontstrip ) ) continue;
       
   599     unset ( $elements[$k] );
       
   600   }
       
   601   
       
   602   $matches = array();
       
   603   $text = extractTagsAndParams( $elements, $text, $matches, $uniq_prefix );
       
   604 
       
   605   foreach( $matches as $marker => $data ) {
       
   606     list( $element, $content, $params, $tag ) = $data;
       
   607     if( $render ) {
       
   608       $tagName = strtolower( $element );
       
   609       switch( $tagName ) {
       
   610       case '!--':
       
   611         // Comment
       
   612         if( substr( $tag, -3 ) == '-->' ) {
       
   613           $output = $tag;
       
   614         } else {
       
   615           // Unclosed comment in input.
       
   616           // Close it so later stripping can remove it
       
   617           $output = "$tag-->";
       
   618         }
       
   619         break;
       
   620       case 'html':
       
   621         if( $wgRawHtml ) {
       
   622           $output = $content;
       
   623           break;
       
   624         }
       
   625         // Shouldn't happen otherwise. :)
       
   626       case 'nowiki':
       
   627         $output = wfEscapeHTMLTagsOnly( $content );
       
   628         break;
       
   629       default:
       
   630       }
       
   631     } else {
       
   632       // Just stripping tags; keep the source
       
   633       $output = $tag;
       
   634     }
       
   635 
       
   636     // Unstrip the output, because unstrip() is no longer recursive so 
       
   637     // it won't do it itself
       
   638     $output = unstrip( $output, $state );
       
   639 
       
   640     if( !$stripcomments && $element == '!--' ) {
       
   641       $commentState[$marker] = $output;
       
   642     } elseif ( $element == 'html' || $element == 'nowiki' ) {
       
   643       $state['nowiki'][$marker] = $output;
       
   644     } else {
       
   645       $state['general'][$marker] = $output;
       
   646     }
       
   647   }
       
   648 
       
   649   # Unstrip comments unless explicitly told otherwise.
       
   650   # (The comments are always stripped prior to this point, so as to
       
   651   # not invoke any extension tags / parser hooks contained within
       
   652   # a comment.)
       
   653   if ( !$stripcomments ) {
       
   654     // Put them all back and forget them
       
   655     $text = strtr( $text, $commentState );
       
   656   }
       
   657 
       
   658   return $text;
       
   659 }
       
   660 
       
   661 /**
       
   662  * Replaces all occurrences of HTML-style comments and the given tags
       
   663  * in the text with a random marker and returns teh next text. The output
       
   664  * parameter $matches will be an associative array filled with data in
       
   665  * the form:
       
   666  *   'UNIQ-xxxxx' => array(
       
   667  *     'element',
       
   668  *     'tag content',
       
   669  *     array( 'param' => 'x' ),
       
   670  *     '<element param="x">tag content</element>' ) )
       
   671  *
       
   672  * @param $elements list of element names. Comments are always extracted.
       
   673  * @param $text Source text string.
       
   674  * @param $uniq_prefix
       
   675  *
       
   676  * @access private
       
   677  * @static
       
   678  */
       
   679 function extractTagsAndParams($elements, $text, &$matches, $uniq_prefix = ''){
       
   680   static $n = 1;
       
   681   $stripped = '';
       
   682   $matches = array();
       
   683 
       
   684   $taglist = implode( '|', $elements );
       
   685   $start = "/<($taglist)(\\s+[^>]*?|\\s*?)(\/?>)|<(!--)/i";
       
   686 
       
   687   while ( '' != $text ) {
       
   688     $p = preg_split( $start, $text, 2, PREG_SPLIT_DELIM_CAPTURE );
       
   689     $stripped .= $p[0];
       
   690     if( count( $p ) < 5 ) {
       
   691       break;
       
   692     }
       
   693     if( count( $p ) > 5 ) {
       
   694       // comment
       
   695       $element    = $p[4];
       
   696       $attributes = '';
       
   697       $close      = '';
       
   698       $inside     = $p[5];
       
   699     } else {
       
   700       // tag
       
   701       $element    = $p[1];
       
   702       $attributes = $p[2];
       
   703       $close      = $p[3];
       
   704       $inside     = $p[4];
       
   705     }
       
   706 
       
   707     $marker = "$uniq_prefix-$element-" . sprintf('%08X', $n++) . '-QINU';
       
   708     $stripped .= $marker;
       
   709 
       
   710     if ( $close === '/>' ) {
       
   711       // Empty element tag, <tag />
       
   712       $content = null;
       
   713       $text = $inside;
       
   714       $tail = null;
       
   715     } else {
       
   716       if( $element == '!--' ) {
       
   717         $end = '/(-->)/';
       
   718       } else {
       
   719         $end = "/(<\\/$element\\s*>)/i";
       
   720       }
       
   721       $q = preg_split( $end, $inside, 2, PREG_SPLIT_DELIM_CAPTURE );
       
   722       $content = $q[0];
       
   723       if( count( $q ) < 3 ) {
       
   724         # No end tag -- let it run out to the end of the text.
       
   725         $tail = '';
       
   726         $text = '';
       
   727       } else {
       
   728         $tail = $q[1];
       
   729         $text = $q[2];
       
   730       }
       
   731     }
       
   732     
       
   733     $matches[$marker] = array( $element,
       
   734       $content,
       
   735       decodeTagAttributes( $attributes ),
       
   736       "<$element$attributes$close$content$tail" );
       
   737   }
       
   738   return $stripped;
       
   739 }
       
   740 
       
   741 /**
       
   742  * Escape html tags
       
   743  * Basically replacing " > and < with HTML entities ( &quot;, &gt;, &lt;)
       
   744  *
       
   745  * @param $in String: text that might contain HTML tags.
       
   746  * @return string Escaped string
       
   747  */
       
   748 function wfEscapeHTMLTagsOnly( $in ) {
       
   749   return str_replace(
       
   750     array( '"', '>', '<' ),
       
   751     array( '&quot;', '&gt;', '&lt;' ),
       
   752     $in );
       
   753 }
       
   754 
       
   755 /**
       
   756  * Restores pre, math, and other extensions removed by strip()
       
   757  *
       
   758  * always call unstripNoWiki() after this one
       
   759  * @private
       
   760  */
       
   761 function unstrip( $text, &$state ) {
       
   762   if ( !isset( $state['general'] ) ) {
       
   763     return $text;
       
   764   }
       
   765 
       
   766   # TODO: good candidate for FSS
       
   767   $text = strtr( $text, $state['general'] );
       
   768   
       
   769   return $text;
       
   770 }
       
   771 
       
   772 /**
       
   773  * Return UTF-8 string for a codepoint if that is a valid
       
   774  * character reference, otherwise U+FFFD REPLACEMENT CHARACTER.
       
   775  * @param int $codepoint
       
   776  * @return string
       
   777  * @private
       
   778  */
       
   779 function decodeChar( $codepoint ) {
       
   780   if( validateCodepoint( $codepoint ) ) {
       
   781     return codepointToUtf8( $codepoint );
       
   782   } else {
       
   783     return UTF8_REPLACEMENT;
       
   784   }
       
   785 }
       
   786 
       
   787 /**
       
   788  * If the named entity is defined in the HTML 4.0/XHTML 1.0 DTD,
       
   789  * return the UTF-8 encoding of that character. Otherwise, returns
       
   790  * pseudo-entity source (eg &foo;)
       
   791  *
       
   792  * @param string $name
       
   793  * @return string
       
   794  */
       
   795 function decodeEntity( $name ) {
       
   796   global $wgHtmlEntities;
       
   797   if( isset( $wgHtmlEntities[$name] ) ) {
       
   798     return codepointToUtf8( $wgHtmlEntities[$name] );
       
   799   } else {
       
   800     return "&$name;";
       
   801   }
       
   802 }
       
   803 
       
   804 /**
       
   805  * Returns true if a given Unicode codepoint is a valid character in XML.
       
   806  * @param int $codepoint
       
   807  * @return bool
       
   808  */
       
   809 function validateCodepoint( $codepoint ) {
       
   810   return ($codepoint ==    0x09)
       
   811     || ($codepoint ==    0x0a)
       
   812     || ($codepoint ==    0x0d)
       
   813     || ($codepoint >=    0x20 && $codepoint <=   0xd7ff)
       
   814     || ($codepoint >=  0xe000 && $codepoint <=   0xfffd)
       
   815     || ($codepoint >= 0x10000 && $codepoint <= 0x10ffff);
       
   816 }
       
   817   
       
   818 /**
       
   819  * Return UTF-8 sequence for a given Unicode code point.
       
   820  * May die if fed out of range data.
       
   821  *
       
   822  * @param $codepoint Integer:
       
   823  * @return String
       
   824  * @public
       
   825  */
       
   826 function codepointToUtf8( $codepoint ) {
       
   827 	if($codepoint <		0x80) return chr($codepoint);
       
   828 	if($codepoint <    0x800) return chr($codepoint >>	6 & 0x3f | 0xc0) .
       
   829 									 chr($codepoint		  & 0x3f | 0x80);
       
   830 	if($codepoint <  0x10000) return chr($codepoint >> 12 & 0x0f | 0xe0) .
       
   831 									 chr($codepoint >>	6 & 0x3f | 0x80) .
       
   832 									 chr($codepoint		  & 0x3f | 0x80);
       
   833 	if($codepoint < 0x110000) return chr($codepoint >> 18 & 0x07 | 0xf0) .
       
   834 									 chr($codepoint >> 12 & 0x3f | 0x80) .
       
   835 									 chr($codepoint >>	6 & 0x3f | 0x80) .
       
   836 									 chr($codepoint		  & 0x3f | 0x80);
       
   837 
       
   838 	echo "Asked for code outside of range ($codepoint)\n";
       
   839 	die( -1 );
       
   840 }
       
   841 
       
   842 /**
       
   843  * @param string $matches
       
   844  * @return string
       
   845  */
       
   846 function decodeCharReferencesCallback( $matches ) {
       
   847   if( $matches[1] != '' ) {
       
   848     return decodeEntity( $matches[1] );
       
   849   } elseif( $matches[2] != '' ) {
       
   850     return  decodeChar( intval( $matches[2] ) );
       
   851   } elseif( $matches[3] != ''  ) {
       
   852     return  decodeChar( hexdec( $matches[3] ) );
       
   853   } elseif( $matches[4] != '' ) {
       
   854     return  decodeChar( hexdec( $matches[4] ) );
       
   855   }
       
   856   # Last case should be an ampersand by itself
       
   857   return $matches[0];
       
   858 }
       
   859