1
+ − 1
<?php
+ − 2
+ − 3
/**
+ − 4
* Enano - an open-source CMS capable of wiki functions, Drupal-like sidebar blocks, and everything in between
73
0a74676a2f2f
Made the move to Loch Ness, and got some basic page grouping functionality working. TODO: fix some UI issues in Javascript ACL editor and change non-JS ACL editor to work with page groups too
Dan
diff
changeset
+ − 5
* Version 1.0.1 (Loch Ness)
1
+ − 6
* Copyright (C) 2006-2007 Dan Fuhry
+ − 7
*
+ − 8
* This program is Free Software; you can redistribute and/or modify it under the terms of the GNU General Public License
+ − 9
* as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version.
+ − 10
*
+ − 11
* This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied
+ − 12
* warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for details.
+ − 13
*
+ − 14
* This script contains code originally found in MediaWiki (http://www.mediawiki.org). MediaWiki is also licensed under
+ − 15
* the GPLv2; see the file GPL included with this package for details.
+ − 16
*
+ − 17
* We're using the MW parser because the Text_Wiki version simply refused to work under PHP 5.2.0. Porting this was
+ − 18
* _not_ easy. <leaves to get cup of coffee>
+ − 19
*/
+ − 20
+ − 21
global $mStripState, $wgRandomKey;
+ − 22
$mStripState = Array();
+ − 23
+ − 24
$attrib = '[a-zA-Z0-9]';
+ − 25
$space = '[\x09\x0a\x0d\x20]';
+ − 26
+ − 27
define( 'MW_CHAR_REFS_REGEX',
+ − 28
'/&([A-Za-z0-9]+);
+ − 29
|&\#([0-9]+);
+ − 30
|&\#x([0-9A-Za-z]+);
+ − 31
|&\#X([0-9A-Za-z]+);
+ − 32
|(&)/x' );
+ − 33
+ − 34
define( 'MW_ATTRIBS_REGEX',
+ − 35
"/(?:^|$space)($attrib+)
+ − 36
($space*=$space*
+ − 37
(?:
+ − 38
# The attribute value: quoted or alone
+ − 39
".'"'."([^<".'"'."]*)".'"'."
+ − 40
| '([^<']*)'
+ − 41
| ([a-zA-Z0-9!#$%&()*,\\-.\\/:;<>?@[\\]^_`{|}~]+)
+ − 42
| (\#[0-9a-fA-F]+) # Technically wrong, but lots of
+ − 43
# colors are specified like this.
+ − 44
# We'll be normalizing it.
+ − 45
)
+ − 46
)?(?=$space|\$)/sx" );
+ − 47
+ − 48
/**
+ − 49
* emulate mediawiki parser, including stripping, etc.
+ − 50
*
+ − 51
* @param string $text the text to parse
+ − 52
* @return string
+ − 53
* @access public
+ − 54
*/
+ − 55
+ − 56
function process_tables( $text )
+ − 57
{
+ − 58
// include some globals, do some parser stuff that would normally be done in the parent parser function
+ − 59
global $mStripState;
+ − 60
$x =& $mStripState;
+ − 61
//$text = mwStrip( $text, $x );
+ − 62
+ − 63
// parse the text
+ − 64
$text = doTableStuff($text);
+ − 65
+ − 66
// Unstrip it
+ − 67
// $text = unstrip( $text, $mStripState );
+ − 68
// $text = unstripNoWiki( $text, $mStripState );
+ − 69
//die('<pre>'.print_r($mStripState, true).'</pre>');
+ − 70
return $text;
+ − 71
}
+ − 72
+ − 73
/**
+ − 74
* parse the wiki syntax used to render tables
+ − 75
*
+ − 76
* @param string $t the text to parse
+ − 77
* @return string
+ − 78
* @access private
+ − 79
*/
+ − 80
function doTableStuff( $t ) {
+ − 81
+ − 82
$t = explode ( "\n" , $t ) ;
+ − 83
$td = array () ; # Is currently a td tag open?
+ − 84
$ltd = array () ; # Was it TD or TH?
+ − 85
$tr = array () ; # Is currently a tr tag open?
+ − 86
$ltr = array () ; # tr attributes
+ − 87
$has_opened_tr = array(); # Did this table open a <tr> element?
+ − 88
$indent_level = 0; # indent level of the table
+ − 89
foreach ( $t AS $k => $x )
+ − 90
{
+ − 91
$x = trim ( $x ) ;
+ − 92
$fc = substr ( $x , 0 , 1 ) ;
+ − 93
if ( preg_match( '/^(:*)\{\|(.*)$/', $x, $matches ) ) {
+ − 94
$indent_level = strlen( $matches[1] );
+ − 95
+ − 96
$attributes = unstripForHTML( $matches[2] );
+ − 97
+ − 98
$t[$k] = str_repeat( '<dl><dd>', $indent_level ) .
+ − 99
'<nowiki><table' . fixTagAttributes( $attributes, 'table' ) . '></nowiki>' ;
+ − 100
array_push ( $td , false ) ;
+ − 101
array_push ( $ltd , '' ) ;
+ − 102
array_push ( $tr , false ) ;
+ − 103
array_push ( $ltr , '' ) ;
+ − 104
array_push ( $has_opened_tr, false );
+ − 105
}
+ − 106
else if ( count ( $td ) == 0 ) { } # Don't do any of the following
+ − 107
else if ( '|}' == substr ( $x , 0 , 2 ) ) {
+ − 108
$z = "<nowiki></table></nowiki>" . substr ( $x , 2);
+ − 109
$l = array_pop ( $ltd ) ;
+ − 110
if ( !array_pop ( $has_opened_tr ) ) $z = "<nowiki><tr><td></td></tr></nowiki>" . $z ;
+ − 111
if ( array_pop ( $tr ) ) $z = '<nowiki></tr></nowiki>' . $z ;
+ − 112
if ( array_pop ( $td ) ) $z = '<nowiki></'.$l.'></nowiki>' . $z ;
+ − 113
array_pop ( $ltr ) ;
+ − 114
$t[$k] = $z . str_repeat( '<nowiki></dd></dl></nowiki>', $indent_level );
+ − 115
}
+ − 116
else if ( '|-' == substr ( $x , 0 , 2 ) ) { # Allows for |---------------
+ − 117
$x = substr ( $x , 1 ) ;
+ − 118
while ( $x != '' && substr ( $x , 0 , 1 ) == '-' ) $x = substr ( $x , 1 ) ;
+ − 119
$z = '' ;
+ − 120
$l = array_pop ( $ltd ) ;
+ − 121
array_pop ( $has_opened_tr );
+ − 122
array_push ( $has_opened_tr , true ) ;
+ − 123
if ( array_pop ( $tr ) ) $z = '<nowiki></tr></nowiki>' . $z ;
+ − 124
if ( array_pop ( $td ) ) $z = '<nowiki></'.$l.'></nowiki>' . $z ;
+ − 125
array_pop ( $ltr ) ;
+ − 126
$t[$k] = $z ;
+ − 127
array_push ( $tr , false ) ;
+ − 128
array_push ( $td , false ) ;
+ − 129
array_push ( $ltd , '' ) ;
+ − 130
$attributes = unstripForHTML( $x );
+ − 131
array_push ( $ltr , fixTagAttributes( $attributes, 'tr' ) ) ;
+ − 132
}
+ − 133
else if ( '|' == $fc || '!' == $fc || '|+' == substr ( $x , 0 , 2 ) ) { # Caption
+ − 134
# $x is a table row
+ − 135
if ( '|+' == substr ( $x , 0 , 2 ) ) {
+ − 136
$fc = '+' ;
+ − 137
$x = substr ( $x , 1 ) ;
+ − 138
}
+ − 139
$after = substr ( $x , 1 ) ;
+ − 140
if ( $fc == '!' ) $after = str_replace ( '!!' , '||' , $after ) ;
+ − 141
+ − 142
// Split up multiple cells on the same line.
+ − 143
// FIXME: This can result in improper nesting of tags processed
+ − 144
// by earlier parser steps, but should avoid splitting up eg
+ − 145
// attribute values containing literal "||".
+ − 146
$after = wfExplodeMarkup( '||', $after );
+ − 147
+ − 148
$t[$k] = '' ;
+ − 149
+ − 150
# Loop through each table cell
+ − 151
foreach ( $after AS $theline )
+ − 152
{
+ − 153
$z = '' ;
+ − 154
if ( $fc != '+' )
+ − 155
{
+ − 156
$tra = array_pop ( $ltr ) ;
+ − 157
if ( !array_pop ( $tr ) ) $z = '<nowiki><tr'.$tra."></nowiki>\n" ;
+ − 158
array_push ( $tr , true ) ;
+ − 159
array_push ( $ltr , '' ) ;
+ − 160
array_pop ( $has_opened_tr );
+ − 161
array_push ( $has_opened_tr , true ) ;
+ − 162
}
+ − 163
+ − 164
$l = array_pop ( $ltd ) ;
+ − 165
if ( array_pop ( $td ) ) $z = '<nowiki></'.$l.'></nowiki>' . $z ;
+ − 166
if ( $fc == '|' ) $l = 'td' ;
+ − 167
else if ( $fc == '!' ) $l = 'th' ;
+ − 168
else if ( $fc == '+' ) $l = 'caption' ;
+ − 169
else $l = '' ;
+ − 170
array_push ( $ltd , $l ) ;
+ − 171
+ − 172
# Cell parameters
+ − 173
$y = explode ( '|' , $theline , 2 ) ;
+ − 174
# Note that a '|' inside an invalid link should not
+ − 175
# be mistaken as delimiting cell parameters
+ − 176
if ( strpos( $y[0], '[[' ) !== false ) {
+ − 177
$y = array ($theline);
+ − 178
}
+ − 179
if ( count ( $y ) == 1 )
+ − 180
$y = "{$z}<nowiki><{$l}></nowiki>{$y[0]}" ;
+ − 181
else {
+ − 182
$attributes = unstripForHTML( $y[0] );
+ − 183
$y = "{$z}<nowiki><{$l}".fixTagAttributes($attributes, $l)."></nowiki>{$y[1]}" ;
+ − 184
}
+ − 185
$t[$k] .= $y ;
+ − 186
array_push ( $td , true ) ;
+ − 187
}
+ − 188
}
+ − 189
}
+ − 190
+ − 191
# Closing open td, tr && table
+ − 192
while ( count ( $td ) > 0 )
+ − 193
{
+ − 194
$l = array_pop ( $ltd ) ;
+ − 195
if ( array_pop ( $td ) ) $t[] = '<nowiki></td></nowiki>' ;
+ − 196
if ( array_pop ( $tr ) ) $t[] = '<nowiki></tr></nowiki>' ;
+ − 197
if ( !array_pop ( $has_opened_tr ) ) $t[] = "<nowiki><tr><td></td></tr></nowiki>" ;
+ − 198
$t[] = '<nowiki></table></nowiki>' ;
+ − 199
}
+ − 200
+ − 201
$t = implode ( "\n" , $t ) ;
+ − 202
+ − 203
# special case: don't return empty table
+ − 204
if($t == "<nowiki><table></nowiki>\n<nowiki><tr><td></td></tr></nowiki>\n<nowiki></table></nowiki>")
+ − 205
$t = '';
+ − 206
return $t ;
+ − 207
}
+ − 208
+ − 209
/**
+ − 210
* Take a tag soup fragment listing an HTML element's attributes
+ − 211
* and normalize it to well-formed XML, discarding unwanted attributes.
+ − 212
* Output is safe for further wikitext processing, with escaping of
+ − 213
* values that could trigger problems.
+ − 214
*
+ − 215
* - Normalizes attribute names to lowercase
+ − 216
* - Discards attributes not on a whitelist for the given element
+ − 217
* - Turns broken or invalid entities into plaintext
+ − 218
* - Double-quotes all attribute values
+ − 219
* - Attributes without values are given the name as attribute
+ − 220
* - Double attributes are discarded
+ − 221
* - Unsafe style attributes are discarded
+ − 222
* - Prepends space if there are attributes.
+ − 223
*
+ − 224
* @param string $text
+ − 225
* @param string $element
+ − 226
* @return string
+ − 227
*/
+ − 228
function fixTagAttributes( $text, $element ) {
+ − 229
if( trim( $text ) == '' ) {
+ − 230
return '';
+ − 231
}
+ − 232
+ − 233
$stripped = validateTagAttributes(
+ − 234
decodeTagAttributes( $text ), $element );
+ − 235
+ − 236
$attribs = array();
+ − 237
foreach( $stripped as $attribute => $value ) {
+ − 238
$encAttribute = htmlspecialchars( $attribute );
+ − 239
$encValue = safeEncodeAttribute( $value );
+ − 240
+ − 241
$attribs[] = "$encAttribute=".'"'."$encValue".'"'.""; // "
+ − 242
}
+ − 243
return count( $attribs ) ? ' ' . implode( ' ', $attribs ) : '';
+ − 244
}
+ − 245
+ − 246
/**
+ − 247
* Encode an attribute value for HTML tags, with extra armoring
+ − 248
* against further wiki processing.
+ − 249
* @param $text
+ − 250
* @return HTML-encoded text fragment
+ − 251
*/
+ − 252
function safeEncodeAttribute( $text ) {
+ − 253
$encValue= encodeAttribute( $text );
+ − 254
+ − 255
# Templates and links may be expanded in later parsing,
+ − 256
# creating invalid or dangerous output. Suppress this.
+ − 257
$encValue = strtr( $encValue, array(
+ − 258
'<' => '<', // This should never happen,
+ − 259
'>' => '>', // we've received invalid input
+ − 260
'"' => '"', // which should have been escaped.
+ − 261
'{' => '{',
+ − 262
'[' => '[',
+ − 263
"''" => '''',
+ − 264
'ISBN' => 'ISBN',
+ − 265
'RFC' => 'RFC',
+ − 266
'PMID' => 'PMID',
+ − 267
'|' => '|',
+ − 268
'__' => '__',
+ − 269
) );
+ − 270
+ − 271
return $encValue;
+ − 272
}
+ − 273
+ − 274
/**
+ − 275
* Encode an attribute value for HTML output.
+ − 276
* @param $text
+ − 277
* @return HTML-encoded text fragment
+ − 278
*/
+ − 279
function encodeAttribute( $text ) {
+ − 280
$encValue = htmlspecialchars( $text );
+ − 281
+ − 282
// Whitespace is normalized during attribute decoding,
+ − 283
// so if we've been passed non-spaces we must encode them
+ − 284
// ahead of time or they won't be preserved.
+ − 285
$encValue = strtr( $encValue, array(
+ − 286
"\n" => ' ',
+ − 287
"\r" => ' ',
+ − 288
"\t" => '	',
+ − 289
) );
+ − 290
+ − 291
return $encValue;
+ − 292
}
+ − 293
+ − 294
function unstripForHTML( $text ) {
+ − 295
global $mStripState;
+ − 296
$text = unstrip( $text, $mStripState );
+ − 297
$text = unstripNoWiki( $text, $mStripState );
+ − 298
return $text;
+ − 299
}
+ − 300
+ − 301
/**
+ − 302
* Always call this after unstrip() to preserve the order
+ − 303
*
+ − 304
* @private
+ − 305
*/
+ − 306
function unstripNoWiki( $text, &$state ) {
+ − 307
if ( !isset( $state['nowiki'] ) ) {
+ − 308
return $text;
+ − 309
}
+ − 310
+ − 311
# TODO: good candidate for FSS
+ − 312
$text = strtr( $text, $state['nowiki'] );
+ − 313
+ − 314
return $text;
+ − 315
}
+ − 316
+ − 317
/**
+ − 318
* Take an array of attribute names and values and normalize or discard
+ − 319
* illegal values for the given element type.
+ − 320
*
+ − 321
* - Discards attributes not on a whitelist for the given element
+ − 322
* - Unsafe style attributes are discarded
+ − 323
*
+ − 324
* @param array $attribs
+ − 325
* @param string $element
+ − 326
* @return array
+ − 327
*
+ − 328
* @todo Check for legal values where the DTD limits things.
+ − 329
* @todo Check for unique id attribute :P
+ − 330
*/
+ − 331
function validateTagAttributes( $attribs, $element ) {
+ − 332
$whitelist = array_flip( attributeWhitelist( $element ) );
+ − 333
$out = array();
+ − 334
foreach( $attribs as $attribute => $value ) {
+ − 335
if( !isset( $whitelist[$attribute] ) ) {
+ − 336
continue;
+ − 337
}
+ − 338
# Strip javascript "expression" from stylesheets.
+ − 339
# http://msdn.microsoft.com/workshop/author/dhtml/overview/recalc.asp
+ − 340
if( $attribute == 'style' ) {
+ − 341
$value = checkCss( $value );
+ − 342
if( $value === false ) {
+ − 343
# haxx0r
+ − 344
continue;
+ − 345
}
+ − 346
}
+ − 347
+ − 348
if ( $attribute === 'id' )
+ − 349
$value = escapeId( $value );
+ − 350
+ − 351
// If this attribute was previously set, override it.
+ − 352
// Output should only have one attribute of each name.
+ − 353
$out[$attribute] = $value;
+ − 354
}
+ − 355
return $out;
+ − 356
}
+ − 357
+ − 358
/**
+ − 359
* Pick apart some CSS and check it for forbidden or unsafe structures.
+ − 360
* Returns a sanitized string, or false if it was just too evil.
+ − 361
*
+ − 362
* Currently URL references, 'expression', 'tps' are forbidden.
+ − 363
*
+ − 364
* @param string $value
+ − 365
* @return mixed
+ − 366
*/
+ − 367
function checkCss( $value ) {
+ − 368
$stripped = decodeCharReferences( $value );
+ − 369
+ − 370
// Remove any comments; IE gets token splitting wrong
+ − 371
$stripped = preg_replace( '!/\\*.*?\\*/!S', '', $stripped );
+ − 372
$value = $stripped;
+ − 373
+ − 374
// ... and continue checks
+ − 375
$stripped = preg_replace( '!\\\\([0-9A-Fa-f]{1,6})[ \\n\\r\\t\\f]?!e',
+ − 376
'codepointToUtf8(hexdec("$1"))', $stripped );
+ − 377
$stripped = str_replace( '\\', '', $stripped );
+ − 378
if( preg_match( '/(expression|tps*:\/\/|url\\s*\().*/is',
+ − 379
$stripped ) ) {
+ − 380
# haxx0r
+ − 381
return false;
+ − 382
}
+ − 383
+ − 384
return $value;
+ − 385
}
+ − 386
+ − 387
/**
+ − 388
* Decode any character references, numeric or named entities,
+ − 389
* in the text and return a UTF-8 string.
+ − 390
*
+ − 391
* @param string $text
+ − 392
* @return string
+ − 393
* @access public
+ − 394
* @static
+ − 395
*/
+ − 396
function decodeCharReferences( $text ) {
+ − 397
return preg_replace_callback(
+ − 398
MW_CHAR_REFS_REGEX,
+ − 399
'decodeCharReferencesCallback',
+ − 400
$text );
+ − 401
}
+ − 402
+ − 403
/**
+ − 404
* Fetch the whitelist of acceptable attributes for a given
+ − 405
* element name.
+ − 406
*
+ − 407
* @param string $element
+ − 408
* @return array
+ − 409
*/
+ − 410
function attributeWhitelist( $element ) {
+ − 411
static $list;
+ − 412
if( !isset( $list ) ) {
+ − 413
$list = setupAttributeWhitelist();
+ − 414
}
+ − 415
return isset( $list[$element] )
+ − 416
? $list[$element]
+ − 417
: array();
+ − 418
}
+ − 419
+ − 420
/**
+ − 421
* @todo Document it a bit
+ − 422
* @return array
+ − 423
*/
+ − 424
function setupAttributeWhitelist() {
+ − 425
$common = array( 'id', 'class', 'lang', 'dir', 'title', 'style' );
+ − 426
$block = array_merge( $common, array( 'align' ) );
+ − 427
$tablealign = array( 'align', 'char', 'charoff', 'valign' );
+ − 428
$tablecell = array( 'abbr',
+ − 429
'axis',
+ − 430
'headers',
+ − 431
'scope',
+ − 432
'rowspan',
+ − 433
'colspan',
+ − 434
'nowrap', # deprecated
+ − 435
'width', # deprecated
+ − 436
'height', # deprecated
+ − 437
'bgcolor' # deprecated
+ − 438
);
+ − 439
+ − 440
# Numbers refer to sections in HTML 4.01 standard describing the element.
+ − 441
# See: http://www.w3.org/TR/html4/
+ − 442
$whitelist = array (
+ − 443
# 7.5.4
+ − 444
'div' => $block,
+ − 445
'center' => $common, # deprecated
+ − 446
'span' => $block, # ??
+ − 447
+ − 448
# 7.5.5
+ − 449
'h1' => $block,
+ − 450
'h2' => $block,
+ − 451
'h3' => $block,
+ − 452
'h4' => $block,
+ − 453
'h5' => $block,
+ − 454
'h6' => $block,
+ − 455
+ − 456
# 7.5.6
+ − 457
# address
+ − 458
+ − 459
# 8.2.4
+ − 460
# bdo
+ − 461
+ − 462
# 9.2.1
+ − 463
'em' => $common,
+ − 464
'strong' => $common,
+ − 465
'cite' => $common,
+ − 466
# dfn
+ − 467
'code' => $common,
+ − 468
# samp
+ − 469
# kbd
+ − 470
'var' => $common,
+ − 471
# abbr
+ − 472
# acronym
+ − 473
+ − 474
# 9.2.2
+ − 475
'blockquote' => array_merge( $common, array( 'cite' ) ),
+ − 476
# q
+ − 477
+ − 478
# 9.2.3
+ − 479
'sub' => $common,
+ − 480
'sup' => $common,
+ − 481
+ − 482
# 9.3.1
+ − 483
'p' => $block,
+ − 484
+ − 485
# 9.3.2
+ − 486
'br' => array( 'id', 'class', 'title', 'style', 'clear' ),
+ − 487
+ − 488
# 9.3.4
+ − 489
'pre' => array_merge( $common, array( 'width' ) ),
+ − 490
+ − 491
# 9.4
+ − 492
'ins' => array_merge( $common, array( 'cite', 'datetime' ) ),
+ − 493
'del' => array_merge( $common, array( 'cite', 'datetime' ) ),
+ − 494
+ − 495
# 10.2
+ − 496
'ul' => array_merge( $common, array( 'type' ) ),
+ − 497
'ol' => array_merge( $common, array( 'type', 'start' ) ),
+ − 498
'li' => array_merge( $common, array( 'type', 'value' ) ),
+ − 499
+ − 500
# 10.3
+ − 501
'dl' => $common,
+ − 502
'dd' => $common,
+ − 503
'dt' => $common,
+ − 504
+ − 505
# 11.2.1
+ − 506
'table' => array_merge( $common,
+ − 507
array( 'summary', 'width', 'border', 'frame',
+ − 508
'rules', 'cellspacing', 'cellpadding',
+ − 509
'align', 'bgcolor',
+ − 510
) ),
+ − 511
+ − 512
# 11.2.2
+ − 513
'caption' => array_merge( $common, array( 'align' ) ),
+ − 514
+ − 515
# 11.2.3
+ − 516
'thead' => array_merge( $common, $tablealign ),
+ − 517
'tfoot' => array_merge( $common, $tablealign ),
+ − 518
'tbody' => array_merge( $common, $tablealign ),
+ − 519
+ − 520
# 11.2.4
+ − 521
'colgroup' => array_merge( $common, array( 'span', 'width' ), $tablealign ),
+ − 522
'col' => array_merge( $common, array( 'span', 'width' ), $tablealign ),
+ − 523
+ − 524
# 11.2.5
+ − 525
'tr' => array_merge( $common, array( 'bgcolor' ), $tablealign ),
+ − 526
+ − 527
# 11.2.6
+ − 528
'td' => array_merge( $common, $tablecell, $tablealign ),
+ − 529
'th' => array_merge( $common, $tablecell, $tablealign ),
+ − 530
+ − 531
# 12.2
+ − 532
# added by dan
+ − 533
'a' => array_merge( $common, array( 'href', 'name' ) ),
+ − 534
+ − 535
# 13.2
+ − 536
# added by dan
+ − 537
'img' => array_merge( $common, array( 'src', 'width', 'height', 'alt' ) ),
+ − 538
+ − 539
# 15.2.1
+ − 540
'tt' => $common,
+ − 541
'b' => $common,
+ − 542
'i' => $common,
+ − 543
'big' => $common,
+ − 544
'small' => $common,
+ − 545
'strike' => $common,
+ − 546
's' => $common,
+ − 547
'u' => $common,
+ − 548
+ − 549
# 15.2.2
+ − 550
'font' => array_merge( $common, array( 'size', 'color', 'face' ) ),
+ − 551
# basefont
+ − 552
+ − 553
# 15.3
+ − 554
'hr' => array_merge( $common, array( 'noshade', 'size', 'width' ) ),
+ − 555
+ − 556
# XHTML Ruby annotation text module, simple ruby only.
+ − 557
# http://www.w3c.org/TR/ruby/
+ − 558
'ruby' => $common,
+ − 559
# rbc
+ − 560
# rtc
+ − 561
'rb' => $common,
+ − 562
'rt' => $common, #array_merge( $common, array( 'rbspan' ) ),
+ − 563
'rp' => $common,
+ − 564
+ − 565
# For compatibility with the XHTML parser.
+ − 566
'nowiki' => array(),
+ − 567
'noinclude' => array(),
+ − 568
'nodisplay' => array(),
+ − 569
+ − 570
# XHTML stuff
+ − 571
'acronym' => $common
+ − 572
);
+ − 573
return $whitelist;
+ − 574
}
+ − 575
+ − 576
/**
+ − 577
* Given a value escape it so that it can be used in an id attribute and
+ − 578
* return it, this does not validate the value however (see first link)
+ − 579
*
+ − 580
* @link http://www.w3.org/TR/html401/types.html#type-name Valid characters
+ − 581
* in the id and
+ − 582
* name attributes
+ − 583
* @link http://www.w3.org/TR/html401/struct/links.html#h-12.2.3 Anchors with the id attribute
+ − 584
*
+ − 585
* @bug 4461
+ − 586
*
+ − 587
* @static
+ − 588
*
+ − 589
* @param string $id
+ − 590
* @return string
+ − 591
*/
+ − 592
function escapeId( $id ) {
+ − 593
static $replace = array(
+ − 594
'%3A' => ':',
+ − 595
'%' => '.'
+ − 596
);
+ − 597
+ − 598
$id = urlencode( decodeCharReferences( strtr( $id, ' ', '_' ) ) );
+ − 599
+ − 600
return str_replace( array_keys( $replace ), array_values( $replace ), $id );
+ − 601
}
+ − 602
+ − 603
/**
+ − 604
* More or less "markup-safe" explode()
+ − 605
* Ignores any instances of the separator inside <...>
+ − 606
* @param string $separator
+ − 607
* @param string $text
+ − 608
* @return array
+ − 609
*/
+ − 610
function wfExplodeMarkup( $separator, $text ) {
+ − 611
$placeholder = "\x00";
+ − 612
+ − 613
// Just in case...
+ − 614
$text = str_replace( $placeholder, '', $text );
+ − 615
+ − 616
// Trim stuff
+ − 617
$replacer = new ReplacerCallback( $separator, $placeholder );
+ − 618
$cleaned = preg_replace_callback( '/(<.*?>)/', array( $replacer, 'go' ), $text );
+ − 619
+ − 620
$items = explode( $separator, $cleaned );
+ − 621
foreach( $items as $i => $str ) {
+ − 622
$items[$i] = str_replace( $placeholder, $separator, $str );
+ − 623
}
+ − 624
+ − 625
return $items;
+ − 626
}
+ − 627
+ − 628
class ReplacerCallback {
+ − 629
function ReplacerCallback( $from, $to ) {
+ − 630
$this->from = $from;
+ − 631
$this->to = $to;
+ − 632
}
+ − 633
+ − 634
function go( $matches ) {
+ − 635
return str_replace( $this->from, $this->to, $matches[1] );
+ − 636
}
+ − 637
}
+ − 638
+ − 639
/**
+ − 640
* Return an associative array of attribute names and values from
+ − 641
* a partial tag string. Attribute names are forces to lowercase,
+ − 642
* character references are decoded to UTF-8 text.
+ − 643
*
+ − 644
* @param string
+ − 645
* @return array
+ − 646
*/
+ − 647
function decodeTagAttributes( $text ) {
+ − 648
$attribs = array();
+ − 649
+ − 650
if( trim( $text ) == '' ) {
+ − 651
return $attribs;
+ − 652
}
+ − 653
+ − 654
$pairs = array();
+ − 655
if( !preg_match_all(
+ − 656
MW_ATTRIBS_REGEX,
+ − 657
$text,
+ − 658
$pairs,
+ − 659
PREG_SET_ORDER ) ) {
+ − 660
return $attribs;
+ − 661
}
+ − 662
+ − 663
foreach( $pairs as $set ) {
+ − 664
$attribute = strtolower( $set[1] );
+ − 665
$value = getTagAttributeCallback( $set );
+ − 666
+ − 667
// Normalize whitespace
+ − 668
$value = preg_replace( '/[\t\r\n ]+/', ' ', $value );
+ − 669
$value = trim( $value );
+ − 670
+ − 671
// Decode character references
+ − 672
$attribs[$attribute] = decodeCharReferences( $value );
+ − 673
}
+ − 674
return $attribs;
+ − 675
}
+ − 676
+ − 677
/**
+ − 678
* Pick the appropriate attribute value from a match set from the
+ − 679
* MW_ATTRIBS_REGEX matches.
+ − 680
*
+ − 681
* @param array $set
+ − 682
* @return string
+ − 683
* @access private
+ − 684
*/
+ − 685
function getTagAttributeCallback( $set ) {
+ − 686
if( isset( $set[6] ) ) {
+ − 687
# Illegal #XXXXXX color with no quotes.
+ − 688
return $set[6];
+ − 689
} elseif( isset( $set[5] ) ) {
+ − 690
# No quotes.
+ − 691
return $set[5];
+ − 692
} elseif( isset( $set[4] ) ) {
+ − 693
# Single-quoted
+ − 694
return $set[4];
+ − 695
} elseif( isset( $set[3] ) ) {
+ − 696
# Double-quoted
+ − 697
return $set[3];
+ − 698
} elseif( !isset( $set[2] ) ) {
+ − 699
# In XHTML, attributes must have a value.
+ − 700
# For 'reduced' form, return explicitly the attribute name here.
+ − 701
return $set[1];
+ − 702
} else {
+ − 703
die_friendly('Parser error', "<p>Tag conditions not met. This should never happen and is a bug.</p>" );
+ − 704
}
+ − 705
}
+ − 706
+ − 707
/**
+ − 708
* Strips and renders nowiki, pre, math, hiero
+ − 709
* If $render is set, performs necessary rendering operations on plugins
+ − 710
* Returns the text, and fills an array with data needed in unstrip()
+ − 711
* If the $state is already a valid strip state, it adds to the state
+ − 712
*
+ − 713
* @param bool $stripcomments when set, HTML comments <!-- like this -->
+ − 714
* will be stripped in addition to other tags. This is important
+ − 715
* for section editing, where these comments cause confusion when
+ − 716
* counting the sections in the wikisource
+ − 717
*
+ − 718
* @param array dontstrip contains tags which should not be stripped;
+ − 719
* used to prevent stipping of <gallery> when saving (fixes bug 2700)
+ − 720
*
+ − 721
* @access private
+ − 722
*/
+ − 723
function mwStrip( $text, &$state, $stripcomments = false , $dontstrip = array () ) {
+ − 724
global $wgRandomKey;
+ − 725
$render = true;
+ − 726
+ − 727
$wgRandomKey = "\x07UNIQ" . dechex(mt_rand(0, 0x7fffffff)) . dechex(mt_rand(0, 0x7fffffff));
+ − 728
$uniq_prefix =& $wgRandomKey;
+ − 729
$commentState = array();
+ − 730
+ − 731
$elements = array( 'nowiki', 'gallery' );
+ − 732
+ − 733
# Removing $dontstrip tags from $elements list (currently only 'gallery', fixing bug 2700)
+ − 734
foreach ( $elements AS $k => $v ) {
+ − 735
if ( !in_array ( $v , $dontstrip ) ) continue;
+ − 736
unset ( $elements[$k] );
+ − 737
}
+ − 738
+ − 739
$matches = array();
+ − 740
$text = extractTagsAndParams( $elements, $text, $matches, $uniq_prefix );
+ − 741
+ − 742
foreach( $matches as $marker => $data ) {
+ − 743
list( $element, $content, $params, $tag ) = $data;
+ − 744
if( $render ) {
+ − 745
$tagName = strtolower( $element );
+ − 746
switch( $tagName ) {
+ − 747
case '!--':
+ − 748
// Comment
+ − 749
if( substr( $tag, -3 ) == '-->' ) {
+ − 750
$output = $tag;
+ − 751
} else {
+ − 752
// Unclosed comment in input.
+ − 753
// Close it so later stripping can remove it
+ − 754
$output = "$tag-->";
+ − 755
}
+ − 756
break;
+ − 757
case 'html':
+ − 758
if( $wgRawHtml ) {
+ − 759
$output = $content;
+ − 760
break;
+ − 761
}
+ − 762
// Shouldn't happen otherwise. :)
+ − 763
case 'nowiki':
+ − 764
$output = wfEscapeHTMLTagsOnly( $content );
+ − 765
break;
+ − 766
default:
+ − 767
}
+ − 768
} else {
+ − 769
// Just stripping tags; keep the source
+ − 770
$output = $tag;
+ − 771
}
+ − 772
+ − 773
// Unstrip the output, because unstrip() is no longer recursive so
+ − 774
// it won't do it itself
+ − 775
$output = unstrip( $output, $state );
+ − 776
+ − 777
if( !$stripcomments && $element == '!--' ) {
+ − 778
$commentState[$marker] = $output;
+ − 779
} elseif ( $element == 'html' || $element == 'nowiki' ) {
+ − 780
$state['nowiki'][$marker] = $output;
+ − 781
} else {
+ − 782
$state['general'][$marker] = $output;
+ − 783
}
+ − 784
}
+ − 785
+ − 786
# Unstrip comments unless explicitly told otherwise.
+ − 787
# (The comments are always stripped prior to this point, so as to
+ − 788
# not invoke any extension tags / parser hooks contained within
+ − 789
# a comment.)
+ − 790
if ( !$stripcomments ) {
+ − 791
// Put them all back and forget them
+ − 792
$text = strtr( $text, $commentState );
+ − 793
}
+ − 794
+ − 795
return $text;
+ − 796
}
+ − 797
+ − 798
/**
+ − 799
* Replaces all occurrences of HTML-style comments and the given tags
+ − 800
* in the text with a random marker and returns teh next text. The output
+ − 801
* parameter $matches will be an associative array filled with data in
+ − 802
* the form:
+ − 803
* 'UNIQ-xxxxx' => array(
+ − 804
* 'element',
+ − 805
* 'tag content',
+ − 806
* array( 'param' => 'x' ),
+ − 807
* '<element param="x">tag content</element>' ) )
+ − 808
*
+ − 809
* @param $elements list of element names. Comments are always extracted.
+ − 810
* @param $text Source text string.
+ − 811
* @param $uniq_prefix
+ − 812
*
+ − 813
* @access private
+ − 814
* @static
+ − 815
*/
+ − 816
function extractTagsAndParams($elements, $text, &$matches, $uniq_prefix = ''){
+ − 817
static $n = 1;
+ − 818
$stripped = '';
+ − 819
$matches = array();
+ − 820
+ − 821
$taglist = implode( '|', $elements );
+ − 822
$start = "/<($taglist)(\\s+[^>]*?|\\s*?)(\/?>)|<(!--)/i";
+ − 823
+ − 824
while ( '' != $text ) {
+ − 825
$p = preg_split( $start, $text, 2, PREG_SPLIT_DELIM_CAPTURE );
+ − 826
$stripped .= $p[0];
+ − 827
if( count( $p ) < 5 ) {
+ − 828
break;
+ − 829
}
+ − 830
if( count( $p ) > 5 ) {
+ − 831
// comment
+ − 832
$element = $p[4];
+ − 833
$attributes = '';
+ − 834
$close = '';
+ − 835
$inside = $p[5];
+ − 836
} else {
+ − 837
// tag
+ − 838
$element = $p[1];
+ − 839
$attributes = $p[2];
+ − 840
$close = $p[3];
+ − 841
$inside = $p[4];
+ − 842
}
+ − 843
+ − 844
$marker = "$uniq_prefix-$element-" . sprintf('%08X', $n++) . '-QINU';
+ − 845
$stripped .= $marker;
+ − 846
+ − 847
if ( $close === '/>' ) {
+ − 848
// Empty element tag, <tag />
+ − 849
$content = null;
+ − 850
$text = $inside;
+ − 851
$tail = null;
+ − 852
} else {
+ − 853
if( $element == '!--' ) {
+ − 854
$end = '/(-->)/';
+ − 855
} else {
+ − 856
$end = "/(<\\/$element\\s*>)/i";
+ − 857
}
+ − 858
$q = preg_split( $end, $inside, 2, PREG_SPLIT_DELIM_CAPTURE );
+ − 859
$content = $q[0];
+ − 860
if( count( $q ) < 3 ) {
+ − 861
# No end tag -- let it run out to the end of the text.
+ − 862
$tail = '';
+ − 863
$text = '';
+ − 864
} else {
+ − 865
$tail = $q[1];
+ − 866
$text = $q[2];
+ − 867
}
+ − 868
}
+ − 869
+ − 870
$matches[$marker] = array( $element,
+ − 871
$content,
+ − 872
decodeTagAttributes( $attributes ),
+ − 873
"<$element$attributes$close$content$tail" );
+ − 874
}
+ − 875
return $stripped;
+ − 876
}
+ − 877
+ − 878
/**
+ − 879
* Escape html tags
+ − 880
* Basically replacing " > and < with HTML entities ( ", >, <)
+ − 881
*
+ − 882
* @param $in String: text that might contain HTML tags.
+ − 883
* @return string Escaped string
+ − 884
*/
+ − 885
function wfEscapeHTMLTagsOnly( $in ) {
+ − 886
return str_replace(
+ − 887
array( '"', '>', '<' ),
+ − 888
array( '"', '>', '<' ),
+ − 889
$in );
+ − 890
}
+ − 891
+ − 892
/**
+ − 893
* Restores pre, math, and other extensions removed by strip()
+ − 894
*
+ − 895
* always call unstripNoWiki() after this one
+ − 896
* @private
+ − 897
*/
+ − 898
function unstrip( $text, &$state ) {
+ − 899
if ( !isset( $state['general'] ) ) {
+ − 900
return $text;
+ − 901
}
+ − 902
+ − 903
# TODO: good candidate for FSS
+ − 904
$text = strtr( $text, $state['general'] );
+ − 905
+ − 906
return $text;
+ − 907
}
+ − 908
+ − 909
/**
+ − 910
* Return UTF-8 string for a codepoint if that is a valid
+ − 911
* character reference, otherwise U+FFFD REPLACEMENT CHARACTER.
+ − 912
* @param int $codepoint
+ − 913
* @return string
+ − 914
* @private
+ − 915
*/
+ − 916
function decodeChar( $codepoint ) {
+ − 917
if( validateCodepoint( $codepoint ) ) {
+ − 918
return codepointToUtf8( $codepoint );
+ − 919
} else {
+ − 920
return UTF8_REPLACEMENT;
+ − 921
}
+ − 922
}
+ − 923
+ − 924
/**
+ − 925
* If the named entity is defined in the HTML 4.0/XHTML 1.0 DTD,
+ − 926
* return the UTF-8 encoding of that character. Otherwise, returns
+ − 927
* pseudo-entity source (eg &foo;)
+ − 928
*
+ − 929
* @param string $name
+ − 930
* @return string
+ − 931
*/
+ − 932
function decodeEntity( $name ) {
+ − 933
global $wgHtmlEntities;
+ − 934
if( isset( $wgHtmlEntities[$name] ) ) {
+ − 935
return codepointToUtf8( $wgHtmlEntities[$name] );
+ − 936
} else {
+ − 937
return "&$name;";
+ − 938
}
+ − 939
}
+ − 940
+ − 941
/**
+ − 942
* Returns true if a given Unicode codepoint is a valid character in XML.
+ − 943
* @param int $codepoint
+ − 944
* @return bool
+ − 945
*/
+ − 946
function validateCodepoint( $codepoint ) {
+ − 947
return ($codepoint == 0x09)
+ − 948
|| ($codepoint == 0x0a)
+ − 949
|| ($codepoint == 0x0d)
+ − 950
|| ($codepoint >= 0x20 && $codepoint <= 0xd7ff)
+ − 951
|| ($codepoint >= 0xe000 && $codepoint <= 0xfffd)
+ − 952
|| ($codepoint >= 0x10000 && $codepoint <= 0x10ffff);
+ − 953
}
+ − 954
+ − 955
/**
+ − 956
* Return UTF-8 sequence for a given Unicode code point.
+ − 957
* May die if fed out of range data.
+ − 958
*
+ − 959
* @param $codepoint Integer:
+ − 960
* @return String
+ − 961
* @public
+ − 962
*/
+ − 963
function codepointToUtf8( $codepoint ) {
+ − 964
if($codepoint < 0x80) return chr($codepoint);
+ − 965
if($codepoint < 0x800) return chr($codepoint >> 6 & 0x3f | 0xc0) .
+ − 966
chr($codepoint & 0x3f | 0x80);
+ − 967
if($codepoint < 0x10000) return chr($codepoint >> 12 & 0x0f | 0xe0) .
+ − 968
chr($codepoint >> 6 & 0x3f | 0x80) .
+ − 969
chr($codepoint & 0x3f | 0x80);
+ − 970
if($codepoint < 0x110000) return chr($codepoint >> 18 & 0x07 | 0xf0) .
+ − 971
chr($codepoint >> 12 & 0x3f | 0x80) .
+ − 972
chr($codepoint >> 6 & 0x3f | 0x80) .
+ − 973
chr($codepoint & 0x3f | 0x80);
+ − 974
+ − 975
echo "Asked for code outside of range ($codepoint)\n";
+ − 976
die( -1 );
+ − 977
}
+ − 978
+ − 979
/**
+ − 980
* @param string $matches
+ − 981
* @return string
+ − 982
*/
+ − 983
function decodeCharReferencesCallback( $matches ) {
+ − 984
if( $matches[1] != '' ) {
24
+ − 985
return decodeEntity( $matches[1] );
1
+ − 986
} elseif( $matches[2] != '' ) {
24
+ − 987
return decodeChar( intval( $matches[2] ) );
1
+ − 988
} elseif( $matches[3] != '' ) {
24
+ − 989
return decodeChar( hexdec( $matches[3] ) );
1
+ − 990
} elseif( $matches[4] != '' ) {
24
+ − 991
return decodeChar( hexdec( $matches[4] ) );
1
+ − 992
}
+ − 993
# Last case should be an ampersand by itself
+ − 994
return $matches[0];
+ − 995
}
+ − 996
+ − 997
?>