1
+ − 1
<?php
+ − 2
166
+ − 3
/*
1
+ − 4
* Enano - an open-source CMS capable of wiki functions, Drupal-like sidebar blocks, and everything in between
387
92664d2efab8
Rebranded source code as 1.1.1; added TinyMCE ACL rule as per Vadi's request: http://forum.enanocms.org/viewtopic.php?f=7&t=54
Dan
diff
changeset
+ − 5
* Version 1.1.1 (Caoineag alpha 1)
1
+ − 6
* Copyright (C) 2006-2007 Dan Fuhry
+ − 7
*
+ − 8
* This program is Free Software; you can redistribute and/or modify it under the terms of the GNU General Public License
+ − 9
* as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version.
+ − 10
*
+ − 11
* This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied
+ − 12
* warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for details.
+ − 13
*
+ − 14
* This script contains code originally found in MediaWiki (http://www.mediawiki.org). MediaWiki is also licensed under
+ − 15
* the GPLv2; see the file GPL included with this package for details.
+ − 16
*
+ − 17
* We're using the MW parser because the Text_Wiki version simply refused to work under PHP 5.2.0. Porting this was
+ − 18
* _not_ easy. <leaves to get cup of coffee>
+ − 19
*/
+ − 20
+ − 21
global $mStripState, $wgRandomKey;
+ − 22
$mStripState = Array();
+ − 23
+ − 24
$attrib = '[a-zA-Z0-9]';
+ − 25
$space = '[\x09\x0a\x0d\x20]';
+ − 26
+ − 27
define( 'MW_CHAR_REFS_REGEX',
+ − 28
'/&([A-Za-z0-9]+);
+ − 29
|&\#([0-9]+);
+ − 30
|&\#x([0-9A-Za-z]+);
+ − 31
|&\#X([0-9A-Za-z]+);
+ − 32
|(&)/x' );
+ − 33
+ − 34
define( 'MW_ATTRIBS_REGEX',
+ − 35
"/(?:^|$space)($attrib+)
+ − 36
($space*=$space*
+ − 37
(?:
+ − 38
# The attribute value: quoted or alone
+ − 39
".'"'."([^<".'"'."]*)".'"'."
+ − 40
| '([^<']*)'
+ − 41
| ([a-zA-Z0-9!#$%&()*,\\-.\\/:;<>?@[\\]^_`{|}~]+)
+ − 42
| (\#[0-9a-fA-F]+) # Technically wrong, but lots of
+ − 43
# colors are specified like this.
+ − 44
# We'll be normalizing it.
+ − 45
)
+ − 46
)?(?=$space|\$)/sx" );
+ − 47
+ − 48
/**
+ − 49
* emulate mediawiki parser, including stripping, etc.
+ − 50
*
+ − 51
* @param string $text the text to parse
+ − 52
* @return string
+ − 53
* @access public
+ − 54
*/
+ − 55
+ − 56
function process_tables( $text )
+ − 57
{
+ − 58
// include some globals, do some parser stuff that would normally be done in the parent parser function
+ − 59
global $mStripState;
+ − 60
$x =& $mStripState;
+ − 61
//$text = mwStrip( $text, $x );
+ − 62
+ − 63
// parse the text
+ − 64
$text = doTableStuff($text);
+ − 65
+ − 66
// Unstrip it
+ − 67
// $text = unstrip( $text, $mStripState );
+ − 68
// $text = unstripNoWiki( $text, $mStripState );
+ − 69
//die('<pre>'.print_r($mStripState, true).'</pre>');
+ − 70
return $text;
+ − 71
}
+ − 72
+ − 73
/**
+ − 74
* parse the wiki syntax used to render tables
+ − 75
*
+ − 76
* @param string $t the text to parse
+ − 77
* @return string
+ − 78
* @access private
+ − 79
*/
+ − 80
function doTableStuff( $t ) {
+ − 81
+ − 82
$t = explode ( "\n" , $t ) ;
+ − 83
$td = array () ; # Is currently a td tag open?
+ − 84
$ltd = array () ; # Was it TD or TH?
+ − 85
$tr = array () ; # Is currently a tr tag open?
+ − 86
$ltr = array () ; # tr attributes
+ − 87
$has_opened_tr = array(); # Did this table open a <tr> element?
+ − 88
$indent_level = 0; # indent level of the table
+ − 89
foreach ( $t AS $k => $x )
+ − 90
{
+ − 91
$x = trim ( $x ) ;
+ − 92
$fc = substr ( $x , 0 , 1 ) ;
+ − 93
if ( preg_match( '/^(:*)\{\|(.*)$/', $x, $matches ) ) {
+ − 94
$indent_level = strlen( $matches[1] );
+ − 95
+ − 96
$attributes = unstripForHTML( $matches[2] );
+ − 97
+ − 98
$t[$k] = str_repeat( '<dl><dd>', $indent_level ) .
+ − 99
'<nowiki><table' . fixTagAttributes( $attributes, 'table' ) . '></nowiki>' ;
+ − 100
array_push ( $td , false ) ;
+ − 101
array_push ( $ltd , '' ) ;
+ − 102
array_push ( $tr , false ) ;
+ − 103
array_push ( $ltr , '' ) ;
+ − 104
array_push ( $has_opened_tr, false );
+ − 105
}
+ − 106
else if ( count ( $td ) == 0 ) { } # Don't do any of the following
+ − 107
else if ( '|}' == substr ( $x , 0 , 2 ) ) {
+ − 108
$z = "<nowiki></table></nowiki>" . substr ( $x , 2);
+ − 109
$l = array_pop ( $ltd ) ;
+ − 110
if ( !array_pop ( $has_opened_tr ) ) $z = "<nowiki><tr><td></td></tr></nowiki>" . $z ;
+ − 111
if ( array_pop ( $tr ) ) $z = '<nowiki></tr></nowiki>' . $z ;
+ − 112
if ( array_pop ( $td ) ) $z = '<nowiki></'.$l.'></nowiki>' . $z ;
+ − 113
array_pop ( $ltr ) ;
+ − 114
$t[$k] = $z . str_repeat( '<nowiki></dd></dl></nowiki>', $indent_level );
+ − 115
}
+ − 116
else if ( '|-' == substr ( $x , 0 , 2 ) ) { # Allows for |---------------
+ − 117
$x = substr ( $x , 1 ) ;
+ − 118
while ( $x != '' && substr ( $x , 0 , 1 ) == '-' ) $x = substr ( $x , 1 ) ;
+ − 119
$z = '' ;
+ − 120
$l = array_pop ( $ltd ) ;
+ − 121
array_pop ( $has_opened_tr );
+ − 122
array_push ( $has_opened_tr , true ) ;
+ − 123
if ( array_pop ( $tr ) ) $z = '<nowiki></tr></nowiki>' . $z ;
+ − 124
if ( array_pop ( $td ) ) $z = '<nowiki></'.$l.'></nowiki>' . $z ;
+ − 125
array_pop ( $ltr ) ;
+ − 126
$t[$k] = $z ;
+ − 127
array_push ( $tr , false ) ;
+ − 128
array_push ( $td , false ) ;
+ − 129
array_push ( $ltd , '' ) ;
+ − 130
$attributes = unstripForHTML( $x );
+ − 131
array_push ( $ltr , fixTagAttributes( $attributes, 'tr' ) ) ;
+ − 132
}
+ − 133
else if ( '|' == $fc || '!' == $fc || '|+' == substr ( $x , 0 , 2 ) ) { # Caption
+ − 134
# $x is a table row
+ − 135
if ( '|+' == substr ( $x , 0 , 2 ) ) {
+ − 136
$fc = '+' ;
+ − 137
$x = substr ( $x , 1 ) ;
+ − 138
}
+ − 139
$after = substr ( $x , 1 ) ;
+ − 140
if ( $fc == '!' ) $after = str_replace ( '!!' , '||' , $after ) ;
+ − 141
+ − 142
// Split up multiple cells on the same line.
+ − 143
// FIXME: This can result in improper nesting of tags processed
+ − 144
// by earlier parser steps, but should avoid splitting up eg
+ − 145
// attribute values containing literal "||".
+ − 146
$after = wfExplodeMarkup( '||', $after );
+ − 147
+ − 148
$t[$k] = '' ;
+ − 149
+ − 150
# Loop through each table cell
+ − 151
foreach ( $after AS $theline )
+ − 152
{
+ − 153
$z = '' ;
+ − 154
if ( $fc != '+' )
+ − 155
{
+ − 156
$tra = array_pop ( $ltr ) ;
+ − 157
if ( !array_pop ( $tr ) ) $z = '<nowiki><tr'.$tra."></nowiki>\n" ;
+ − 158
array_push ( $tr , true ) ;
+ − 159
array_push ( $ltr , '' ) ;
+ − 160
array_pop ( $has_opened_tr );
+ − 161
array_push ( $has_opened_tr , true ) ;
+ − 162
}
+ − 163
+ − 164
$l = array_pop ( $ltd ) ;
+ − 165
if ( array_pop ( $td ) ) $z = '<nowiki></'.$l.'></nowiki>' . $z ;
+ − 166
if ( $fc == '|' ) $l = 'td' ;
+ − 167
else if ( $fc == '!' ) $l = 'th' ;
+ − 168
else if ( $fc == '+' ) $l = 'caption' ;
+ − 169
else $l = '' ;
+ − 170
array_push ( $ltd , $l ) ;
+ − 171
+ − 172
# Cell parameters
+ − 173
$y = explode ( '|' , $theline , 2 ) ;
+ − 174
# Note that a '|' inside an invalid link should not
+ − 175
# be mistaken as delimiting cell parameters
+ − 176
if ( strpos( $y[0], '[[' ) !== false ) {
+ − 177
$y = array ($theline);
+ − 178
}
+ − 179
if ( count ( $y ) == 1 )
+ − 180
$y = "{$z}<nowiki><{$l}></nowiki>{$y[0]}" ;
+ − 181
else {
+ − 182
$attributes = unstripForHTML( $y[0] );
+ − 183
$y = "{$z}<nowiki><{$l}".fixTagAttributes($attributes, $l)."></nowiki>{$y[1]}" ;
+ − 184
}
+ − 185
$t[$k] .= $y ;
+ − 186
array_push ( $td , true ) ;
+ − 187
}
+ − 188
}
+ − 189
}
+ − 190
+ − 191
# Closing open td, tr && table
+ − 192
while ( count ( $td ) > 0 )
+ − 193
{
+ − 194
$l = array_pop ( $ltd ) ;
+ − 195
if ( array_pop ( $td ) ) $t[] = '<nowiki></td></nowiki>' ;
+ − 196
if ( array_pop ( $tr ) ) $t[] = '<nowiki></tr></nowiki>' ;
+ − 197
if ( !array_pop ( $has_opened_tr ) ) $t[] = "<nowiki><tr><td></td></tr></nowiki>" ;
+ − 198
$t[] = '<nowiki></table></nowiki>' ;
+ − 199
}
+ − 200
+ − 201
$t = implode ( "\n" , $t ) ;
+ − 202
+ − 203
# special case: don't return empty table
+ − 204
if($t == "<nowiki><table></nowiki>\n<nowiki><tr><td></td></tr></nowiki>\n<nowiki></table></nowiki>")
+ − 205
$t = '';
+ − 206
return $t ;
+ − 207
}
+ − 208
+ − 209
/**
+ − 210
* Take a tag soup fragment listing an HTML element's attributes
+ − 211
* and normalize it to well-formed XML, discarding unwanted attributes.
+ − 212
* Output is safe for further wikitext processing, with escaping of
+ − 213
* values that could trigger problems.
+ − 214
*
+ − 215
* - Normalizes attribute names to lowercase
+ − 216
* - Discards attributes not on a whitelist for the given element
+ − 217
* - Turns broken or invalid entities into plaintext
+ − 218
* - Double-quotes all attribute values
+ − 219
* - Attributes without values are given the name as attribute
+ − 220
* - Double attributes are discarded
+ − 221
* - Unsafe style attributes are discarded
+ − 222
* - Prepends space if there are attributes.
+ − 223
*
+ − 224
* @param string $text
+ − 225
* @param string $element
+ − 226
* @return string
+ − 227
*/
+ − 228
function fixTagAttributes( $text, $element ) {
+ − 229
if( trim( $text ) == '' ) {
+ − 230
return '';
+ − 231
}
+ − 232
+ − 233
$stripped = validateTagAttributes(
+ − 234
decodeTagAttributes( $text ), $element );
+ − 235
+ − 236
$attribs = array();
+ − 237
foreach( $stripped as $attribute => $value ) {
+ − 238
$encAttribute = htmlspecialchars( $attribute );
+ − 239
$encValue = safeEncodeAttribute( $value );
+ − 240
+ − 241
$attribs[] = "$encAttribute=".'"'."$encValue".'"'.""; // "
+ − 242
}
+ − 243
return count( $attribs ) ? ' ' . implode( ' ', $attribs ) : '';
+ − 244
}
+ − 245
+ − 246
/**
+ − 247
* Encode an attribute value for HTML tags, with extra armoring
+ − 248
* against further wiki processing.
+ − 249
* @param $text
+ − 250
* @return HTML-encoded text fragment
+ − 251
*/
+ − 252
function safeEncodeAttribute( $text ) {
+ − 253
$encValue= encodeAttribute( $text );
+ − 254
+ − 255
# Templates and links may be expanded in later parsing,
+ − 256
# creating invalid or dangerous output. Suppress this.
+ − 257
$encValue = strtr( $encValue, array(
+ − 258
'<' => '<', // This should never happen,
+ − 259
'>' => '>', // we've received invalid input
+ − 260
'"' => '"', // which should have been escaped.
+ − 261
'{' => '{',
+ − 262
'[' => '[',
+ − 263
"''" => '''',
+ − 264
'ISBN' => 'ISBN',
+ − 265
'RFC' => 'RFC',
+ − 266
'PMID' => 'PMID',
+ − 267
'|' => '|',
+ − 268
'__' => '__',
+ − 269
) );
+ − 270
+ − 271
return $encValue;
+ − 272
}
+ − 273
+ − 274
/**
+ − 275
* Encode an attribute value for HTML output.
+ − 276
* @param $text
+ − 277
* @return HTML-encoded text fragment
+ − 278
*/
+ − 279
function encodeAttribute( $text ) {
309
+ − 280
+ − 281
// In Enano 1.0.3, added this cheapo hack to keep ampersands
+ − 282
// from being double-sanitized. Thanks to markybob from #deluge.
+ − 283
$encValue = strtr( $text, array(
+ − 284
'&' => '&'
+ − 285
) );
+ − 286
1
+ − 287
$encValue = htmlspecialchars( $text );
+ − 288
+ − 289
// Whitespace is normalized during attribute decoding,
+ − 290
// so if we've been passed non-spaces we must encode them
+ − 291
// ahead of time or they won't be preserved.
+ − 292
$encValue = strtr( $encValue, array(
+ − 293
"\n" => ' ',
+ − 294
"\r" => ' ',
+ − 295
"\t" => '	',
+ − 296
) );
+ − 297
+ − 298
return $encValue;
+ − 299
}
+ − 300
+ − 301
function unstripForHTML( $text ) {
+ − 302
global $mStripState;
+ − 303
$text = unstrip( $text, $mStripState );
+ − 304
$text = unstripNoWiki( $text, $mStripState );
+ − 305
return $text;
+ − 306
}
+ − 307
+ − 308
/**
+ − 309
* Always call this after unstrip() to preserve the order
+ − 310
*
+ − 311
* @private
+ − 312
*/
+ − 313
function unstripNoWiki( $text, &$state ) {
+ − 314
if ( !isset( $state['nowiki'] ) ) {
+ − 315
return $text;
+ − 316
}
+ − 317
+ − 318
# TODO: good candidate for FSS
+ − 319
$text = strtr( $text, $state['nowiki'] );
+ − 320
+ − 321
return $text;
+ − 322
}
+ − 323
+ − 324
/**
+ − 325
* Take an array of attribute names and values and normalize or discard
+ − 326
* illegal values for the given element type.
+ − 327
*
+ − 328
* - Discards attributes not on a whitelist for the given element
+ − 329
* - Unsafe style attributes are discarded
+ − 330
*
+ − 331
* @param array $attribs
+ − 332
* @param string $element
+ − 333
* @return array
+ − 334
*
+ − 335
* @todo Check for legal values where the DTD limits things.
+ − 336
* @todo Check for unique id attribute :P
+ − 337
*/
+ − 338
function validateTagAttributes( $attribs, $element ) {
+ − 339
$whitelist = array_flip( attributeWhitelist( $element ) );
+ − 340
$out = array();
+ − 341
foreach( $attribs as $attribute => $value ) {
+ − 342
if( !isset( $whitelist[$attribute] ) ) {
+ − 343
continue;
+ − 344
}
+ − 345
# Strip javascript "expression" from stylesheets.
+ − 346
# http://msdn.microsoft.com/workshop/author/dhtml/overview/recalc.asp
+ − 347
if( $attribute == 'style' ) {
+ − 348
$value = checkCss( $value );
+ − 349
if( $value === false ) {
+ − 350
# haxx0r
+ − 351
continue;
+ − 352
}
+ − 353
}
+ − 354
+ − 355
if ( $attribute === 'id' )
+ − 356
$value = escapeId( $value );
+ − 357
+ − 358
// If this attribute was previously set, override it.
+ − 359
// Output should only have one attribute of each name.
+ − 360
$out[$attribute] = $value;
+ − 361
}
+ − 362
return $out;
+ − 363
}
+ − 364
+ − 365
/**
+ − 366
* Pick apart some CSS and check it for forbidden or unsafe structures.
+ − 367
* Returns a sanitized string, or false if it was just too evil.
+ − 368
*
+ − 369
* Currently URL references, 'expression', 'tps' are forbidden.
+ − 370
*
+ − 371
* @param string $value
+ − 372
* @return mixed
+ − 373
*/
+ − 374
function checkCss( $value ) {
+ − 375
$stripped = decodeCharReferences( $value );
+ − 376
+ − 377
// Remove any comments; IE gets token splitting wrong
+ − 378
$stripped = preg_replace( '!/\\*.*?\\*/!S', '', $stripped );
+ − 379
$value = $stripped;
+ − 380
+ − 381
// ... and continue checks
+ − 382
$stripped = preg_replace( '!\\\\([0-9A-Fa-f]{1,6})[ \\n\\r\\t\\f]?!e',
+ − 383
'codepointToUtf8(hexdec("$1"))', $stripped );
+ − 384
$stripped = str_replace( '\\', '', $stripped );
+ − 385
if( preg_match( '/(expression|tps*:\/\/|url\\s*\().*/is',
+ − 386
$stripped ) ) {
+ − 387
# haxx0r
+ − 388
return false;
+ − 389
}
+ − 390
+ − 391
return $value;
+ − 392
}
+ − 393
+ − 394
/**
+ − 395
* Decode any character references, numeric or named entities,
+ − 396
* in the text and return a UTF-8 string.
+ − 397
*
+ − 398
* @param string $text
+ − 399
* @return string
+ − 400
* @access public
+ − 401
* @static
+ − 402
*/
+ − 403
function decodeCharReferences( $text ) {
+ − 404
return preg_replace_callback(
+ − 405
MW_CHAR_REFS_REGEX,
+ − 406
'decodeCharReferencesCallback',
+ − 407
$text );
+ − 408
}
+ − 409
+ − 410
/**
+ − 411
* Fetch the whitelist of acceptable attributes for a given
+ − 412
* element name.
+ − 413
*
+ − 414
* @param string $element
+ − 415
* @return array
+ − 416
*/
+ − 417
function attributeWhitelist( $element ) {
+ − 418
static $list;
+ − 419
if( !isset( $list ) ) {
+ − 420
$list = setupAttributeWhitelist();
+ − 421
}
+ − 422
return isset( $list[$element] )
+ − 423
? $list[$element]
+ − 424
: array();
+ − 425
}
+ − 426
+ − 427
/**
+ − 428
* @todo Document it a bit
+ − 429
* @return array
+ − 430
*/
+ − 431
function setupAttributeWhitelist() {
163
+ − 432
global $db, $session, $paths, $template, $plugins;
1
+ − 433
$common = array( 'id', 'class', 'lang', 'dir', 'title', 'style' );
+ − 434
$block = array_merge( $common, array( 'align' ) );
+ − 435
$tablealign = array( 'align', 'char', 'charoff', 'valign' );
+ − 436
$tablecell = array( 'abbr',
+ − 437
'axis',
+ − 438
'headers',
+ − 439
'scope',
+ − 440
'rowspan',
+ − 441
'colspan',
+ − 442
'nowrap', # deprecated
+ − 443
'width', # deprecated
+ − 444
'height', # deprecated
+ − 445
'bgcolor' # deprecated
+ − 446
);
+ − 447
+ − 448
# Numbers refer to sections in HTML 4.01 standard describing the element.
+ − 449
# See: http://www.w3.org/TR/html4/
+ − 450
$whitelist = array (
+ − 451
# 7.5.4
+ − 452
'div' => $block,
+ − 453
'center' => $common, # deprecated
+ − 454
'span' => $block, # ??
+ − 455
+ − 456
# 7.5.5
+ − 457
'h1' => $block,
+ − 458
'h2' => $block,
+ − 459
'h3' => $block,
+ − 460
'h4' => $block,
+ − 461
'h5' => $block,
+ − 462
'h6' => $block,
+ − 463
+ − 464
# 7.5.6
+ − 465
# address
+ − 466
+ − 467
# 8.2.4
+ − 468
# bdo
+ − 469
+ − 470
# 9.2.1
+ − 471
'em' => $common,
+ − 472
'strong' => $common,
+ − 473
'cite' => $common,
+ − 474
# dfn
+ − 475
'code' => $common,
+ − 476
# samp
+ − 477
# kbd
+ − 478
'var' => $common,
+ − 479
# abbr
+ − 480
# acronym
+ − 481
+ − 482
# 9.2.2
+ − 483
'blockquote' => array_merge( $common, array( 'cite' ) ),
+ − 484
# q
+ − 485
+ − 486
# 9.2.3
+ − 487
'sub' => $common,
+ − 488
'sup' => $common,
+ − 489
+ − 490
# 9.3.1
+ − 491
'p' => $block,
+ − 492
+ − 493
# 9.3.2
+ − 494
'br' => array( 'id', 'class', 'title', 'style', 'clear' ),
+ − 495
+ − 496
# 9.3.4
+ − 497
'pre' => array_merge( $common, array( 'width' ) ),
+ − 498
+ − 499
# 9.4
+ − 500
'ins' => array_merge( $common, array( 'cite', 'datetime' ) ),
+ − 501
'del' => array_merge( $common, array( 'cite', 'datetime' ) ),
+ − 502
+ − 503
# 10.2
+ − 504
'ul' => array_merge( $common, array( 'type' ) ),
+ − 505
'ol' => array_merge( $common, array( 'type', 'start' ) ),
+ − 506
'li' => array_merge( $common, array( 'type', 'value' ) ),
+ − 507
+ − 508
# 10.3
+ − 509
'dl' => $common,
+ − 510
'dd' => $common,
+ − 511
'dt' => $common,
+ − 512
+ − 513
# 11.2.1
+ − 514
'table' => array_merge( $common,
+ − 515
array( 'summary', 'width', 'border', 'frame',
+ − 516
'rules', 'cellspacing', 'cellpadding',
+ − 517
'align', 'bgcolor',
+ − 518
) ),
+ − 519
+ − 520
# 11.2.2
+ − 521
'caption' => array_merge( $common, array( 'align' ) ),
+ − 522
+ − 523
# 11.2.3
+ − 524
'thead' => array_merge( $common, $tablealign ),
+ − 525
'tfoot' => array_merge( $common, $tablealign ),
+ − 526
'tbody' => array_merge( $common, $tablealign ),
+ − 527
+ − 528
# 11.2.4
+ − 529
'colgroup' => array_merge( $common, array( 'span', 'width' ), $tablealign ),
+ − 530
'col' => array_merge( $common, array( 'span', 'width' ), $tablealign ),
+ − 531
+ − 532
# 11.2.5
+ − 533
'tr' => array_merge( $common, array( 'bgcolor' ), $tablealign ),
+ − 534
+ − 535
# 11.2.6
+ − 536
'td' => array_merge( $common, $tablecell, $tablealign ),
+ − 537
'th' => array_merge( $common, $tablecell, $tablealign ),
+ − 538
+ − 539
# 12.2
+ − 540
# added by dan
+ − 541
'a' => array_merge( $common, array( 'href', 'name' ) ),
+ − 542
+ − 543
# 13.2
+ − 544
# added by dan
+ − 545
'img' => array_merge( $common, array( 'src', 'width', 'height', 'alt' ) ),
+ − 546
+ − 547
# 15.2.1
+ − 548
'tt' => $common,
+ − 549
'b' => $common,
+ − 550
'i' => $common,
+ − 551
'big' => $common,
+ − 552
'small' => $common,
+ − 553
'strike' => $common,
+ − 554
's' => $common,
+ − 555
'u' => $common,
+ − 556
+ − 557
# 15.2.2
+ − 558
'font' => array_merge( $common, array( 'size', 'color', 'face' ) ),
+ − 559
# basefont
+ − 560
+ − 561
# 15.3
+ − 562
'hr' => array_merge( $common, array( 'noshade', 'size', 'width' ) ),
+ − 563
+ − 564
# XHTML Ruby annotation text module, simple ruby only.
+ − 565
# http://www.w3c.org/TR/ruby/
+ − 566
'ruby' => $common,
+ − 567
# rbc
+ − 568
# rtc
+ − 569
'rb' => $common,
+ − 570
'rt' => $common, #array_merge( $common, array( 'rbspan' ) ),
+ − 571
'rp' => $common,
+ − 572
+ − 573
# For compatibility with the XHTML parser.
+ − 574
'nowiki' => array(),
+ − 575
'noinclude' => array(),
+ − 576
'nodisplay' => array(),
377
bb3e6c3bd4f4
Removed stray debugging info from ACL editor success notification; added ability for guests to set language on URI (?lang=eng); added html_in_pages ACL type and separated from php_in_pages so HTML can be embedded but not PHP; rewote portions of the path manager to better abstract URL input; added Zend Framework into list of BSD-licensed libraries; localized some remaining strings; got the migration script working, but just barely; fixed display bug in Special:Contributions; localized Main Page button in admin panel
Dan
diff
changeset
+ − 577
'lang' => array('code'),
1
+ − 578
+ − 579
# XHTML stuff
+ − 580
'acronym' => $common
+ − 581
);
163
+ − 582
+ − 583
// custom tags can be added by plugins
+ − 584
$code = $plugins->setHook('html_attribute_whitelist');
+ − 585
foreach ( $code as $cmd )
+ − 586
{
+ − 587
eval($cmd);
+ − 588
}
+ − 589
1
+ − 590
return $whitelist;
+ − 591
}
+ − 592
+ − 593
/**
+ − 594
* Given a value escape it so that it can be used in an id attribute and
+ − 595
* return it, this does not validate the value however (see first link)
+ − 596
*
+ − 597
* @link http://www.w3.org/TR/html401/types.html#type-name Valid characters
+ − 598
* in the id and
+ − 599
* name attributes
+ − 600
* @link http://www.w3.org/TR/html401/struct/links.html#h-12.2.3 Anchors with the id attribute
+ − 601
*
+ − 602
* @bug 4461
+ − 603
*
+ − 604
* @static
+ − 605
*
+ − 606
* @param string $id
+ − 607
* @return string
+ − 608
*/
+ − 609
function escapeId( $id ) {
+ − 610
static $replace = array(
+ − 611
'%3A' => ':',
+ − 612
'%' => '.'
+ − 613
);
+ − 614
+ − 615
$id = urlencode( decodeCharReferences( strtr( $id, ' ', '_' ) ) );
+ − 616
+ − 617
return str_replace( array_keys( $replace ), array_values( $replace ), $id );
+ − 618
}
+ − 619
+ − 620
/**
+ − 621
* More or less "markup-safe" explode()
+ − 622
* Ignores any instances of the separator inside <...>
+ − 623
* @param string $separator
+ − 624
* @param string $text
+ − 625
* @return array
+ − 626
*/
+ − 627
function wfExplodeMarkup( $separator, $text ) {
+ − 628
$placeholder = "\x00";
+ − 629
+ − 630
// Just in case...
+ − 631
$text = str_replace( $placeholder, '', $text );
+ − 632
+ − 633
// Trim stuff
+ − 634
$replacer = new ReplacerCallback( $separator, $placeholder );
+ − 635
$cleaned = preg_replace_callback( '/(<.*?>)/', array( $replacer, 'go' ), $text );
+ − 636
+ − 637
$items = explode( $separator, $cleaned );
+ − 638
foreach( $items as $i => $str ) {
+ − 639
$items[$i] = str_replace( $placeholder, $separator, $str );
+ − 640
}
+ − 641
+ − 642
return $items;
+ − 643
}
+ − 644
+ − 645
class ReplacerCallback {
+ − 646
function ReplacerCallback( $from, $to ) {
+ − 647
$this->from = $from;
+ − 648
$this->to = $to;
+ − 649
}
+ − 650
+ − 651
function go( $matches ) {
+ − 652
return str_replace( $this->from, $this->to, $matches[1] );
+ − 653
}
+ − 654
}
+ − 655
+ − 656
/**
+ − 657
* Return an associative array of attribute names and values from
+ − 658
* a partial tag string. Attribute names are forces to lowercase,
+ − 659
* character references are decoded to UTF-8 text.
+ − 660
*
+ − 661
* @param string
+ − 662
* @return array
+ − 663
*/
+ − 664
function decodeTagAttributes( $text ) {
+ − 665
$attribs = array();
+ − 666
+ − 667
if( trim( $text ) == '' ) {
+ − 668
return $attribs;
+ − 669
}
+ − 670
+ − 671
$pairs = array();
+ − 672
if( !preg_match_all(
+ − 673
MW_ATTRIBS_REGEX,
+ − 674
$text,
+ − 675
$pairs,
+ − 676
PREG_SET_ORDER ) ) {
+ − 677
return $attribs;
+ − 678
}
+ − 679
+ − 680
foreach( $pairs as $set ) {
+ − 681
$attribute = strtolower( $set[1] );
+ − 682
$value = getTagAttributeCallback( $set );
+ − 683
+ − 684
// Normalize whitespace
+ − 685
$value = preg_replace( '/[\t\r\n ]+/', ' ', $value );
+ − 686
$value = trim( $value );
+ − 687
+ − 688
// Decode character references
+ − 689
$attribs[$attribute] = decodeCharReferences( $value );
+ − 690
}
+ − 691
return $attribs;
+ − 692
}
+ − 693
+ − 694
/**
+ − 695
* Pick the appropriate attribute value from a match set from the
+ − 696
* MW_ATTRIBS_REGEX matches.
+ − 697
*
+ − 698
* @param array $set
+ − 699
* @return string
+ − 700
* @access private
+ − 701
*/
+ − 702
function getTagAttributeCallback( $set ) {
+ − 703
if( isset( $set[6] ) ) {
+ − 704
# Illegal #XXXXXX color with no quotes.
+ − 705
return $set[6];
+ − 706
} elseif( isset( $set[5] ) ) {
+ − 707
# No quotes.
+ − 708
return $set[5];
+ − 709
} elseif( isset( $set[4] ) ) {
+ − 710
# Single-quoted
+ − 711
return $set[4];
+ − 712
} elseif( isset( $set[3] ) ) {
+ − 713
# Double-quoted
+ − 714
return $set[3];
+ − 715
} elseif( !isset( $set[2] ) ) {
+ − 716
# In XHTML, attributes must have a value.
+ − 717
# For 'reduced' form, return explicitly the attribute name here.
+ − 718
return $set[1];
+ − 719
} else {
+ − 720
die_friendly('Parser error', "<p>Tag conditions not met. This should never happen and is a bug.</p>" );
+ − 721
}
+ − 722
}
+ − 723
+ − 724
/**
+ − 725
* Strips and renders nowiki, pre, math, hiero
+ − 726
* If $render is set, performs necessary rendering operations on plugins
+ − 727
* Returns the text, and fills an array with data needed in unstrip()
+ − 728
* If the $state is already a valid strip state, it adds to the state
+ − 729
*
+ − 730
* @param bool $stripcomments when set, HTML comments <!-- like this -->
+ − 731
* will be stripped in addition to other tags. This is important
+ − 732
* for section editing, where these comments cause confusion when
+ − 733
* counting the sections in the wikisource
+ − 734
*
+ − 735
* @param array dontstrip contains tags which should not be stripped;
+ − 736
* used to prevent stipping of <gallery> when saving (fixes bug 2700)
+ − 737
*
+ − 738
* @access private
+ − 739
*/
+ − 740
function mwStrip( $text, &$state, $stripcomments = false , $dontstrip = array () ) {
+ − 741
global $wgRandomKey;
+ − 742
$render = true;
+ − 743
+ − 744
$wgRandomKey = "\x07UNIQ" . dechex(mt_rand(0, 0x7fffffff)) . dechex(mt_rand(0, 0x7fffffff));
+ − 745
$uniq_prefix =& $wgRandomKey;
+ − 746
$commentState = array();
+ − 747
+ − 748
$elements = array( 'nowiki', 'gallery' );
+ − 749
+ − 750
# Removing $dontstrip tags from $elements list (currently only 'gallery', fixing bug 2700)
+ − 751
foreach ( $elements AS $k => $v ) {
+ − 752
if ( !in_array ( $v , $dontstrip ) ) continue;
+ − 753
unset ( $elements[$k] );
+ − 754
}
+ − 755
+ − 756
$matches = array();
+ − 757
$text = extractTagsAndParams( $elements, $text, $matches, $uniq_prefix );
+ − 758
+ − 759
foreach( $matches as $marker => $data ) {
+ − 760
list( $element, $content, $params, $tag ) = $data;
+ − 761
if( $render ) {
+ − 762
$tagName = strtolower( $element );
+ − 763
switch( $tagName ) {
+ − 764
case '!--':
+ − 765
// Comment
+ − 766
if( substr( $tag, -3 ) == '-->' ) {
+ − 767
$output = $tag;
+ − 768
} else {
+ − 769
// Unclosed comment in input.
+ − 770
// Close it so later stripping can remove it
+ − 771
$output = "$tag-->";
+ − 772
}
+ − 773
break;
+ − 774
case 'html':
+ − 775
if( $wgRawHtml ) {
+ − 776
$output = $content;
+ − 777
break;
+ − 778
}
+ − 779
// Shouldn't happen otherwise. :)
+ − 780
case 'nowiki':
+ − 781
$output = wfEscapeHTMLTagsOnly( $content );
+ − 782
break;
+ − 783
default:
+ − 784
}
+ − 785
} else {
+ − 786
// Just stripping tags; keep the source
+ − 787
$output = $tag;
+ − 788
}
+ − 789
+ − 790
// Unstrip the output, because unstrip() is no longer recursive so
+ − 791
// it won't do it itself
+ − 792
$output = unstrip( $output, $state );
+ − 793
+ − 794
if( !$stripcomments && $element == '!--' ) {
+ − 795
$commentState[$marker] = $output;
+ − 796
} elseif ( $element == 'html' || $element == 'nowiki' ) {
+ − 797
$state['nowiki'][$marker] = $output;
+ − 798
} else {
+ − 799
$state['general'][$marker] = $output;
+ − 800
}
+ − 801
}
+ − 802
+ − 803
# Unstrip comments unless explicitly told otherwise.
+ − 804
# (The comments are always stripped prior to this point, so as to
+ − 805
# not invoke any extension tags / parser hooks contained within
+ − 806
# a comment.)
+ − 807
if ( !$stripcomments ) {
+ − 808
// Put them all back and forget them
+ − 809
$text = strtr( $text, $commentState );
+ − 810
}
+ − 811
+ − 812
return $text;
+ − 813
}
+ − 814
+ − 815
/**
+ − 816
* Replaces all occurrences of HTML-style comments and the given tags
+ − 817
* in the text with a random marker and returns teh next text. The output
+ − 818
* parameter $matches will be an associative array filled with data in
+ − 819
* the form:
+ − 820
* 'UNIQ-xxxxx' => array(
+ − 821
* 'element',
+ − 822
* 'tag content',
+ − 823
* array( 'param' => 'x' ),
+ − 824
* '<element param="x">tag content</element>' ) )
+ − 825
*
+ − 826
* @param $elements list of element names. Comments are always extracted.
+ − 827
* @param $text Source text string.
+ − 828
* @param $uniq_prefix
+ − 829
*
+ − 830
* @access private
+ − 831
* @static
+ − 832
*/
+ − 833
function extractTagsAndParams($elements, $text, &$matches, $uniq_prefix = ''){
+ − 834
static $n = 1;
+ − 835
$stripped = '';
+ − 836
$matches = array();
+ − 837
+ − 838
$taglist = implode( '|', $elements );
+ − 839
$start = "/<($taglist)(\\s+[^>]*?|\\s*?)(\/?>)|<(!--)/i";
+ − 840
+ − 841
while ( '' != $text ) {
+ − 842
$p = preg_split( $start, $text, 2, PREG_SPLIT_DELIM_CAPTURE );
+ − 843
$stripped .= $p[0];
+ − 844
if( count( $p ) < 5 ) {
+ − 845
break;
+ − 846
}
+ − 847
if( count( $p ) > 5 ) {
+ − 848
// comment
+ − 849
$element = $p[4];
+ − 850
$attributes = '';
+ − 851
$close = '';
+ − 852
$inside = $p[5];
+ − 853
} else {
+ − 854
// tag
+ − 855
$element = $p[1];
+ − 856
$attributes = $p[2];
+ − 857
$close = $p[3];
+ − 858
$inside = $p[4];
+ − 859
}
+ − 860
+ − 861
$marker = "$uniq_prefix-$element-" . sprintf('%08X', $n++) . '-QINU';
+ − 862
$stripped .= $marker;
+ − 863
+ − 864
if ( $close === '/>' ) {
+ − 865
// Empty element tag, <tag />
+ − 866
$content = null;
+ − 867
$text = $inside;
+ − 868
$tail = null;
+ − 869
} else {
+ − 870
if( $element == '!--' ) {
+ − 871
$end = '/(-->)/';
+ − 872
} else {
+ − 873
$end = "/(<\\/$element\\s*>)/i";
+ − 874
}
+ − 875
$q = preg_split( $end, $inside, 2, PREG_SPLIT_DELIM_CAPTURE );
+ − 876
$content = $q[0];
+ − 877
if( count( $q ) < 3 ) {
+ − 878
# No end tag -- let it run out to the end of the text.
+ − 879
$tail = '';
+ − 880
$text = '';
+ − 881
} else {
+ − 882
$tail = $q[1];
+ − 883
$text = $q[2];
+ − 884
}
+ − 885
}
+ − 886
+ − 887
$matches[$marker] = array( $element,
+ − 888
$content,
+ − 889
decodeTagAttributes( $attributes ),
+ − 890
"<$element$attributes$close$content$tail" );
+ − 891
}
+ − 892
return $stripped;
+ − 893
}
+ − 894
+ − 895
/**
+ − 896
* Escape html tags
+ − 897
* Basically replacing " > and < with HTML entities ( ", >, <)
+ − 898
*
+ − 899
* @param $in String: text that might contain HTML tags.
+ − 900
* @return string Escaped string
+ − 901
*/
+ − 902
function wfEscapeHTMLTagsOnly( $in ) {
+ − 903
return str_replace(
+ − 904
array( '"', '>', '<' ),
+ − 905
array( '"', '>', '<' ),
+ − 906
$in );
+ − 907
}
+ − 908
+ − 909
/**
+ − 910
* Restores pre, math, and other extensions removed by strip()
+ − 911
*
+ − 912
* always call unstripNoWiki() after this one
+ − 913
* @private
+ − 914
*/
+ − 915
function unstrip( $text, &$state ) {
+ − 916
if ( !isset( $state['general'] ) ) {
+ − 917
return $text;
+ − 918
}
+ − 919
+ − 920
# TODO: good candidate for FSS
+ − 921
$text = strtr( $text, $state['general'] );
+ − 922
+ − 923
return $text;
+ − 924
}
+ − 925
+ − 926
/**
+ − 927
* Return UTF-8 string for a codepoint if that is a valid
+ − 928
* character reference, otherwise U+FFFD REPLACEMENT CHARACTER.
+ − 929
* @param int $codepoint
+ − 930
* @return string
+ − 931
* @private
+ − 932
*/
+ − 933
function decodeChar( $codepoint ) {
+ − 934
if( validateCodepoint( $codepoint ) ) {
+ − 935
return codepointToUtf8( $codepoint );
+ − 936
} else {
+ − 937
return UTF8_REPLACEMENT;
+ − 938
}
+ − 939
}
+ − 940
+ − 941
/**
+ − 942
* If the named entity is defined in the HTML 4.0/XHTML 1.0 DTD,
+ − 943
* return the UTF-8 encoding of that character. Otherwise, returns
+ − 944
* pseudo-entity source (eg &foo;)
+ − 945
*
+ − 946
* @param string $name
+ − 947
* @return string
+ − 948
*/
+ − 949
function decodeEntity( $name ) {
+ − 950
global $wgHtmlEntities;
+ − 951
if( isset( $wgHtmlEntities[$name] ) ) {
+ − 952
return codepointToUtf8( $wgHtmlEntities[$name] );
+ − 953
} else {
+ − 954
return "&$name;";
+ − 955
}
+ − 956
}
+ − 957
+ − 958
/**
+ − 959
* Returns true if a given Unicode codepoint is a valid character in XML.
+ − 960
* @param int $codepoint
+ − 961
* @return bool
+ − 962
*/
+ − 963
function validateCodepoint( $codepoint ) {
+ − 964
return ($codepoint == 0x09)
+ − 965
|| ($codepoint == 0x0a)
+ − 966
|| ($codepoint == 0x0d)
+ − 967
|| ($codepoint >= 0x20 && $codepoint <= 0xd7ff)
+ − 968
|| ($codepoint >= 0xe000 && $codepoint <= 0xfffd)
+ − 969
|| ($codepoint >= 0x10000 && $codepoint <= 0x10ffff);
+ − 970
}
+ − 971
+ − 972
/**
+ − 973
* Return UTF-8 sequence for a given Unicode code point.
+ − 974
* May die if fed out of range data.
+ − 975
*
+ − 976
* @param $codepoint Integer:
+ − 977
* @return String
+ − 978
* @public
+ − 979
*/
+ − 980
function codepointToUtf8( $codepoint ) {
+ − 981
if($codepoint < 0x80) return chr($codepoint);
+ − 982
if($codepoint < 0x800) return chr($codepoint >> 6 & 0x3f | 0xc0) .
+ − 983
chr($codepoint & 0x3f | 0x80);
+ − 984
if($codepoint < 0x10000) return chr($codepoint >> 12 & 0x0f | 0xe0) .
+ − 985
chr($codepoint >> 6 & 0x3f | 0x80) .
+ − 986
chr($codepoint & 0x3f | 0x80);
+ − 987
if($codepoint < 0x110000) return chr($codepoint >> 18 & 0x07 | 0xf0) .
+ − 988
chr($codepoint >> 12 & 0x3f | 0x80) .
+ − 989
chr($codepoint >> 6 & 0x3f | 0x80) .
+ − 990
chr($codepoint & 0x3f | 0x80);
+ − 991
+ − 992
echo "Asked for code outside of range ($codepoint)\n";
+ − 993
die( -1 );
+ − 994
}
+ − 995
+ − 996
/**
+ − 997
* @param string $matches
+ − 998
* @return string
+ − 999
*/
+ − 1000
function decodeCharReferencesCallback( $matches ) {
+ − 1001
if( $matches[1] != '' ) {
24
+ − 1002
return decodeEntity( $matches[1] );
1
+ − 1003
} elseif( $matches[2] != '' ) {
24
+ − 1004
return decodeChar( intval( $matches[2] ) );
1
+ − 1005
} elseif( $matches[3] != '' ) {
24
+ − 1006
return decodeChar( hexdec( $matches[3] ) );
1
+ − 1007
} elseif( $matches[4] != '' ) {
24
+ − 1008
return decodeChar( hexdec( $matches[4] ) );
1
+ − 1009
}
+ − 1010
# Last case should be an ampersand by itself
+ − 1011
return $matches[0];
+ − 1012
}
+ − 1013
+ − 1014
?>