includes/wikiengine/Parse/Mediawiki/Url.php
author Dan
Fri, 05 Oct 2007 01:57:00 -0400
changeset 161 e1a22031b5bd
parent 1 fe660c52c48f
permissions -rw-r--r--
Major revamps to the template parser. Fixed a few security holes that could allow PHP to be injected in untimely places in TPL code. Improved Ux for XSS attempt in tplWikiFormat. Documented many functions. Backported much cleaner parser from 2.0 branch. Beautified a lot of code in the depths of the template class. Pretty much a small-scale Extreme Makeover.

<?php

/**
* 
* Parse for URLS in the source text.
* 
* @category Text
* 
* @package Text_Wiki
* 
* @author Paul M. Jones <pmjones@php.net>
* 
* @author Moritz Venn <moritz.venn@freaque.net>
* 
* @license LGPL
* 
* @version $Id: Url.php,v 1.1 2005/12/06 15:54:56 ritzmo Exp $
* 
*/

/**
* 
* Parse for URLS in the source text.
* 
* Various URL markings are supported: inline (the URL by itself),
* inline (where the URL is enclosed in square brackets), and named
* reference (where the URL is enclosed in square brackets and has a
* name included inside the brackets).  E.g.:
*
* inline      -- http://example.com
* undescribed -- [http://example.com]
* described   -- [http://example.com Example Description]
* described   -- [http://www.example.com|Example Description]
*
* When rendering a URL token, this will convert URLs pointing to a .gif,
* .jpg, or .png image into an inline <img /> tag (for the 'xhtml'
* format).
*
* Token options are:
* 
* 'type' => ['inline'|'footnote'|'descr'] the type of URL
* 
* 'href' => the URL link href portion
* 
* 'text' => the displayed text of the URL link
* 
* @category Text
* 
* @package Text_Wiki
* 
* @author Paul M. Jones <pmjones@php.net>
* 
* @author Moritz Venn <moritz.venn@freaque.net>
* 
*/

class Text_Wiki_Parse_Url extends Text_Wiki_Parse {
    
    
    /**
    * 
    * Keeps a running count of numbered-reference URLs.
    * 
    * @access public
    * 
    * @var int
    * 
    */
    
    var $footnoteCount = 0;
    
    
    /**
    * 
    * URL schemes recognized by this rule.
    * 
    * @access public
    * 
    * @var array
    * 
    */
    
    var $conf = array(
        'schemes' => array(
            'http://',
            'https://',
            'ftp://',
            'gopher://',
            'news://',
            'mailto:',
            'irc://'
        )
    );
    
    
    /**
    * 
    * Constructor.
    * 
    * We override the constructor so we can comment the regex nicely.
    * 
    * @access public
    * 
    */
    
    function Text_Wiki_Parse_Url(&$obj)
    {
        parent::Text_Wiki_Parse($obj);
        
        // convert the list of recognized schemes to a regex-safe string,
        // where the pattern delim is a slash
        $tmp = array();
        $list = $this->getConf('schemes', array());
        foreach ($list as $val) {
            $tmp[] = preg_quote($val, '/');
        }
        $schemes = implode('|', $tmp);
        
        // build the regex
        $this->regex =
            "($schemes)" . // allowed schemes
            "(" . // start pattern
            "[^ \\/\"\'{$this->wiki->delim}]*\\/" . // no spaces, backslashes, slashes, double-quotes, single quotes, or delimiters;
            ")*" . // end pattern
            "[^ \\t\\n\\/\"\'{$this->wiki->delim}]*" .
            "[A-Za-z0-9\\/?=&~_]";
            // fix for jEdit syntax highlighting bug: \"
    }
    
    
    /**
    * 
    * Find three different kinds of URLs in the source text.
    *
    * @access public
    * 
    */
    
    function parse()
    {
        // -------------------------------------------------------------
        // 
        // Described-reference (named) URLs.
        // 

        // the regular expression for this kind of URL
        $tmp_regex = '/\[(' . $this->regex . ')[ |]([^\]]+)\]/';

        // use a custom callback processing method to generate
        // the replacement text for matches.
        $this->wiki->source = preg_replace_callback(
            $tmp_regex,
            array(&$this, 'processDescr'),
            $this->wiki->source
        );

        
        // -------------------------------------------------------------
        // 
        // Unnamed-reference ('Ordinary'-style) URLs.
        // 
        
        // the regular expression for this kind of URL
        $tmp_regex = '/\[(' . $this->regex . ')\]/U';
        
        // use a custom callback processing method to generate
        // the replacement text for matches.
        $this->wiki->source = preg_replace_callback(
            $tmp_regex,
            //array(&$this, 'processFootnote'),
            array(&$this, 'processOrdinary'),
            $this->wiki->source
        );
        
        
        // -------------------------------------------------------------
        // 
        // Normal inline URLs.
        // 
        
        /*
        
        ## DISABLED FOR ENANO
        ## This messes up HTML links.
        
        // the regular expression for this kind of URL
        
        $tmp_regex = '/(^|[^A-Za-z])(' . $this->regex . ')(.*?)/';
        
        // use the standard callback for inline URLs
        $this->wiki->source = preg_replace_callback(
            $tmp_regex,
            array(&$this, 'process'),
            $this->wiki->source
        );

        //$tmp_regex = '/(^|[^A-Za-z])([a-zA-Z])(.*?)/';
        $tmp_regex = '/(^|\s)([a-zA-Z0-9\-]+\.[a-zA-Z0-9\-]+(\.[a-zA-Z0-9\-]+)+)($|\s)/';
        
        // use the standard callback for inline URLs
        $this->wiki->source = preg_replace_callback(
            $tmp_regex,
            array(&$this, 'processWithoutProtocol'),
            $this->wiki->source
        );

        $tmp_regex = '/(^|\s|'.$this->wiki->delim.')<([a-zA-Z0-9\-\.%_\+\!\*\'\(\)\,]+@[a-zA-Z0-9\-]+(\.[a-zA-Z0-9\-]+)+)>(\s|'.$this->wiki->delim.'|$)/';
        
        // use the standard callback for inline URLs
        $this->wiki->source = preg_replace_callback(
            $tmp_regex,
            array(&$this, 'processInlineEmail'),
            $this->wiki->source
        );
        */
    }
    
    
    /**
    * 
    * Process inline URLs.
    * 
    * @param array &$matches
    * 
    * @param array $matches An array of matches from the parse() method
    * as generated by preg_replace_callback.  $matches[0] is the full
    * matched string, $matches[1] is the first matched pattern,
    * $matches[2] is the second matched pattern, and so on.
    * 
    * @return string The processed text replacement.
    * 
    */ 
    
    function process(&$matches)
    {
        // set options
        $options = array(
            'type' => 'inline',
            'href' => $matches[2],
            'text' => $matches[2]
        );
        
        // tokenize
        return $matches[1] . $this->wiki->addToken($this->rule, $options) . $matches[5];
    }

    function processWithoutProtocol(&$matches)
    {
        // set options
        $options = array(
            'type' => 'inline',
            'href' => 'http://'.$matches[2],
            'text' => $matches[2]
        );
        
        // tokenize
        return $matches[1] . $this->wiki->addToken($this->rule, $options) . $matches[4];
    }

    function processInlineEmail(&$matches)
    {
        // set options
        $options = array(
            'type' => 'inline',
            'href' => 'mailto://'.$matches[2],
            'text' => $matches[2]
        );
        
        // tokenize
        return $matches[1] . $this->wiki->addToken($this->rule, $options) . $matches[4];
    }    
    
    /**
    * 
    * Process numbered (footnote) URLs.
    * 
    * Token options are:
    * @param array &$matches
    * 
    * @param array $matches An array of matches from the parse() method
    * as generated by preg_replace_callback.  $matches[0] is the full
    * matched string, $matches[1] is the first matched pattern,
    * $matches[2] is the second matched pattern, and so on.
    * 
    * @return string The processed text replacement.
    * 
    */ 
    
    function processFootnote(&$matches)
    {
        // keep a running count for footnotes 
        $this->footnoteCount++;
        
        // set options
        $options = array(
            'type' => 'footnote',
            'href' => $matches[1],
            'text' => $this->footnoteCount
        );
        
        // tokenize
        return $this->wiki->addToken($this->rule, $options);
    }
    
     function processOrdinary(&$matches)
    {
    	// keep a running count for footnotes 
        $this->footnoteCount++;
        
        // set options
        $options = array(
            'type' => 'descr',
            'href' => $matches[1],
            'text' => $matches[1]
        );
        
        // tokenize
        return $this->wiki->addToken($this->rule, $options);
    }
    
    
    /**
    * 
    * Process described-reference (named-reference) URLs.
    * 
    * Token options are:
    *     'type' => ['inline'|'footnote'|'descr'] the type of URL
    *     'href' => the URL link href portion
    *     'text' => the displayed text of the URL link
    * 
    * @param array &$matches
    * 
    * @param array $matches An array of matches from the parse() method
    * as generated by preg_replace_callback.  $matches[0] is the full
    * matched string, $matches[1] is the first matched pattern,
    * $matches[2] is the second matched pattern, and so on.
    * 
    * @return string The processed text replacement.
    * 
    */ 
    
    function processDescr(&$matches)
    {
        // set options
        $options = array(
            'type' => 'descr',
            'href' => $matches[1],
            'text' => $matches[4]
        );

        // tokenize
        return $this->wiki->addToken($this->rule, $options);
    }
}
?>