includes/search.php
changeset 279 8acd77a6c19d
parent 272 e0ec986c0af3
child 292 b3cfaf0a505c
equal deleted inserted replaced
278:53ba55d33abb 279:8acd77a6c19d
    29   }
    29   }
    30   return $arr3;
    30   return $arr3;
    31 }
    31 }
    32 
    32 
    33 /**
    33 /**
    34  * Algorithm to actually do the searching. This system usually works pretty fast (tested and developed on a site with 22 pages) but one
    34  * In Enano versions prior to 1.0.2, this class provided a search function that was keyword-based and allowed boolean searches. It was
    35  * caveat of this algorithm is that it has to load the entire index into memory. It also requires manual parsing of the search query
    35  * cut from Coblynau and replaced with perform_search(), later in this file, because of speed issues. Now mostly deprecated. The only
    36  * which can be quite CPU-intensive. On the flip side this algorithm is extremely flexible and can be adapted for other uses very easily.
    36  * thing remaining is the buildIndex function, which is still used by the path manager and the new search framework.
    37  * 
    37  * 
    38  * Most of the time, this system is disabled. It is only used when MySQL can't or won't allow FULLTEXT indices.
       
    39  *
       
    40  * @package Enano
    38  * @package Enano
    41  * @subpackage Page management frontend
    39  * @subpackage Page management frontend
    42  * @license GNU General Public License http://enanocms.org/Special:GNU_General_Public_License
    40  * @license GNU General Public License <http://enanocms.org/Special:GNU_General_Public_License>
    43  */
    41  */
    44 
    42 
    45 class Searcher
    43 class Searcher
    46 {
    44 {
    47   
    45   
    48   var $results;
    46   var $results;
    49   var $index;
    47   var $index;
    50   var $warnings;
    48   var $warnings;
    51   var $match_case = false;
    49   var $match_case = false;
    52   
    50   
    53   function __construct()
       
    54   {
       
    55     $this->warnings = Array();
       
    56   }
       
    57   
       
    58   function Searcher()
       
    59   {
       
    60     $this->__construct();
       
    61   }
       
    62   
       
    63   function warn($t)
       
    64   {
       
    65     if(!in_array($t, $this->warnings)) $this->warnings[] = $t;
       
    66   }
       
    67   
       
    68   function convertCase($text)
       
    69   {
       
    70     return ( $this->match_case ) ? $text : strtolower($text);
       
    71   }
       
    72   
       
    73   function buildIndex($texts)
    51   function buildIndex($texts)
    74   {
    52   {
    75     $this->index = Array();
    53     $this->index = Array();
    76 
    54     $stopwords = get_stopwords();
       
    55     
    77     foreach($texts as $i => $l)
    56     foreach($texts as $i => $l)
    78     {
    57     {
    79       $seed = md5(microtime(true) . mt_rand());
    58       $seed = md5(microtime(true) . mt_rand());
    80       $texts[$i] = str_replace("'", 'xxxApoS'.$seed.'xxx', $texts[$i]);
    59       $texts[$i] = str_replace("'", 'xxxApoS'.$seed.'xxx', $texts[$i]);
    81       $texts[$i] = preg_replace('#([\W_]+)#i', ' ', $texts[$i]);
    60       $texts[$i] = preg_replace('#([\W_]+)#i', ' ', $texts[$i]);
    94       }
    73       }
    95       $letters = implode('', $letters);
    74       $letters = implode('', $letters);
    96       $words = explode(' ', $letters);
    75       $words = explode(' ', $letters);
    97       foreach($words as $c => $w)
    76       foreach($words as $c => $w)
    98       {
    77       {
    99         if(strlen($w) < 4)
    78         if(strlen($w) < 2 || in_array($w, $stopwords))
   100           unset($words[$c]);
    79           unset($words[$c]);
   101         else
    80         else
   102           $words[$c] = $w;
    81           $words[$c] = $w;
   103       }
    82       }
   104       $words = array_values($words);
    83       $words = array_values($words);
   119     foreach($this->index as $k => $v)
    98     foreach($this->index as $k => $v)
   120     {
    99     {
   121       $this->index[$k] = implode(',', $this->index[$k]);
   100       $this->index[$k] = implode(',', $this->index[$k]);
   122     }
   101     }
   123   }
   102   }
   124   
   103 }
   125   function search($query, $texts)
   104 
   126   {
   105 /**
   127     
   106  * Searches the site for the specified string and returns an array with each value being an array filled with the following:
   128     // OK, let's establish some basics here. Here is the procedure for performing the search:
   107  *   page_id: string, self-explanatory
   129     //   * search for items that matches all the terms in the correct order.
   108  *   namespace: string, self-explanatory
   130     //   * search for items that match in any order
   109  *   page_length: integer, the length of the full page in bytes
   131     //   * eliminate one term and do the loop all over
   110  *   page_text: string, the contents of the page (trimmed to ~150 bytes if necessary)
   132     
   111  *   score: numerical relevance score, 1-100, rounded to 2 digits and calculated based on which terms were present and which were not
   133     $this->results = Array();
   112  * @param string Search query
   134     $query = $this->parseQuery($query);
   113  * @param string Will be filled with any warnings encountered whilst parsing the query
   135     $querybak = $query;
   114  * @param bool Case sensitivity - defaults to false
   136     for($i = sizeof($query['any'])-1; $i >= 0; $i--)
   115  * @return array
   137     {
   116  */
   138       $res = $this->performCoreSearch($query, $texts, true);
   117 
   139       $this->results = enano_safe_array_merge($this->results, $res);
   118 function perform_search($query, &$warnings, $case_sensitive = false)
   140       $res = $this->performCoreSearch($query, $texts, false);
   119 {
   141       $this->results = enano_safe_array_merge($this->results, $res);
   120   global $db, $session, $paths, $template, $plugins; // Common objects
   142       unset($query['any'][$i]);
   121   $warnings = array();
   143     }
   122   
   144     
   123   $query = parse_search_query($query, $warnings);
   145     // Last resort - search for any of the terms instead of all of 'em
   124   
   146     $res = $this->performCoreSearch($querybak, $texts, false, true);
   125   // Segregate search terms containing spaces
   147     $this->results = enano_safe_array_merge($this->results, $res);
   126   $query_phrase = array(
   148     
   127     'any' => array(),
   149     $this->highlightResults($querybak);
   128     'req' => array()
   150   }
   129     );
   151   
   130   
   152   // $texts should be a textual MySQL query!
   131   foreach ( $query['any'] as $i => $_ )
   153   // @todo document
   132   {
   154   function searchMySQL($query, $texts)
   133     $term =& $query['any'][$i];
   155   {
   134     $term = trim($term);
   156     global $db;
   135     // the indexer only indexes words a-z with apostrophes
   157     // OK, let's establish some basics here. Here is the procedure for performing the search:
   136     if ( preg_match('/[^A-Za-z\']/', $term) )
   158     //   * search for items that matches all the terms in the correct order.
   137     {
   159     //   * search for items that match in any order
   138       $query_phrase['any'][] = $term;
   160     //   * eliminate one term and do the loop all over
   139       unset($term, $query['any'][$i]);
   161     
   140     }
   162     $this->results = Array();
   141   }
   163     $query = $this->parseQuery($query);
   142   unset($term);
   164     $querytmp = $query;
   143   $query['any'] = array_values($query['any']);
   165     $querybak = $query;
   144   
   166     for($i = sizeof($querytmp['any'])-1; $i >= 0; $i--)
   145   foreach ( $query['req'] as $i => $_ )
   167     {
   146   {
   168       $res = $this->performCoreSearchMySQL($querytmp, $texts, true);
   147     $term =& $query['req'][$i];
   169       $this->results = enano_safe_array_merge($this->results, $res);
   148     $term = trim($term);
   170       $res = $this->performCoreSearchMySQL($querytmp, $texts, false);
   149     if ( preg_match('/[^A-Za-z\']/', $term) )
   171       $this->results = enano_safe_array_merge($this->results, $res);
   150     {
   172       unset($querytmp['any'][$i]);
   151       $query_phrase['req'][] = $term;
   173     }
   152       unset($term, $query['req'][$i]);
   174     
   153     }
   175     // Last resort - search for any of the terms instead of all of 'em
   154   }
   176     $res = $this->performCoreSearchMySQL($querybak, $texts, false, true);
   155   unset($term);
   177     $this->results = enano_safe_array_merge($this->results, $res);
   156   $query['req'] = array_values($query['req']);
   178     
   157   
   179     $this->highlightResults($querybak);
   158   $results = array();
   180   }
   159   $scores = array();
   181   
   160   
   182   /**
   161   // FIXME: Update to use FULLTEXT algo when available.
   183    * This method assumes that $query is already parsed and $texts is an (associative) array of possible results
   162   
   184    * @param array $query A search query parsed with Searcher::parseQuery()
   163   // Build an SQL query to load from the index table
   185    * @param array $texts The list of possible results
   164   if ( count($query['any']) < 1 && count($query['req']) < 1 && count($query_phrase['any']) < 1 && count($query_phrase['req']) < 1 )
   186    * @param bool $exact_order If true, only matches results with the terms in the same order as the terms in the query
   165   {
   187    * @return array An associative array of results
   166     // This is both because of technical restrictions and devastation that would occur on shared servers/large sites.
   188    * @access private
   167     $warnings[] = 'You need to have at least one keyword in your search query. Searching only for pages not containing a term is not allowed.';
   189    */
   168     return array();
   190   function performCoreSearch($query, $texts, $exact_order = false, $any = false)
   169   }
   191   {
   170   
   192     $textkeys = array_keys($texts);
   171   //
   193     $results = Array();
   172   // STAGE 1
   194     if($exact_order)
   173   // Get all possible result pages from the search index. Tally which pages have the most words, and later sort them by boolean relevance
   195     {
   174   //
   196       $query = $this->concatQueryTerms($query);
   175   
   197     }
   176   // Skip this if no indexable words are included
   198     $query['trm'] = array_merge($query['any'], $query['req']);
   177   
   199     # Find all remotely possible results first
   178   if ( count($query['any']) > 0 || count($query['req']) > 0 )
   200     // Single-word terms
   179   {
   201     foreach($this->index as $term => $keys)
   180     $where_any = array();
   202     {
   181     foreach ( $query['any'] as $term )
   203       foreach($query['trm'] as $userterm)
   182     {
   204       {
   183       $term = escape_string_like($term);
   205         if($this->convertCase($userterm) == $this->convertCase($term))
   184       if ( !$case_sensitive )
       
   185         $term = strtolower($term);
       
   186       $where_any[] = $term;
       
   187     }
       
   188     foreach ( $query['req'] as $term )
       
   189     {
       
   190       $term = escape_string_like($term);
       
   191       if ( !$case_sensitive )
       
   192         $term = strtolower($term);
       
   193       $where_any[] = $term;
       
   194     }
       
   195     
       
   196     $col_word = ( $case_sensitive ) ? 'word' : 'lcase(word)';
       
   197     $where_any = ( count($where_any) > 0 ) ? '( ' . $col_word . ' = \'' . implode('\' OR ' . $col_word . ' = \'', $where_any) . '\' )' : '';
       
   198     
       
   199     // generate query
       
   200     // using a GROUP BY here ensures that the same word with a different case isn't counted as 2 words - it's all melted back
       
   201     // into one later in the processing stages
       
   202     $group_by = ( $case_sensitive ) ? '' : ' GROUP BY lcase(word);';
       
   203     $sql = "SELECT word, page_names FROM " . table_prefix . "search_index WHERE {$where_any}{$group_by}";
       
   204     if ( !($q = $db->sql_unbuffered_query($sql)) )
       
   205       $db->_die('Error is in perform_search(), includes/search.php, query 1');
       
   206     
       
   207     $word_tracking = array();
       
   208     if ( $row = $db->fetchrow() )
       
   209     {
       
   210       do
       
   211       {
       
   212         // get page list
       
   213         $pages =& $row['page_names'];
       
   214         $ns_list = '(' . implode('|', array_keys($paths->nslist)) . ')';
       
   215         if ( strpos($pages, ',') )
   206         {
   216         {
   207           $k = explode(',', $keys);
   217           // the term occurs in more than one page
   208           foreach($k as $idxkey)
   218           
       
   219           // Find page IDs that contain commas
       
   220           // This should never happen because commas are escaped by sanitize_page_id(). Nevertheless for compatibility with older
       
   221           // databases, and to alleviate the concerns of hackers, we'll accommodate for page IDs with commas here by checking for
       
   222           // IDs that don't match the pattern for stringified page ID + namespace. If it doesn't match, that means it's a continuation
       
   223           // of the previous ID and should be concatenated to the previous entry.
       
   224           $matches = explode(',', $pages);
       
   225           $prev = false;
       
   226           foreach ( $matches as $i => $_ )
   209           {
   227           {
   210             if(isset($texts[$idxkey])) 
   228             $match =& $matches[$i];
       
   229             if ( !preg_match("/^ns=$ns_list;pid=(.+)$/", $match) && $prev )
   211             {
   230             {
   212               $results[$idxkey] = $texts[$idxkey];
   231               $matches[$prev] .= ',' . $match;
       
   232               unset($match, $matches[$i]);
       
   233               continue;
       
   234             }
       
   235             $prev = $i;
       
   236           }
       
   237           unset($match);
       
   238           
       
   239           // Iterate through each of the results, assigning scores based on how many times the page has shown up.
       
   240           // This works because this phase of the search is strongly word-based not page-based. If a page shows up
       
   241           // multiple times while fetching the result rows from the search_index table, it simply means that page
       
   242           // contains more than one of the terms the user searched for.
       
   243           
       
   244           foreach ( $matches as $match )
       
   245           {
       
   246             if ( isset($scores[$match]) )
       
   247             {
       
   248               $scores[$match]++;
   213             }
   249             }
   214             else
   250             else
   215             {
   251             {
   216               if(preg_match('#^([0-9]+)$#', $idxkey))
   252               $scores[$match] = 1;
   217               {
   253             }
   218                 $idxkey = intval($idxkey);
   254             if ( isset($word_tracking[$match]) )
   219                 if(isset($texts[$idxkey])) $results[$idxkey] = $texts[$idxkey];
   255             {
   220               }
   256               $word_tracking[$match][] = $row['word'];
       
   257             }
       
   258             else
       
   259             {
       
   260               $word_tracking[$match] = array($row['word']);
   221             }
   261             }
   222           }
   262           }
   223         }
   263         }
   224       }
   264         else
   225     }
       
   226     // Quoted terms
       
   227     foreach($query['trm'] as $userterm)
       
   228     {
       
   229       if(!preg_match('/[\s"\'~`!@#\$%\^&\*\(\)\{\}:;<>,.\/\?_-]/', $userterm)) continue;
       
   230       foreach($texts as $k => $t)
       
   231       {
       
   232         if(strstr($this->convertCase($t), $this->convertCase($userterm)))
       
   233         {
   265         {
   234           // We have a match!
   266           // the term only occurs in one page
   235           if(!isset($results[$k])) $results[$k] = $t;
   267           if ( isset($scores[$pages]) )
       
   268           {
       
   269             $scores[$pages]++;
       
   270           }
       
   271           else
       
   272           {
       
   273             $scores[$pages] = 1;
       
   274           }
       
   275           if ( isset($word_tracking[$pages]) )
       
   276           {
       
   277             $word_tracking[$pages][] = $row['word'];
       
   278           }
       
   279           else
       
   280           {
       
   281             $word_tracking[$pages] = array($row['word']);
       
   282           }
   236         }
   283         }
   237       }
   284       }
   238     }
   285       while ( $row = $db->fetchrow() );
   239     // Remove excluded terms
   286     }
   240     foreach($results as $k => $r)
   287     $db->free_result();
   241     {
   288   
   242       foreach($query['not'] as $not)
   289     //
   243       {
   290     // STAGE 2: FIRST ELIMINATION ROUND
   244         if(strstr($this->convertCase($r), $this->convertCase($not))) unset($results[$k]);
   291     // Iterate through the list of required terms. If a given page is not found to have the required term, eliminate it
   245       }
   292     //
   246     }
   293     
   247     if(!$any)
   294     foreach ( $query['req'] as $term )
   248     {
   295     {
   249       // Remove results not containing all terms
   296       foreach ( $word_tracking as $i => $page )
   250       foreach($results as $k => $r)
   297       {
   251       {
   298         if ( !in_array($term, $page) )
   252         foreach($query['any'] as $term)
       
   253         {
   299         {
   254           if(!strstr($this->convertCase($r), $this->convertCase($term))) unset($results[$k]);
   300           unset($word_tracking[$i], $scores[$i]);
   255         }
   301         }
   256       }
   302       }
   257     }
   303     }
   258     // Remove results not containing all required terms
   304   }
   259     foreach($results as $k => $r)
   305   
   260     {
   306   //
   261       foreach($query['req'] as $term)
   307   // STAGE 3: PHRASE SEARCHING
   262       {
   308   // Use LIKE to find pages with specified phrases. We can do a super-picky single query without another elimination round because
   263         if(!strstr($this->convertCase($r), $this->convertCase($term))) unset($results[$k]);
   309   // at this stage we can search the full page_text column instead of relying on a word list.
   264       }
   310   //
   265     }
   311   
   266     return $results;
   312   // We can skip this stage if none of these special terms apply
   267   }
   313   
   268   
   314   $text_col = ( $case_sensitive ) ? 'page_text' : 'lcase(page_text)';
   269   /**
   315   
   270    * This is the same as performCoreSearch, but $texts should be a MySQL result resource. This can save tremendous amounts of memory on large sites.
   316   if ( count($query_phrase['any']) > 0 || count($query_phrase['req']) > 0 )
   271    * @param array $query A search query parsed with Searcher::parseQuery()
   317   {
   272    * @param string $texts A text MySQL query that selects the text as the first column and the index key as the second column
   318   
   273    * @param bool $exact_order If true, only matches results with the terms in the same order as the terms in the query
   319     $where_any = array();
   274    * @return array An associative array of results
   320     foreach ( $query_phrase['any'] as $term )
   275    * @access private
   321     {
   276    */
   322       $term = escape_string_like($term);
   277   function performCoreSearchMySQL($query, $texts, $exact_order = false, $any = false)
   323       if ( !$case_sensitive )
   278   {
   324         $term = strtolower($term);
   279     global $db;
   325       $where_any[] = $term;
   280     $results = Array();
   326     }
   281     if($exact_order)
   327     
   282     {
   328     $where_any = ( count($where_any) > 0 ) ? "( $text_col LIKE '%" . implode("%' OR $text_col LIKE '%", $where_any) . "%' )" : '';
   283       $query = $this->concatQueryTerms($query);
   329     
   284     }
   330     // Also do required columns, but use AND to ensure that all required terms are included
   285     $query['trm'] = array_merge($query['any'], $query['req']);
   331     $where_req = array();
   286     # Find all remotely possible results first
   332     foreach ( $query_phrase['req'] as $term )
   287     $texts = $db->sql_query($texts);
   333     {
   288     if ( !$texts )
   334       $term = escape_string_like($term);
   289       $db->_die('The error is in the search engine.');
   335       if ( !$case_sensitive )
   290     if ( $r = $db->fetchrow_num($texts) )
   336         $term = strtolower($term);
       
   337       $where_req[] = $term;
       
   338     }
       
   339     $and_clause = ( $where_any != '' ) ? 'AND ' : '';
       
   340     $where_req = ( count($where_req) > 0 ) ? "{$and_clause}$text_col LIKE '%" . implode("%' AND $text_col LIKE '%", $where_req) . "%'" : '';
       
   341     
       
   342     $sql = 'SELECT CONCAT("ns=",namespace,";pid=",page_id) AS id FROM ' . table_prefix . "page_text WHERE $where_any $where_req;";
       
   343     if ( !($q = $db->sql_unbuffered_query($sql)) )
       
   344       $db->_die('Error is in perform_search(), includes/search.php, query 2. Parsed query dump follows:<pre>(indexable) ' . htmlspecialchars(print_r($query, true)) . '(non-indexable) ' . htmlspecialchars(print_r($query_phrase, true)) . '</pre>');
       
   345     
       
   346     if ( $row = $db->fetchrow() )
   291     {
   347     {
   292       do
   348       do
   293       {
   349       {
   294         foreach($this->index as $term => $keys)
   350         $id =& $row['id'];
       
   351         if ( isset($scores[$id]) )
   295         {
   352         {
   296           foreach($query['trm'] as $userterm)
   353           $scores[$id]++;
       
   354         }
       
   355         else
       
   356         {
       
   357           $scores[$id] = 1;
       
   358         }
       
   359       }
       
   360       while ( $row = $db->fetchrow() );
       
   361     }
       
   362     $db->free_result();
       
   363   }
       
   364   
       
   365   //
       
   366   // STAGE 4 - SELECT PAGE TEXT AND ELIMINATE NOTS
       
   367   // At this point, we have a complete list of all the possible pages. Now we want to obtain the page text, and within the same query
       
   368   // eliminate any terms that shouldn't be in there.
       
   369   //
       
   370   
       
   371   // Generate master word list for the highlighter
       
   372   $word_list = array_values(array_merge($query['any'], $query['req'], $query_phrase['any'], $query_phrase['req']));
       
   373   
       
   374   $text_where = array();
       
   375   foreach ( $scores as $page_id => $_ )
       
   376   {
       
   377     $text_where[] = $db->escape($page_id);
       
   378   }
       
   379   $text_where = '( CONCAT("ns=",t.namespace,";pid=",t.page_id) = \'' . implode('\' OR CONCAT("ns=",t.namespace,";pid=",t.page_id) = \'', $text_where) . '\' )';
       
   380   
       
   381   if ( count($query['not']) > 0 )
       
   382     $text_where .= ' AND';
       
   383   
       
   384   $where_not = array();
       
   385   foreach ( $query['not'] as $term )
       
   386   {
       
   387     $term = escape_string_like($term);
       
   388     if ( !$case_sensitive )
       
   389       $term = strtolower($term);
       
   390     $where_not[] = $term;
       
   391   }
       
   392   $where_not = ( count($where_not) > 0 ) ? "$text_col NOT LIKE '%" . implode("%' AND $text_col NOT LIKE '%", $where_not) . "%'" : '';
       
   393   
       
   394   $sql = 'SELECT CONCAT("ns=",t.namespace,";pid=",t.page_id) AS id, t.page_id, t.namespace, CHAR_LENGTH(t.page_text) AS page_length, t.page_text, p.name AS page_name FROM ' . table_prefix . "page_text AS t
       
   395             LEFT JOIN " . table_prefix . "pages AS p
       
   396               ON ( p.urlname = t.page_id AND p.namespace = t.namespace )
       
   397             WHERE $text_where $where_not;";
       
   398   if ( !($q = $db->sql_unbuffered_query($sql)) )
       
   399     $db->_die('Error is in perform_search(), includes/search.php, query 3');
       
   400   
       
   401   $page_data = array();
       
   402   if ( $row = $db->fetchrow() )
       
   403   {
       
   404     do
       
   405     {
       
   406       $row['page_text'] = htmlspecialchars($row['page_text']);
       
   407       $row['page_name'] = htmlspecialchars($row['page_name']);
       
   408       
       
   409       // Highlight results (this is wonderfully automated)
       
   410       $row['page_text'] = highlight_and_clip_search_result($row['page_text'], $word_list, $case_sensitive);
       
   411       if ( strlen($row['page_text']) > 250 && !preg_match('/^\.\.\.(.+)\.\.\.$/', $row['page_text']) )
       
   412       {
       
   413         $row['page_text'] = substr($row['page_text'], 0, 150) . '...';
       
   414       }
       
   415       $row['page_name'] = highlight_search_result($row['page_name'], $word_list, $case_sensitive);
       
   416       
       
   417       $page_data[$row['id']] = $row;
       
   418     }
       
   419     while ( $row = $db->fetchrow() );
       
   420   }
       
   421   $db->free_result();
       
   422   
       
   423   //
       
   424   // STAGE 5 - SPECIAL PAGE TITLE SEARCH
       
   425   // Iterate through $paths->pages and check the titles for search terms. Score accordingly.
       
   426   //
       
   427   
       
   428   foreach ( $paths->pages as $page )
       
   429   {
       
   430     if ( $page['namespace'] != 'Special' )
       
   431       continue;
       
   432     $idstring = 'ns=' . $page['namespace'] . ';pid=' . $page['urlname_nons'];
       
   433     $any = array_merge($query['any'], $query_phrase['any']);
       
   434     foreach ( $any as $term )
       
   435     {
       
   436       if ( $case_sensitive )
       
   437       {
       
   438         if ( strstr($page['name'], $term) || strstr($page['urlname_nons'], $term) )
       
   439         {
       
   440           ( isset($scores[$idstring]) ) ? $scores[$idstring]++ : $scores[$idstring] = 1;
       
   441         }
       
   442       }
       
   443       else
       
   444       {
       
   445         if ( strstr(strtolower($page['name']), strtolower($term)) || strstr(strtolower($page['urlname_nons']), strtolower($term)) )
       
   446         {
       
   447           ( isset($scores[$idstring]) ) ? $scores[$idstring]++ : $scores[$idstring] = 1;
       
   448         }
       
   449       }
       
   450     }
       
   451     if ( isset($scores[$idstring]) )
       
   452     {
       
   453       $page_data[$idstring] = array(
       
   454           'page_name' => $page['name'],
       
   455           'page_text' => '',
       
   456           'page_id' => $page['urlname_nons'],
       
   457           'namespace' => $page['namespace'],
       
   458           'score' => $scores[$idstring],
       
   459           'page_length' => 1,
       
   460           'page_note' => '[Special page]'
       
   461         );
       
   462     }
       
   463   }
       
   464   
       
   465   //
       
   466   // STAGE 6 - SECOND ELIMINATION ROUND
       
   467   // Iterate through the list of required terms. If a given page is not found to have the required term, eliminate it
       
   468   //
       
   469   
       
   470   $required = array_merge($query['req'], $query_phrase['req']);
       
   471   foreach ( $required as $term )
       
   472   {
       
   473     foreach ( $page_data as $id => $page )
       
   474     {
       
   475       if ( ( $page['namespace'] == 'Special' || ( $page['namespace'] != 'Special' && !strstr($page['page_text'], $term) ) ) && !strstr($page['page_id'], $term) && !strstr($page['page_name'], $term) )
       
   476       {
       
   477         unset($page_data[$id]);
       
   478       }
       
   479     }
       
   480   }
       
   481   
       
   482   // At this point, all of our normal results are in. However, we can also allow plugins to hook into the system and score their own
       
   483   // pages and add text, etc. as necessary.
       
   484   // Plugins are COMPLETELY responsible for using the search terms and handling Boolean logic properly
       
   485   
       
   486   $code = $plugins->setHook('search_global_inner');
       
   487   foreach ( $code as $cmd )
       
   488   {
       
   489     eval($cmd);
       
   490   }
       
   491   
       
   492   // a marvelous debugging aid :-)
       
   493   // die('<pre>' . htmlspecialchars(print_r($page_data, true)) . '</pre>');
       
   494   
       
   495   //
       
   496   // STAGE 7 - HIGHLIGHT, TRIM, AND SCORE RESULTS
       
   497   // We now have the complete results of the search. We need to trim text down to show only portions of the page containing search
       
   498   // terms, highlight any search terms within the page, and sort the final results array in descending order of score.
       
   499   //
       
   500   
       
   501   // Sort scores array
       
   502   arsort($scores);
       
   503   
       
   504   // Divisor for calculating relevance scores
       
   505   $divisor = count($query['any']) + count($query_phrase['any']) + count($query['req']) + count($query_phrase['not']);
       
   506   
       
   507   foreach ( $scores as $page_id => $score )
       
   508   {
       
   509     if ( !isset($page_data[$page_id]) )
       
   510       // It's possible that $scores contains a score for a page that was later eliminated because it contained a disallowed term
       
   511       continue;
       
   512       
       
   513     // Make a copy of the datum, then delete the original (it frees up a LOT of RAM)
       
   514     $datum = $page_data[$page_id];
       
   515     unset($page_data[$page_id]);
       
   516     
       
   517     // This is an internal value used for sorting - it's no longer needed.
       
   518     unset($datum['id']);
       
   519     
       
   520     // Calculate score
       
   521     if ( $score > $divisor )
       
   522       $score = $divisor;
       
   523     $datum['score'] = round($score / $divisor, 2) * 100;
       
   524     
       
   525     // Store it in our until-now-unused results array
       
   526     $results[] = $datum;
       
   527   }
       
   528   
       
   529   // Our work here is done. :-D
       
   530   return $results;
       
   531 }
       
   532 
       
   533 /**
       
   534  * Parses a search query into an associative array. The resultant array will be filled with the following values, each an array:
       
   535  *   any: Search terms that can optionally be present
       
   536  *   req: Search terms that must be present
       
   537  *   not: Search terms that should not be present
       
   538  * @param string Search query
       
   539  * @param array Will be filled with parser warnings, such as query too short, words too short, etc.
       
   540  * @return array
       
   541  */
       
   542 
       
   543 function parse_search_query($query, &$warnings)
       
   544 {
       
   545   $stopwords = get_stopwords();
       
   546   $ret = array(
       
   547     'any' => array(),
       
   548     'req' => array(),
       
   549     'not' => array()
       
   550     );
       
   551   $warnings = array();
       
   552   $terms = array();
       
   553   $in_quote = false;
       
   554   $start_term = 0;
       
   555   $just_finished = false;
       
   556   for ( $i = 0; $i < strlen($query); $i++ )
       
   557   {
       
   558     $chr = $query{$i};
       
   559     $prev = ( $i > 0 ) ? $query{ $i - 1 } : '';
       
   560     $next = ( ( $i + 1 ) < strlen($query) ) ? $query{ $i + 1 } : '';
       
   561     
       
   562     if ( ( $chr == ' ' && !$in_quote ) || ( $i + 1 == strlen ( $query ) ) )
       
   563     {
       
   564       $len = ( $next == '' ) ? $i + 1 : $i - $start_term;
       
   565       $word = substr ( $query, $start_term, $len );
       
   566       $terms[] = $word;
       
   567       $start_term = $i + 1;
       
   568     }
       
   569     
       
   570     elseif ( $chr == '"' && $in_quote && $prev != '\\' )
       
   571     {
       
   572       $word = substr ( $query, $start_term, $i - $start_term + 1 );
       
   573       $start_pos = ( $next == ' ' ) ? $i + 2 : $i + 1;
       
   574       $in_quote = false;
       
   575     }
       
   576     
       
   577     elseif ( $chr == '"' && !$in_quote )
       
   578     {
       
   579       $in_quote = true;
       
   580       $start_pos = $i;
       
   581     }
       
   582     
       
   583   }
       
   584   
       
   585   $ticker = 0;
       
   586   
       
   587   foreach ( $terms as $element => $__unused )
       
   588   {
       
   589     $atom =& $terms[$element];
       
   590     
       
   591     $ticker++;
       
   592     
       
   593     if ( $ticker == 20 )
       
   594     {
       
   595       $warnings[] = 'Some of your search terms were excluded because searches are limited to 20 terms to prevent excessive server load.';
       
   596       break;
       
   597     }
       
   598     
       
   599     if ( substr ( $atom, 0, 2 ) == '+"' && substr ( $atom, ( strlen ( $atom ) - 1 ), 1 ) == '"' )
       
   600     {
       
   601       $word = substr ( $atom, 2, ( strlen( $atom ) - 3 ) );
       
   602       if ( strlen ( $word ) < 2 || in_array($word, $stopwords) )
       
   603       {
       
   604         $warnings[] = 'One or more of your search terms was excluded because either it was less than 2 characters in length or is a common word (a stopword) that is typically found on a large number of pages. Examples of stopwords include "the", "this", "which", "with", etc.';
       
   605         $ticker--;
       
   606         continue;
       
   607       }
       
   608       if(in_array($word, $ret['req']))
       
   609       {
       
   610         $warnings[] = 'One or more of your search terms was excluded because duplicate terms were encountered.';
       
   611         $ticker--;
       
   612         continue;
       
   613       }
       
   614       $ret['req'][] = $word;
       
   615     }
       
   616     elseif ( substr ( $atom, 0, 2 ) == '-"' && substr ( $atom, ( strlen ( $atom ) - 1 ), 1 ) == '"' )
       
   617     {
       
   618       $word = substr ( $atom, 2, ( strlen( $atom ) - 3 ) );
       
   619       if ( strlen ( $word ) < 4 )
       
   620       {
       
   621         $warnings[] = 'One or more of your search terms was excluded because terms must be at least 4 characters in length.';
       
   622         $ticker--;
       
   623         continue;
       
   624       }
       
   625       if(in_array($word, $ret['not']))
       
   626       {
       
   627         $warnings[] = 'One or more of your search terms was excluded because duplicate terms were encountered.';
       
   628         $ticker--;
       
   629         continue;
       
   630       }
       
   631       $ret['not'][] = $word;
       
   632     }
       
   633     elseif ( substr ( $atom, 0, 1 ) == '+' )
       
   634     {
       
   635       $word = substr ( $atom, 1 );
       
   636       if ( strlen ( $word ) < 2 || in_array($word, $stopwords) )
       
   637       {
       
   638         $warnings[] = 'One or more of your search terms was excluded because either it was less than 2 characters in length or is a common word (a stopword) that is typically found on a large number of pages. Examples of stopwords include "the", "this", "which", "with", etc.';
       
   639         $ticker--;
       
   640         continue;
       
   641       }
       
   642       if(in_array($word, $ret['req']))
       
   643       {
       
   644         $warnings[] = 'One or more of your search terms was excluded because duplicate terms were encountered.';
       
   645         $ticker--;
       
   646         continue;
       
   647       }
       
   648       $ret['req'][] = $word;
       
   649     }
       
   650     elseif ( substr ( $atom, 0, 1 ) == '-' )
       
   651     {
       
   652       $word = substr ( $atom, 1 );
       
   653       if ( strlen ( $word ) < 2 || in_array($word, $stopwords) )
       
   654       {
       
   655         $warnings[] = 'One or more of your search terms was excluded because either it was less than 2 characters in length or is a common word (a stopword) that is typically found on a large number of pages. Examples of stopwords include "the", "this", "which", "with", etc.';
       
   656         $ticker--;
       
   657         continue;
       
   658       }
       
   659       if(in_array($word, $ret['not']))
       
   660       {
       
   661         $warnings[] = 'One or more of your search terms was excluded because duplicate terms were encountered.';
       
   662         $ticker--;
       
   663         continue;
       
   664       }
       
   665       $ret['not'][] = $word;
       
   666     }
       
   667     elseif ( substr ( $atom, 0, 1 ) == '"' && substr ( $atom, ( strlen($atom) - 1 ), 1 ) == '"' )
       
   668     {
       
   669       $word = substr ( $atom, 1, ( strlen ( $atom ) - 2 ) );
       
   670       if ( strlen ( $word ) < 2 || in_array($word, $stopwords) )
       
   671       {
       
   672         $warnings[] = 'One or more of your search terms was excluded because either it was less than 2 characters in length or is a common word (a stopword) that is typically found on a large number of pages. Examples of stopwords include "the", "this", "which", "with", etc.';
       
   673         $ticker--;
       
   674         continue;
       
   675       }
       
   676       if(in_array($word, $ret['any']))
       
   677       {
       
   678         $warnings[] = 'One or more of your search terms was excluded because duplicate terms were encountered.';
       
   679         $ticker--;
       
   680         continue;
       
   681       }
       
   682       $ret['any'][] = $word;
       
   683     }
       
   684     else
       
   685     {
       
   686       $word = $atom;
       
   687       if ( strlen ( $word ) < 2 || in_array($word, $stopwords) )
       
   688       {
       
   689         $warnings[] = 'One or more of your search terms was excluded because either it was less than 2 characters in length or is a common word (a stopword) that is typically found on a large number of pages. Examples of stopwords include "the", "this", "which", "with", etc.';
       
   690         $ticker--;
       
   691         continue;
       
   692       }
       
   693       if(in_array($word, $ret['any']))
       
   694       {
       
   695         $warnings[] = 'One or more of your search terms was excluded because duplicate terms were encountered.';
       
   696         $ticker--;
       
   697         continue;
       
   698       }
       
   699       $ret['any'][] = $word;
       
   700     }
       
   701   }
       
   702   return $ret;
       
   703 }
       
   704 
       
   705 /**
       
   706  * Escapes a string for use in a LIKE clause.
       
   707  * @param string
       
   708  * @return string
       
   709  */
       
   710 
       
   711 function escape_string_like($string)
       
   712 {
       
   713   global $db, $session, $paths, $template, $plugins; // Common objects
       
   714   $string = $db->escape($string);
       
   715   $string = str_replace(array('%', '_'), array('\%', '\_'), $string);
       
   716   return $string;
       
   717 }
       
   718 
       
   719 /**
       
   720  * Wraps <highlight></highlight> tags around all words in both the specified array. Does not perform any clipping.
       
   721  * @param string Text to process
       
   722  * @param array Word list
       
   723  * @param bool If true, searches case-sensitively when highlighting words
       
   724  * @return string
       
   725  */
       
   726 
       
   727 function highlight_search_result($pt, $words, $case_sensitive = false)
       
   728 {
       
   729   $words2 = array();
       
   730   for ( $i = 0; $i < sizeof($words); $i++)
       
   731   {
       
   732     if(!empty($words[$i]))
       
   733       $words2[] = preg_quote($words[$i]);
       
   734   }
       
   735   
       
   736   $flag = ( $case_sensitive ) ? '' : 'i';
       
   737   $regex = '/(' . implode('|', $words2) . ')/' . $flag;
       
   738   $pt = preg_replace($regex, '<highlight>\\1</highlight>', $pt);
       
   739   
       
   740   return $pt;
       
   741 }
       
   742 
       
   743 /**
       
   744  * Wraps <highlight></highlight> tags around all words in both the specified array and the specified text and clips the text to
       
   745  * an appropriate length.
       
   746  * @param string Text to process
       
   747  * @param array Word list
       
   748  * @param bool If true, searches case-sensitively when highlighting words
       
   749  * @return string
       
   750  */
       
   751 
       
   752 function highlight_and_clip_search_result($pt, $words, $case_sensitive = false)
       
   753 {
       
   754   $cut_off = false;
       
   755   
       
   756   $space_chars = Array("\t", "\n", "\r", " ");
       
   757   
       
   758   $pt = highlight_search_result($pt, $words, $case_sensitive);
       
   759   
       
   760   foreach ( $words as $word )
       
   761   {
       
   762     // Boldface searched words
       
   763     $ptlen = strlen($pt);
       
   764     for ( $i = 0; $i < $ptlen; $i++ )
       
   765     {
       
   766       $len = strlen($word);
       
   767       if ( strtolower(substr($pt, $i, $len)) == strtolower($word) )
       
   768       {
       
   769         $chunk1 = substr($pt, 0, $i);
       
   770         $chunk2 = substr($pt, $i, $len);
       
   771         $chunk3 = substr($pt, ( $i + $len ));
       
   772         $pt = $chunk1 . $chunk2 . $chunk3;
       
   773         $ptlen = strlen($pt);
       
   774         // Cut off text to 150 chars or so
       
   775         if ( !$cut_off )
       
   776         {
       
   777           $cut_off = true;
       
   778           if ( $i - 75 > 0 )
   297           {
   779           {
   298             if($this->convertCase($userterm) == $this->convertCase($term))
   780             // Navigate backwards until a space character is found
       
   781             $chunk = substr($pt, 0, ( $i - 75 ));
       
   782             $final_chunk = $chunk;
       
   783             for ( $j = strlen($chunk); $j > 0; $j = $j - 1 )
   299             {
   784             {
   300               $k = explode(',', $keys);
   785               if ( in_array($chunk{$j}, $space_chars) )
   301               foreach($k as $idxkey)
       
   302               {
   786               {
   303                 $row[0] = $r[0];
   787                 $final_chunk = substr($chunk, $j + 1);
   304                 $row[1] = $r[1];
   788                 break;
   305                 if(!isset($row[1]))
       
   306                 {
       
   307                   echo('PHP PARSER BUG: $row[1] is set but not set... includes/search.php:'.__LINE__);
       
   308                   $GLOBALS['template']->footer();
       
   309                   exit;
       
   310                 }
       
   311                 if($row[1] == $idxkey)
       
   312                   $results[$idxkey] = $row[0];
       
   313                 else
       
   314                 {
       
   315                   if(preg_match('#^([0-9]+)$#', $idxkey))
       
   316                   {
       
   317                     $idxkey = intval($idxkey);
       
   318                     if($row[1] == $idxkey) $results[$idxkey] = $row[0];
       
   319                   }
       
   320                 }
       
   321               }
   789               }
   322             }
   790             }
       
   791             $mid_chunk = substr($pt, ( $i - 75 ), 75);
       
   792             
       
   793             $clipped = '...' . $final_chunk . $mid_chunk . $chunk2;
       
   794             
       
   795             $chunk = substr($pt, ( $i + strlen($chunk2) + 75 ));
       
   796             $final_chunk = $chunk;
       
   797             for ( $j = 0; $j < strlen($chunk); $j++ )
       
   798             {
       
   799               if ( in_array($chunk{$j}, $space_chars) )
       
   800               {
       
   801                 $final_chunk = substr($chunk, 0, $j);
       
   802                 break;
       
   803               }
       
   804             }
       
   805             
       
   806             $end_chunk = substr($pt, ( $i + strlen($chunk2) ), 75 );
       
   807             
       
   808             $clipped .= $end_chunk . $final_chunk . '...';
       
   809             
       
   810             $pt = $clipped;
   323           }
   811           }
       
   812           else if ( strlen($pt) > 200 )
       
   813           {
       
   814             $mid_chunk = substr($pt, ( $i - 75 ), 75);
       
   815             
       
   816             $clipped = $chunk1 . $chunk2;
       
   817             
       
   818             $chunk = substr($pt, ( $i + strlen($chunk2) + 75 ));
       
   819             $final_chunk = $chunk;
       
   820             for ( $j = 0; $j < strlen($chunk); $j++ )
       
   821             {
       
   822               if ( in_array($chunk{$j}, $space_chars) )
       
   823               {
       
   824                 $final_chunk = substr($chunk, 0, $j);
       
   825                 break;
       
   826               }
       
   827             }
       
   828             
       
   829             $end_chunk = substr($pt, ( $i + strlen($chunk2) ), 75 );
       
   830             
       
   831             $clipped .= $end_chunk . $final_chunk . '...';
       
   832             
       
   833             $pt = $clipped;
       
   834             
       
   835           }
       
   836           break 2;
   324         }
   837         }
   325         // Quoted terms
   838       }
   326         foreach($query['trm'] as $userterm)
   839     }
   327         {
   840     $cut_off = false;
   328           if(!preg_match('/[\s"\'~`!@#\$%\^&\*\(\)\{\}:;<>,.\/\?_-]/', $userterm)) continue;
   841   }
   329           if(strstr($this->convertCase($r[0]), $this->convertCase($userterm)))
   842   return $pt;
   330           {
       
   331             // We have a match!
       
   332             if(!isset($results[$r[1]])) $results[$r[1]] = $r[0];
       
   333           }
       
   334         }
       
   335       } while( $r = $db->fetchrow_num($texts) );
       
   336     }
       
   337     // Remove excluded terms
       
   338     foreach($results as $k => $r)
       
   339     {
       
   340       foreach($query['not'] as $not)
       
   341       {
       
   342         if(strstr($this->convertCase($r), $this->convertCase($not))) unset($results[$k]);
       
   343       }
       
   344     }
       
   345     if(!$any)
       
   346     {
       
   347       // Remove results not containing all terms
       
   348       foreach($results as $k => $r)
       
   349       {
       
   350         foreach($query['any'] as $term)
       
   351         {
       
   352           if(!strstr($this->convertCase($r), $this->convertCase($term))) unset($results[$k]);
       
   353         }
       
   354       }
       
   355     }
       
   356     // Remove results not containing all terms
       
   357     foreach($results as $k => $r)
       
   358     {
       
   359       foreach($query['req'] as $term)
       
   360       {
       
   361         if(!strstr($this->convertCase($r), $this->convertCase($term))) unset($results[$k]);
       
   362       }
       
   363     }
       
   364     return $results;
       
   365   }
       
   366   
       
   367   function concatQueryTerms($query)
       
   368   {
       
   369     $tmp = implode(' ', $query['any']);
       
   370     unset($query['any']);
       
   371     $query['any'] = Array(0 => $tmp);
       
   372     return $query;
       
   373   }
       
   374   
       
   375   /**
       
   376    * Builds a basic assoc array with a more organized version of the query
       
   377    */
       
   378   
       
   379   function parseQuery($query)
       
   380   {
       
   381     $ret = array(
       
   382       'any' => array(),
       
   383       'req' => array(),
       
   384       'not' => array()
       
   385       );
       
   386     $terms = array();
       
   387     $in_quote = false;
       
   388     $start_term = 0;
       
   389     $just_finished = false;
       
   390     for ( $i = 0; $i < strlen($query); $i++ )
       
   391     {
       
   392       $chr = $query{$i};
       
   393       $prev = ( $i > 0 ) ? $query{ $i - 1 } : '';
       
   394       $next = ( ( $i + 1 ) < strlen($query) ) ? $query{ $i + 1 } : '';
       
   395       
       
   396       if ( ( $chr == ' ' && !$in_quote ) || ( $i + 1 == strlen ( $query ) ) )
       
   397       {
       
   398         $len = ( $next == '' ) ? $i + 1 : $i - $start_term;
       
   399         $word = substr ( $query, $start_term, $len );
       
   400         $terms[] = $word;
       
   401         $start_term = $i + 1;
       
   402       }
       
   403       
       
   404       elseif ( $chr == '"' && $in_quote && $prev != '\\' )
       
   405       {
       
   406         $word = substr ( $query, $start_term, $i - $start_term + 1 );
       
   407         $start_pos = ( $next == ' ' ) ? $i + 2 : $i + 1;
       
   408         $in_quote = false;
       
   409       }
       
   410       
       
   411       elseif ( $chr == '"' && !$in_quote )
       
   412       {
       
   413         $in_quote = true;
       
   414         $start_pos = $i;
       
   415       }
       
   416       
       
   417     }
       
   418     
       
   419     $ticker = 0;
       
   420     
       
   421     foreach ( $terms as $element => $__unused )
       
   422     {
       
   423       $atom =& $terms[$element];
       
   424       
       
   425       $ticker++;
       
   426       
       
   427       if ( $ticker == 20 )
       
   428       {
       
   429         $this->warn('Some of your search terms were excluded because searches are limited to 20 terms to prevent excessive server load.');
       
   430         break;
       
   431       }
       
   432       
       
   433       if ( substr ( $atom, 0, 2 ) == '+"' && substr ( $atom, ( strlen ( $atom ) - 1 ), 1 ) == '"' )
       
   434       {
       
   435         $word = substr ( $atom, 2, ( strlen( $atom ) - 3 ) );
       
   436         if ( strlen ( $word ) < 4 )
       
   437         {
       
   438           $this->warn('One or more of your search terms was excluded because terms must be at least 4 characters in length.');
       
   439           $ticker--;
       
   440           continue;
       
   441         }
       
   442         if(in_array($word, $ret['req']))
       
   443         {
       
   444           $this->warn('One or more of your search terms was excluded because duplicate terms were encountered.');
       
   445           $ticker--;
       
   446           continue;
       
   447         }
       
   448         $ret['req'][] = $word;
       
   449       }
       
   450       elseif ( substr ( $atom, 0, 2 ) == '-"' && substr ( $atom, ( strlen ( $atom ) - 1 ), 1 ) == '"' )
       
   451       {
       
   452         $word = substr ( $atom, 2, ( strlen( $atom ) - 3 ) );
       
   453         if ( strlen ( $word ) < 4 )
       
   454         {
       
   455           $this->warn('One or more of your search terms was excluded because terms must be at least 4 characters in length.');
       
   456           $ticker--;
       
   457           continue;
       
   458         }
       
   459         if(in_array($word, $ret['not']))
       
   460         {
       
   461           $this->warn('One or more of your search terms was excluded because duplicate terms were encountered.');
       
   462           $ticker--;
       
   463           continue;
       
   464         }
       
   465         $ret['not'][] = $word;
       
   466       }
       
   467       elseif ( substr ( $atom, 0, 1 ) == '+' )
       
   468       {
       
   469         $word = substr ( $atom, 1 );
       
   470         if ( strlen ( $word ) < 4 )
       
   471         {
       
   472           $this->warn('One or more of your search terms was excluded because terms must be at least 4 characters in length.');
       
   473           $ticker--;
       
   474           continue;
       
   475         }
       
   476         if(in_array($word, $ret['req']))
       
   477         {
       
   478           $this->warn('One or more of your search terms was excluded because duplicate terms were encountered.');
       
   479           $ticker--;
       
   480           continue;
       
   481         }
       
   482         $ret['req'][] = $word;
       
   483       }
       
   484       elseif ( substr ( $atom, 0, 1 ) == '-' )
       
   485       {
       
   486         $word = substr ( $atom, 1 );
       
   487         if ( strlen ( $word ) < 4 )
       
   488         {
       
   489           $this->warn('One or more of your search terms was excluded because terms must be at least 4 characters in length.');
       
   490           $ticker--;
       
   491           continue;
       
   492         }
       
   493         if(in_array($word, $ret['not']))
       
   494         {
       
   495           $this->warn('One or more of your search terms was excluded because duplicate terms were encountered.');
       
   496           $ticker--;
       
   497           continue;
       
   498         }
       
   499         $ret['not'][] = $word;
       
   500       }
       
   501       elseif ( substr ( $atom, 0, 1 ) == '"' && substr ( $atom, ( strlen($atom) - 1 ), 1 ) == '"' )
       
   502       {
       
   503         $word = substr ( $atom, 1, ( strlen ( $atom ) - 2 ) );
       
   504         if ( strlen ( $word ) < 4 )
       
   505         {
       
   506           $this->warn('One or more of your search terms was excluded because terms must be at least 4 characters in length.');
       
   507           $ticker--;
       
   508           continue;
       
   509         }
       
   510         if(in_array($word, $ret['any']))
       
   511         {
       
   512           $this->warn('One or more of your search terms was excluded because duplicate terms were encountered.');
       
   513           $ticker--;
       
   514           continue;
       
   515         }
       
   516         $ret['any'][] = $word;
       
   517       }
       
   518       else
       
   519       {
       
   520         $word = $atom;
       
   521         if ( strlen ( $word ) < 4 )
       
   522         {
       
   523           $this->warn('One or more of your search terms was excluded because terms must be at least 4 characters in length.');
       
   524           $ticker--;
       
   525           continue;
       
   526         }
       
   527         if(in_array($word, $ret['any']))
       
   528         {
       
   529           $this->warn('One or more of your search terms was excluded because duplicate terms were encountered.');
       
   530           $ticker--;
       
   531           continue;
       
   532         }
       
   533         $ret['any'][] = $word;
       
   534       }
       
   535     }
       
   536     return $ret;
       
   537   }
       
   538   
       
   539   function highlightResults($query, $starttag = '<b>', $endtag = '</b>')
       
   540   {
       
   541     $query['trm'] = array_merge($query['any'], $query['req']);
       
   542     //die('<pre>'.print_r($query, true).'</pre>');
       
   543     foreach($query['trm'] as $q)
       
   544     {
       
   545       foreach($this->results as $k => $r)
       
   546       {
       
   547         $startplace = 0;
       
   548         //$this->results[$k] = htmlspecialchars($this->results[$k]);
       
   549         for($i = 0; $i < strlen($r); $i++)
       
   550         {
       
   551           $word = substr($r, $i, strlen($q));
       
   552           if($this->convertCase($word) == $this->convertCase($q))
       
   553           {
       
   554             $word = $starttag . $word . $endtag;
       
   555             $this->results[$k] = substr($r, 0, $i) . $word . substr($r, $i + strlen($q), strlen($r)+999999);
       
   556             $startplace = $i - 75;
       
   557             if($startplace < 0) $startplace = 0;
       
   558             $this->results[$k] = '...'.trim(substr($this->results[$k], $startplace, strlen($word) + 150)).'...';
       
   559             continue 2;
       
   560           }
       
   561         }
       
   562       }
       
   563     }
       
   564   }
       
   565   
       
   566 }
   843 }
   567 
   844 
   568 /**
   845 /**
   569  * Developer-friendly way to do searches. :-) Uses the MySQL FULLTEXT index type.
   846  * Returns a list of words that shouldn't under most circumstances be indexed for searching. Kudos to MySQL.
   570  * @package Enano
   847  * @return array
   571  * @subpackage Search
   848  * @see http://dev.mysql.com/doc/refman/5.0/en/fulltext-stopwords.html
   572  */
   849  */
   573 
   850 
   574 class MySQL_Fulltext_Search {
   851 function get_stopwords()
   575   
   852 {
   576   /**
   853   static $stopwords;
   577    * Performs a search.
   854   if ( is_array($stopwords) )
   578    * @param string The search query
   855     return $stopwords;
   579    * @return resource MySQL result resource - this is an UNBUFFERED query.
   856   
   580    */
   857   $stopwords = array('a\'s', 'able', 'after', 'afterwards', 'again',
   581   
   858                      'against', 'ain\'t', 'all', 'almost', 'alone', 'along', 'already', 'also', 'although', 'always',
   582   function search($query)
   859                      'am', 'among', 'amongst', 'an', 'and', 'another', 'any', 'anybody', 'anyhow', 'anyone', 'anything', 'anyway',
   583   {
   860                      'anyways', 'anywhere', 'apart', 'appear', 'appreciate', 'appropriate', 'are', 'aren\'t', 'around', 'as', 'aside',
   584     global $db, $session, $paths, $template, $plugins; // Common objects
   861                      'ask', 'asking', 'associated', 'at', 'available', 'away', 'awfully', 'be', 'became', 'because', 'become', 'becomes',
   585     
   862                      'becoming', 'been', 'before', 'beforehand', 'behind', 'being', 'believe', 'below', 'beside', 'besides', 'best',
   586     $fulltext_col = 'MATCH(t.page_id,t.namespace,p.name,t.page_text) AGAINST (\'' . $db->escape($query) . '\' IN BOOLEAN MODE)';
   863                      'better', 'between', 'beyond', 'both', 'brief', 'but', 'by', 'c\'mon', 'c\'s', 'came', 'can', 'can\'t', 'cannot',
   587     $sql = "SELECT t.page_text,CONCAT('ns=',t.namespace,';pid=',t.page_id) AS page_identifier, $fulltext_col AS score, CHAR_LENGTH(t.page_text) AS length FROM ".table_prefix."page_text AS t
   864                      'cant', 'cause', 'causes', 'certain', 'certainly', 'changes', 'clearly', 'co', 'com', 'come', 'comes', 'concerning',
   588               LEFT JOIN ".table_prefix."pages AS p
   865                      'consequently', 'consider', 'considering', 'contain', 'containing', 'contains', 'corresponding', 'could',
   589                 ON ( p.urlname=t.page_id AND p.namespace=t.namespace)
   866                      'couldn\'t', 'course', 'despite', 'did', 'didn\'t', 'different', 'do',
   590               WHERE $fulltext_col > 0
   867                      'does', 'doesn\'t', 'doing', 'don\'t', 'done', 'down', 'downwards', 'during', 'each', 'edu', 'eg', 'eight',
   591                 AND p.visible=1
   868                      'either', 'else', 'elsewhere', 'enough', 'entirely', 'especially', 'et', 'etc', 'even', 'ever', 'every',
   592               ORDER BY score DESC;";
   869                      'everybody', 'everyone', 'everything', 'everywhere', 'ex', 'exactly', 'example', 'except', 'far', 'few', 'fifth',
   593     $q = $db->sql_unbuffered_query($sql);
   870                      'first', 'five', 'followed', 'following', 'follows', 'for', 'former', 'formerly', 'forth', 'four', 'from',
   594     if ( !$q )
   871                      'further', 'get', 'gets', 'getting', 'given', 'gives', 'go', 'goes', 'going', 'gone', 'got',
   595       $db->_die();
   872                      'gotten', 'had', 'hadn\'t', 'happens', 'hardly', 'has', 'hasn\'t', 'have', 'haven\'t', 'having',
   596     
   873                      'he', 'he\'s', 'hello', 'help', 'hence', 'her', 'here', 'here\'s', 'hereafter', 'hereby', 'herein', 'hereupon',
   597     return $q;
   874                      'hers', 'herself', 'hi', 'him', 'himself', 'his', 'hither', 'hopefully', 'how', 'howbeit', 'however', 'i\'d',
   598   }
   875                      'i\'ll', 'i\'m', 'i\'ve', 'ie', 'if', 'ignored', 'immediate', 'in', 'inasmuch', 'inc', 'indeed', 'indicate',
   599   
   876                      'indicated', 'indicates', 'inner', 'insofar', 'instead', 'into', 'inward', 'is', 'isn\'t', 'it', 'it\'d', 'it\'ll',
   600   function highlight_result($query, $result)
   877                      'it\'s', 'its', 'itself', 'just', 'keep', 'keeps', 'kept', 'know', 'knows', 'known', 'last', 'lately', 'later',
   601   {
   878                      'latter', 'latterly', 'least', 'less', 'lest', 'let', 'let\'s', 'like', 'liked', 'likely', 'little', 'look',
   602     global $db, $session, $paths, $template, $plugins; // Common objects
   879                      'looking', 'looks', 'ltd', 'mainly', 'many', 'may', 'maybe', 'me', 'mean', 'meanwhile', 'merely', 'might', 'more',
   603     $search = new Searcher();
   880                      'moreover', 'most', 'mostly', 'much', 'must', 'my', 'myself', 'name', 'namely', 'nd', 'near', 'nearly', 'necessary',
   604     $parsed_query = $search->parseQuery($query);
   881                      'need', 'needs', 'neither', 'never', 'nevertheless', 'new', 'next', 'nine', 'no', 'nobody', 'non', 'none', 'noone',
   605     return $this->highlight_result_inner($query, $result);
   882                      'nor', 'normally', 'not', 'nothing', 'novel', 'now', 'nowhere', 'obviously', 'of', 'off', 'often', 'oh', 'ok',
   606   }
   883                      'okay', 'old', 'on', 'once', 'one', 'ones', 'only', 'onto', 'or', 'other', 'others', 'otherwise', 'ought', 'our',
   607   
   884                      'ours', 'ourselves', 'out', 'outside', 'over', 'overall', 'own', 'particular', 'particularly', 'per', 'perhaps',
   608   function highlight_result_inner($query, $fulltext, $starttag = '<b>', $endtag = '</b>')
   885                      'placed', 'please', 'plus', 'possible', 'presumably', 'probably', 'provides', 'que', 'quite', 'qv', 'rather', 'rd',
   609   {
   886                      're', 'really', 'reasonably', 'regarding', 'regardless', 'regards', 'relatively', 'respectively', 'right', 'said',
   610     $result = false;
   887                      'same', 'saw', 'say', 'saying', 'says', 'second', 'secondly', 'see', 'seeing', 'seem', 'seemed', 'seeming', 'seems',
   611     $query['trm'] = array_merge($query['any'], $query['req']);
   888                      'seen', 'self', 'selves', 'sensible', 'sent', 'serious', 'seriously', 'seven', 'several', 'shall', 'she', 'should',
   612     //die('<pre>'.print_r($query, true).'</pre>');
   889                      'shouldn\'t', 'since', 'six', 'so', 'some', 'somebody', 'somehow', 'someone', 'something', 'sometime', 'sometimes',
   613     foreach($query['trm'] as $q)
   890                      'somewhat', 'somewhere', 'soon', 'sorry', 'specified', 'specify', 'specifying', 'still', 'sub', 'such', 'sup',
   614     {
   891                      'sure', 't\'s', 'take', 'taken', 'tell', 'tends', 'th', 'than', 'thank', 'thanks', 'thanx', 'that', 'that\'s',
   615       $startplace = 0;
   892                      'thats', 'the', 'their', 'theirs', 'them', 'themselves', 'then', 'thence', 'there', 'there\'s', 'thereafter',
   616       //$this->results[$k] = htmlspecialchars($this->results[$k]);
   893                      'thereby', 'therefore', 'therein', 'theres', 'thereupon', 'these', 'they', 'they\'d', 'they\'ll', 'they\'re',
   617       for($i = 0; $i < strlen($r); $i++)
   894                      'they\'ve', 'think', 'third', 'this', 'thorough', 'thoroughly', 'those', 'though', 'three', 'through', 'throughout',
   618       {
   895                      'thru', 'thus', 'to', 'together', 'too', 'took', 'toward', 'towards', 'tried', 'tries', 'truly', 'try', 'trying',
   619         $word = substr($r, $i, strlen($q));
   896                      'twice', 'two', 'un', 'under', 'unfortunately', 'unless', 'unlikely', 'until', 'unto', 'up', 'upon', 'us', 'use',
   620         if($this->convertCase($word) == $this->convertCase($q))
   897                      'used', 'useful', 'uses', 'using', 'usually', 'value', 'various', 'very', 'via', 'viz', 'vs', 'want', 'wants',
   621         {
   898                      'was', 'wasn\'t', 'way', 'we', 'we\'d', 'we\'ll', 'we\'re', 'we\'ve', 'welcome', 'well', 'went', 'were', 'weren\'t',
   622           $word = $starttag . $word . $endtag;
   899                      'what', 'what\'s', 'whatever', 'when', 'whence', 'whenever', 'where', 'where\'s', 'whereafter', 'whereas',
   623           $result = substr($fulltext, 0, $i) . $word . substr($r, $i + strlen($q), strlen($r)+99999999);
   900                      'whereby', 'wherein', 'whereupon', 'wherever', 'whether', 'which', 'while', 'whither', 'who', 'who\'s', 'whoever',
   624           $startplace = $i - 75;
   901                      'whole', 'whom', 'whose', 'why', 'will', 'willing', 'wish', 'with', 'within', 'without', 'won\'t', 'wonder',
   625           if($startplace < 0) $startplace = 0;
   902                      'would', 'would', 'wouldn\'t', 'yes', 'yet', 'you', 'you\'d', 'you\'ll', 'you\'re', 'you\'ve', 'your', 'yours',
   626           $result = '...'.trim(substr($result, $startplace, strlen($word) + 150)).'...';
   903                      'yourself', 'yourselves', 'zero');
   627           continue 2;
   904   return $stopwords;
   628         }
       
   629       }
       
   630     }
       
   631     return $result;
       
   632   }
       
   633   
       
   634 }
   905 }
   635 
   906 
   636 ?>
   907 ?>