includes/search.php
changeset 292 b3cfaf0a505c
parent 272 e0ec986c0af3
child 320 112debff64bd
equal deleted inserted replaced
291:a1d0846c4504 292:b3cfaf0a505c
    17  * Implementation of array_merge() that preserves key names. $arr2 takes precedence over $arr1.
    17  * Implementation of array_merge() that preserves key names. $arr2 takes precedence over $arr1.
    18  * @param array $arr1
    18  * @param array $arr1
    19  * @param array $arr2
    19  * @param array $arr2
    20  * @return array
    20  * @return array
    21  */
    21  */
    22  
    22 
    23 function enano_safe_array_merge($arr1, $arr2)
    23 function enano_safe_array_merge($arr1, $arr2)
    24 {
    24 {
    25   $arr3 = $arr1;
    25   $arr3 = $arr1;
    26   foreach($arr2 as $k => $v)
    26   foreach($arr2 as $k => $v)
    27   {
    27   {
    32 
    32 
    33 /**
    33 /**
    34  * In Enano versions prior to 1.0.2, this class provided a search function that was keyword-based and allowed boolean searches. It was
    34  * In Enano versions prior to 1.0.2, this class provided a search function that was keyword-based and allowed boolean searches. It was
    35  * cut from Coblynau and replaced with perform_search(), later in this file, because of speed issues. Now mostly deprecated. The only
    35  * cut from Coblynau and replaced with perform_search(), later in this file, because of speed issues. Now mostly deprecated. The only
    36  * thing remaining is the buildIndex function, which is still used by the path manager and the new search framework.
    36  * thing remaining is the buildIndex function, which is still used by the path manager and the new search framework.
    37  * 
    37  *
    38  * @package Enano
    38  * @package Enano
    39  * @subpackage Page management frontend
    39  * @subpackage Page management frontend
    40  * @license GNU General Public License <http://enanocms.org/Special:GNU_General_Public_License>
    40  * @license GNU General Public License <http://enanocms.org/Special:GNU_General_Public_License>
    41  */
    41  */
    42 
    42 
    43 class Searcher
    43 class Searcher
    44 {
    44 {
    45   
    45 
    46   var $results;
    46   var $results;
    47   var $index;
    47   var $index;
    48   var $warnings;
    48   var $warnings;
    49   var $match_case = false;
    49   var $match_case = false;
    50   
    50 
    51   function buildIndex($texts)
    51   function buildIndex($texts)
    52   {
    52   {
    53     $this->index = Array();
    53     $this->index = Array();
    54     $stopwords = get_stopwords();
    54     $stopwords = get_stopwords();
    55     
    55 
    56     foreach($texts as $i => $l)
    56     foreach($texts as $i => $l)
    57     {
    57     {
    58       $seed = md5(microtime(true) . mt_rand());
    58       $seed = md5(microtime(true) . mt_rand());
    59       $texts[$i] = str_replace("'", 'xxxApoS'.$seed.'xxx', $texts[$i]);
    59       $texts[$i] = str_replace("'", 'xxxApoS'.$seed.'xxx', $texts[$i]);
    60       $texts[$i] = preg_replace('#([\W_]+)#i', ' ', $texts[$i]);
    60       $texts[$i] = preg_replace('#([\W_]+)#i', ' ', $texts[$i]);
   117 
   117 
   118 function perform_search($query, &$warnings, $case_sensitive = false)
   118 function perform_search($query, &$warnings, $case_sensitive = false)
   119 {
   119 {
   120   global $db, $session, $paths, $template, $plugins; // Common objects
   120   global $db, $session, $paths, $template, $plugins; // Common objects
   121   $warnings = array();
   121   $warnings = array();
   122   
   122 
   123   $query = parse_search_query($query, $warnings);
   123   $query = parse_search_query($query, $warnings);
   124   
   124 
   125   // Segregate search terms containing spaces
   125   // Segregate search terms containing spaces
   126   $query_phrase = array(
   126   $query_phrase = array(
   127     'any' => array(),
   127     'any' => array(),
   128     'req' => array()
   128     'req' => array()
   129     );
   129     );
   130   
   130 
   131   foreach ( $query['any'] as $i => $_ )
   131   foreach ( $query['any'] as $i => $_ )
   132   {
   132   {
   133     $term =& $query['any'][$i];
   133     $term =& $query['any'][$i];
   134     $term = trim($term);
   134     $term = trim($term);
   135     // the indexer only indexes words a-z with apostrophes
   135     // the indexer only indexes words a-z with apostrophes
   139       unset($term, $query['any'][$i]);
   139       unset($term, $query['any'][$i]);
   140     }
   140     }
   141   }
   141   }
   142   unset($term);
   142   unset($term);
   143   $query['any'] = array_values($query['any']);
   143   $query['any'] = array_values($query['any']);
   144   
   144 
   145   foreach ( $query['req'] as $i => $_ )
   145   foreach ( $query['req'] as $i => $_ )
   146   {
   146   {
   147     $term =& $query['req'][$i];
   147     $term =& $query['req'][$i];
   148     $term = trim($term);
   148     $term = trim($term);
   149     if ( preg_match('/[^A-Za-z\']/', $term) )
   149     if ( preg_match('/[^A-Za-z\']/', $term) )
   152       unset($term, $query['req'][$i]);
   152       unset($term, $query['req'][$i]);
   153     }
   153     }
   154   }
   154   }
   155   unset($term);
   155   unset($term);
   156   $query['req'] = array_values($query['req']);
   156   $query['req'] = array_values($query['req']);
   157   
   157 
   158   $results = array();
   158   $results = array();
   159   $scores = array();
   159   $scores = array();
   160   
   160   $ns_list = '(' . implode('|', array_keys($paths->nslist)) . ')';
       
   161 
   161   // FIXME: Update to use FULLTEXT algo when available.
   162   // FIXME: Update to use FULLTEXT algo when available.
   162   
   163 
   163   // Build an SQL query to load from the index table
   164   // Build an SQL query to load from the index table
   164   if ( count($query['any']) < 1 && count($query['req']) < 1 && count($query_phrase['any']) < 1 && count($query_phrase['req']) < 1 )
   165   if ( count($query['any']) < 1 && count($query['req']) < 1 && count($query_phrase['any']) < 1 && count($query_phrase['req']) < 1 )
   165   {
   166   {
   166     // This is both because of technical restrictions and devastation that would occur on shared servers/large sites.
   167     // This is both because of technical restrictions and devastation that would occur on shared servers/large sites.
   167     $warnings[] = 'You need to have at least one keyword in your search query. Searching only for pages not containing a term is not allowed.';
   168     $warnings[] = 'You need to have at least one keyword in your search query. Searching only for pages not containing a term is not allowed.';
   168     return array();
   169     return array();
   169   }
   170   }
   170   
   171 
   171   //
   172   //
   172   // STAGE 1
   173   // STAGE 1
   173   // Get all possible result pages from the search index. Tally which pages have the most words, and later sort them by boolean relevance
   174   // Get all possible result pages from the search index. Tally which pages have the most words, and later sort them by boolean relevance
   174   //
   175   //
   175   
   176 
   176   // Skip this if no indexable words are included
   177   // Skip this if no indexable words are included
   177   
   178 
   178   if ( count($query['any']) > 0 || count($query['req']) > 0 )
   179   if ( count($query['any']) > 0 || count($query['req']) > 0 )
   179   {
   180   {
   180     $where_any = array();
   181     $where_any = array();
   181     foreach ( $query['any'] as $term )
   182     foreach ( $query['any'] as $term )
   182     {
   183     {
   190       $term = escape_string_like($term);
   191       $term = escape_string_like($term);
   191       if ( !$case_sensitive )
   192       if ( !$case_sensitive )
   192         $term = strtolower($term);
   193         $term = strtolower($term);
   193       $where_any[] = $term;
   194       $where_any[] = $term;
   194     }
   195     }
   195     
   196 
   196     $col_word = ( $case_sensitive ) ? 'word' : 'lcase(word)';
   197     $col_word = ( $case_sensitive ) ? 'word' : 'lcase(word)';
   197     $where_any = ( count($where_any) > 0 ) ? '( ' . $col_word . ' = \'' . implode('\' OR ' . $col_word . ' = \'', $where_any) . '\' )' : '';
   198     $where_any = ( count($where_any) > 0 ) ? '( ' . $col_word . ' = \'' . implode('\' OR ' . $col_word . ' = \'', $where_any) . '\' )' : '';
   198     
   199 
   199     // generate query
   200     // generate query
   200     // using a GROUP BY here ensures that the same word with a different case isn't counted as 2 words - it's all melted back
   201     // using a GROUP BY here ensures that the same word with a different case isn't counted as 2 words - it's all melted back
   201     // into one later in the processing stages
   202     // into one later in the processing stages
   202     $group_by = ( $case_sensitive ) ? '' : ' GROUP BY lcase(word);';
   203     // $group_by = ( $case_sensitive ) ? '' : ' GROUP BY lcase(word);';
   203     $sql = "SELECT word, page_names FROM " . table_prefix . "search_index WHERE {$where_any}{$group_by}";
   204     $sql = "SELECT word, page_names FROM " . table_prefix . "search_index WHERE {$where_any}";
   204     if ( !($q = $db->sql_unbuffered_query($sql)) )
   205     if ( !($q = $db->sql_unbuffered_query($sql)) )
   205       $db->_die('Error is in perform_search(), includes/search.php, query 1');
   206       $db->_die('Error is in perform_search(), includes/search.php, query 1');
   206     
   207 
   207     $word_tracking = array();
   208     $word_tracking = array();
   208     if ( $row = $db->fetchrow() )
   209     if ( $row = $db->fetchrow() )
   209     {
   210     {
   210       do
   211       do
   211       {
   212       {
   212         // get page list
   213         // get page list
   213         $pages =& $row['page_names'];
   214         $pages =& $row['page_names'];
   214         $ns_list = '(' . implode('|', array_keys($paths->nslist)) . ')';
       
   215         if ( strpos($pages, ',') )
   215         if ( strpos($pages, ',') )
   216         {
   216         {
   217           // the term occurs in more than one page
   217           // the term occurs in more than one page
   218           
   218 
   219           // Find page IDs that contain commas
   219           // Find page IDs that contain commas
   220           // This should never happen because commas are escaped by sanitize_page_id(). Nevertheless for compatibility with older
   220           // This should never happen because commas are escaped by sanitize_page_id(). Nevertheless for compatibility with older
   221           // databases, and to alleviate the concerns of hackers, we'll accommodate for page IDs with commas here by checking for
   221           // databases, and to alleviate the concerns of hackers, we'll accommodate for page IDs with commas here by checking for
   222           // IDs that don't match the pattern for stringified page ID + namespace. If it doesn't match, that means it's a continuation
   222           // IDs that don't match the pattern for stringified page ID + namespace. If it doesn't match, that means it's a continuation
   223           // of the previous ID and should be concatenated to the previous entry.
   223           // of the previous ID and should be concatenated to the previous entry.
   233               continue;
   233               continue;
   234             }
   234             }
   235             $prev = $i;
   235             $prev = $i;
   236           }
   236           }
   237           unset($match);
   237           unset($match);
   238           
   238 
   239           // Iterate through each of the results, assigning scores based on how many times the page has shown up.
   239           // Iterate through each of the results, assigning scores based on how many times the page has shown up.
   240           // This works because this phase of the search is strongly word-based not page-based. If a page shows up
   240           // This works because this phase of the search is strongly word-based not page-based. If a page shows up
   241           // multiple times while fetching the result rows from the search_index table, it simply means that page
   241           // multiple times while fetching the result rows from the search_index table, it simply means that page
   242           // contains more than one of the terms the user searched for.
   242           // contains more than one of the terms the user searched for.
   243           
   243 
   244           foreach ( $matches as $match )
   244           foreach ( $matches as $match )
   245           {
   245           {
       
   246             $word_cs = (( $case_sensitive ) ? $row['word'] : strtolower($row['word']));
       
   247             if ( isset($word_tracking[$match]) && in_array($word_cs, $word_tracking[$match]) )
       
   248             {
       
   249               continue;
       
   250             }
       
   251             if ( isset($word_tracking[$match]) )
       
   252             {
       
   253               if ( isset($word_tracking[$match]) )
       
   254               {
       
   255                 $word_tracking[$match][] = ($word_cs);
       
   256               }
       
   257             }
       
   258             else
       
   259             {
       
   260               $word_tracking[$match] = array($word_cs);
       
   261             }
       
   262             $inc = 1;
       
   263 
       
   264             // Is this search term present in the page's title? If so, give extra points
       
   265             preg_match("/^ns=$ns_list;pid=(.+)$/", $match, $piecesparts);
       
   266             $pathskey = $paths->nslist[ $piecesparts[1] ] . sanitize_page_id($piecesparts[2]);
       
   267             if ( isset($paths->pages[$pathskey]) )
       
   268             {
       
   269               $test_func = ( $case_sensitive ) ? 'strstr' : 'stristr';
       
   270               if ( $test_func($paths->pages[$pathskey]['name'], $row['word']) || $test_func($paths->pages[$pathskey]['urlname_nons'], $row['word']) )
       
   271               {
       
   272                 $inc = 1.5;
       
   273               }
       
   274             }
   246             if ( isset($scores[$match]) )
   275             if ( isset($scores[$match]) )
   247             {
   276             {
   248               $scores[$match]++;
   277               $scores[$match] = $scores[$match] + $inc;
   249             }
   278             }
   250             else
   279             else
   251             {
   280             {
   252               $scores[$match] = 1;
   281               $scores[$match] = $inc;
   253             }
       
   254             if ( isset($word_tracking[$match]) )
       
   255             {
       
   256               $word_tracking[$match][] = $row['word'];
       
   257             }
       
   258             else
       
   259             {
       
   260               $word_tracking[$match] = array($row['word']);
       
   261             }
   282             }
   262           }
   283           }
   263         }
   284         }
   264         else
   285         else
   265         {
   286         {
   266           // the term only occurs in one page
   287           // the term only occurs in one page
       
   288           $word_cs = (( $case_sensitive ) ? $row['word'] : strtolower($row['word']));
       
   289           if ( isset($word_tracking[$pages]) && in_array($word_cs, $word_tracking[$pages]) )
       
   290           {
       
   291             continue;
       
   292           }
       
   293           if ( isset($word_tracking[$pages]) )
       
   294           {
       
   295             if ( isset($word_tracking[$pages]) )
       
   296             {
       
   297               $word_tracking[$pages][] = ($word_cs);
       
   298             }
       
   299           }
       
   300           else
       
   301           {
       
   302             $word_tracking[$pages] = array($word_cs);
       
   303           }
       
   304           $inc = 1;
       
   305 
       
   306           // Is this search term present in the page's title? If so, give extra points
       
   307           preg_match("/^ns=$ns_list;pid=(.+)$/", $pages, $piecesparts);
       
   308           $pathskey = $paths->nslist[ $piecesparts[1] ] . sanitize_page_id($piecesparts[2]);
       
   309           if ( isset($paths->pages[$pathskey]) )
       
   310           {
       
   311             $test_func = ( $case_sensitive ) ? 'strstr' : 'stristr';
       
   312             if ( $test_func($paths->pages[$pathskey]['name'], $row['word']) || $test_func($paths->pages[$pathskey]['urlname_nons'], $row['word']) )
       
   313             {
       
   314               $inc = 1.5;
       
   315             }
       
   316           }
   267           if ( isset($scores[$pages]) )
   317           if ( isset($scores[$pages]) )
   268           {
   318           {
   269             $scores[$pages]++;
   319             $scores[$pages] = $scores[$pages] + $inc;
   270           }
   320           }
   271           else
   321           else
   272           {
   322           {
   273             $scores[$pages] = 1;
   323             $scores[$pages] = $inc;
   274           }
       
   275           if ( isset($word_tracking[$pages]) )
       
   276           {
       
   277             $word_tracking[$pages][] = $row['word'];
       
   278           }
       
   279           else
       
   280           {
       
   281             $word_tracking[$pages] = array($row['word']);
       
   282           }
   324           }
   283         }
   325         }
   284       }
   326       }
   285       while ( $row = $db->fetchrow() );
   327       while ( $row = $db->fetchrow() );
   286     }
   328     }
   287     $db->free_result();
   329     $db->free_result();
   288   
   330 
   289     //
   331     //
   290     // STAGE 2: FIRST ELIMINATION ROUND
   332     // STAGE 2: FIRST ELIMINATION ROUND
   291     // Iterate through the list of required terms. If a given page is not found to have the required term, eliminate it
   333     // Iterate through the list of required terms. If a given page is not found to have the required term, eliminate it
   292     //
   334     //
   293     
   335 
   294     foreach ( $query['req'] as $term )
   336     foreach ( $query['req'] as $term )
   295     {
   337     {
   296       foreach ( $word_tracking as $i => $page )
   338       foreach ( $word_tracking as $i => $page )
   297       {
   339       {
   298         if ( !in_array($term, $page) )
   340         if ( !in_array($term, $page) )
   300           unset($word_tracking[$i], $scores[$i]);
   342           unset($word_tracking[$i], $scores[$i]);
   301         }
   343         }
   302       }
   344       }
   303     }
   345     }
   304   }
   346   }
   305   
   347 
   306   //
   348   //
   307   // STAGE 3: PHRASE SEARCHING
   349   // STAGE 3: PHRASE SEARCHING
   308   // Use LIKE to find pages with specified phrases. We can do a super-picky single query without another elimination round because
   350   // Use LIKE to find pages with specified phrases. We can do a super-picky single query without another elimination round because
   309   // at this stage we can search the full page_text column instead of relying on a word list.
   351   // at this stage we can search the full page_text column instead of relying on a word list.
   310   //
   352   //
   311   
   353 
   312   // We can skip this stage if none of these special terms apply
   354   // We can skip this stage if none of these special terms apply
   313   
   355 
   314   $text_col = ( $case_sensitive ) ? 'page_text' : 'lcase(page_text)';
   356   $text_col = ( $case_sensitive ) ? 'page_text' : 'lcase(page_text)';
   315   
   357   $name_col = ( $case_sensitive ) ? 'name' : 'lcase(name)';
       
   358   $text_col_join = ( $case_sensitive ) ? 't.page_text' : 'lcase(t.page_text)';
       
   359   $name_col_join = ( $case_sensitive ) ? 'p.name' : 'lcase(p.name)';
       
   360 
   316   if ( count($query_phrase['any']) > 0 || count($query_phrase['req']) > 0 )
   361   if ( count($query_phrase['any']) > 0 || count($query_phrase['req']) > 0 )
   317   {
   362   {
   318   
   363 
   319     $where_any = array();
   364     $where_any = array();
   320     foreach ( $query_phrase['any'] as $term )
   365     foreach ( $query_phrase['any'] as $term )
   321     {
   366     {
   322       $term = escape_string_like($term);
   367       $term = escape_string_like($term);
   323       if ( !$case_sensitive )
   368       if ( !$case_sensitive )
   324         $term = strtolower($term);
   369         $term = strtolower($term);
   325       $where_any[] = $term;
   370       $where_any[] = "( $text_col LIKE '%$term%' OR $name_col LIKE '%$term%' )";
   326     }
   371     }
   327     
   372 
   328     $where_any = ( count($where_any) > 0 ) ? "( $text_col LIKE '%" . implode("%' OR $text_col LIKE '%", $where_any) . "%' )" : '';
   373     $where_any = ( count($where_any) > 0 ) ? implode(" OR\n  ", $where_any) : '';
   329     
   374 
   330     // Also do required columns, but use AND to ensure that all required terms are included
   375     // Also do required terms, but use AND to ensure that all required terms are included
   331     $where_req = array();
   376     $where_req = array();
   332     foreach ( $query_phrase['req'] as $term )
   377     foreach ( $query_phrase['req'] as $term )
   333     {
   378     {
   334       $term = escape_string_like($term);
   379       $term = escape_string_like($term);
   335       if ( !$case_sensitive )
   380       if ( !$case_sensitive )
   336         $term = strtolower($term);
   381         $term = strtolower($term);
   337       $where_req[] = $term;
   382       $where_req[] = "( $text_col LIKE '%$term%' OR $name_col LIKE '%$term%' )";
   338     }
   383     }
   339     $and_clause = ( $where_any != '' ) ? 'AND ' : '';
   384     $and_clause = ( $where_any != '' ) ? 'AND ' : '';
   340     $where_req = ( count($where_req) > 0 ) ? "{$and_clause}$text_col LIKE '%" . implode("%' AND $text_col LIKE '%", $where_req) . "%'" : '';
   385     $where_req = ( count($where_req) > 0 ) ? "{$and_clause}" . implode(" AND\n  ", $where_req) : '';
   341     
   386 
   342     $sql = 'SELECT CONCAT("ns=",namespace,";pid=",page_id) AS id FROM ' . table_prefix . "page_text WHERE $where_any $where_req;";
   387     $sql = 'SELECT CONCAT("ns=",t.namespace,";pid=",t.page_id) AS id, p.name FROM ' . table_prefix . "page_text AS t\n"
       
   388             . "  LEFT JOIN " . table_prefix . "pages AS p\n"
       
   389             . "    ON ( p.urlname = t.page_id AND p.namespace = t.namespace )\n"
       
   390             . "  WHERE\n  $where_any\n  $where_req;";
   343     if ( !($q = $db->sql_unbuffered_query($sql)) )
   391     if ( !($q = $db->sql_unbuffered_query($sql)) )
   344       $db->_die('Error is in perform_search(), includes/search.php, query 2. Parsed query dump follows:<pre>(indexable) ' . htmlspecialchars(print_r($query, true)) . '(non-indexable) ' . htmlspecialchars(print_r($query_phrase, true)) . '</pre>');
   392       $db->_die('Error is in perform_search(), includes/search.php, query 2. Parsed query dump follows:<pre>(indexable) ' . htmlspecialchars(print_r($query, true)) . '(non-indexable) ' . htmlspecialchars(print_r($query_phrase, true)) . '</pre>');
   345     
   393 
   346     if ( $row = $db->fetchrow() )
   394     if ( $row = $db->fetchrow() )
   347     {
   395     {
   348       do
   396       do
   349       {
   397       {
   350         $id =& $row['id'];
   398         $id =& $row['id'];
       
   399         $inc = 1;
       
   400 
       
   401         // Is this search term present in the page's title? If so, give extra points
       
   402         preg_match("/^ns=$ns_list;pid=(.+)$/", $id, $piecesparts);
       
   403         $pathskey = $paths->nslist[ $piecesparts[1] ] . sanitize_page_id($piecesparts[2]);
       
   404         if ( isset($paths->pages[$pathskey]) )
       
   405         {
       
   406           $test_func = ( $case_sensitive ) ? 'strstr' : 'stristr';
       
   407           foreach ( array_merge($query_phrase['any'], $query_phrase['req']) as $term )
       
   408           {
       
   409             if ( $test_func($paths->pages[$pathskey]['name'], $term) || $test_func($paths->pages[$pathskey]['urlname_nons'], $term) )
       
   410             {
       
   411               $inc = 1.5;
       
   412               break;
       
   413             }
       
   414           }
       
   415         }
   351         if ( isset($scores[$id]) )
   416         if ( isset($scores[$id]) )
   352         {
   417         {
   353           $scores[$id]++;
   418           $scores[$id] = $scores[$id] + $inc;
   354         }
   419         }
   355         else
   420         else
   356         {
   421         {
   357           $scores[$id] = 1;
   422           $scores[$id] = $inc;
   358         }
   423         }
   359       }
   424       }
   360       while ( $row = $db->fetchrow() );
   425       while ( $row = $db->fetchrow() );
   361     }
   426     }
   362     $db->free_result();
   427     $db->free_result();
   363   }
   428   }
   364   
   429 
   365   //
   430   //
   366   // STAGE 4 - SELECT PAGE TEXT AND ELIMINATE NOTS
   431   // STAGE 4 - SELECT PAGE TEXT AND ELIMINATE NOTS
   367   // At this point, we have a complete list of all the possible pages. Now we want to obtain the page text, and within the same query
   432   // At this point, we have a complete list of all the possible pages. Now we want to obtain the page text, and within the same query
   368   // eliminate any terms that shouldn't be in there.
   433   // eliminate any terms that shouldn't be in there.
   369   //
   434   //
   370   
   435 
   371   // Generate master word list for the highlighter
   436   // Generate master word list for the highlighter
   372   $word_list = array_values(array_merge($query['any'], $query['req'], $query_phrase['any'], $query_phrase['req']));
   437   $word_list = array_values(array_merge($query['any'], $query['req'], $query_phrase['any'], $query_phrase['req']));
   373   
   438 
   374   $text_where = array();
   439   $text_where = array();
   375   foreach ( $scores as $page_id => $_ )
   440   foreach ( $scores as $page_id => $_ )
   376   {
   441   {
   377     $text_where[] = $db->escape($page_id);
   442     $text_where[] = $db->escape($page_id);
   378   }
   443   }
   379   $text_where = '( CONCAT("ns=",t.namespace,";pid=",t.page_id) = \'' . implode('\' OR CONCAT("ns=",t.namespace,";pid=",t.page_id) = \'', $text_where) . '\' )';
   444   $text_where = '( CONCAT("ns=",t.namespace,";pid=",t.page_id) = \'' . implode('\' OR CONCAT("ns=",t.namespace,";pid=",t.page_id) = \'', $text_where) . '\' )';
   380   
   445 
   381   if ( count($query['not']) > 0 )
   446   if ( count($query['not']) > 0 )
   382     $text_where .= ' AND';
   447     $text_where .= ' AND';
   383   
   448 
   384   $where_not = array();
   449   $where_not = array();
   385   foreach ( $query['not'] as $term )
   450   foreach ( $query['not'] as $term )
   386   {
   451   {
   387     $term = escape_string_like($term);
   452     $term = escape_string_like($term);
   388     if ( !$case_sensitive )
   453     if ( !$case_sensitive )
   389       $term = strtolower($term);
   454       $term = strtolower($term);
   390     $where_not[] = $term;
   455     $where_not[] = $term;
   391   }
   456   }
   392   $where_not = ( count($where_not) > 0 ) ? "$text_col NOT LIKE '%" . implode("%' AND $text_col NOT LIKE '%", $where_not) . "%'" : '';
   457   $where_not = ( count($where_not) > 0 ) ? "$text_col NOT LIKE '%" . implode("%' AND $text_col NOT LIKE '%", $where_not) . "%'" : '';
   393   
   458 
   394   $sql = 'SELECT CONCAT("ns=",t.namespace,";pid=",t.page_id) AS id, t.page_id, t.namespace, CHAR_LENGTH(t.page_text) AS page_length, t.page_text, p.name AS page_name FROM ' . table_prefix . "page_text AS t
   459   $sql = 'SELECT CONCAT("ns=",t.namespace,";pid=",t.page_id) AS id, t.page_id, t.namespace, CHAR_LENGTH(t.page_text) AS page_length, t.page_text, p.name AS page_name FROM ' . table_prefix . "page_text AS t
   395             LEFT JOIN " . table_prefix . "pages AS p
   460             LEFT JOIN " . table_prefix . "pages AS p
   396               ON ( p.urlname = t.page_id AND p.namespace = t.namespace )
   461               ON ( p.urlname = t.page_id AND p.namespace = t.namespace )
   397             WHERE $text_where $where_not;";
   462             WHERE $text_where $where_not;";
   398   if ( !($q = $db->sql_unbuffered_query($sql)) )
   463   if ( !($q = $db->sql_unbuffered_query($sql)) )
   399     $db->_die('Error is in perform_search(), includes/search.php, query 3');
   464     $db->_die('Error is in perform_search(), includes/search.php, query 3');
   400   
   465 
   401   $page_data = array();
   466   $page_data = array();
   402   if ( $row = $db->fetchrow() )
   467   if ( $row = $db->fetchrow() )
   403   {
   468   {
   404     do
   469     do
   405     {
   470     {
   406       $row['page_text'] = htmlspecialchars($row['page_text']);
   471       $row['page_text'] = htmlspecialchars($row['page_text']);
   407       $row['page_name'] = htmlspecialchars($row['page_name']);
   472       $row['page_name'] = htmlspecialchars($row['page_name']);
   408       
   473 
   409       // Highlight results (this is wonderfully automated)
   474       // Highlight results (this is wonderfully automated)
   410       $row['page_text'] = highlight_and_clip_search_result($row['page_text'], $word_list, $case_sensitive);
   475       $row['page_text'] = highlight_and_clip_search_result($row['page_text'], $word_list, $case_sensitive);
   411       if ( strlen($row['page_text']) > 250 && !preg_match('/^\.\.\.(.+)\.\.\.$/', $row['page_text']) )
   476       if ( strlen($row['page_text']) > 250 && !preg_match('/^\.\.\.(.+)\.\.\.$/', $row['page_text']) )
   412       {
   477       {
   413         $row['page_text'] = substr($row['page_text'], 0, 150) . '...';
   478         $row['page_text'] = substr($row['page_text'], 0, 150) . '...';
   414       }
   479       }
   415       $row['page_name'] = highlight_search_result($row['page_name'], $word_list, $case_sensitive);
   480       $row['page_name'] = highlight_search_result($row['page_name'], $word_list, $case_sensitive);
   416       
   481 
   417       $page_data[$row['id']] = $row;
   482       $page_data[$row['id']] = $row;
   418     }
   483     }
   419     while ( $row = $db->fetchrow() );
   484     while ( $row = $db->fetchrow() );
   420   }
   485   }
   421   $db->free_result();
   486   $db->free_result();
   422   
   487   
   423   //
   488   //
   424   // STAGE 5 - SPECIAL PAGE TITLE SEARCH
   489   // STAGE 5 - SPECIAL PAGE TITLE SEARCH
   425   // Iterate through $paths->pages and check the titles for search terms. Score accordingly.
   490   // Iterate through $paths->pages and check the titles for search terms. Score accordingly.
   426   //
   491   //
   427   
   492 
   428   foreach ( $paths->pages as $page )
   493   foreach ( $paths->pages as $id => $page )
   429   {
   494   {
   430     if ( $page['namespace'] != 'Special' )
   495     if ( $page['namespace'] != 'Special' )
   431       continue;
   496       continue;
       
   497     if ( !is_int($id) )
       
   498       continue;
   432     $idstring = 'ns=' . $page['namespace'] . ';pid=' . $page['urlname_nons'];
   499     $idstring = 'ns=' . $page['namespace'] . ';pid=' . $page['urlname_nons'];
   433     $any = array_merge($query['any'], $query_phrase['any']);
   500     $any = array_values(array_unique(array_merge($query['any'], $query_phrase['any'])));
   434     foreach ( $any as $term )
   501     foreach ( $any as $term )
   435     {
   502     {
   436       if ( $case_sensitive )
   503       if ( $case_sensitive )
   437       {
   504       {
   438         if ( strstr($page['name'], $term) || strstr($page['urlname_nons'], $term) )
   505         if ( strstr($page['name'], $term) || strstr($page['urlname_nons'], $term) )
   439         {
   506         {
   440           ( isset($scores[$idstring]) ) ? $scores[$idstring]++ : $scores[$idstring] = 1;
   507           ( isset($scores[$idstring]) ) ? $scores[$idstring] = $scores[$idstring] + 1.5 : $scores[$idstring] = 1.5;
   441         }
   508         }
   442       }
   509       }
   443       else
   510       else
   444       {
   511       {
   445         if ( strstr(strtolower($page['name']), strtolower($term)) || strstr(strtolower($page['urlname_nons']), strtolower($term)) )
   512         if ( stristr($page['name'], $term) || stristr($page['urlname_nons'], $term) )
   446         {
   513         {
   447           ( isset($scores[$idstring]) ) ? $scores[$idstring]++ : $scores[$idstring] = 1;
   514           ( isset($scores[$idstring]) ) ? $scores[$idstring] = $scores[$idstring] + 1.5 : $scores[$idstring] = 1.5;
   448         }
   515         }
   449       }
   516       }
   450     }
   517     }
   451     if ( isset($scores[$idstring]) )
   518     if ( isset($scores[$idstring]) )
   452     {
   519     {
   464   
   531   
   465   //
   532   //
   466   // STAGE 6 - SECOND ELIMINATION ROUND
   533   // STAGE 6 - SECOND ELIMINATION ROUND
   467   // Iterate through the list of required terms. If a given page is not found to have the required term, eliminate it
   534   // Iterate through the list of required terms. If a given page is not found to have the required term, eliminate it
   468   //
   535   //
   469   
   536 
   470   $required = array_merge($query['req'], $query_phrase['req']);
   537   $required = array_merge($query['req'], $query_phrase['req']);
   471   foreach ( $required as $term )
   538   foreach ( $required as $term )
   472   {
   539   {
   473     foreach ( $page_data as $id => $page )
   540     foreach ( $page_data as $id => $page )
   474     {
   541     {
   476       {
   543       {
   477         unset($page_data[$id]);
   544         unset($page_data[$id]);
   478       }
   545       }
   479     }
   546     }
   480   }
   547   }
   481   
   548 
   482   // At this point, all of our normal results are in. However, we can also allow plugins to hook into the system and score their own
   549   // At this point, all of our normal results are in. However, we can also allow plugins to hook into the system and score their own
   483   // pages and add text, etc. as necessary.
   550   // pages and add text, etc. as necessary.
   484   // Plugins are COMPLETELY responsible for using the search terms and handling Boolean logic properly
   551   // Plugins are COMPLETELY responsible for using the search terms and handling Boolean logic properly
   485   
   552 
   486   $code = $plugins->setHook('search_global_inner');
   553   $code = $plugins->setHook('search_global_inner');
   487   foreach ( $code as $cmd )
   554   foreach ( $code as $cmd )
   488   {
   555   {
   489     eval($cmd);
   556     eval($cmd);
   490   }
   557   }
   491   
   558 
   492   // a marvelous debugging aid :-)
   559   // a marvelous debugging aid :-)
   493   // die('<pre>' . htmlspecialchars(print_r($page_data, true)) . '</pre>');
   560   // die('<pre>' . htmlspecialchars(print_r($page_data, true)) . '</pre>');
   494   
   561 
   495   //
   562   //
   496   // STAGE 7 - HIGHLIGHT, TRIM, AND SCORE RESULTS
   563   // STAGE 7 - HIGHLIGHT, TRIM, AND SCORE RESULTS
   497   // We now have the complete results of the search. We need to trim text down to show only portions of the page containing search
   564   // We now have the complete results of the search. We need to trim text down to show only portions of the page containing search
   498   // terms, highlight any search terms within the page, and sort the final results array in descending order of score.
   565   // terms, highlight any search terms within the page, and sort the final results array in descending order of score.
   499   //
   566   //
   500   
   567 
   501   // Sort scores array
   568   // Sort scores array
   502   arsort($scores);
   569   arsort($scores);
   503   
   570 
   504   // Divisor for calculating relevance scores
   571   // Divisor for calculating relevance scores
   505   $divisor = count($query['any']) + count($query_phrase['any']) + count($query['req']) + count($query_phrase['not']);
   572   $divisor = ( count($query['any']) + count($query_phrase['any']) + count($query['req']) + count($query_phrase['not']) ) * 1.5;
   506   
   573 
   507   foreach ( $scores as $page_id => $score )
   574   foreach ( $scores as $page_id => $score )
   508   {
   575   {
   509     if ( !isset($page_data[$page_id]) )
   576     if ( !isset($page_data[$page_id]) )
   510       // It's possible that $scores contains a score for a page that was later eliminated because it contained a disallowed term
   577       // It's possible that $scores contains a score for a page that was later eliminated because it contained a disallowed term
   511       continue;
   578       continue;
   512       
   579 
   513     // Make a copy of the datum, then delete the original (it frees up a LOT of RAM)
   580     // Make a copy of the datum, then delete the original (it frees up a LOT of RAM)
   514     $datum = $page_data[$page_id];
   581     $datum = $page_data[$page_id];
   515     unset($page_data[$page_id]);
   582     unset($page_data[$page_id]);
   516     
   583 
   517     // This is an internal value used for sorting - it's no longer needed.
   584     // This is an internal value used for sorting - it's no longer needed.
   518     unset($datum['id']);
   585     unset($datum['id']);
   519     
   586 
   520     // Calculate score
   587     // Calculate score
   521     if ( $score > $divisor )
   588     // if ( $score > $divisor )
   522       $score = $divisor;
   589     //   $score = $divisor;
   523     $datum['score'] = round($score / $divisor, 2) * 100;
   590     $datum['score'] = round($score / $divisor, 2) * 100;
   524     
   591 
   525     // Store it in our until-now-unused results array
   592     // Store it in our until-now-unused results array
   526     $results[] = $datum;
   593     $results[] = $datum;
   527   }
   594   }
   528   
   595 
   529   // Our work here is done. :-D
   596   // Our work here is done. :-D
   530   return $results;
   597   return $results;
   531 }
   598 }
   532 
   599 
   533 /**
   600 /**
   556   for ( $i = 0; $i < strlen($query); $i++ )
   623   for ( $i = 0; $i < strlen($query); $i++ )
   557   {
   624   {
   558     $chr = $query{$i};
   625     $chr = $query{$i};
   559     $prev = ( $i > 0 ) ? $query{ $i - 1 } : '';
   626     $prev = ( $i > 0 ) ? $query{ $i - 1 } : '';
   560     $next = ( ( $i + 1 ) < strlen($query) ) ? $query{ $i + 1 } : '';
   627     $next = ( ( $i + 1 ) < strlen($query) ) ? $query{ $i + 1 } : '';
   561     
   628 
   562     if ( ( $chr == ' ' && !$in_quote ) || ( $i + 1 == strlen ( $query ) ) )
   629     if ( ( $chr == ' ' && !$in_quote ) || ( $i + 1 == strlen ( $query ) ) )
   563     {
   630     {
   564       $len = ( $next == '' ) ? $i + 1 : $i - $start_term;
   631       $len = ( $next == '' ) ? $i + 1 : $i - $start_term;
   565       $word = substr ( $query, $start_term, $len );
   632       $word = substr ( $query, $start_term, $len );
   566       $terms[] = $word;
   633       $terms[] = $word;
   567       $start_term = $i + 1;
   634       $start_term = $i + 1;
   568     }
   635     }
   569     
   636 
   570     elseif ( $chr == '"' && $in_quote && $prev != '\\' )
   637     elseif ( $chr == '"' && $in_quote && $prev != '\\' )
   571     {
   638     {
   572       $word = substr ( $query, $start_term, $i - $start_term + 1 );
   639       $word = substr ( $query, $start_term, $i - $start_term + 1 );
   573       $start_pos = ( $next == ' ' ) ? $i + 2 : $i + 1;
   640       $start_pos = ( $next == ' ' ) ? $i + 2 : $i + 1;
   574       $in_quote = false;
   641       $in_quote = false;
   575     }
   642     }
   576     
   643 
   577     elseif ( $chr == '"' && !$in_quote )
   644     elseif ( $chr == '"' && !$in_quote )
   578     {
   645     {
   579       $in_quote = true;
   646       $in_quote = true;
   580       $start_pos = $i;
   647       $start_pos = $i;
   581     }
   648     }
   582     
   649 
   583   }
   650   }
   584   
   651 
   585   $ticker = 0;
   652   $ticker = 0;
   586   
   653 
   587   foreach ( $terms as $element => $__unused )
   654   foreach ( $terms as $element => $__unused )
   588   {
   655   {
   589     $atom =& $terms[$element];
   656     $atom =& $terms[$element];
   590     
   657 
   591     $ticker++;
   658     $ticker++;
   592     
   659 
   593     if ( $ticker == 20 )
   660     if ( $ticker == 20 )
   594     {
   661     {
   595       $warnings[] = 'Some of your search terms were excluded because searches are limited to 20 terms to prevent excessive server load.';
   662       $warnings[] = 'Some of your search terms were excluded because searches are limited to 20 terms to prevent excessive server load.';
   596       break;
   663       break;
   597     }
   664     }
   598     
   665 
   599     if ( substr ( $atom, 0, 2 ) == '+"' && substr ( $atom, ( strlen ( $atom ) - 1 ), 1 ) == '"' )
   666     if ( substr ( $atom, 0, 2 ) == '+"' && substr ( $atom, ( strlen ( $atom ) - 1 ), 1 ) == '"' )
   600     {
   667     {
   601       $word = substr ( $atom, 2, ( strlen( $atom ) - 3 ) );
   668       $word = substr ( $atom, 2, ( strlen( $atom ) - 3 ) );
   602       if ( strlen ( $word ) < 2 || in_array($word, $stopwords) )
   669       if ( strlen ( $word ) < 2 || in_array($word, $stopwords) )
   603       {
   670       {
   730   for ( $i = 0; $i < sizeof($words); $i++)
   797   for ( $i = 0; $i < sizeof($words); $i++)
   731   {
   798   {
   732     if(!empty($words[$i]))
   799     if(!empty($words[$i]))
   733       $words2[] = preg_quote($words[$i]);
   800       $words2[] = preg_quote($words[$i]);
   734   }
   801   }
   735   
   802 
   736   $flag = ( $case_sensitive ) ? '' : 'i';
   803   $flag = ( $case_sensitive ) ? '' : 'i';
   737   $regex = '/(' . implode('|', $words2) . ')/' . $flag;
   804   $regex = '/(' . implode('|', $words2) . ')/' . $flag;
   738   $pt = preg_replace($regex, '<highlight>\\1</highlight>', $pt);
   805   $pt = preg_replace($regex, '<highlight>\\1</highlight>', $pt);
   739   
   806 
   740   return $pt;
   807   return $pt;
   741 }
   808 }
   742 
   809 
   743 /**
   810 /**
   744  * Wraps <highlight></highlight> tags around all words in both the specified array and the specified text and clips the text to
   811  * Wraps <highlight></highlight> tags around all words in both the specified array and the specified text and clips the text to
   750  */
   817  */
   751 
   818 
   752 function highlight_and_clip_search_result($pt, $words, $case_sensitive = false)
   819 function highlight_and_clip_search_result($pt, $words, $case_sensitive = false)
   753 {
   820 {
   754   $cut_off = false;
   821   $cut_off = false;
   755   
   822 
   756   $space_chars = Array("\t", "\n", "\r", " ");
   823   $space_chars = Array("\t", "\n", "\r", " ");
   757   
   824 
   758   $pt = highlight_search_result($pt, $words, $case_sensitive);
   825   $pt = highlight_search_result($pt, $words, $case_sensitive);
   759   
   826 
   760   foreach ( $words as $word )
   827   foreach ( $words as $word )
   761   {
   828   {
   762     // Boldface searched words
   829     // Boldface searched words
   763     $ptlen = strlen($pt);
   830     $ptlen = strlen($pt);
   764     for ( $i = 0; $i < $ptlen; $i++ )
   831     for ( $i = 0; $i < $ptlen; $i++ )
   787                 $final_chunk = substr($chunk, $j + 1);
   854                 $final_chunk = substr($chunk, $j + 1);
   788                 break;
   855                 break;
   789               }
   856               }
   790             }
   857             }
   791             $mid_chunk = substr($pt, ( $i - 75 ), 75);
   858             $mid_chunk = substr($pt, ( $i - 75 ), 75);
   792             
   859 
   793             $clipped = '...' . $final_chunk . $mid_chunk . $chunk2;
   860             $clipped = '...' . $final_chunk . $mid_chunk . $chunk2;
   794             
   861 
   795             $chunk = substr($pt, ( $i + strlen($chunk2) + 75 ));
   862             $chunk = substr($pt, ( $i + strlen($chunk2) + 75 ));
   796             $final_chunk = $chunk;
   863             $final_chunk = $chunk;
   797             for ( $j = 0; $j < strlen($chunk); $j++ )
   864             for ( $j = 0; $j < strlen($chunk); $j++ )
   798             {
   865             {
   799               if ( in_array($chunk{$j}, $space_chars) )
   866               if ( in_array($chunk{$j}, $space_chars) )
   800               {
   867               {
   801                 $final_chunk = substr($chunk, 0, $j);
   868                 $final_chunk = substr($chunk, 0, $j);
   802                 break;
   869                 break;
   803               }
   870               }
   804             }
   871             }
   805             
   872 
   806             $end_chunk = substr($pt, ( $i + strlen($chunk2) ), 75 );
   873             $end_chunk = substr($pt, ( $i + strlen($chunk2) ), 75 );
   807             
   874 
   808             $clipped .= $end_chunk . $final_chunk . '...';
   875             $clipped .= $end_chunk . $final_chunk . '...';
   809             
   876 
   810             $pt = $clipped;
   877             $pt = $clipped;
   811           }
   878           }
   812           else if ( strlen($pt) > 200 )
   879           else if ( strlen($pt) > 200 )
   813           {
   880           {
   814             $mid_chunk = substr($pt, ( $i - 75 ), 75);
   881             $mid_chunk = substr($pt, ( $i - 75 ), 75);
   815             
   882 
   816             $clipped = $chunk1 . $chunk2;
   883             $clipped = $chunk1 . $chunk2;
   817             
   884 
   818             $chunk = substr($pt, ( $i + strlen($chunk2) + 75 ));
   885             $chunk = substr($pt, ( $i + strlen($chunk2) + 75 ));
   819             $final_chunk = $chunk;
   886             $final_chunk = $chunk;
   820             for ( $j = 0; $j < strlen($chunk); $j++ )
   887             for ( $j = 0; $j < strlen($chunk); $j++ )
   821             {
   888             {
   822               if ( in_array($chunk{$j}, $space_chars) )
   889               if ( in_array($chunk{$j}, $space_chars) )
   823               {
   890               {
   824                 $final_chunk = substr($chunk, 0, $j);
   891                 $final_chunk = substr($chunk, 0, $j);
   825                 break;
   892                 break;
   826               }
   893               }
   827             }
   894             }
   828             
   895 
   829             $end_chunk = substr($pt, ( $i + strlen($chunk2) ), 75 );
   896             $end_chunk = substr($pt, ( $i + strlen($chunk2) ), 75 );
   830             
   897 
   831             $clipped .= $end_chunk . $final_chunk . '...';
   898             $clipped .= $end_chunk . $final_chunk . '...';
   832             
   899 
   833             $pt = $clipped;
   900             $pt = $clipped;
   834             
   901 
   835           }
   902           }
   836           break 2;
   903           break 2;
   837         }
   904         }
   838       }
   905       }
   839     }
   906     }
   851 function get_stopwords()
   918 function get_stopwords()
   852 {
   919 {
   853   static $stopwords;
   920   static $stopwords;
   854   if ( is_array($stopwords) )
   921   if ( is_array($stopwords) )
   855     return $stopwords;
   922     return $stopwords;
   856   
   923 
   857   $stopwords = array('a\'s', 'able', 'after', 'afterwards', 'again',
   924   $stopwords = array('a\'s', 'able', 'after', 'afterwards', 'again',
   858                      'against', 'ain\'t', 'all', 'almost', 'alone', 'along', 'already', 'also', 'although', 'always',
   925                      'against', 'ain\'t', 'all', 'almost', 'alone', 'along', 'already', 'also', 'although', 'always',
   859                      'am', 'among', 'amongst', 'an', 'and', 'another', 'any', 'anybody', 'anyhow', 'anyone', 'anything', 'anyway',
   926                      'am', 'among', 'amongst', 'an', 'and', 'another', 'any', 'anybody', 'anyhow', 'anyone', 'anything', 'anyway',
   860                      'anyways', 'anywhere', 'apart', 'appear', 'appreciate', 'appropriate', 'are', 'aren\'t', 'around', 'as', 'aside',
   927                      'anyways', 'anywhere', 'apart', 'appear', 'appreciate', 'appropriate', 'are', 'aren\'t', 'around', 'as', 'aside',
   861                      'ask', 'asking', 'associated', 'at', 'available', 'away', 'awfully', 'be', 'became', 'because', 'become', 'becomes',
   928                      'ask', 'asking', 'associated', 'at', 'available', 'away', 'awfully', 'be', 'became', 'because', 'become', 'becomes',
   887                      'same', 'saw', 'say', 'saying', 'says', 'second', 'secondly', 'see', 'seeing', 'seem', 'seemed', 'seeming', 'seems',
   954                      'same', 'saw', 'say', 'saying', 'says', 'second', 'secondly', 'see', 'seeing', 'seem', 'seemed', 'seeming', 'seems',
   888                      'seen', 'self', 'selves', 'sensible', 'sent', 'serious', 'seriously', 'seven', 'several', 'shall', 'she', 'should',
   955                      'seen', 'self', 'selves', 'sensible', 'sent', 'serious', 'seriously', 'seven', 'several', 'shall', 'she', 'should',
   889                      'shouldn\'t', 'since', 'six', 'so', 'some', 'somebody', 'somehow', 'someone', 'something', 'sometime', 'sometimes',
   956                      'shouldn\'t', 'since', 'six', 'so', 'some', 'somebody', 'somehow', 'someone', 'something', 'sometime', 'sometimes',
   890                      'somewhat', 'somewhere', 'soon', 'sorry', 'specified', 'specify', 'specifying', 'still', 'sub', 'such', 'sup',
   957                      'somewhat', 'somewhere', 'soon', 'sorry', 'specified', 'specify', 'specifying', 'still', 'sub', 'such', 'sup',
   891                      'sure', 't\'s', 'take', 'taken', 'tell', 'tends', 'th', 'than', 'thank', 'thanks', 'thanx', 'that', 'that\'s',
   958                      'sure', 't\'s', 'take', 'taken', 'tell', 'tends', 'th', 'than', 'thank', 'thanks', 'thanx', 'that', 'that\'s',
   892                      'thats', 'the', 'their', 'theirs', 'them', 'themselves', 'then', 'thence', 'there', 'there\'s', 'thereafter',
   959                      'thats', 'the', 'their', 'theirs', 'them', 'then', 'thence', 'there', 'there\'s', 'thereafter',
   893                      'thereby', 'therefore', 'therein', 'theres', 'thereupon', 'these', 'they', 'they\'d', 'they\'ll', 'they\'re',
   960                      'thereby', 'therefore', 'therein', 'theres', 'thereupon', 'these', 'they', 'they\'d', 'they\'ll', 'they\'re',
   894                      'they\'ve', 'think', 'third', 'this', 'thorough', 'thoroughly', 'those', 'though', 'three', 'through', 'throughout',
   961                      'they\'ve', 'think', 'third', 'this', 'thorough', 'thoroughly', 'those', 'though', 'three', 'through', 'throughout',
   895                      'thru', 'thus', 'to', 'together', 'too', 'took', 'toward', 'towards', 'tried', 'tries', 'truly', 'try', 'trying',
   962                      'thru', 'thus', 'to', 'together', 'too', 'took', 'toward', 'towards', 'tried', 'tries', 'truly', 'try', 'trying',
   896                      'twice', 'two', 'un', 'under', 'unfortunately', 'unless', 'unlikely', 'until', 'unto', 'up', 'upon', 'us', 'use',
   963                      'twice', 'two', 'un', 'under', 'unfortunately', 'unless', 'unlikely', 'until', 'unto', 'upon', 'use',
   897                      'used', 'useful', 'uses', 'using', 'usually', 'value', 'various', 'very', 'via', 'viz', 'vs', 'want', 'wants',
   964                      'used', 'useful', 'uses', 'using', 'usually', 'value', 'various', 'very',
   898                      'was', 'wasn\'t', 'way', 'we', 'we\'d', 'we\'ll', 'we\'re', 'we\'ve', 'welcome', 'well', 'went', 'were', 'weren\'t',
   965                      'was', 'wasn\'t', 'way', 'we', 'we\'d', 'we\'ll', 'we\'re', 'we\'ve', 'welcome', 'well', 'went', 'were', 'weren\'t',
   899                      'what', 'what\'s', 'whatever', 'when', 'whence', 'whenever', 'where', 'where\'s', 'whereafter', 'whereas',
   966                      'what', 'what\'s', 'whatever', 'when', 'whence', 'whenever', 'where', 'where\'s', 'whereafter', 'whereas',
   900                      'whereby', 'wherein', 'whereupon', 'wherever', 'whether', 'which', 'while', 'whither', 'who', 'who\'s', 'whoever',
   967                      'which', 'while', 'who', 'who\'s', 'whole', 'whom', 'whose', 'why', 'will', 'willing', 'wish', 'with', 'within',
   901                      'whole', 'whom', 'whose', 'why', 'will', 'willing', 'wish', 'with', 'within', 'without', 'won\'t', 'wonder',
   968                      'without', 'won\'t', 'wonder', 'would', 'would', 'wouldn\'t', 'yes', 'yet', 'you', 'you\'d', 'you\'ll', 'you\'re',
   902                      'would', 'would', 'wouldn\'t', 'yes', 'yet', 'you', 'you\'d', 'you\'ll', 'you\'re', 'you\'ve', 'your', 'yours',
   969                      'you\'ve', 'your', 'yours', 'zero');
   903                      'yourself', 'yourselves', 'zero');
       
   904   return $stopwords;
   970   return $stopwords;
   905 }
   971 }
   906 
   972 
   907 ?>
   973 ?>