includes/paths.php
changeset 461 717e71109645
parent 458 c433348f3628
child 463 0857911fb7f1
--- a/includes/paths.php	Sun Feb 24 12:52:07 2008 -0500
+++ b/includes/paths.php	Sun Feb 24 17:50:38 2008 -0500
@@ -583,81 +583,168 @@
   }
   
   /**
-   * Rebuilds the search index
-   * @param bool If true, prints out status messages
+   * Builds a word list for search indexing.
+   * @param string Text to index
+   * @param string Page ID of the page being indexed
+   * @param string Title of the page being indexed
+   * @return array List of words
    */
-   
-  function rebuild_search_index($verbose = false)
+  
+  function calculate_word_list($text, $page_id, $page_name)
+  {
+    $page_id = dirtify_page_id($page_id);
+    $text = preg_replace('/[^a-z0-9\']/i', ' ', $text);
+    $page_id = preg_replace('/[^a-z0-9\']/i', ' ', $page_id);
+    $page_name = preg_replace('/[^a-z0-9\']/i', ' ', $page_name);
+    $text .= " $page_id $page_name";
+    $text = explode(' ', $text);
+    foreach ( $text as $i => &$word )
+    {
+      if ( strstr($word, "''") )
+        $word = preg_replace("/[']{2,}/", '', $word);
+      if ( strlen($word) < 2 )
+        unset($text[$i]);
+    }
+    $text = array_unique(array_values($text));
+    return $text;
+  }
+  
+  /**
+   * Rebuilds the site's entire search index. Considerably more exciting if run from the command line.
+   * @param bool If true, verbose output.
+   * @param bool If true, verbose + debugging output.
+   */
+  
+  function rebuild_search_index($verbose = false, $debug = false)
   {
     global $db, $session, $paths, $template, $plugins; // Common objects
-    $search = new Searcher();
-    if ( $verbose )
+    
+    @set_time_limit(0);
+    
+    $q = $db->sql_query('DELETE FROM search_index;');
+    if ( !$q )
+      $db->_die();
+    
+    $sha1_blank = sha1('');
+    $query_func = ( ENANO_DBLAYER == 'MYSQL' ) ? 'mysql_query' : 'pg_query';
+    
+    //
+    // Index $pages_in_batch pages at a time
+    //
+    $pages_in_batch = 15;
+    
+    // First find out how many pages there are
+    $q = $db->sql_query('SELECT COUNT(p.urlname) AS num_pages FROM ' . table_prefix . "page_text AS t\n"
+                      . "  LEFT JOIN " . table_prefix . "pages AS p\n"
+                      . "    ON ( p.urlname = t.page_id AND p.namespace = t.namespace )\n"
+                      . "  WHERE ( p.password = '' OR p.password = '$sha1_blank' )\n"
+                      . "    AND ( p.visible = 1 );");
+    if ( !$q )
+      $db->_die();
+    
+    list($num_pages) = $db->fetchrow_num();
+    $num_pages = intval($num_pages);
+    $loops = ceil($num_pages / $pages_in_batch);
+    $master_word_list = array();
+    $stopwords = get_stopwords();
+    
+    for ( $j = 0; $j < $loops; )
     {
-      echo '<p>';
-    }
-    $texts = Array();
-    $textq = $db->sql_unbuffered_query($this->fetch_page_search_resource());
-    if(!$textq) $db->_die('');
-    while($row = $db->fetchrow())
-    {
-      if ( $verbose )
+      $offset = $j * $pages_in_batch;
+      
+      $j++;
+      
+      if ( $verbose && $debug )
       {
-        ob_start();
-        echo "Indexing page " . $this->nslist[$row['namespace']] . sanitize_page_id($row['page_id']) . "<br />";
-        ob_flush();
-        while (@ob_end_flush());
-        flush();
+        echo "Running indexing round $j of $loops (offset $offset)\n" . ( isset($_SERVER['REQUEST_URI']) ? '<br />' : '' );
       }
-      if ( isset($this->nslist[$row['namespace']]) )
+      
+      $texts = $db->sql_query('SELECT p.name, t.page_id, t.namespace, t.page_text FROM ' . table_prefix . "page_text AS t\n"
+                            . "  LEFT JOIN " . table_prefix . "pages AS p\n"
+                            . "    ON ( p.urlname = t.page_id AND p.namespace = t.namespace )\n"
+                            . "  WHERE ( p.password = '' OR p.password = '$sha1_blank' )\n"
+                            . "    AND ( p.visible = 1 )\n"
+                            . "  LIMIT $offset, $pages_in_batch;", false);
+      if ( !$texts )
+        $db->_die();
+      
+      $k = $offset;
+      
+      if ( $row = $db->fetchrow($texts) )
       {
-        $idstring = $this->nslist[$row['namespace']] . sanitize_page_id($row['page_id']);
-        if ( isset($this->pages[$idstring]) )
-        {
-          $page = $this->pages[$idstring];
-        }
-        else
+        do
         {
-          $page = array('name' => dirtify_page_id($row['page_id']));
+          $k++;
+          if ( $verbose )
+          {
+            $mu = memory_get_usage();
+            echo "  Indexing page $k of $num_pages: {$row['namespace']}:{$row['page_id']}";
+            if ( $debug )
+              echo ", mem = $mu...";
+            flush();
+          }
+          
+          // Indexing identifier for the page in the DB
+          $page_uniqid = "ns={$row['namespace']};pid=" . sanitize_page_id($row['page_id']);
+          $page_uniqid = $db->escape($page_uniqid);
+          
+          // List of words on the page
+          $wordlist = $this->calculate_word_list($row['page_text'], $row['page_id'], $row['name']);
+          
+          // Index calculation complete -- run inserts
+          $inserts = array();
+          foreach ( $wordlist as $word )
+          {
+            if ( in_array($word, $stopwords) || strval(intval($word)) === $word || strlen($word) < 3 )
+              continue;
+            $word_db = $db->escape($word);
+            if ( !in_array($word, $master_word_list) )
+            {
+              $inserts[] = "( '$word_db', '$page_uniqid' )";
+            }
+            else
+            {
+              if ( $verbose && $debug )
+                echo '.';
+              $pid_col = ( ENANO_DBLAYER == 'MYSQL' ) ?
+                          "CONCAT( page_names, ',$page_uniqid' )":
+                          "page_names || ',$page_uniqid'";
+              $q = $db->sql_query('UPDATE ' . table_prefix . "search_index SET page_names = $pid_col WHERE word = '$word_db';", false);
+              if ( !$q )
+                $db->_die();
+            }
+          }
+          if ( count($inserts) > 0 )
+          {
+            if ( $verbose && $debug )
+              echo 'i';
+            $inserts = implode(",\n  ", $inserts);
+            $q = $db->sql_query('INSERT INTO ' . table_prefix . "search_index(word, page_names) VALUES\n  $inserts;", false);
+            if ( !$q )
+              $db->_die();
+          }
+          
+          $master_word_list = array_unique(array_merge($master_word_list, $wordlist));
+          if ( $verbose )
+          {
+            if ( isset($_SERVER['REQUEST_URI']) )
+              echo '<br />';
+            echo "\n";
+          }
+          unset($inserts, $wordlist, $page_uniqid, $word_db, $q, $word, $row);
         }
+        while ( $row = $db->fetchrow($texts) );
       }
-      else
-      {
-        $page = array('name' => dirtify_page_id($row['page_id']));
-      }
-      $texts[(string)$row['page_idstring']] = $row['page_text'] . ' ' . $page['name'];
+      $db->free_result($texts);
     }
     if ( $verbose )
     {
-      ob_start();
-      echo "Calculating word list...";
-      ob_flush();
-      while (@ob_end_flush());
-      flush();
-    }
-    $search->buildIndex($texts);
-    if ( $verbose )
-    {
-      echo '</p>';
+      echo "Indexing complete.";
+      if ( isset($_SERVER['REQUEST_URI']) )
+        echo '<br />';
+      echo "\n";
     }
-    // echo '<pre>'.print_r($search->index, true).'</pre>';
-    // return;
-    $q = $db->sql_query('DELETE FROM '.table_prefix.'search_index');
-    if(!$q) return false;
-    $secs = Array();
-    $q = 'INSERT INTO '.table_prefix.'search_index(word,page_names) VALUES';
-    foreach($search->index as $word => $pages)
-    {
-      $secs[] = '(\''.$db->escape($word).'\', \''.$db->escape($pages).'\')';
-    }
-    $q .= implode(',', $secs);
-    unset($secs);
-    $q .= ';';
-    $result = $db->sql_query($q);
-    $db->free_result();
-    if($result)
-      return true;
-    else
-      $db->_die('The search index was trying to rebuild itself when the error occured.');
+    return true;
   }
   
   /**