includes/paths.php
changeset 461 717e71109645
parent 458 c433348f3628
child 463 0857911fb7f1
equal deleted inserted replaced
460:3a1c99845ca8 461:717e71109645
   581                              AND p.visible=1;'; // Only indexes "visible" pages
   581                              AND p.visible=1;'; // Only indexes "visible" pages
   582     return $texts;
   582     return $texts;
   583   }
   583   }
   584   
   584   
   585   /**
   585   /**
   586    * Rebuilds the search index
   586    * Builds a word list for search indexing.
   587    * @param bool If true, prints out status messages
   587    * @param string Text to index
       
   588    * @param string Page ID of the page being indexed
       
   589    * @param string Title of the page being indexed
       
   590    * @return array List of words
   588    */
   591    */
   589    
   592   
   590   function rebuild_search_index($verbose = false)
   593   function calculate_word_list($text, $page_id, $page_name)
       
   594   {
       
   595     $page_id = dirtify_page_id($page_id);
       
   596     $text = preg_replace('/[^a-z0-9\']/i', ' ', $text);
       
   597     $page_id = preg_replace('/[^a-z0-9\']/i', ' ', $page_id);
       
   598     $page_name = preg_replace('/[^a-z0-9\']/i', ' ', $page_name);
       
   599     $text .= " $page_id $page_name";
       
   600     $text = explode(' ', $text);
       
   601     foreach ( $text as $i => &$word )
       
   602     {
       
   603       if ( strstr($word, "''") )
       
   604         $word = preg_replace("/[']{2,}/", '', $word);
       
   605       if ( strlen($word) < 2 )
       
   606         unset($text[$i]);
       
   607     }
       
   608     $text = array_unique(array_values($text));
       
   609     return $text;
       
   610   }
       
   611   
       
   612   /**
       
   613    * Rebuilds the site's entire search index. Considerably more exciting if run from the command line.
       
   614    * @param bool If true, verbose output.
       
   615    * @param bool If true, verbose + debugging output.
       
   616    */
       
   617   
       
   618   function rebuild_search_index($verbose = false, $debug = false)
   591   {
   619   {
   592     global $db, $session, $paths, $template, $plugins; // Common objects
   620     global $db, $session, $paths, $template, $plugins; // Common objects
   593     $search = new Searcher();
   621     
       
   622     @set_time_limit(0);
       
   623     
       
   624     $q = $db->sql_query('DELETE FROM search_index;');
       
   625     if ( !$q )
       
   626       $db->_die();
       
   627     
       
   628     $sha1_blank = sha1('');
       
   629     $query_func = ( ENANO_DBLAYER == 'MYSQL' ) ? 'mysql_query' : 'pg_query';
       
   630     
       
   631     //
       
   632     // Index $pages_in_batch pages at a time
       
   633     //
       
   634     $pages_in_batch = 15;
       
   635     
       
   636     // First find out how many pages there are
       
   637     $q = $db->sql_query('SELECT COUNT(p.urlname) AS num_pages FROM ' . table_prefix . "page_text AS t\n"
       
   638                       . "  LEFT JOIN " . table_prefix . "pages AS p\n"
       
   639                       . "    ON ( p.urlname = t.page_id AND p.namespace = t.namespace )\n"
       
   640                       . "  WHERE ( p.password = '' OR p.password = '$sha1_blank' )\n"
       
   641                       . "    AND ( p.visible = 1 );");
       
   642     if ( !$q )
       
   643       $db->_die();
       
   644     
       
   645     list($num_pages) = $db->fetchrow_num();
       
   646     $num_pages = intval($num_pages);
       
   647     $loops = ceil($num_pages / $pages_in_batch);
       
   648     $master_word_list = array();
       
   649     $stopwords = get_stopwords();
       
   650     
       
   651     for ( $j = 0; $j < $loops; )
       
   652     {
       
   653       $offset = $j * $pages_in_batch;
       
   654       
       
   655       $j++;
       
   656       
       
   657       if ( $verbose && $debug )
       
   658       {
       
   659         echo "Running indexing round $j of $loops (offset $offset)\n" . ( isset($_SERVER['REQUEST_URI']) ? '<br />' : '' );
       
   660       }
       
   661       
       
   662       $texts = $db->sql_query('SELECT p.name, t.page_id, t.namespace, t.page_text FROM ' . table_prefix . "page_text AS t\n"
       
   663                             . "  LEFT JOIN " . table_prefix . "pages AS p\n"
       
   664                             . "    ON ( p.urlname = t.page_id AND p.namespace = t.namespace )\n"
       
   665                             . "  WHERE ( p.password = '' OR p.password = '$sha1_blank' )\n"
       
   666                             . "    AND ( p.visible = 1 )\n"
       
   667                             . "  LIMIT $offset, $pages_in_batch;", false);
       
   668       if ( !$texts )
       
   669         $db->_die();
       
   670       
       
   671       $k = $offset;
       
   672       
       
   673       if ( $row = $db->fetchrow($texts) )
       
   674       {
       
   675         do
       
   676         {
       
   677           $k++;
       
   678           if ( $verbose )
       
   679           {
       
   680             $mu = memory_get_usage();
       
   681             echo "  Indexing page $k of $num_pages: {$row['namespace']}:{$row['page_id']}";
       
   682             if ( $debug )
       
   683               echo ", mem = $mu...";
       
   684             flush();
       
   685           }
       
   686           
       
   687           // Indexing identifier for the page in the DB
       
   688           $page_uniqid = "ns={$row['namespace']};pid=" . sanitize_page_id($row['page_id']);
       
   689           $page_uniqid = $db->escape($page_uniqid);
       
   690           
       
   691           // List of words on the page
       
   692           $wordlist = $this->calculate_word_list($row['page_text'], $row['page_id'], $row['name']);
       
   693           
       
   694           // Index calculation complete -- run inserts
       
   695           $inserts = array();
       
   696           foreach ( $wordlist as $word )
       
   697           {
       
   698             if ( in_array($word, $stopwords) || strval(intval($word)) === $word || strlen($word) < 3 )
       
   699               continue;
       
   700             $word_db = $db->escape($word);
       
   701             if ( !in_array($word, $master_word_list) )
       
   702             {
       
   703               $inserts[] = "( '$word_db', '$page_uniqid' )";
       
   704             }
       
   705             else
       
   706             {
       
   707               if ( $verbose && $debug )
       
   708                 echo '.';
       
   709               $pid_col = ( ENANO_DBLAYER == 'MYSQL' ) ?
       
   710                           "CONCAT( page_names, ',$page_uniqid' )":
       
   711                           "page_names || ',$page_uniqid'";
       
   712               $q = $db->sql_query('UPDATE ' . table_prefix . "search_index SET page_names = $pid_col WHERE word = '$word_db';", false);
       
   713               if ( !$q )
       
   714                 $db->_die();
       
   715             }
       
   716           }
       
   717           if ( count($inserts) > 0 )
       
   718           {
       
   719             if ( $verbose && $debug )
       
   720               echo 'i';
       
   721             $inserts = implode(",\n  ", $inserts);
       
   722             $q = $db->sql_query('INSERT INTO ' . table_prefix . "search_index(word, page_names) VALUES\n  $inserts;", false);
       
   723             if ( !$q )
       
   724               $db->_die();
       
   725           }
       
   726           
       
   727           $master_word_list = array_unique(array_merge($master_word_list, $wordlist));
       
   728           if ( $verbose )
       
   729           {
       
   730             if ( isset($_SERVER['REQUEST_URI']) )
       
   731               echo '<br />';
       
   732             echo "\n";
       
   733           }
       
   734           unset($inserts, $wordlist, $page_uniqid, $word_db, $q, $word, $row);
       
   735         }
       
   736         while ( $row = $db->fetchrow($texts) );
       
   737       }
       
   738       $db->free_result($texts);
       
   739     }
   594     if ( $verbose )
   740     if ( $verbose )
   595     {
   741     {
   596       echo '<p>';
   742       echo "Indexing complete.";
   597     }
   743       if ( isset($_SERVER['REQUEST_URI']) )
   598     $texts = Array();
   744         echo '<br />';
   599     $textq = $db->sql_unbuffered_query($this->fetch_page_search_resource());
   745       echo "\n";
   600     if(!$textq) $db->_die('');
   746     }
   601     while($row = $db->fetchrow())
   747     return true;
   602     {
       
   603       if ( $verbose )
       
   604       {
       
   605         ob_start();
       
   606         echo "Indexing page " . $this->nslist[$row['namespace']] . sanitize_page_id($row['page_id']) . "<br />";
       
   607         ob_flush();
       
   608         while (@ob_end_flush());
       
   609         flush();
       
   610       }
       
   611       if ( isset($this->nslist[$row['namespace']]) )
       
   612       {
       
   613         $idstring = $this->nslist[$row['namespace']] . sanitize_page_id($row['page_id']);
       
   614         if ( isset($this->pages[$idstring]) )
       
   615         {
       
   616           $page = $this->pages[$idstring];
       
   617         }
       
   618         else
       
   619         {
       
   620           $page = array('name' => dirtify_page_id($row['page_id']));
       
   621         }
       
   622       }
       
   623       else
       
   624       {
       
   625         $page = array('name' => dirtify_page_id($row['page_id']));
       
   626       }
       
   627       $texts[(string)$row['page_idstring']] = $row['page_text'] . ' ' . $page['name'];
       
   628     }
       
   629     if ( $verbose )
       
   630     {
       
   631       ob_start();
       
   632       echo "Calculating word list...";
       
   633       ob_flush();
       
   634       while (@ob_end_flush());
       
   635       flush();
       
   636     }
       
   637     $search->buildIndex($texts);
       
   638     if ( $verbose )
       
   639     {
       
   640       echo '</p>';
       
   641     }
       
   642     // echo '<pre>'.print_r($search->index, true).'</pre>';
       
   643     // return;
       
   644     $q = $db->sql_query('DELETE FROM '.table_prefix.'search_index');
       
   645     if(!$q) return false;
       
   646     $secs = Array();
       
   647     $q = 'INSERT INTO '.table_prefix.'search_index(word,page_names) VALUES';
       
   648     foreach($search->index as $word => $pages)
       
   649     {
       
   650       $secs[] = '(\''.$db->escape($word).'\', \''.$db->escape($pages).'\')';
       
   651     }
       
   652     $q .= implode(',', $secs);
       
   653     unset($secs);
       
   654     $q .= ';';
       
   655     $result = $db->sql_query($q);
       
   656     $db->free_result();
       
   657     if($result)
       
   658       return true;
       
   659     else
       
   660       $db->_die('The search index was trying to rebuild itself when the error occured.');
       
   661   }
   748   }
   662   
   749   
   663   /**
   750   /**
   664    * Partially rebuilds the search index, removing/inserting entries only for the current page
   751    * Partially rebuilds the search index, removing/inserting entries only for the current page
   665    * @param string $page_id
   752    * @param string $page_id