diff -r 7091cff2ca01 -r f3933b355229 includes/paths.php --- a/includes/paths.php Sun Jul 25 11:23:09 2010 -0400 +++ b/includes/paths.php Mon Jul 26 20:10:01 2010 -0400 @@ -726,6 +726,32 @@ } /** + * Get the unique words on a page. Returns an array listing all items in small array $arr1 that are not in very large array $arr2. + * @param array + * @param array + * @return array + */ + + function get_unique_words($arr1, $arr2) + { + $no = array(); + foreach ( $arr2 as $w ) + { + if ( ($k = array_search($w, $arr1, true)) !== false ) + { + $no[$k] = true; + } + } + $ret = array(); + foreach ( $arr1 as $k => $w ) + { + if ( !isset($no[$k]) ) + $ret[] = $w; + } + return $ret; + } + + /** * Builds a word list for search indexing. * @param string Text to index * @param string Page ID of the page being indexed @@ -863,17 +889,22 @@ $page_uniqid = $db->escape($page_uniqid); // List of words on the page + if ( $debug ) + echo "wordlist..."; $wordlist = $this->calculate_word_list($row['page_text'], $row['page_id'], $row['name']); // Index calculation complete -- run inserts $inserts = array(); + $qt = array(); + $unique_words = $this->get_unique_words($wordlist, $master_word_list); foreach ( $wordlist as $word ) { + $qs = microtime_float(); if ( in_array($word, $stopwords) || strval(intval($word)) === $word || strlen($word) < 3 ) continue; $word_db = $db->escape($word); $word_db_lc = $db->escape(strtolower($word)); - if ( !in_array($word, $master_word_list) ) + if ( in_array($word, $unique_words) ) { $inserts[] = "( '$word_db', '$word_db_lc', '$page_uniqid' )"; } @@ -888,7 +919,10 @@ if ( !$q ) $db->_die(); } + $qt[] = microtime_float() - $qs; } + if ( $debug && count($qt) > 0 ) + echo "QT: " . number_format(array_sum($qt) / count($qt), 4) . " * " . count($qt) . '; wl_len: ' . count($master_word_list) .' '; if ( count($inserts) > 0 ) { if ( $verbose && $debug ) @@ -899,14 +933,14 @@ $db->_die(); } - $master_word_list = array_unique(array_merge($master_word_list, $wordlist)); + $master_word_list = array_merge($master_word_list, $unique_words); if ( $verbose ) { if ( isset($_SERVER['REQUEST_URI']) ) echo '
'; echo "\n"; } - unset($inserts, $wordlist, $page_uniqid, $word_db, $q, $word, $row); + unset($inserts, $wordlist, $page_uniqid, $word_db, $q, $word, $row, $unique_words); } while ( $row = $db->fetchrow($texts) ); }