diff -r 902822492a68 -r fe660c52c48f includes/search.php --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/includes/search.php Wed Jun 13 16:07:17 2007 -0400 @@ -0,0 +1,636 @@ + $v) + { + $arr3[$k] = $v; + } + return $arr3; +} + +/** + * Algorithm to actually do the searching. This system usually works pretty fast (tested and developed on a site with 22 pages) but one + * caveat of this algorithm is that it has to load the entire index into memory. It also requires manual parsing of the search query + * which can be quite CPU-intensive. On the flip side this algorithm is extremely flexible and can be adapted for other uses very easily. + * + * Most of the time, this system is disabled. It is only used when MySQL can't or won't allow FULLTEXT indices. + * + * @package Enano + * @subpackage Page management frontend + * @license GNU General Public License http://www.enanocms.org/Special:GNU_General_Public_License + */ + +class Searcher +{ + + var $results; + var $index; + var $warnings; + var $match_case = false; + + function __construct() + { + $this->warnings = Array(); + } + + function Searcher() + { + $this->__construct(); + } + + function warn($t) + { + if(!in_array($t, $this->warnings)) $this->warnings[] = $t; + } + + function convertCase($text) + { + return ( $this->match_case ) ? $text : strtolower($text); + } + + function buildIndex($texts) + { + $this->index = Array(); + + foreach($texts as $i => $l) + { + $seed = md5(microtime(true) . mt_rand()); + $texts[$i] = str_replace("'", 'xxxApoS'.$seed.'xxx', $texts[$i]); + $texts[$i] = preg_replace('#([\W_]+)#i', ' ', $texts[$i]); + $texts[$i] = preg_replace('#([ ]+?)#', ' ', $texts[$i]); + $texts[$i] = preg_replace('#([\']*){2,}#s', '', $texts[$i]); + $texts[$i] = str_replace('xxxApoS'.$seed.'xxx', "'", $texts[$i]); + $l = $texts[$i]; + $words = Array(); + $good_chars = 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789\' '; + $good_chars = enano_str_split($good_chars, 1); + $letters = enano_str_split($l, 1); + foreach($letters as $x => $t) + { + if(!in_array($t, $good_chars)) + unset($letters[$x]); + } + $letters = implode('', $letters); + $words = explode(' ', $letters); + foreach($words as $c => $w) + { + if(strlen($w) < 4) + unset($words[$c]); + else + $words[$c] = $w; + } + $words = array_values($words); + foreach($words as $c => $w) + { + if(isset($this->index[$w])) + { + if(!in_array($i, $this->index[$w])) + $this->index[$w][] = $i; + } + else + { + $this->index[$w] = Array(); + $this->index[$w][] = $i; + } + } + } + foreach($this->index as $k => $v) + { + $this->index[$k] = implode(',', $this->index[$k]); + } + } + + function search($query, $texts) + { + + // OK, let's establish some basics here. Here is the procedure for performing the search: + // * search for items that matches all the terms in the correct order. + // * search for items that match in any order + // * eliminate one term and do the loop all over + + $this->results = Array(); + $query = $this->parseQuery($query); + $querybak = $query; + for($i = sizeof($query['any'])-1; $i >= 0; $i--) + { + $res = $this->performCoreSearch($query, $texts, true); + $this->results = enano_safe_array_merge($this->results, $res); + $res = $this->performCoreSearch($query, $texts, false); + $this->results = enano_safe_array_merge($this->results, $res); + unset($query['any'][$i]); + } + + // Last resort - search for any of the terms instead of all of 'em + $res = $this->performCoreSearch($querybak, $texts, false, true); + $this->results = enano_safe_array_merge($this->results, $res); + + $this->highlightResults($querybak); + } + + // $texts should be a textual MySQL query! + // @todo document + function searchMySQL($query, $texts) + { + global $db; + // OK, let's establish some basics here. Here is the procedure for performing the search: + // * search for items that matches all the terms in the correct order. + // * search for items that match in any order + // * eliminate one term and do the loop all over + + $this->results = Array(); + $query = $this->parseQuery($query); + $querytmp = $query; + $querybak = $query; + for($i = sizeof($querytmp['any'])-1; $i >= 0; $i--) + { + $res = $this->performCoreSearchMySQL($querytmp, $texts, true); + $this->results = enano_safe_array_merge($this->results, $res); + $res = $this->performCoreSearchMySQL($querytmp, $texts, false); + $this->results = enano_safe_array_merge($this->results, $res); + unset($querytmp['any'][$i]); + } + + // Last resort - search for any of the terms instead of all of 'em + $res = $this->performCoreSearchMySQL($querybak, $texts, false, true); + $this->results = enano_safe_array_merge($this->results, $res); + + $this->highlightResults($querybak); + } + + /** + * This method assumes that $query is already parsed and $texts is an (associative) array of possible results + * @param array $query A search query parsed with Searcher::parseQuery() + * @param array $texts The list of possible results + * @param bool $exact_order If true, only matches results with the terms in the same order as the terms in the query + * @return array An associative array of results + * @access private + */ + function performCoreSearch($query, $texts, $exact_order = false, $any = false) + { + $textkeys = array_keys($texts); + $results = Array(); + if($exact_order) + { + $query = $this->concatQueryTerms($query); + } + $query['trm'] = array_merge($query['any'], $query['req']); + # Find all remotely possible results first + // Single-word terms + foreach($this->index as $term => $keys) + { + foreach($query['trm'] as $userterm) + { + if($this->convertCase($userterm) == $this->convertCase($term)) + { + $k = explode(',', $keys); + foreach($k as $idxkey) + { + if(isset($texts[$idxkey])) + { + $results[$idxkey] = $texts[$idxkey]; + } + else + { + if(preg_match('#^([0-9]+)$#', $idxkey)) + { + $idxkey = intval($idxkey); + if(isset($texts[$idxkey])) $results[$idxkey] = $texts[$idxkey]; + } + } + } + } + } + } + // Quoted terms + foreach($query['trm'] as $userterm) + { + if(!preg_match('/[\s"\'~`!@#\$%\^&\*\(\)\{\}:;<>,.\/\?_-]/', $userterm)) continue; + foreach($texts as $k => $t) + { + if(strstr($this->convertCase($t), $this->convertCase($userterm))) + { + // We have a match! + if(!isset($results[$k])) $results[$k] = $t; + } + } + } + // Remove excluded terms + foreach($results as $k => $r) + { + foreach($query['not'] as $not) + { + if(strstr($this->convertCase($r), $this->convertCase($not))) unset($results[$k]); + } + } + if(!$any) + { + // Remove results not containing all terms + foreach($results as $k => $r) + { + foreach($query['any'] as $term) + { + if(!strstr($this->convertCase($r), $this->convertCase($term))) unset($results[$k]); + } + } + } + // Remove results not containing all required terms + foreach($results as $k => $r) + { + foreach($query['req'] as $term) + { + if(!strstr($this->convertCase($r), $this->convertCase($term))) unset($results[$k]); + } + } + return $results; + } + + /** + * This is the same as performCoreSearch, but $texts should be a MySQL result resource. This can save tremendous amounts of memory on large sites. + * @param array $query A search query parsed with Searcher::parseQuery() + * @param string $texts A text MySQL query that selects the text as the first column and the index key as the second column + * @param bool $exact_order If true, only matches results with the terms in the same order as the terms in the query + * @return array An associative array of results + * @access private + */ + function performCoreSearchMySQL($query, $texts, $exact_order = false, $any = false) + { + global $db; + $results = Array(); + if($exact_order) + { + $query = $this->concatQueryTerms($query); + } + $query['trm'] = array_merge($query['any'], $query['req']); + # Find all remotely possible results first + $texts = $db->sql_query($texts); + if ( !$texts ) + $db->_die('The error is in the search engine.'); + if ( $r = $db->fetchrow_num($texts) ) + { + do + { + foreach($this->index as $term => $keys) + { + foreach($query['trm'] as $userterm) + { + if($this->convertCase($userterm) == $this->convertCase($term)) + { + $k = explode(',', $keys); + foreach($k as $idxkey) + { + $row[0] = $r[0]; + $row[1] = $r[1]; + if(!isset($row[1])) + { + echo('PHP PARSER BUG: $row[1] is set but not set... includes/search.php:'.__LINE__); + $GLOBALS['template']->footer(); + exit; + } + if($row[1] == $idxkey) + $results[$idxkey] = $row[0]; + else + { + if(preg_match('#^([0-9]+)$#', $idxkey)) + { + $idxkey = intval($idxkey); + if($row[1] == $idxkey) $results[$idxkey] = $row[0]; + } + } + } + } + } + } + // Quoted terms + foreach($query['trm'] as $userterm) + { + if(!preg_match('/[\s"\'~`!@#\$%\^&\*\(\)\{\}:;<>,.\/\?_-]/', $userterm)) continue; + if(strstr($this->convertCase($r[0]), $this->convertCase($userterm))) + { + // We have a match! + if(!isset($results[$r[1]])) $results[$r[1]] = $r[0]; + } + } + } while( $r = $db->fetchrow_num($texts) ); + } + // Remove excluded terms + foreach($results as $k => $r) + { + foreach($query['not'] as $not) + { + if(strstr($this->convertCase($r), $this->convertCase($not))) unset($results[$k]); + } + } + if(!$any) + { + // Remove results not containing all terms + foreach($results as $k => $r) + { + foreach($query['any'] as $term) + { + if(!strstr($this->convertCase($r), $this->convertCase($term))) unset($results[$k]); + } + } + } + // Remove results not containing all terms + foreach($results as $k => $r) + { + foreach($query['req'] as $term) + { + if(!strstr($this->convertCase($r), $this->convertCase($term))) unset($results[$k]); + } + } + return $results; + } + + function concatQueryTerms($query) + { + $tmp = implode(' ', $query['any']); + unset($query['any']); + $query['any'] = Array(0 => $tmp); + return $query; + } + + /** + * Builds a basic assoc array with a more organized version of the query + */ + + function parseQuery($query) + { + $ret = array( + 'any' => array(), + 'req' => array(), + 'not' => array() + ); + $terms = array(); + $in_quote = false; + $start_term = 0; + $just_finished = false; + for ( $i = 0; $i < strlen($query); $i++ ) + { + $chr = $query{$i}; + $prev = ( $i > 0 ) ? $query{ $i - 1 } : ''; + $next = ( ( $i + 1 ) < strlen($query) ) ? $query{ $i + 1 } : ''; + + if ( ( $chr == ' ' && !$in_quote ) || ( $i + 1 == strlen ( $query ) ) ) + { + $len = ( $next == '' ) ? $i + 1 : $i - $start_term; + $word = substr ( $query, $start_term, $len ); + $terms[] = $word; + $start_term = $i + 1; + } + + elseif ( $chr == '"' && $in_quote && $prev != '\\' ) + { + $word = substr ( $query, $start_term, $i - $start_term + 1 ); + $start_pos = ( $next == ' ' ) ? $i + 2 : $i + 1; + $in_quote = false; + } + + elseif ( $chr == '"' && !$in_quote ) + { + $in_quote = true; + $start_pos = $i; + } + + } + + $ticker = 0; + + foreach ( $terms as $element => $__unused ) + { + $atom =& $terms[$element]; + + $ticker++; + + if ( $ticker == 20 ) + { + $this->warn('Some of your search terms were excluded because searches are limited to 20 terms to prevent excessive server load.'); + break; + } + + if ( substr ( $atom, 0, 2 ) == '+"' && substr ( $atom, ( strlen ( $atom ) - 1 ), 1 ) == '"' ) + { + $word = substr ( $atom, 2, ( strlen( $atom ) - 3 ) ); + if ( strlen ( $word ) < 4 ) + { + $this->warn('One or more of your search terms was excluded because terms must be at least 4 characters in length.'); + $ticker--; + continue; + } + if(in_array($word, $ret['req'])) + { + $this->warn('One or more of your search terms was excluded because duplicate terms were encountered.'); + $ticker--; + continue; + } + $ret['req'][] = $word; + } + elseif ( substr ( $atom, 0, 2 ) == '-"' && substr ( $atom, ( strlen ( $atom ) - 1 ), 1 ) == '"' ) + { + $word = substr ( $atom, 2, ( strlen( $atom ) - 3 ) ); + if ( strlen ( $word ) < 4 ) + { + $this->warn('One or more of your search terms was excluded because terms must be at least 4 characters in length.'); + $ticker--; + continue; + } + if(in_array($word, $ret['not'])) + { + $this->warn('One or more of your search terms was excluded because duplicate terms were encountered.'); + $ticker--; + continue; + } + $ret['not'][] = $word; + } + elseif ( substr ( $atom, 0, 1 ) == '+' ) + { + $word = substr ( $atom, 1 ); + if ( strlen ( $word ) < 4 ) + { + $this->warn('One or more of your search terms was excluded because terms must be at least 4 characters in length.'); + $ticker--; + continue; + } + if(in_array($word, $ret['req'])) + { + $this->warn('One or more of your search terms was excluded because duplicate terms were encountered.'); + $ticker--; + continue; + } + $ret['req'][] = $word; + } + elseif ( substr ( $atom, 0, 1 ) == '-' ) + { + $word = substr ( $atom, 1 ); + if ( strlen ( $word ) < 4 ) + { + $this->warn('One or more of your search terms was excluded because terms must be at least 4 characters in length.'); + $ticker--; + continue; + } + if(in_array($word, $ret['not'])) + { + $this->warn('One or more of your search terms was excluded because duplicate terms were encountered.'); + $ticker--; + continue; + } + $ret['not'][] = $word; + } + elseif ( substr ( $atom, 0, 1 ) == '"' && substr ( $atom, ( strlen($atom) - 1 ), 1 ) == '"' ) + { + $word = substr ( $atom, 1, ( strlen ( $atom ) - 2 ) ); + if ( strlen ( $word ) < 4 ) + { + $this->warn('One or more of your search terms was excluded because terms must be at least 4 characters in length.'); + $ticker--; + continue; + } + if(in_array($word, $ret['any'])) + { + $this->warn('One or more of your search terms was excluded because duplicate terms were encountered.'); + $ticker--; + continue; + } + $ret['any'][] = $word; + } + else + { + $word = $atom; + if ( strlen ( $word ) < 4 ) + { + $this->warn('One or more of your search terms was excluded because terms must be at least 4 characters in length.'); + $ticker--; + continue; + } + if(in_array($word, $ret['any'])) + { + $this->warn('One or more of your search terms was excluded because duplicate terms were encountered.'); + $ticker--; + continue; + } + $ret['any'][] = $word; + } + } + return $ret; + } + + function highlightResults($query, $starttag = '', $endtag = '') + { + $query['trm'] = array_merge($query['any'], $query['req']); + //die('
'.print_r($query, true).''); + foreach($query['trm'] as $q) + { + foreach($this->results as $k => $r) + { + $startplace = 0; + //$this->results[$k] = htmlspecialchars($this->results[$k]); + for($i = 0; $i < strlen($r); $i++) + { + $word = substr($r, $i, strlen($q)); + if($this->convertCase($word) == $this->convertCase($q)) + { + $word = $starttag . $word . $endtag; + $this->results[$k] = substr($r, 0, $i) . $word . substr($r, $i + strlen($q), strlen($r)+999999); + $startplace = $i - 75; + if($startplace < 0) $startplace = 0; + $this->results[$k] = '...'.trim(substr($this->results[$k], $startplace, strlen($word) + 150)).'...'; + continue 2; + } + } + } + } + } + +} + +/** + * Developer-friendly way to do searches. :-) Uses the MySQL FULLTEXT index type. + * @package Enano + * @subpackage Search + */ + +class MySQL_Fulltext_Search { + + /** + * Performs a search. + * @param string The search query + * @return resource MySQL result resource - this is an UNBUFFERED query. + */ + + function search($query) + { + global $db, $session, $paths, $template, $plugins; // Common objects + + $fulltext_col = 'MATCH(t.page_id,t.namespace,p.name,t.page_text) AGAINST (\'' . $db->escape($query) . '\' IN BOOLEAN MODE)'; + $sql = "SELECT t.page_text,CONCAT('ns=',t.namespace,';pid=',t.page_id) AS page_identifier, $fulltext_col AS score, CHAR_LENGTH(t.page_text) AS length FROM ".table_prefix."page_text AS t + LEFT JOIN ".table_prefix."pages AS p + ON ( p.urlname=t.page_id AND p.namespace=t.namespace) + WHERE $fulltext_col > 0 + AND p.visible=1 + ORDER BY score DESC;"; + $q = $db->sql_unbuffered_query($sql); + if ( !$q ) + $db->_die(); + + return $q; + } + + function highlight_result($query, $result) + { + global $db, $session, $paths, $template, $plugins; // Common objects + $search = new Searcher(); + $parsed_query = $search->parseQuery($query); + return $this->highlight_result_inner($query, $result); + } + + function highlight_result_inner($query, $fulltext, $starttag = '', $endtag = '') + { + $result = false; + $query['trm'] = array_merge($query['any'], $query['req']); + //die('
'.print_r($query, true).''); + foreach($query['trm'] as $q) + { + $startplace = 0; + //$this->results[$k] = htmlspecialchars($this->results[$k]); + for($i = 0; $i < strlen($r); $i++) + { + $word = substr($r, $i, strlen($q)); + if($this->convertCase($word) == $this->convertCase($q)) + { + $word = $starttag . $word . $endtag; + $result = substr($fulltext, 0, $i) . $word . substr($r, $i + strlen($q), strlen($r)+99999999); + $startplace = $i - 75; + if($startplace < 0) $startplace = 0; + $result = '...'.trim(substr($result, $startplace, strlen($word) + 150)).'...'; + continue 2; + } + } + } + return $result; + } + +} + +?>