Improved captcha word generation; fixed duplicate auth parameter in Special:Login privileged login; improved search indexer performance on websites with lots of words
authorDan
Mon, 26 Jul 2010 20:10:01 -0400
changeset 1266 f3933b355229
parent 1265 7091cff2ca01
child 1267 31ff2e5351b0
Improved captcha word generation; fixed duplicate auth parameter in Special:Login privileged login; improved search indexer performance on websites with lots of words
includes/functions.php
includes/paths.php
includes/sessions.php
plugins/SpecialUserFuncs.php
--- a/includes/functions.php	Sun Jul 25 11:23:09 2010 -0400
+++ b/includes/functions.php	Mon Jul 26 20:10:01 2010 -0400
@@ -2545,14 +2545,14 @@
 }
 
 /**
- * Paginates (breaks into multiple pages) a MySQL result resource, which is treated as unbuffered.
- * @param resource The MySQL result resource. This should preferably be an unbuffered query.
+ * Paginates (breaks into multiple pages) a database result resource, which is treated as unbuffered.
+ * @param resource The result resource. This should preferably be an unbuffered query, which allows scalability across very large result sets.
  * @param string A template, with variables being named after the column name
  * @param int The number of total results. This should be determined by a second query.
  * @param string sprintf-style formatting string for URLs for result pages. First parameter will be start offset.
  * @param int Optional. Start offset in individual results. Defaults to 0.
  * @param int Optional. The number of results per page. Defualts to 10.
- * @param int Optional. An associative array of functions to call, with key names being column names, and values being function names. Values can also be an array with key 0 being either an object or a string(class name) and key 1 being a [static] method.
+ * @param array Optional. An associative array of functions to call, with key names being column names, and values being callbacks (string or array(string, string) or array(object, string)). They can also be closures if you're OK with incompatibility with PHP <5.3.0.
  * @param string Optional. The text to be sent before the result list, only if there are any results. Possibly the start of a table.
  * @param string Optional. The text to be sent after the result list, only if there are any results. Possibly the end of a table.
  * @return string
--- a/includes/paths.php	Sun Jul 25 11:23:09 2010 -0400
+++ b/includes/paths.php	Mon Jul 26 20:10:01 2010 -0400
@@ -726,6 +726,32 @@
 	}
 	
 	/**
+	 * Get the unique words on a page. Returns an array listing all items in small array $arr1 that are not in very large array $arr2.
+	 * @param array
+	 * @param array
+	 * @return array
+	 */
+	
+	function get_unique_words($arr1, $arr2)
+	{
+		$no = array();
+		foreach ( $arr2 as $w )
+		{
+			if ( ($k = array_search($w, $arr1, true)) !== false )
+			{
+				$no[$k] = true;
+			}
+		}
+		$ret = array();
+		foreach ( $arr1 as $k => $w )
+		{
+			if ( !isset($no[$k]) )
+				$ret[] = $w;
+		}
+		return $ret;
+	}
+	
+	/**
  	* Builds a word list for search indexing.
  	* @param string Text to index
  	* @param string Page ID of the page being indexed
@@ -863,17 +889,22 @@
 					$page_uniqid = $db->escape($page_uniqid);
 					
 					// List of words on the page
+					if ( $debug )
+						echo "wordlist...";
 					$wordlist = $this->calculate_word_list($row['page_text'], $row['page_id'], $row['name']);
 					
 					// Index calculation complete -- run inserts
 					$inserts = array();
+					$qt = array();
+					$unique_words = $this->get_unique_words($wordlist, $master_word_list);
 					foreach ( $wordlist as $word )
 					{
+						$qs = microtime_float();
 						if ( in_array($word, $stopwords) || strval(intval($word)) === $word || strlen($word) < 3 )
 							continue;
 						$word_db = $db->escape($word);
 						$word_db_lc = $db->escape(strtolower($word));
-						if ( !in_array($word, $master_word_list) )
+						if ( in_array($word, $unique_words) )
 						{
 							$inserts[] = "( '$word_db', '$word_db_lc', '$page_uniqid' )";
 						}
@@ -888,7 +919,10 @@
 							if ( !$q )
 								$db->_die();
 						}
+						$qt[] = microtime_float() - $qs;
 					}
+					if ( $debug && count($qt) > 0 )
+						echo "QT: " . number_format(array_sum($qt) / count($qt), 4) . " * " . count($qt) . '; wl_len: ' . count($master_word_list) .' ';
 					if ( count($inserts) > 0 )
 					{
 						if ( $verbose && $debug )
@@ -899,14 +933,14 @@
 							$db->_die();
 					}
 					
-					$master_word_list = array_unique(array_merge($master_word_list, $wordlist));
+					$master_word_list = array_merge($master_word_list, $unique_words);
 					if ( $verbose )
 					{
 						if ( isset($_SERVER['REQUEST_URI']) )
 							echo '<br />';
 						echo "\n";
 					}
-					unset($inserts, $wordlist, $page_uniqid, $word_db, $q, $word, $row);
+					unset($inserts, $wordlist, $page_uniqid, $word_db, $q, $word, $row, $unique_words);
 				}
 				while ( $row = $db->fetchrow($texts) );
 			}
--- a/includes/sessions.php	Sun Jul 25 11:23:09 2010 -0400
+++ b/includes/sessions.php	Mon Jul 26 20:10:01 2010 -0400
@@ -3481,6 +3481,12 @@
 					$word .= 't';
 				else if ( $prev_l == 'p' && mt_rand(0, 5) == 1 )
 					$word .= 'h';
+				// this rule allows "ck" which can result in the occasional "dick", "fuck", etc. that tends
+				// to end up on 4chan, but I decided to keep it, because it increases word complexity.
+				else if ( $prev_l == 'c' && mt_rand(0, 3) == 1 )
+					$word .= 'k';
+				else if ( $prev_l == 'q' && mt_rand(0, 5) != 1 )
+					$word .= 'u';
 				else
 					$word .= $vowels{mt_rand(0, (strlen($vowels)-1))};
 			}
--- a/plugins/SpecialUserFuncs.php	Sun Jul 25 11:23:09 2010 -0400
+++ b/plugins/SpecialUserFuncs.php	Mon Jul 26 20:10:01 2010 -0400
@@ -391,7 +391,8 @@
 					$get_add = '';
 					foreach ( $get_fwd as $key => $value )
 					{
-						$get_add .= "&{$key}=" . urlencode($value);
+						if ( $key != 'auth' )
+							$get_add .= "&{$key}=" . urlencode($value);
 					}
 					$get_add = ltrim($get_add, '&');
 				}