Modified Text_Wiki parser to fully support UTF-8 strings; several other UTF-8 fixes, international characters seem to work reasonably well now
authorDan
Sun, 29 Jul 2007 17:40:36 -0400
changeset 78 4df25dfdde63
parent 77 63ca29eda873
child 79 5faff33a6580
Modified Text_Wiki parser to fully support UTF-8 strings; several other UTF-8 fixes, international characters seem to work reasonably well now
includes/clientside/static/ajax.js
includes/comment.php
includes/functions.php
includes/pageutils.php
includes/wikiengine/Render/Xhtml.php
includes/wikiformat.php
schema.sql
--- a/includes/clientside/static/ajax.js	Sat Jul 28 18:11:14 2007 -0400
+++ b/includes/clientside/static/ajax.js	Sun Jul 29 17:40:36 2007 -0400
@@ -164,7 +164,7 @@
   // IE <6 pseudo-compatibility
   if ( KILL_SWITCH )
     return true;
-  goBusy('Saving page...');
+  //goBusy('Saving page...');
   var text = ajaxEscape($('ajaxEditArea').getContent());
   if(document.mdgAjaxEditor.minor.checked) minor='&minor';
   else minor='';
--- a/includes/comment.php	Sat Jul 28 18:11:14 2007 -0400
+++ b/includes/comment.php	Sun Jul 29 17:40:36 2007 -0400
@@ -82,6 +82,7 @@
     global $db, $session, $paths, $template, $plugins; // Common objects
     $parser = new Services_JSON(SERVICES_JSON_LOOSE_TYPE);
     $data = $parser->decode($json);
+    $data = decode_unicode_array($data);
     if ( !isset($data['mode']) )
     {
       return $parser->encode(Array('mode'=>'error','error'=>'No mode defined!'));
--- a/includes/functions.php	Sat Jul 28 18:11:14 2007 -0400
+++ b/includes/functions.php	Sun Jul 29 17:40:36 2007 -0400
@@ -1418,6 +1418,10 @@
   $_GET     = strip_nul_chars($_GET);
   $_COOKIE  = strip_nul_chars($_COOKIE);
   $_REQUEST = strip_nul_chars($_REQUEST);
+  $_POST    = decode_unicode_array($_POST);
+  $_GET     = decode_unicode_array($_GET);
+  $_COOKIE  = decode_unicode_array($_COOKIE);
+  $_REQUEST = decode_unicode_array($_REQUEST);
 }
 
 /**
@@ -2578,6 +2582,80 @@
   return $haystack;
 }
 
+/**
+ * From http://us2.php.net/urldecode - decode %uXXXX
+ * @param string The urlencoded string
+ * @return string
+ */
+
+function decode_unicode_url($str)
+{
+  $res = '';
+
+  $i = 0;
+  $max = strlen($str) - 6;
+  while ($i <= $max)
+  {
+    $character = $str[$i];
+    if ($character == '%' && $str[$i + 1] == 'u')
+    {
+      $value = hexdec(substr($str, $i + 2, 4));
+      $i += 6;
+
+      if ($value < 0x0080)
+      {
+        // 1 byte: 0xxxxxxx
+        $character = chr($value);
+      }
+      else if ($value < 0x0800)
+      {
+        // 2 bytes: 110xxxxx 10xxxxxx
+        $character =
+            chr((($value & 0x07c0) >> 6) | 0xc0)
+          . chr(($value & 0x3f) | 0x80);
+      }
+      else
+      {
+        // 3 bytes: 1110xxxx 10xxxxxx 10xxxxxx
+        $character =
+            chr((($value & 0xf000) >> 12) | 0xe0)
+          . chr((($value & 0x0fc0) >> 6) | 0x80)
+          . chr(($value & 0x3f) | 0x80);
+      }
+    }
+    else
+    {
+      $i++;
+    }
+
+    $res .= $character;
+  }
+
+  return $res . substr($str, $i);
+}
+
+/**
+ * Recursively decodes an array with UTF-8 characters in its strings
+ * @param array Can be multi-depth
+ * @return array
+ */
+
+function decode_unicode_array($array)
+{
+  foreach ( $array as $i => $val )
+  {
+    if ( is_string($val) )
+    {
+      $array[$i] = decode_unicode_url($val);
+    }
+    else
+    {
+      $array[$i] = decode_unicode_array($val);
+    }
+  }
+  return $array;
+}
+
 //die('<pre>Original:  01010101010100101010100101010101011010'."\nProcessed: ".uncompress_bitfield(compress_bitfield('01010101010100101010100101010101011010')).'</pre>');
 
 ?>
--- a/includes/pageutils.php	Sat Jul 28 18:11:14 2007 -0400
+++ b/includes/pageutils.php	Sun Jul 29 17:40:36 2007 -0400
@@ -838,9 +838,10 @@
       $_ob .= '<p>There are currently no comments on this '.strtolower($namespace).'';
       if($namespace != 'Article') $_ob .= ' page';
       $_ob .= '.</p>';
-    } else $_ob .= '<p>There '.$s.' on this article.</p>';
+    } else $_ob .= '<p>There '.$s.' on this article.';
     if($session->get_permissions('mod_comments') && $num_unapp > 0) $_ob .= ' <span style="color: #D84308">'.$num_unapp.' of those are unapproved.</span>';
     elseif(!$session->get_permissions('mod_comments') && $num_unapp > 0) { $u = ($num_unapp == 1) ? "is $num_unapp comment" : "are $num_unapp comments"; $_ob .= ' However, there ' . $u . ' awating approval.'; }
+    $_ob .= '</p>';
     $list = 'list = { ';
     // _die(htmlspecialchars($ttext));
     $i = -1;
@@ -895,10 +896,10 @@
         if($session->get_permissions('edit_comments'))
         {
           // Edit link
-          $strings['EDIT_LINK'] = '<a href="'.makeUrlNS($namespace, $page_id, 'do=comments&amp;sub=editcomment&amp;id='.$row['comment_id']).'" onclick="editComment(\''.$i.'\'); return false;" id="editbtn_'.$i.'">edit</a>';
+          $strings['EDIT_LINK'] = '<a href="'.makeUrlNS($namespace, $page_id, 'do=comments&amp;sub=editcomment&amp;id='.$row['comment_id']).'" id="editbtn_'.$i.'">edit</a>';
         
           // Delete link
-          $strings['DELETE_LINK'] = '<a href="'.makeUrlNS($namespace, $page_id, 'do=comments&amp;sub=deletecomment&amp;id='.$row['comment_id']).'" onclick="ajaxDeleteComment(\''.$i.'\'); return false;">delete</a>';
+          $strings['DELETE_LINK'] = '<a href="'.makeUrlNS($namespace, $page_id, 'do=comments&amp;sub=deletecomment&amp;id='.$row['comment_id']).'">delete</a>';
         }
         else
         {
@@ -917,12 +918,12 @@
         
         // Mod links
         $applink = '';
-        $applink .= '<a href="'.makeUrlNS($namespace, $page_id, 'do=comments&amp;sub=admin&amp;action=approve&amp;id='.$row['comment_id']).'" onclick="ajaxCommentAdmin(\'approve\', \''.$i.'\'); return false;" id="mdgApproveLink'.$i.'">';
+        $applink .= '<a href="'.makeUrlNS($namespace, $page_id, 'do=comments&amp;sub=admin&amp;action=approve&amp;id='.$row['comment_id']).'" id="mdgApproveLink'.$i.'">';
         if($row['approved']) $applink .= 'Unapprove';
         else $applink .= 'Approve';
         $applink .= '</a>';
         $strings['MOD_APPROVE_LINK'] = $applink; unset($applink);
-        $strings['MOD_DELETE_LINK'] = '<a href="'.makeUrlNS($namespace, $page_id, 'do=comments&amp;sub=admin&amp;action=delete&amp;id='.$row['comment_id']).'" onclick="ajaxCommentAdmin(\'delete\', \''.$i.'\'); return false;">Delete</a>';
+        $strings['MOD_DELETE_LINK'] = '<a href="'.makeUrlNS($namespace, $page_id, 'do=comments&amp;sub=admin&amp;action=delete&amp;id='.$row['comment_id']).'">Delete</a>';
         
         // Signature
         $strings['SIGNATURE'] = '';
--- a/includes/wikiengine/Render/Xhtml.php	Sat Jul 28 18:11:14 2007 -0400
+++ b/includes/wikiengine/Render/Xhtml.php	Sun Jul 29 17:40:36 2007 -0400
@@ -61,11 +61,12 @@
 
         // have to check null and false because HTML_ENTITIES is a zero
         if ($type === HTML_ENTITIES) {
+          /*
 
             // keep a copy of the translated version of the delimiter
             // so we can convert it back.
             $new_delim = htmlentities($this->wiki->delim, $quotes, $charset);
-      
+            
             // convert the entities.  we silence the call here so that
             // errors about charsets don't pop up, per counsel from
             // Jan at Horde.  (http://pear.php.net/bugs/bug.php?id=4474)
@@ -84,6 +85,7 @@
             $text = str_replace(
               $new_delim, $this->wiki->delim, $text
             );
+          */
 
         } elseif ($type === HTML_SPECIALCHARS) {
     
--- a/includes/wikiformat.php	Sat Jul 28 18:11:14 2007 -0400
+++ b/includes/wikiformat.php	Sun Jul 29 17:40:36 2007 -0400
@@ -379,7 +379,7 @@
       if ($this->isError($result)) {
         return $result;
       }
-
+      
       if (is_object($this->formatObj[$format])) {
         $output .= $this->formatObj[$format]->pre();
       }
@@ -387,7 +387,7 @@
       foreach (array_keys($this->_countRulesTokens) as $rule) {
         $this->loadRenderObj($format, $rule);
       }
-
+      
       $k = strlen($this->source);
       for ($i = 0; $i < $k; $i++) {
 
--- a/schema.sql	Sat Jul 28 18:11:14 2007 -0400
+++ b/schema.sql	Sun Jul 29 17:40:36 2007 -0400
@@ -14,7 +14,7 @@
   page_id varchar(64),
   namespace varchar(64),
   category_id varchar(64)
-) CHARACTER SET `utf8` COLLATE `utf8_bin`;
+) CHARACTER SET `utf8`;
 
 CREATE TABLE {{TABLE_PREFIX}}comments(
   comment_id int(12) NOT NULL auto_increment,
@@ -27,12 +27,12 @@
   user_id mediumint(8) NOT NULL DEFAULT -1,
   time int(12) NOT NULL DEFAULT 0,
   PRIMARY KEY ( comment_id )
-) CHARACTER SET `utf8` COLLATE `utf8_bin`;
+) CHARACTER SET `utf8`;
 
 CREATE TABLE {{TABLE_PREFIX}}config(
   config_name varchar(63),
   config_value text
-) CHARACTER SET `utf8` COLLATE `utf8_bin`;
+) CHARACTER SET `utf8`;
 
 CREATE TABLE {{TABLE_PREFIX}}logs(
   log_type varchar(16),
@@ -46,14 +46,14 @@
   author varchar(63),
   edit_summary text,
   minor_edit tinyint(1)
-) CHARACTER SET `utf8` COLLATE `utf8_bin`;
+) CHARACTER SET `utf8`;
 
 CREATE TABLE {{TABLE_PREFIX}}page_text(
   page_id varchar(63),
   namespace varchar(16) NOT NULL default 'Article',
   page_text text,
   char_tag varchar(63)
-) CHARACTER SET `utf8` COLLATE `utf8_bin`;
+) CHARACTER SET `utf8`;
 
 CREATE TABLE {{TABLE_PREFIX}}pages(
   page_order int(8),
@@ -68,7 +68,7 @@
   delvotes int(10) NOT NULL default 0,
   password varchar(40) NOT NULL DEFAULT '',
   delvote_ips text NOT NULL
-) CHARACTER SET `utf8` COLLATE `utf8_bin`;
+) CHARACTER SET `utf8`;
 
 CREATE TABLE {{TABLE_PREFIX}}session_keys(
   session_key varchar(32),
@@ -77,7 +77,7 @@
   auth_level tinyint(1) NOT NULL default '0',
   source_ip varchar(10) default '0x7f000001',
   time bigint(15) default '0'
-) CHARACTER SET `utf8` COLLATE `utf8_bin`;
+) CHARACTER SET `utf8`;
 
 CREATE TABLE {{TABLE_PREFIX}}themes(
   theme_id varchar(63),
@@ -85,7 +85,7 @@
   theme_order smallint(5) NOT NULL default '1',
   default_style varchar(63) NOT NULL DEFAULT '',
   enabled tinyint(1) NOT NULL default '1'
-) CHARACTER SET `utf8` COLLATE `utf8_bin`;
+) CHARACTER SET `utf8`;
 
 CREATE TABLE {{TABLE_PREFIX}}users(
   user_id mediumint(8) NOT NULL auto_increment,
@@ -105,7 +105,7 @@
   temp_password_time int(12) NOT NULL DEFAULT 0,
   user_coppa tinyint(1) NOT NULL DEFAULT 0,
   PRIMARY KEY  (user_id)
-) CHARACTER SET `utf8` COLLATE `utf8_bin`;
+) CHARACTER SET `utf8`;
 
 CREATE TABLE {{TABLE_PREFIX}}users_extra(
   user_id mediumint(8) NOT NULL,
@@ -119,7 +119,7 @@
   user_hobbies text,
   email_public tinyint(1) NOT NULL DEFAULT 0,
   PRIMARY KEY ( user_id ) 
-) CHARACTER SET `utf8` COLLATE `utf8_bin`;
+) CHARACTER SET `utf8`;
 
 CREATE TABLE {{TABLE_PREFIX}}banlist(
   ban_id mediumint(8) NOT NULL auto_increment,
@@ -128,7 +128,7 @@
   is_regex tinyint(1) DEFAULT 0,
   reason text,
   PRIMARY KEY ( ban_id ) 
-) CHARACTER SET `utf8` COLLATE `utf8_bin`;
+) CHARACTER SET `utf8`;
 
 CREATE TABLE {{TABLE_PREFIX}}files(
   file_id int(12) NOT NULL auto_increment,
@@ -140,7 +140,7 @@
   file_extension varchar(8) default NULL,
   file_key varchar(32) NOT NULL,
   PRIMARY KEY (file_id) 
-) CHARACTER SET `utf8` COLLATE `utf8_bin`;
+) CHARACTER SET `utf8`;
 
 CREATE TABLE {{TABLE_PREFIX}}buddies(
   buddy_id int(15) NOT NULL auto_increment,
@@ -148,7 +148,7 @@
   buddy_user_id mediumint(8),
   is_friend tinyint(1) NOT NULL default '1',
   PRIMARY KEY  (buddy_id) 
-) CHARACTER SET `utf8` COLLATE `utf8_bin`;
+) CHARACTER SET `utf8`;
 
 CREATE TABLE {{TABLE_PREFIX}}privmsgs(
   message_id int(15) NOT NULL auto_increment,
@@ -160,7 +160,7 @@
   folder_name varchar(63),
   message_read tinyint(1) NOT NULL DEFAULT 0,
   PRIMARY KEY  (message_id) 
-) CHARACTER SET `utf8` COLLATE `utf8_bin`;
+) CHARACTER SET `utf8`;
 
 CREATE TABLE {{TABLE_PREFIX}}sidebar(
   item_id smallint(3) NOT NULL auto_increment,
@@ -171,7 +171,7 @@
   block_type tinyint(1) NOT NULL DEFAULT 0,
   block_content text,
   PRIMARY KEY ( item_id )
-) CHARACTER SET `utf8` COLLATE `utf8_bin`;
+) CHARACTER SET `utf8`;
 
 CREATE TABLE {{TABLE_PREFIX}}hits(
   hit_id bigint(20) NOT NULL auto_increment,
@@ -180,13 +180,13 @@
   page_id varchar(63),
   namespace varchar(63),
   PRIMARY KEY ( hit_id ) 
-) CHARACTER SET `utf8` COLLATE `utf8_bin`;
+) CHARACTER SET `utf8`;
 
 CREATE TABLE {{TABLE_PREFIX}}search_index(
   word varbinary(64) NOT NULL,
   page_names text,
   PRIMARY KEY ( word ) 
-) CHARACTER SET `utf8` COLLATE `utf8_bin`;
+) CHARACTER SET `utf8`;
 
 CREATE TABLE {{TABLE_PREFIX}}groups(
   group_id mediumint(5) UNSIGNED NOT NULL auto_increment,
@@ -194,7 +194,7 @@
   group_type tinyint(1) NOT NULL DEFAULT 1,
   PRIMARY KEY ( group_id ),
   system_group tinyint(1) NOT NULL DEFAULT 0 
-) CHARACTER SET `utf8` COLLATE `utf8_bin`;
+) CHARACTER SET `utf8`;
 
 CREATE TABLE {{TABLE_PREFIX}}group_members(
   member_id int(12) UNSIGNED NOT NULL auto_increment,
@@ -203,7 +203,7 @@
   is_mod tinyint(1) NOT NULL DEFAULT 0,
   pending tinyint(1) NOT NULL DEFAULT 0,
   PRIMARY KEY ( member_id ) 
-) CHARACTER SET `utf8` COLLATE `utf8_bin`;
+) CHARACTER SET `utf8`;
 
 CREATE TABLE {{TABLE_PREFIX}}acl(
   rule_id int(12) UNSIGNED NOT NULL auto_increment,
@@ -213,7 +213,7 @@
   namespace varchar(24),
   rules text,
   PRIMARY KEY ( rule_id ) 
-) CHARACTER SET `utf8` COLLATE `utf8_bin`;
+) CHARACTER SET `utf8`;
 
 CREATE TABLE {{TABLE_PREFIX}}search_cache(
   search_id int(15) NOT NULL auto_increment,
@@ -221,7 +221,7 @@
   query text,
   results longblob,
   PRIMARY KEY ( search_id )
-) CHARACTER SET `utf8` COLLATE `utf8_bin`;
+) CHARACTER SET `utf8`;
 
 -- Added in 1.0.1
 
@@ -231,7 +231,7 @@
   pg_name varchar(255) NOT NULL DEFAULT '',
   pg_target varchar(255) DEFAULT NULL,
   PRIMARY KEY ( pg_id )
-) CHARACTER SET `utf8` COLLATE `utf8_bin`;
+) CHARACTER SET `utf8`;
 
 -- Added in 1.0.1
 
@@ -241,7 +241,7 @@
   page_id varchar(63) NOT NULL,
   namespace varchar(63) NOT NULL DEFAULT 'Article',
   PRIMARY KEY ( pg_member_id )
-) CHARACTER SET `utf8` COLLATE `utf8_bin`;
+) CHARACTER SET `utf8`;
 
 -- Added in 1.0.1
 
@@ -252,7 +252,7 @@
   namespace varchar(255) NOT NULL,
   user mediumint(8) NOT NULL DEFAULT 1,
   PRIMARY KEY ( tag_id )
-) CHARACTER SET `utf8` COLLATE `utf8_bin`;
+) CHARACTER SET `utf8`;
 
 INSERT INTO {{TABLE_PREFIX}}config(config_name, config_value) VALUES
   ('site_name', '{{SITE_NAME}}'),