includes/diff.php
changeset 1 fe660c52c48f
child 536 218a627eb53e
equal deleted inserted replaced
0:902822492a68 1:fe660c52c48f
       
     1 <?php
       
     2 
       
     3 // A PHP diff engine for phpwiki. (Taken from phpwiki-1.3.3)
       
     4 //
       
     5 // Copyright (C) 2000, 2001 Geoffrey T. Dairiki <dairiki@dairiki.org>
       
     6 // You may copy this code freely under the conditions of the GPL.
       
     7 //
       
     8 
       
     9 define('USE_ASSERTS', function_exists('assert'));
       
    10 
       
    11 /**
       
    12  * @todo document
       
    13  * @package Enano
       
    14  * @subpackage DifferenceEngine
       
    15  */
       
    16 class _DiffOp {
       
    17 	var $type;
       
    18 	var $orig;
       
    19 	var $closing;
       
    20 
       
    21 	function reverse() {
       
    22 		trigger_error('pure virtual', E_USER_ERROR);
       
    23 	}
       
    24 
       
    25 	function norig() {
       
    26 		return $this->orig ? sizeof($this->orig) : 0;
       
    27 	}
       
    28 
       
    29 	function nclosing() {
       
    30 		return $this->closing ? sizeof($this->closing) : 0;
       
    31 	}
       
    32 }
       
    33 
       
    34 /**
       
    35  * @todo document
       
    36  * @package Enano
       
    37  * @subpackage DifferenceEngine
       
    38  */
       
    39 class _DiffOp_Copy extends _DiffOp {
       
    40 	var $type = 'copy';
       
    41 
       
    42 	function _DiffOp_Copy ($orig, $closing = false) {
       
    43 		if (!is_array($closing))
       
    44 			$closing = $orig;
       
    45 		$this->orig = $orig;
       
    46 		$this->closing = $closing;
       
    47 	}
       
    48 
       
    49 	function reverse() {
       
    50 		return new _DiffOp_Copy($this->closing, $this->orig);
       
    51 	}
       
    52 }
       
    53 
       
    54 /**
       
    55  * @todo document
       
    56  * @package Enano
       
    57  * @subpackage DifferenceEngine
       
    58  */
       
    59 class _DiffOp_Delete extends _DiffOp {
       
    60 	var $type = 'delete';
       
    61 
       
    62 	function _DiffOp_Delete ($lines) {
       
    63 		$this->orig = $lines;
       
    64 		$this->closing = false;
       
    65 	}
       
    66 
       
    67 	function reverse() {
       
    68 		return new _DiffOp_Add($this->orig);
       
    69 	}
       
    70 }
       
    71 
       
    72 /**
       
    73  * @todo document
       
    74  * @package Enano
       
    75  * @subpackage DifferenceEngine
       
    76  */
       
    77 class _DiffOp_Add extends _DiffOp {
       
    78 	var $type = 'add';
       
    79 
       
    80 	function _DiffOp_Add ($lines) {
       
    81 		$this->closing = $lines;
       
    82 		$this->orig = false;
       
    83 	}
       
    84 
       
    85 	function reverse() {
       
    86 		return new _DiffOp_Delete($this->closing);
       
    87 	}
       
    88 }
       
    89 
       
    90 /**
       
    91  * @todo document
       
    92  * @package Enano
       
    93  * @subpackage DifferenceEngine
       
    94  */
       
    95 class _DiffOp_Change extends _DiffOp {
       
    96 	var $type = 'change';
       
    97 
       
    98 	function _DiffOp_Change ($orig, $closing) {
       
    99 		$this->orig = $orig;
       
   100 		$this->closing = $closing;
       
   101 	}
       
   102 
       
   103 	function reverse() {
       
   104 		return new _DiffOp_Change($this->closing, $this->orig);
       
   105 	}
       
   106 }
       
   107 
       
   108 
       
   109 /**
       
   110  * Class used internally by Diff to actually compute the diffs.
       
   111  *
       
   112  * The algorithm used here is mostly lifted from the perl module
       
   113  * Algorithm::Diff (version 1.06) by Ned Konz, which is available at:
       
   114  *	 http://www.perl.com/CPAN/authors/id/N/NE/NEDKONZ/Algorithm-Diff-1.06.zip
       
   115  *
       
   116  * More ideas are taken from:
       
   117  *	 http://www.ics.uci.edu/~eppstein/161/960229.html
       
   118  *
       
   119  * Some ideas are (and a bit of code) are from from analyze.c, from GNU
       
   120  * diffutils-2.7, which can be found at:
       
   121  *	 ftp://gnudist.gnu.org/pub/gnu/diffutils/diffutils-2.7.tar.gz
       
   122  *
       
   123  * closingly, some ideas (subdivision by NCHUNKS > 2, and some optimizations)
       
   124  * are my own.
       
   125  *
       
   126  * Line length limits for robustness added by Tim Starling, 2005-08-31
       
   127  *
       
   128  * @author Geoffrey T. Dairiki, Tim Starling
       
   129  * @package Enano
       
   130  * @subpackage DifferenceEngine
       
   131  */
       
   132 define('MAX_XREF_LENGTH', 10000);
       
   133 class _DiffEngine
       
   134 {
       
   135 	function diff ($from_lines, $to_lines) {
       
   136 		$fname = '_DiffEngine::diff';
       
   137 		// wfProfileIn( $fname );
       
   138 
       
   139 		$n_from = sizeof($from_lines);
       
   140 		$n_to = sizeof($to_lines);
       
   141 
       
   142 		$this->xchanged = $this->ychanged = array();
       
   143 		$this->xv = $this->yv = array();
       
   144 		$this->xind = $this->yind = array();
       
   145 		unset($this->seq);
       
   146 		unset($this->in_seq);
       
   147 		unset($this->lcs);
       
   148 
       
   149 		// Skip leading common lines.
       
   150 		for ($skip = 0; $skip < $n_from && $skip < $n_to; $skip++) {
       
   151 			if ($from_lines[$skip] !== $to_lines[$skip])
       
   152 				break;
       
   153 			$this->xchanged[$skip] = $this->ychanged[$skip] = false;
       
   154 		}
       
   155 		// Skip trailing common lines.
       
   156 		$xi = $n_from; $yi = $n_to;
       
   157 		for ($endskip = 0; --$xi > $skip && --$yi > $skip; $endskip++) {
       
   158 			if ($from_lines[$xi] !== $to_lines[$yi])
       
   159 				break;
       
   160 			$this->xchanged[$xi] = $this->ychanged[$yi] = false;
       
   161 		}
       
   162 
       
   163 		// Ignore lines which do not exist in both files.
       
   164 		for ($xi = $skip; $xi < $n_from - $endskip; $xi++) {
       
   165 			$xhash[$this->_line_hash($from_lines[$xi])] = 1;
       
   166 		}
       
   167 
       
   168 		for ($yi = $skip; $yi < $n_to - $endskip; $yi++) {
       
   169 			$line = $to_lines[$yi];
       
   170 			if ( ($this->ychanged[$yi] = empty($xhash[$this->_line_hash($line)])) )
       
   171 				continue;
       
   172 			$yhash[$this->_line_hash($line)] = 1;
       
   173 			$this->yv[] = $line;
       
   174 			$this->yind[] = $yi;
       
   175 		}
       
   176 		for ($xi = $skip; $xi < $n_from - $endskip; $xi++) {
       
   177 			$line = $from_lines[$xi];
       
   178 			if ( ($this->xchanged[$xi] = empty($yhash[$this->_line_hash($line)])) )
       
   179 				continue;
       
   180 			$this->xv[] = $line;
       
   181 			$this->xind[] = $xi;
       
   182 		}
       
   183 
       
   184 		// Find the LCS.
       
   185 		$this->_compareseq(0, sizeof($this->xv), 0, sizeof($this->yv));
       
   186 
       
   187 		// Merge edits when possible
       
   188 		$this->_shift_boundaries($from_lines, $this->xchanged, $this->ychanged);
       
   189 		$this->_shift_boundaries($to_lines, $this->ychanged, $this->xchanged);
       
   190 
       
   191 		// Compute the edit operations.
       
   192 		$edits = array();
       
   193 		$xi = $yi = 0;
       
   194 		while ($xi < $n_from || $yi < $n_to) {
       
   195 			USE_ASSERTS && assert($yi < $n_to || $this->xchanged[$xi]);
       
   196 			USE_ASSERTS && assert($xi < $n_from || $this->ychanged[$yi]);
       
   197 
       
   198 			// Skip matching "snake".
       
   199 			$copy = array();
       
   200 			while ( $xi < $n_from && $yi < $n_to
       
   201 					&& !$this->xchanged[$xi] && !$this->ychanged[$yi]) {
       
   202 				$copy[] = $from_lines[$xi++];
       
   203 				++$yi;
       
   204 			}
       
   205 			if ($copy)
       
   206 				$edits[] = new _DiffOp_Copy($copy);
       
   207 
       
   208 			// Find deletes & adds.
       
   209 			$delete = array();
       
   210 			while ($xi < $n_from && $this->xchanged[$xi])
       
   211 				$delete[] = $from_lines[$xi++];
       
   212 
       
   213 			$add = array();
       
   214 			while ($yi < $n_to && $this->ychanged[$yi])
       
   215 				$add[] = $to_lines[$yi++];
       
   216 
       
   217 			if ($delete && $add)
       
   218 				$edits[] = new _DiffOp_Change($delete, $add);
       
   219 			elseif ($delete)
       
   220 				$edits[] = new _DiffOp_Delete($delete);
       
   221 			elseif ($add)
       
   222 				$edits[] = new _DiffOp_Add($add);
       
   223 		}
       
   224 		// wfProfileOut( $fname );
       
   225 		return $edits;
       
   226 	}
       
   227 
       
   228 	/**
       
   229 	 * Returns the whole line if it's small enough, or the MD5 hash otherwise
       
   230 	 */
       
   231 	function _line_hash( $line ) {
       
   232 		if ( strlen( $line ) > MAX_XREF_LENGTH ) {
       
   233 			return md5( $line );
       
   234 		} else {
       
   235 			return $line;
       
   236 		}
       
   237 	}
       
   238 
       
   239 
       
   240 	/* Divide the Largest Common Subsequence (LCS) of the sequences
       
   241 	 * [XOFF, XLIM) and [YOFF, YLIM) into NCHUNKS approximately equally
       
   242 	 * sized segments.
       
   243 	 *
       
   244 	 * Returns (LCS, PTS).	LCS is the length of the LCS. PTS is an
       
   245 	 * array of NCHUNKS+1 (X, Y) indexes giving the diving points between
       
   246 	 * sub sequences.  The first sub-sequence is contained in [X0, X1),
       
   247 	 * [Y0, Y1), the second in [X1, X2), [Y1, Y2) and so on.  Note
       
   248 	 * that (X0, Y0) == (XOFF, YOFF) and
       
   249 	 * (X[NCHUNKS], Y[NCHUNKS]) == (XLIM, YLIM).
       
   250 	 *
       
   251 	 * This function assumes that the first lines of the specified portions
       
   252 	 * of the two files do not match, and likewise that the last lines do not
       
   253 	 * match.  The caller must trim matching lines from the beginning and end
       
   254 	 * of the portions it is going to specify.
       
   255 	 */
       
   256 	function _diag ($xoff, $xlim, $yoff, $ylim, $nchunks) {
       
   257 		$fname = '_DiffEngine::_diag';
       
   258 		// wfProfileIn( $fname );
       
   259 		$flip = false;
       
   260 
       
   261 		if ($xlim - $xoff > $ylim - $yoff) {
       
   262 			// Things seems faster (I'm not sure I understand why)
       
   263 				// when the shortest sequence in X.
       
   264 				$flip = true;
       
   265 			list ($xoff, $xlim, $yoff, $ylim)
       
   266 			= array( $yoff, $ylim, $xoff, $xlim);
       
   267 		}
       
   268 
       
   269 		if ($flip)
       
   270 			for ($i = $ylim - 1; $i >= $yoff; $i--)
       
   271 				$ymatches[$this->xv[$i]][] = $i;
       
   272 		else
       
   273 			for ($i = $ylim - 1; $i >= $yoff; $i--)
       
   274 				$ymatches[$this->yv[$i]][] = $i;
       
   275 
       
   276 		$this->lcs = 0;
       
   277 		$this->seq[0]= $yoff - 1;
       
   278 		$this->in_seq = array();
       
   279 		$ymids[0] = array();
       
   280 
       
   281 		$numer = $xlim - $xoff + $nchunks - 1;
       
   282 		$x = $xoff;
       
   283 		for ($chunk = 0; $chunk < $nchunks; $chunk++) {
       
   284 			// wfProfileIn( "$fname-chunk" );
       
   285 			if ($chunk > 0)
       
   286 				for ($i = 0; $i <= $this->lcs; $i++)
       
   287 					$ymids[$i][$chunk-1] = $this->seq[$i];
       
   288 
       
   289 			$x1 = $xoff + (int)(($numer + ($xlim-$xoff)*$chunk) / $nchunks);
       
   290 			for ( ; $x < $x1; $x++) {
       
   291 				$line = $flip ? $this->yv[$x] : $this->xv[$x];
       
   292 					if (empty($ymatches[$line]))
       
   293 						continue;
       
   294 				$matches = $ymatches[$line];
       
   295 				reset($matches);
       
   296 				while (list ($junk, $y) = each($matches))
       
   297 					if (empty($this->in_seq[$y])) {
       
   298 						$k = $this->_lcs_pos($y);
       
   299 						USE_ASSERTS && assert($k > 0);
       
   300 						$ymids[$k] = $ymids[$k-1];
       
   301 						break;
       
   302 					}
       
   303 				while (list ($junk, $y) = each($matches)) {
       
   304 					if ($y > $this->seq[$k-1]) {
       
   305 						USE_ASSERTS && assert($y < $this->seq[$k]);
       
   306 						// Optimization: this is a common case:
       
   307 						//	next match is just replacing previous match.
       
   308 						$this->in_seq[$this->seq[$k]] = false;
       
   309 						$this->seq[$k] = $y;
       
   310 						$this->in_seq[$y] = 1;
       
   311 					} else if (empty($this->in_seq[$y])) {
       
   312 						$k = $this->_lcs_pos($y);
       
   313 						USE_ASSERTS && assert($k > 0);
       
   314 						$ymids[$k] = $ymids[$k-1];
       
   315 					}
       
   316 				}
       
   317 			}
       
   318 			// wfProfileOut( "$fname-chunk" );
       
   319 		}
       
   320 
       
   321 		$seps[] = $flip ? array($yoff, $xoff) : array($xoff, $yoff);
       
   322 		$ymid = $ymids[$this->lcs];
       
   323 		for ($n = 0; $n < $nchunks - 1; $n++) {
       
   324 			$x1 = $xoff + (int)(($numer + ($xlim - $xoff) * $n) / $nchunks);
       
   325 			$y1 = $ymid[$n] + 1;
       
   326 			$seps[] = $flip ? array($y1, $x1) : array($x1, $y1);
       
   327 		}
       
   328 		$seps[] = $flip ? array($ylim, $xlim) : array($xlim, $ylim);
       
   329 
       
   330 		// wfProfileOut( $fname );
       
   331 		return array($this->lcs, $seps);
       
   332 	}
       
   333 
       
   334 	function _lcs_pos ($ypos) {
       
   335 		$fname = '_DiffEngine::_lcs_pos';
       
   336 		// wfProfileIn( $fname );
       
   337 
       
   338 		$end = $this->lcs;
       
   339 		if ($end == 0 || $ypos > $this->seq[$end]) {
       
   340 			$this->seq[++$this->lcs] = $ypos;
       
   341 			$this->in_seq[$ypos] = 1;
       
   342 			// wfProfileOut( $fname );
       
   343 			return $this->lcs;
       
   344 		}
       
   345 
       
   346 		$beg = 1;
       
   347 		while ($beg < $end) {
       
   348 			$mid = (int)(($beg + $end) / 2);
       
   349 			if ( $ypos > $this->seq[$mid] )
       
   350 				$beg = $mid + 1;
       
   351 			else
       
   352 				$end = $mid;
       
   353 		}
       
   354 
       
   355 		USE_ASSERTS && assert($ypos != $this->seq[$end]);
       
   356 
       
   357 		$this->in_seq[$this->seq[$end]] = false;
       
   358 		$this->seq[$end] = $ypos;
       
   359 		$this->in_seq[$ypos] = 1;
       
   360 		// wfProfileOut( $fname );
       
   361 		return $end;
       
   362 	}
       
   363 
       
   364 	/* Find LCS of two sequences.
       
   365 	 *
       
   366 	 * The results are recorded in the vectors $this->{x,y}changed[], by
       
   367 	 * storing a 1 in the element for each line that is an insertion
       
   368 	 * or deletion (ie. is not in the LCS).
       
   369 	 *
       
   370 	 * The subsequence of file 0 is [XOFF, XLIM) and likewise for file 1.
       
   371 	 *
       
   372 	 * Note that XLIM, YLIM are exclusive bounds.
       
   373 	 * All line numbers are origin-0 and discarded lines are not counted.
       
   374 	 */
       
   375 	function _compareseq ($xoff, $xlim, $yoff, $ylim) {
       
   376 		$fname = '_DiffEngine::_compareseq';
       
   377 		// wfProfileIn( $fname );
       
   378 
       
   379 		// Slide down the bottom initial diagonal.
       
   380 		while ($xoff < $xlim && $yoff < $ylim
       
   381 			   && $this->xv[$xoff] == $this->yv[$yoff]) {
       
   382 			++$xoff;
       
   383 			++$yoff;
       
   384 		}
       
   385 
       
   386 		// Slide up the top initial diagonal.
       
   387 		while ($xlim > $xoff && $ylim > $yoff
       
   388 			   && $this->xv[$xlim - 1] == $this->yv[$ylim - 1]) {
       
   389 			--$xlim;
       
   390 			--$ylim;
       
   391 		}
       
   392 
       
   393 		if ($xoff == $xlim || $yoff == $ylim)
       
   394 			$lcs = 0;
       
   395 		else {
       
   396 			// This is ad hoc but seems to work well.
       
   397 			//$nchunks = sqrt(min($xlim - $xoff, $ylim - $yoff) / 2.5);
       
   398 			//$nchunks = max(2,min(8,(int)$nchunks));
       
   399 			$nchunks = min(7, $xlim - $xoff, $ylim - $yoff) + 1;
       
   400 			list ($lcs, $seps)
       
   401 			= $this->_diag($xoff,$xlim,$yoff, $ylim,$nchunks);
       
   402 		}
       
   403 
       
   404 		if ($lcs == 0) {
       
   405 			// X and Y sequences have no common subsequence:
       
   406 			// mark all changed.
       
   407 			while ($yoff < $ylim)
       
   408 				$this->ychanged[$this->yind[$yoff++]] = 1;
       
   409 			while ($xoff < $xlim)
       
   410 				$this->xchanged[$this->xind[$xoff++]] = 1;
       
   411 		} else {
       
   412 			// Use the partitions to split this problem into subproblems.
       
   413 			reset($seps);
       
   414 			$pt1 = $seps[0];
       
   415 			while ($pt2 = next($seps)) {
       
   416 				$this->_compareseq ($pt1[0], $pt2[0], $pt1[1], $pt2[1]);
       
   417 				$pt1 = $pt2;
       
   418 			}
       
   419 		}
       
   420 		// wfProfileOut( $fname );
       
   421 	}
       
   422 
       
   423 	/* Adjust inserts/deletes of identical lines to join changes
       
   424 	 * as much as possible.
       
   425 	 *
       
   426 	 * We do something when a run of changed lines include a
       
   427 	 * line at one end and has an excluded, identical line at the other.
       
   428 	 * We are free to choose which identical line is included.
       
   429 	 * `compareseq' usually chooses the one at the beginning,
       
   430 	 * but usually it is cleaner to consider the following identical line
       
   431 	 * to be the "change".
       
   432 	 *
       
   433 	 * This is extracted verbatim from analyze.c (GNU diffutils-2.7).
       
   434 	 */
       
   435 	function _shift_boundaries ($lines, &$changed, $other_changed) {
       
   436 		$fname = '_DiffEngine::_shift_boundaries';
       
   437 		// wfProfileIn( $fname );
       
   438 		$i = 0;
       
   439 		$j = 0;
       
   440 
       
   441 		USE_ASSERTS && assert('sizeof($lines) == sizeof($changed)');
       
   442 		$len = sizeof($lines);
       
   443 		$other_len = sizeof($other_changed);
       
   444 
       
   445 		while (1) {
       
   446 			/*
       
   447 			 * Scan forwards to find beginning of another run of changes.
       
   448 			 * Also keep track of the corresponding point in the other file.
       
   449 			 *
       
   450 			 * Throughout this code, $i and $j are adjusted together so that
       
   451 			 * the first $i elements of $changed and the first $j elements
       
   452 			 * of $other_changed both contain the same number of zeros
       
   453 			 * (unchanged lines).
       
   454 			 * Furthermore, $j is always kept so that $j == $other_len or
       
   455 			 * $other_changed[$j] == false.
       
   456 			 */
       
   457 			while ($j < $other_len && $other_changed[$j])
       
   458 				$j++;
       
   459 
       
   460 			while ($i < $len && ! $changed[$i]) {
       
   461 				USE_ASSERTS && assert('$j < $other_len && ! $other_changed[$j]');
       
   462 				$i++; $j++;
       
   463 				while ($j < $other_len && $other_changed[$j])
       
   464 					$j++;
       
   465 			}
       
   466 
       
   467 			if ($i == $len)
       
   468 				break;
       
   469 
       
   470 			$start = $i;
       
   471 
       
   472 			// Find the end of this run of changes.
       
   473 			while (++$i < $len && $changed[$i])
       
   474 				continue;
       
   475 
       
   476 			do {
       
   477 				/*
       
   478 				 * Record the length of this run of changes, so that
       
   479 				 * we can later determine whether the run has grown.
       
   480 				 */
       
   481 				$runlength = $i - $start;
       
   482 
       
   483 				/*
       
   484 				 * Move the changed region back, so long as the
       
   485 				 * previous unchanged line matches the last changed one.
       
   486 				 * This merges with previous changed regions.
       
   487 				 */
       
   488 				while ($start > 0 && $lines[$start - 1] == $lines[$i - 1]) {
       
   489 					$changed[--$start] = 1;
       
   490 					$changed[--$i] = false;
       
   491 					while ($start > 0 && $changed[$start - 1])
       
   492 						$start--;
       
   493 					USE_ASSERTS && assert('$j > 0');
       
   494 					while ($other_changed[--$j])
       
   495 						continue;
       
   496 					USE_ASSERTS && assert('$j >= 0 && !$other_changed[$j]');
       
   497 				}
       
   498 
       
   499 				/*
       
   500 				 * Set CORRESPONDING to the end of the changed run, at the last
       
   501 				 * point where it corresponds to a changed run in the other file.
       
   502 				 * CORRESPONDING == LEN means no such point has been found.
       
   503 				 */
       
   504 				$corresponding = $j < $other_len ? $i : $len;
       
   505 
       
   506 				/*
       
   507 				 * Move the changed region forward, so long as the
       
   508 				 * first changed line matches the following unchanged one.
       
   509 				 * This merges with following changed regions.
       
   510 				 * Do this second, so that if there are no merges,
       
   511 				 * the changed region is moved forward as far as possible.
       
   512 				 */
       
   513 				while ($i < $len && $lines[$start] == $lines[$i]) {
       
   514 					$changed[$start++] = false;
       
   515 					$changed[$i++] = 1;
       
   516 					while ($i < $len && $changed[$i])
       
   517 						$i++;
       
   518 
       
   519 					USE_ASSERTS && assert('$j < $other_len && ! $other_changed[$j]');
       
   520 					$j++;
       
   521 					if ($j < $other_len && $other_changed[$j]) {
       
   522 						$corresponding = $i;
       
   523 						while ($j < $other_len && $other_changed[$j])
       
   524 							$j++;
       
   525 					}
       
   526 				}
       
   527 			} while ($runlength != $i - $start);
       
   528 
       
   529 			/*
       
   530 			 * If possible, move the fully-merged run of changes
       
   531 			 * back to a corresponding run in the other file.
       
   532 			 */
       
   533 			while ($corresponding < $i) {
       
   534 				$changed[--$start] = 1;
       
   535 				$changed[--$i] = 0;
       
   536 				USE_ASSERTS && assert('$j > 0');
       
   537 				while ($other_changed[--$j])
       
   538 					continue;
       
   539 				USE_ASSERTS && assert('$j >= 0 && !$other_changed[$j]');
       
   540 			}
       
   541 		}
       
   542 		// wfProfileOut( $fname );
       
   543 	}
       
   544 }
       
   545 
       
   546 /**
       
   547  * Class representing a 'diff' between two sequences of strings.
       
   548  * @todo document
       
   549  * @package Enano
       
   550  * @subpackage DifferenceEngine
       
   551  */
       
   552 class Diff
       
   553 {
       
   554 	var $edits;
       
   555 
       
   556 	/**
       
   557 	 * Constructor.
       
   558 	 * Computes diff between sequences of strings.
       
   559 	 *
       
   560 	 * @param $from_lines array An array of strings.
       
   561 	 *		  (Typically these are lines from a file.)
       
   562 	 * @param $to_lines array An array of strings.
       
   563 	 */
       
   564 	function Diff($from_lines, $to_lines) {
       
   565 		$eng = new _DiffEngine;
       
   566 		$this->edits = $eng->diff($from_lines, $to_lines);
       
   567 		//$this->_check($from_lines, $to_lines);
       
   568 	}
       
   569 
       
   570 	/**
       
   571 	 * Compute reversed Diff.
       
   572 	 *
       
   573 	 * SYNOPSIS:
       
   574 	 *
       
   575 	 *	$diff = new Diff($lines1, $lines2);
       
   576 	 *	$rev = $diff->reverse();
       
   577 	 * @return object A Diff object representing the inverse of the
       
   578 	 *				  original diff.
       
   579 	 */
       
   580 	function reverse () {
       
   581 		$rev = $this;
       
   582 		$rev->edits = array();
       
   583 		foreach ($this->edits as $edit) {
       
   584 			$rev->edits[] = $edit->reverse();
       
   585 		}
       
   586 		return $rev;
       
   587 	}
       
   588 
       
   589 	/**
       
   590 	 * Check for empty diff.
       
   591 	 *
       
   592 	 * @return bool True iff two sequences were identical.
       
   593 	 */
       
   594 	function isEmpty () {
       
   595 		foreach ($this->edits as $edit) {
       
   596 			if ($edit->type != 'copy')
       
   597 				return false;
       
   598 		}
       
   599 		return true;
       
   600 	}
       
   601 
       
   602 	/**
       
   603 	 * Compute the length of the Longest Common Subsequence (LCS).
       
   604 	 *
       
   605 	 * This is mostly for diagnostic purposes.
       
   606 	 *
       
   607 	 * @return int The length of the LCS.
       
   608 	 */
       
   609 	function lcs () {
       
   610 		$lcs = 0;
       
   611 		foreach ($this->edits as $edit) {
       
   612 			if ($edit->type == 'copy')
       
   613 				$lcs += sizeof($edit->orig);
       
   614 		}
       
   615 		return $lcs;
       
   616 	}
       
   617 
       
   618 	/**
       
   619 	 * Get the original set of lines.
       
   620 	 *
       
   621 	 * This reconstructs the $from_lines parameter passed to the
       
   622 	 * constructor.
       
   623 	 *
       
   624 	 * @return array The original sequence of strings.
       
   625 	 */
       
   626 	function orig() {
       
   627 		$lines = array();
       
   628 
       
   629 		foreach ($this->edits as $edit) {
       
   630 			if ($edit->orig)
       
   631 				array_splice($lines, sizeof($lines), 0, $edit->orig);
       
   632 		}
       
   633 		return $lines;
       
   634 	}
       
   635 
       
   636 	/**
       
   637 	 * Get the closing set of lines.
       
   638 	 *
       
   639 	 * This reconstructs the $to_lines parameter passed to the
       
   640 	 * constructor.
       
   641 	 *
       
   642 	 * @return array The sequence of strings.
       
   643 	 */
       
   644 	function closing() {
       
   645 		$lines = array();
       
   646 
       
   647 		foreach ($this->edits as $edit) {
       
   648 			if ($edit->closing)
       
   649 				array_splice($lines, sizeof($lines), 0, $edit->closing);
       
   650 		}
       
   651 		return $lines;
       
   652 	}
       
   653 
       
   654 	/**
       
   655 	 * Check a Diff for validity.
       
   656 	 *
       
   657 	 * This is here only for debugging purposes.
       
   658 	 */
       
   659 	function _check ($from_lines, $to_lines) {
       
   660 		$fname = 'Diff::_check';
       
   661 		// wfProfileIn( $fname );
       
   662 		if (serialize($from_lines) != serialize($this->orig()))
       
   663 			trigger_error("Reconstructed original doesn't match", E_USER_ERROR);
       
   664 		if (serialize($to_lines) != serialize($this->closing()))
       
   665 			trigger_error("Reconstructed closing doesn't match", E_USER_ERROR);
       
   666 
       
   667 		$rev = $this->reverse();
       
   668 		if (serialize($to_lines) != serialize($rev->orig()))
       
   669 			trigger_error("Reversed original doesn't match", E_USER_ERROR);
       
   670 		if (serialize($from_lines) != serialize($rev->closing()))
       
   671 			trigger_error("Reversed closing doesn't match", E_USER_ERROR);
       
   672 
       
   673 
       
   674 		$prevtype = 'none';
       
   675 		foreach ($this->edits as $edit) {
       
   676 			if ( $prevtype == $edit->type )
       
   677 				trigger_error("Edit sequence is non-optimal", E_USER_ERROR);
       
   678 			$prevtype = $edit->type;
       
   679 		}
       
   680 
       
   681 		$lcs = $this->lcs();
       
   682 		trigger_error('Diff okay: LCS = '.$lcs, E_USER_NOTICE);
       
   683 		// wfProfileOut( $fname );
       
   684 	}
       
   685 }
       
   686 
       
   687 /**
       
   688  * FIXME: bad name.
       
   689  * @todo document
       
   690  * @package Enano
       
   691  * @subpackage DifferenceEngine
       
   692  */
       
   693 class MappedDiff extends Diff
       
   694 {
       
   695 	/**
       
   696 	 * Constructor.
       
   697 	 *
       
   698 	 * Computes diff between sequences of strings.
       
   699 	 *
       
   700 	 * This can be used to compute things like
       
   701 	 * case-insensitve diffs, or diffs which ignore
       
   702 	 * changes in white-space.
       
   703 	 *
       
   704 	 * @param $from_lines array An array of strings.
       
   705 	 *	(Typically these are lines from a file.)
       
   706 	 *
       
   707 	 * @param $to_lines array An array of strings.
       
   708 	 *
       
   709 	 * @param $mapped_from_lines array This array should
       
   710 	 *	have the same size number of elements as $from_lines.
       
   711 	 *	The elements in $mapped_from_lines and
       
   712 	 *	$mapped_to_lines are what is actually compared
       
   713 	 *	when computing the diff.
       
   714 	 *
       
   715 	 * @param $mapped_to_lines array This array should
       
   716 	 *	have the same number of elements as $to_lines.
       
   717 	 */
       
   718 	function MappedDiff($from_lines, $to_lines,
       
   719 						$mapped_from_lines, $mapped_to_lines) {
       
   720 		$fname = 'MappedDiff::MappedDiff';
       
   721 		// wfProfileIn( $fname );
       
   722 
       
   723 		assert(sizeof($from_lines) == sizeof($mapped_from_lines));
       
   724 		assert(sizeof($to_lines) == sizeof($mapped_to_lines));
       
   725 
       
   726 		$this->Diff($mapped_from_lines, $mapped_to_lines);
       
   727 
       
   728 		$xi = $yi = 0;
       
   729 		for ($i = 0; $i < sizeof($this->edits); $i++) {
       
   730 			$orig = &$this->edits[$i]->orig;
       
   731 			if (is_array($orig)) {
       
   732 				$orig = array_slice($from_lines, $xi, sizeof($orig));
       
   733 				$xi += sizeof($orig);
       
   734 			}
       
   735 
       
   736 			$closing = &$this->edits[$i]->closing;
       
   737 			if (is_array($closing)) {
       
   738 				$closing = array_slice($to_lines, $yi, sizeof($closing));
       
   739 				$yi += sizeof($closing);
       
   740 			}
       
   741 		}
       
   742 		// wfProfileOut( $fname );
       
   743 	}
       
   744 }
       
   745 
       
   746 /**
       
   747  * A class to format Diffs
       
   748  *
       
   749  * This class formats the diff in classic diff format.
       
   750  * It is intended that this class be customized via inheritance,
       
   751  * to obtain fancier outputs.
       
   752  * @todo document
       
   753  * @package Enano
       
   754  * @subpackage DifferenceEngine
       
   755  */
       
   756 class DiffFormatter
       
   757 {
       
   758 	/**
       
   759 	 * Number of leading context "lines" to preserve.
       
   760 	 *
       
   761 	 * This should be left at zero for this class, but subclasses
       
   762 	 * may want to set this to other values.
       
   763 	 */
       
   764 	var $leading_context_lines = 0;
       
   765 
       
   766 	/**
       
   767 	 * Number of trailing context "lines" to preserve.
       
   768 	 *
       
   769 	 * This should be left at zero for this class, but subclasses
       
   770 	 * may want to set this to other values.
       
   771 	 */
       
   772 	var $trailing_context_lines = 0;
       
   773 
       
   774 	/**
       
   775 	 * Format a diff.
       
   776 	 *
       
   777 	 * @param $diff object A Diff object.
       
   778 	 * @return string The formatted output.
       
   779 	 */
       
   780 	function format($diff) {
       
   781 		$fname = 'DiffFormatter::format';
       
   782 		// wfProfileIn( $fname );
       
   783 
       
   784 		$xi = $yi = 1;
       
   785 		$block = false;
       
   786 		$context = array();
       
   787 
       
   788 		$nlead = $this->leading_context_lines;
       
   789 		$ntrail = $this->trailing_context_lines;
       
   790 
       
   791 		$this->_start_diff();
       
   792 
       
   793 		foreach ($diff->edits as $edit) {
       
   794 			if ($edit->type == 'copy') {
       
   795 				if (is_array($block)) {
       
   796 					if (sizeof($edit->orig) <= $nlead + $ntrail) {
       
   797 						$block[] = $edit;
       
   798 					}
       
   799 					else{
       
   800 						if ($ntrail) {
       
   801 							$context = array_slice($edit->orig, 0, $ntrail);
       
   802 							$block[] = new _DiffOp_Copy($context);
       
   803 						}
       
   804 						$this->_block($x0, $ntrail + $xi - $x0,
       
   805 									  $y0, $ntrail + $yi - $y0,
       
   806 									  $block);
       
   807 						$block = false;
       
   808 					}
       
   809 				}
       
   810 				$context = $edit->orig;
       
   811 			}
       
   812 			else {
       
   813 				if (! is_array($block)) {
       
   814 					$context = array_slice($context, sizeof($context) - $nlead);
       
   815 					$x0 = $xi - sizeof($context);
       
   816 					$y0 = $yi - sizeof($context);
       
   817 					$block = array();
       
   818 					if ($context)
       
   819 						$block[] = new _DiffOp_Copy($context);
       
   820 				}
       
   821 				$block[] = $edit;
       
   822 			}
       
   823 
       
   824 			if ($edit->orig)
       
   825 				$xi += sizeof($edit->orig);
       
   826 			if ($edit->closing)
       
   827 				$yi += sizeof($edit->closing);
       
   828 		}
       
   829 
       
   830 		if (is_array($block))
       
   831 			$this->_block($x0, $xi - $x0,
       
   832 						  $y0, $yi - $y0,
       
   833 						  $block);
       
   834 
       
   835 		$end = $this->_end_diff();
       
   836 		// wfProfileOut( $fname );
       
   837 		return $end;
       
   838 	}
       
   839 
       
   840 	function _block($xbeg, $xlen, $ybeg, $ylen, &$edits) {
       
   841 		$fname = 'DiffFormatter::_block';
       
   842 		// wfProfileIn( $fname );
       
   843 		$this->_start_block($this->_block_header($xbeg, $xlen, $ybeg, $ylen));
       
   844 		foreach ($edits as $edit) {
       
   845 			if ($edit->type == 'copy')
       
   846 				$this->_context($edit->orig);
       
   847 			elseif ($edit->type == 'add')
       
   848 				$this->_added($edit->closing);
       
   849 			elseif ($edit->type == 'delete')
       
   850 				$this->_deleted($edit->orig);
       
   851 			elseif ($edit->type == 'change')
       
   852 				$this->_changed($edit->orig, $edit->closing);
       
   853 			else
       
   854 				trigger_error('Unknown edit type', E_USER_ERROR);
       
   855 		}
       
   856 		$this->_end_block();
       
   857 		// wfProfileOut( $fname );
       
   858 	}
       
   859 
       
   860 	function _start_diff() {
       
   861 		ob_start();
       
   862 	}
       
   863 
       
   864 	function _end_diff() {
       
   865 		$val = ob_get_contents();
       
   866 		ob_end_clean();
       
   867 		return $val;
       
   868 	}
       
   869 
       
   870 	function _block_header($xbeg, $xlen, $ybeg, $ylen) {
       
   871 		if ($xlen > 1)
       
   872 			$xbeg .= "," . ($xbeg + $xlen - 1);
       
   873 		if ($ylen > 1)
       
   874 			$ybeg .= "," . ($ybeg + $ylen - 1);
       
   875 
       
   876 		return $xbeg . ($xlen ? ($ylen ? 'c' : 'd') : 'a') . $ybeg;
       
   877 	}
       
   878 
       
   879 	function _start_block($header) {
       
   880 		echo $header;
       
   881 	}
       
   882 
       
   883 	function _end_block() {
       
   884 	}
       
   885 
       
   886 	function _lines($lines, $prefix = ' ') {
       
   887 		foreach ($lines as $line)
       
   888 			echo "$prefix $line\n";
       
   889 	}
       
   890 
       
   891 	function _context($lines) {
       
   892 		$this->_lines($lines);
       
   893 	}
       
   894 
       
   895 	function _added($lines) {
       
   896 		$this->_lines($lines, '>');
       
   897 	}
       
   898 	function _deleted($lines) {
       
   899 		$this->_lines($lines, '<');
       
   900 	}
       
   901 
       
   902 	function _changed($orig, $closing) {
       
   903 		$this->_deleted($orig);
       
   904 		echo "---\n";
       
   905 		$this->_added($closing);
       
   906 	}
       
   907 }
       
   908 
       
   909 
       
   910 /**
       
   911  *	Additions by Axel Boldt follow, partly taken from diff.php, phpwiki-1.3.3
       
   912  *
       
   913  */
       
   914 
       
   915 define('NBSP', '&#160;');			// iso-8859-x non-breaking space.
       
   916 
       
   917 /**
       
   918  * @todo document
       
   919  * @package Enano
       
   920  * @subpackage DifferenceEngine
       
   921  */
       
   922 class _HWLDF_WordAccumulator {
       
   923 	function _HWLDF_WordAccumulator () {
       
   924 		$this->_lines = array();
       
   925 		$this->_line = '';
       
   926 		$this->_group = '';
       
   927 		$this->_tag = '';
       
   928 	}
       
   929 
       
   930 	function _flushGroup ($new_tag) {
       
   931 		if ($this->_group !== '') {
       
   932 			if ($this->_tag == 'mark')
       
   933 				$this->_line .= '<span class="diffchange">' .
       
   934 					htmlspecialchars ( $this->_group ) . '</span>';
       
   935 			else
       
   936 				$this->_line .= htmlspecialchars ( $this->_group );
       
   937 		}
       
   938 		$this->_group = '';
       
   939 		$this->_tag = $new_tag;
       
   940 	}
       
   941 
       
   942 	function _flushLine ($new_tag) {
       
   943 		$this->_flushGroup($new_tag);
       
   944 		if ($this->_line != '')
       
   945 			array_push ( $this->_lines, $this->_line );
       
   946 		else
       
   947 			# make empty lines visible by inserting an NBSP
       
   948 			array_push ( $this->_lines, NBSP );
       
   949 		$this->_line = '';
       
   950 	}
       
   951 
       
   952 	function addWords ($words, $tag = '') {
       
   953 		if ($tag != $this->_tag)
       
   954 			$this->_flushGroup($tag);
       
   955 
       
   956 		foreach ($words as $word) {
       
   957 			// new-line should only come as first char of word.
       
   958 			if ($word == '')
       
   959 				continue;
       
   960 			if ($word[0] == "\n") {
       
   961 				$this->_flushLine($tag);
       
   962 				$word = substr($word, 1);
       
   963 			}
       
   964 			assert(!strstr($word, "\n"));
       
   965 			$this->_group .= $word;
       
   966 		}
       
   967 	}
       
   968 
       
   969 	function getLines() {
       
   970 		$this->_flushLine('~done');
       
   971 		return $this->_lines;
       
   972 	}
       
   973 }
       
   974 
       
   975 /**
       
   976  * @todo document
       
   977  * @package Enano
       
   978  * @subpackage DifferenceEngine
       
   979  */
       
   980 define('MAX_LINE_LENGTH', 10000);
       
   981 class WordLevelDiff extends MappedDiff
       
   982 {
       
   983 	function WordLevelDiff ($orig_lines, $closing_lines) {
       
   984 		$fname = 'WordLevelDiff::WordLevelDiff';
       
   985 		// wfProfileIn( $fname );
       
   986 
       
   987 		list ($orig_words, $orig_stripped) = $this->_split($orig_lines);
       
   988 		list ($closing_words, $closing_stripped) = $this->_split($closing_lines);
       
   989 
       
   990 		$this->MappedDiff($orig_words, $closing_words,
       
   991 						  $orig_stripped, $closing_stripped);
       
   992 		// wfProfileOut( $fname );
       
   993 	}
       
   994 
       
   995 	function _split($lines) {
       
   996 		$fname = 'WordLevelDiff::_split';
       
   997 		// wfProfileIn( $fname );
       
   998 
       
   999 		$words = array();
       
  1000 		$stripped = array();
       
  1001 		$first = true;
       
  1002 		foreach ( $lines as $line ) {
       
  1003 			# If the line is too long, just pretend the entire line is one big word
       
  1004 			# This prevents resource exhaustion problems
       
  1005 			if ( $first ) {
       
  1006 				$first = false;
       
  1007 			} else {
       
  1008 				$words[] = "\n";
       
  1009 				$stripped[] = "\n";
       
  1010 			}
       
  1011 			if ( strlen( $line ) > MAX_LINE_LENGTH ) {
       
  1012 				$words[] = $line;
       
  1013 				$stripped[] = $line;
       
  1014 			} else {
       
  1015 				if (preg_match_all('/ ( [^\S\n]+ | [0-9_A-Za-z\x80-\xff]+ | . ) (?: (?!< \n) [^\S\n])? /xs',
       
  1016 					$line, $m))
       
  1017 				{
       
  1018 					$words = array_merge( $words, $m[0] );
       
  1019 					$stripped = array_merge( $stripped, $m[1] );
       
  1020 				}
       
  1021 			}
       
  1022 		}
       
  1023 		// wfProfileOut( $fname );
       
  1024 		return array($words, $stripped);
       
  1025 	}
       
  1026 
       
  1027 	function orig () {
       
  1028 		$fname = 'WordLevelDiff::orig';
       
  1029 		// wfProfileIn( $fname );
       
  1030 		$orig = new _HWLDF_WordAccumulator;
       
  1031 
       
  1032 		foreach ($this->edits as $edit) {
       
  1033 			if ($edit->type == 'copy')
       
  1034 				$orig->addWords($edit->orig);
       
  1035 			elseif ($edit->orig)
       
  1036 				$orig->addWords($edit->orig, 'mark');
       
  1037 		}
       
  1038 		$lines = $orig->getLines();
       
  1039 		// wfProfileOut( $fname );
       
  1040 		return $lines;
       
  1041 	}
       
  1042 
       
  1043 	function closing () {
       
  1044 		$fname = 'WordLevelDiff::closing';
       
  1045 		// wfProfileIn( $fname );
       
  1046 		$closing = new _HWLDF_WordAccumulator;
       
  1047 
       
  1048 		foreach ($this->edits as $edit) {
       
  1049 			if ($edit->type == 'copy')
       
  1050 				$closing->addWords($edit->closing);
       
  1051 			elseif ($edit->closing)
       
  1052 				$closing->addWords($edit->closing, 'mark');
       
  1053 		}
       
  1054 		$lines = $closing->getLines();
       
  1055 		// wfProfileOut( $fname );
       
  1056 		return $lines;
       
  1057 	}
       
  1058 }
       
  1059 
       
  1060 /**
       
  1061  *	Wikipedia Table style diff formatter.
       
  1062  * @todo document
       
  1063  * @package Enano
       
  1064  * @subpackage DifferenceEngine
       
  1065  */
       
  1066 class TableDiffFormatter extends DiffFormatter
       
  1067 {
       
  1068 	function TableDiffFormatter() {
       
  1069 		$this->leading_context_lines = 2;
       
  1070 		$this->trailing_context_lines = 2;
       
  1071 	}
       
  1072 
       
  1073 	function _block_header( $xbeg, $xlen, $ybeg, $ylen ) {
       
  1074 		$r = '<tr><td colspan="2" align="left"><strong><!--LINE '.$xbeg."--></strong></td>\n" .
       
  1075 		  '<td colspan="2" align="left"><strong><!--LINE '.$ybeg."--></strong></td></tr>\n";
       
  1076 		return $r;
       
  1077 	}
       
  1078 
       
  1079 	function _start_block( $header ) {
       
  1080 		echo $header;
       
  1081 	}
       
  1082 
       
  1083 	function _end_block() {
       
  1084 	}
       
  1085 
       
  1086 	function _lines( $lines, $prefix=' ', $color='white' ) {
       
  1087 	}
       
  1088 
       
  1089 	# HTML-escape parameter before calling this
       
  1090 	function addedLine( $line ) {
       
  1091 		return "<td>+</td><td class='diff-addedline'>{$line}</td>";
       
  1092 	}
       
  1093 
       
  1094 	# HTML-escape parameter before calling this
       
  1095 	function deletedLine( $line ) {
       
  1096 		return "<td>-</td><td class='diff-deletedline'>{$line}</td>";
       
  1097 	}
       
  1098 
       
  1099 	# HTML-escape parameter before calling this
       
  1100 	function contextLine( $line ) {
       
  1101 		return "<td> </td><td class='diff-context'>{$line}</td>";
       
  1102 	}
       
  1103 
       
  1104 	function emptyLine() {
       
  1105 		return '<td colspan="2">&nbsp;</td>';
       
  1106 	}
       
  1107 
       
  1108 	function _added( $lines ) {
       
  1109 		foreach ($lines as $line) {
       
  1110 			echo '<tr>' . $this->emptyLine() .
       
  1111 				$this->addedLine( htmlspecialchars ( $line ) ) . "</tr>\n";
       
  1112 		}
       
  1113 	}
       
  1114 
       
  1115 	function _deleted($lines) {
       
  1116 		foreach ($lines as $line) {
       
  1117 			echo '<tr>' . $this->deletedLine( htmlspecialchars ( $line ) ) .
       
  1118 			  $this->emptyLine() . "</tr>\n";
       
  1119 		}
       
  1120 	}
       
  1121 
       
  1122 	function _context( $lines ) {
       
  1123 		foreach ($lines as $line) {
       
  1124 			echo '<tr>' .
       
  1125 				$this->contextLine( htmlspecialchars ( $line ) ) .
       
  1126 				$this->contextLine( htmlspecialchars ( $line ) ) . "</tr>\n";
       
  1127 		}
       
  1128 	}
       
  1129 
       
  1130 	function _changed( $orig, $closing ) {
       
  1131 		$fname = 'TableDiffFormatter::_changed';
       
  1132 		// wfProfileIn( $fname );
       
  1133 
       
  1134 		$diff = new WordLevelDiff( $orig, $closing );
       
  1135 		$del = $diff->orig();
       
  1136 		$add = $diff->closing();
       
  1137 
       
  1138 		# Notice that WordLevelDiff returns HTML-escaped output.
       
  1139 		# Hence, we will be calling addedLine/deletedLine without HTML-escaping.
       
  1140 
       
  1141 		while ( $line = array_shift( $del ) ) {
       
  1142 			$aline = array_shift( $add );
       
  1143 			echo '<tr>' . $this->deletedLine( $line ) .
       
  1144 				$this->addedLine( $aline ) . "</tr>\n";
       
  1145 		}
       
  1146 		foreach ($add as $line) {	# If any leftovers
       
  1147 			echo '<tr>' . $this->emptyLine() .
       
  1148 				$this->addedLine( $line ) . "</tr>\n";
       
  1149 		}
       
  1150 		// wfProfileOut( $fname );
       
  1151 	}
       
  1152 }
       
  1153