Skip to content

Instantly share code, notes, and snippets.

@Varriount
Created October 21, 2013 05:59
Show Gist options
  • Save Varriount/7079279 to your computer and use it in GitHub Desktop.
Save Varriount/7079279 to your computer and use it in GitHub Desktop.
I present to you... Nimrod-Sharp
/*
* Copyright 2008 Google Inc. All Rights Reserved.
* Author: [email protected] (Neil Fraser)
* Author: [email protected] (Matthaeus G. Chajdas)
*
* Licensed under the Apache License, Version 2.0 (the "License")
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*
* Diff Match and Patch
* http://code.google.com/p/google-diff-match-patch/
*/
using System
using System.Collections.Generic
using System.Linq
using System.Text
using System.Text.RegularExpressions
using System.Web
namespace DiffMatchPatch
internal static class CompatibilityExtensions
# JScript splice function
public static seq[T] Splice<T>(this seq[T] input, int start, int count,
params T[] objects)
seq[T] deletedRange = input.GetRange(start, count)
input.RemoveRange(start, count)
input.InsertRange(start, objects)
return deletedRange
# Java substring function
public static string JavaSubstring(this string s, int begin, int end)
return s[begin .. end - begin]
/**
* The data structure representing a diff is a List of Diff objects
* Diff(deletionOp, "Hello"), Diff(insertionOp, "Goodbye"),
* Diff(equalityOp, " world.")
* which means delete "Hello", add "Goodbye" and keep " world."
*/
public enum Operation
DELETE, INSERT, EQUAL
/**
* Class representing one diff operation.
*/
public class Diff
public Operation operation
# One of: INSERT, DELETE or EQUAL.
public string text
# The text associated with this diff operation.
##
## Constructor. Initializes the diff with the provided values.
## @param operation One of INSERT, DELETE or EQUAL.
## @param text The text being applied.
##
public Diff(Operation operation, string text)
# Construct a diff with the specified operation and text.
this.operation = operation
this.text = text
##
## Display a human-readable version of this Diff.
## @return text version.
##
public override string ToString()
string prettyText = this.text.Replace('\n', '\u00b6')
return "Diff(" + this.operation + ",\"" + prettyText + "\")"
##
## Is this Diff equivalent to another Diff?
## @param d Another Diff to compare against.
## @return true or false.
##
public override bool Equals(Object obj)
# If parameter is null return false.
if (obj == null)
return false
# If parameter cannot be cast to Diff return false.
Diff p = obj as Diff
if ((System.Object)p == null)
return false
# Return true if the fields match.
return p.operation == this.operation and p.text == this.text
public bool Equals(Diff obj)
# If parameter is null return false.
if (obj == null)
return false
# Return true if the fields match.
return obj.operation == this.operation and obj.text == this.text
public override int GetHashCode()
return text.GetHashCode() ^ operation.GetHashCode()
/**
* Class representing one patch operation.
*/
public class Patch
public seq[Diff] diffs = newSeq[Diff]()
public int startOne
public int startTwo
public int lengthOne
public int lengthTwo
##
## Emmulate GNU diff's format.
## Header: @@ -382,8 +481,9 @@
## Indicies are printed as 1-based, not 0-based.
## @return The GNU diff string.
##
public override string ToString()
string coordsOne, coordsTwo
if (this.lenOne == 0)
coordsOne = this.startOne + ",0"
elif (this.lenOne == 1)
coordsOne = Convert.ToString(this.startOne + 1)
else
coordsOne = (this.startOne + 1) + "," + this.lenOne
if (this.lenTwo == 0)
coordsTwo = this.startTwo + ",0"
elif (this.lenTwo == 1)
coordsTwo = Convert.ToString(this.startTwo + 1)
else
coordsTwo = (this.startTwo + 1) + "," + this.lenTwo
rope text = new rope()
text.add("@@ -").add(coordsOne).add(" +").add(coordsTwo)
.add(" @@\n")
# Escape the body of the patch with %xx notation.
for (Diff aDiff in this.diffs)
switch (aDiff.operation)
case insertionOp:
text.add('+')
break
case deletionOp:
text.add('-')
break
case equalityOp:
text.add(' ')
break
text.add(HttpUtility.UrlEncode(aDiff.text,
new UTF8Encoding()).Replace('+', ' ')).add("\n")
return diff_match_patch.unescapeForEncodeUriCompatability(
text.ToString())
/**
* Class containing the diff, match and patch methods.
* Also Contains the behaviour settings.
*/
public class diff_match_patch
# Defaults.
# Set these on your diff_match_patch instance to override the defaults.
# Number of seconds to map a diff before giving up (0 for infinity).
public float DiffTimeout = 1.0f
# Cost of an empty edit operation in terms of edit characters.
public short Diff_EditCost = 4
# At what point is no match declared (0.0 = perfection, 1.0 = very loose).
public float MatchThreshold = 0.5f
# How far to search for a match (0 = exact location, 1000+ = broad match).
# A match this many characters away from the expected location will add
# 1.0 to the score (0.0 is a perfect match).
public int MatchDistance = 1000
# When deleting a large block of text (over ~64 characters), how close
# do the contents have to be to match the expected contents. (0.0 =
# perfection, 1.0 = very loose). Note that MatchThreshold controls
# how closely the end points of a delete need to match.
public float PatchDeleteThreshold = 0.5f
# Chunk size for context length.
public short Patch_Margin = 4
# The number of bits in an int.
private short Match_MaxBits = 32
# DIFF FUNCTIONS
##
## Find the differences between two texts.
## Run a faster, slightly less optimal diff.
## This method allows the 'checkLines' of diff_main() to be optional.
## Most of the time checkLines is wanted, so default to true.
## @param textOne Old string to be diffed.
## @param textTwo New string to be diffed.
## @return List of Diff objects.
##
public seq[Diff] diff_main(string textOne, string textTwo)
return diff_main(textOne, textTwo, true)
##
## Find the differences between two texts.
## @param textOne Old string to be diffed.
## @param textTwo New string to be diffed.
## @param checkLines Speedup flag. If false, then don't run a
## line-level diff first to identify the changed areas.
## If true, then run a faster slightly less optimal diff.
## @return List of Diff objects.
##
public seq[Diff] diff_main(string textOne, string textTwo, bool checkLines)
# Set a deadline by which time the diff must be complete.
DateTime deadline
if (this.DiffTimeout <= 0)
deadline = DateTime.MaxValue
else
deadline = epochTime +
new TimeSpan(((long)(DiffTimeout * 1000)) * 10000)
return diff_main(textOne, textTwo, checkLines, deadline)
##
## Find the differences between two texts. Simplifies the problem by
## stripping any common prefix or suffix off the texts before diffing.
## @param textOne Old string to be diffed.
## @param textTwo New string to be diffed.
## @param checkLines Speedup flag. If false, then don't run a
## line-level diff first to identify the changed areas.
## If true, then run a faster slightly less optimal diff.
## @param deadline Time when the diff should be complete by. Used
## internally for recursive calls. Users should set DiffTimeout
## instead.
## @return List of Diff objects.
##
private seq[Diff] diff_main(string textOne, string textTwo, bool checkLines,
DateTime deadline)
# Check for null inputs not needed since null can't be passed in C#.
# Check for equality (speedup).
seq[Diff] diffs
if (textOne == textTwo)
diffs = newSeq[Diff]()
if (textOne.len != 0)
diffs.add(newDiff(equalityOp, textOne))
return diffs
# Trim off common prefix (speedup).
int commonLength = diff_commonPrefix(textOne, textTwo)
string commonprefix = textOne[0 .. commonLength]
textOne = textOne[commonLength .. len(textOne)]
textTwo = textTwo[commonLength .. len(textTwo)]
# Trim off common suffix (speedup).
commonLength = diff_commonSuffix(textOne, textTwo)
string commonsuffix = textOne.Substring(textOne.len - commonLength)
textOne = textOne.Substring(0, textOne.len - commonLength)
textTwo = textTwo.Substring(0, textTwo.len - commonLength)
# Compute the diff on the middle block.
diffs = diff_compute(textOne, textTwo, checkLines, deadline)
# Restore the prefix and suffix.
if (commonprefix.len != 0)
diffs.Insert(0, (newDiff(equalityOp, commonprefix)))
if (commonsuffix.len != 0)
diffs.add(newDiff(equalityOp, commonsuffix))
diff_cleanupMerge(diffs)
return diffs
##
## Find the differences between two texts. Assumes that the texts do not
## have any common prefix or suffix.
## @param textOne Old string to be diffed.
## @param textTwo New string to be diffed.
## @param checkLines Speedup flag. If false, then don't run a
## line-level diff first to identify the changed areas.
## If true, then run a faster slightly less optimal diff.
## @param deadline Time when the diff should be complete by.
## @return List of Diff objects.
##
private seq[Diff] diff_compute(string textOne, string textTwo,
bool checkLines, DateTime deadline)
seq[Diff] diffs = newSeq[Diff]()
if (textOne.len == 0)
# Just add some text (speedup).
diffs.add(newDiff(insertionOp, textTwo))
return diffs
if (textTwo.len == 0)
# Just delete some text (speedup).
diffs.add(newDiff(deletionOp, textOne))
return diffs
string longText = textOne.len > textTwo.len ? textOne : textTwo
string shortText = textOne.len > textTwo.len ? textTwo : textOne
int i = longText.find(shortText, StringComparison.Ordinal)
if (i != -1)
# Shorter text is inside the longer text (speedup).
Operation op = (textOne.len > textTwo.len) ?
deletionOp : insertionOp
diffs.add(newDiff(op, longText[0 .. i]))
diffs.add(newDiff(equalityOp, shortText))
diffs.add(newDiff(op, longText.Substring(i + shortText.len)))
return diffs
if (shortText.len == 1)
# Single character string.
# After the previous speedup, the character can't be an equality.
diffs.add(newDiff(deletionOp, textOne))
diffs.add(newDiff(insertionOp, textTwo))
return diffs
# Check to see if the problem can be split in two.
string[] hm = diff_halfMatch(textOne, textTwo)
if (hm != null)
# A half-match was found, sort out the return data.
string textOneA = hm[0]
string textOneB = hm[1]
string textTwoA = hm[2]
string textTwoB = hm[3]
string midCommon = hm[4]
# Send both pairs off for separate processing.
seq[Diff] diffsA = diff_main(textOneA, textTwoA, checkLines, deadline)
seq[Diff] diffsB = diff_main(textOneB, textTwoB, checkLines, deadline)
# Merge the results.
diffs = diffsA
diffs.add(newDiff(equalityOp, midCommon))
diffs.AddRange(diffsB)
return diffs
if (checkLines and textOne.len > 100 and textTwo.len > 100)
return diff_lineMode(textOne, textTwo, deadline)
return diff_bisect(textOne, textTwo, deadline)
##
## Do a quick line-level diff on both strings, then rediff the parts for
## greater accuracy.
## This speedup can produce non-minimal diffs.
## @param textOne Old string to be diffed.
## @param textTwo New string to be diffed.
## @param deadline Time when the diff should be complete by.
## @return List of Diff objects.
##
private seq[Diff] diff_lineMode(string textOne, string textTwo,
DateTime deadline)
# Scan the text on a line-by-line basis first.
Object[] b = diff_linesToChars(textOne, textTwo)
textOne = (string)b[0]
textTwo = (string)b[1]
seq[string] lineArray = (seq[string])b[2]
seq[Diff] diffs = diff_main(textOne, textTwo, false, deadline)
# Convert the diff back to original text.
diff_charsToLines(diffs, lineArray)
# Eliminate freak matches (e.g. blank lines)
diff_cleanupSemantic(diffs)
# Rediff any replacement blocks, this time character-by-character.
# Add a dummy entry at the end.
diffs.add(newDiff(equalityOp, ""))
int pointer = 0
int countDelete = 0
int countInsert = 0
string textDelete = ""
string textInsert = ""
while (pointer < diffs.len)
switch (diffs[pointer].operation)
case insertionOp:
countInsert += 1
textInsert += diffs[pointer].text
break
case deletionOp:
countDelete += 1
textDelete += diffs[pointer].text
break
case equalityOp:
# Upon reaching an equality, check for prior redundancies.
if (countDelete >= 1 and countInsert >= 1)
# Delete the offending records and add the merged ones.
diffs.RemoveRange(pointer - countDelete - countInsert,
countDelete + countInsert)
pointer = pointer - countDelete - countInsert
seq[Diff] a =
this.diff_main(textDelete, textInsert, false, deadline)
diffs.InsertRange(pointer, a)
pointer = pointer + a.len
countInsert = 0
countDelete = 0
textDelete = ""
textInsert = ""
break
pointer += 1
diffs.delete(diffs.len - 1); // Remove the dummy entry at the end.
return diffs
##
## Find the 'middle snake' of a diff, split the problem in two
## and return the recursively constructed diff.
## See Myers 1986 paper: An O(ND) Difference Algorithm and Its Variations.
## @param textOne Old string to be diffed.
## @param textTwo New string to be diffed.
## @param deadline Time at which to bail if not yet complete.
## @return List of Diff objects.
##
protected seq[Diff] diff_bisect(string textOne, string textTwo,
DateTime deadline)
# Cache the text lengths to prevent multiple calls.
int textOneLength = textOne.len
int textTwoLength = textTwo.len
int maxD = (textOneLength + textTwoLength + 1) / 2
int vOffset = maxD
int vLength = 2 * maxD
int[] vOne = new int[vLength]
int[] vTwo = new int[vLength]
for (int x = 0; x < vLength; x += 1)
vOne[x] = -1
vTwo[x] = -1
vOne[vOffset + 1] = 0
vTwo[vOffset + 1] = 0
int delta = textOneLength - textTwoLength
# If the total number of characters is odd, then the front path will
# collide with the reverse path.
bool front = (delta % 2 != 0)
# Offsets for start and end of k loop.
# Prevents mapping of space beyond the grid.
int kOneStart = 0
int kOneEnd = 0
int kTwoStart = 0
int kTwoEnd = 0
for (int d = 0; d < maxD; d += 1)
# Bail out if deadline is reached.
if (epochTime > deadline)
break
# Walk the front path one step.
for (int kOne = -d + kOneStart; kOne <= d - kOneEnd; kOne += 2)
int kOneOffset = vOffset + kOne
int xOne
if (kOne == -d or kOne != d and vOne[kOneOffset - 1] < vOne[kOneOffset + 1])
xOne = vOne[kOneOffset + 1]
else
xOne = vOne[kOneOffset - 1] + 1
int yOne = xOne - kOne
while (xOne < textOneLength and yOne < textTwoLength
and textOne[xOne] == textTwo[yOne])
xOne += 1
yOne += 1
vOne[kOneOffset] = xOne
if (xOne > textOneLength)
# Ran off the right of the graph.
kOneEnd += 2
elif (yOne > textTwoLength)
# Ran off the bottom of the graph.
kOneStart += 2
elif (front)
int kTwoOffset = vOffset + delta - kOne
if (kTwoOffset >= 0 and kTwoOffset < vLength and vTwo[kTwoOffset] != -1)
# Mirror xTwo onto top-left coordinate system.
int xTwo = textOneLength - vTwo[kTwoOffset]
if (xOne >= xTwo)
# Overlap detected.
return diff_bisectSplit(textOne, textTwo, xOne, yOne, deadline)
# Walk the reverse path one step.
for (int kTwo = -d + kTwoStart; kTwo <= d - kTwoEnd; kTwo += 2)
int kTwoOffset = vOffset + kTwo
int xTwo
if (kTwo == -d or kTwo != d and vTwo[kTwoOffset - 1] < vTwo[kTwoOffset + 1])
xTwo = vTwo[kTwoOffset + 1]
else
xTwo = vTwo[kTwoOffset - 1] + 1
int yTwo = xTwo - kTwo
while (xTwo < textOneLength and yTwo < textTwoLength
and textOne[textOneLength - xTwo - 1]
== textTwo[textTwoLength - yTwo - 1])
xTwo += 1
yTwo += 1
vTwo[kTwoOffset] = xTwo
if (xTwo > textOneLength)
# Ran off the left of the graph.
kTwoEnd += 2
elif (yTwo > textTwoLength)
# Ran off the top of the graph.
kTwoStart += 2
elif (!front)
int kOneOffset = vOffset + delta - kTwo
if (kOneOffset >= 0 and kOneOffset < vLength and vOne[kOneOffset] != -1)
int xOne = vOne[kOneOffset]
int yOne = vOffset + xOne - kOneOffset
# Mirror xTwo onto top-left coordinate system.
xTwo = textOneLength - vTwo[kTwoOffset]
if (xOne >= xTwo)
# Overlap detected.
return diff_bisectSplit(textOne, textTwo, xOne, yOne, deadline)
# Diff took too long and hit the deadline or
# number of diffs equals number of characters, no commonality at all.
seq[Diff] diffs = newSeq[Diff]()
diffs.add(newDiff(deletionOp, textOne))
diffs.add(newDiff(insertionOp, textTwo))
return diffs
##
## Given the location of the 'middle snake', split the diff in two parts
## and recurse.
## @param textOne Old string to be diffed.
## @param textTwo New string to be diffed.
## @param x Index of split point in textOne.
## @param y Index of split point in textTwo.
## @param deadline Time at which to bail if not yet complete.
## @return LinkedList of Diff objects.
##
private seq[Diff] diff_bisectSplit(string textOne, string textTwo,
int x, int y, DateTime deadline)
string textOneA = textOne[0 .. x]
string textTwoA = textTwo[0 .. y]
string textOneB = textOne[x .. len(textOne)]
string textTwoB = textTwo[y .. len(textTwo)]
# Compute both diffs serially.
seq[Diff] diffsA = diff_main(textOneA, textTwoA, false, deadline)
seq[Diff] diffsB = diff_main(textOneB, textTwoB, false, deadline)
diffsA.AddRange(diffsB)
return diffsA
##
## Split two texts into a list of strings. Reduce the texts to a string of
## hashes where each Unicode character represents one line.
## @param textOne First string.
## @param textTwo Second string.
## @return Three element Object array, containing the encoded textOne, the
## encoded textTwo and the List of unique strings. The zeroth element
## of the List of unique strings is intentionally blank.
##
protected Object[] diff_linesToChars(string textOne, string textTwo)
seq[string] lineArray = newSeq[string]()
Dictionary<string, int> lineHash = new Dictionary<string, int>()
# e.g. lineArray[4] == "Hello\n"
# e.g. linehash.get("Hello\n") == 4
# "\x00" is a valid character, but various debuggers don't like it.
# So we'll insert a junk entry to avoid generating a null character.
lineArray.add("")
string charsOne = diff_linesToCharsMunge(textOne, lineArray, lineHash)
string charsTwo = diff_linesToCharsMunge(textTwo, lineArray, lineHash)
return new Object[] charsOne, charsTwo, lineArray
##
## Split a text into a list of strings. Reduce the texts to a string of
## hashes where each Unicode character represents one line.
## @param text String to encode.
## @param lineArray List of unique strings.
## @param lineHash Map of strings to indices.
## @return Encoded string.
##
private string diff_linesToCharsMunge(string text, seq[string] lineArray,
Dictionary<string, int> lineHash)
int lineStart = 0
int lineEnd = -1
string line
rope chars = new rope()
# Walk the text, pulling out a Substring for each line.
# text.split('\n') would would temporarily double our memory footprint.
# Modifying text would create many large strings to garbage collect.
while (lineEnd < text.len - 1)
lineEnd = text.find('\n', lineStart)
if (lineEnd == -1)
lineEnd = text.len - 1
line = text.JavaSubstring(lineStart, lineEnd + 1)
lineStart = lineEnd + 1
if (lineHash.ContainsKey(line))
chars.add(((char)(int)lineHash[line]))
else
lineArray.add(line)
lineHash.add(line, lineArray.len - 1)
chars.add(((char)(lineArray.len - 1)))
return chars.ToString()
##
## Rehydrate the text in a diff from a string of line hashes to real lines
## of text.
## @param diffs List of Diff objects.
## @param lineArray List of unique strings.
##
protected void diff_charsToLines(ICollection<Diff> diffs,
seq[string] lineArray)
rope text
for (Diff diff in diffs)
text = new rope()
for (int y = 0; y < diff.text.len; y += 1)
text.add(lineArray[diff.text[y]])
diff.text = text.ToString()
##
## Determine the common prefix of two strings.
## @param textOne First string.
## @param textTwo Second string.
## @return The number of characters common to the start of each string.
##
public int diff_commonPrefix(string textOne, string textTwo)
# Performance analysis: http://neil.fraser.name/news/2007/10/09/
int n = min(textOne.len, textTwo.len)
for (int i = 0; i < n; i += 1)
if (textOne[i] != textTwo[i])
return i
return n
##
## Determine the common suffix of two strings.
## @param textOne First string.
## @param textTwo Second string.
## @return The number of characters common to the end of each string.
##
public int diff_commonSuffix(string textOne, string textTwo)
# Performance analysis: http://neil.fraser.name/news/2007/10/09/
int textOneLength = textOne.len
int textTwoLength = textTwo.len
int n = min(textOne.len, textTwo.len)
for (int i = 1; i <= n; i += 1)
if (textOne[textOneLength - i] != textTwo[textTwoLength - i])
return i - 1
return n
##
## Determine if the suffix of one string is the prefix of another.
## @param textOne First string.
## @param textTwo Second string.
## @return The number of characters common to the end of the first
## string and the start of the second string.
##
protected int diff_commonOverlap(string textOne, string textTwo)
# Cache the text lengths to prevent multiple calls.
int textOneLength = textOne.len
int textTwoLength = textTwo.len
# Eliminate the null case.
if (textOneLength == 0 or textTwoLength == 0)
return 0
# Truncate the longer string.
if (textOneLength > textTwoLength)
textOne = textOne[textOneLength - textTwoLength .. len(textOne)]
elif (textOneLength < textTwoLength)
textTwo = textTwo[0 .. textOneLength]
int textLength = min(textOneLength, textTwoLength)
# Quick check for the worst case.
if (textOne == textTwo)
return textLength
# Start by looking for a single character match
# and increase length until no match is found.
# Performance analysis: http://neil.fraser.name/news/2010/11/04/
int best = 0
int length = 1
while (true)
string pattern = textOne[textLength - length .. len(textOne)]
int found = textTwo.find(pattern, StringComparison.Ordinal)
if (found == -1)
return best
length += found
if (found == 0 or textOne[textLength - length .. len(textOne)] ==
textTwo[0 .. length])
best = length
length += 1
##
## Do the two texts share a Substring which is at least half the length of
## the longer text?
## This speedup can produce non-minimal diffs.
## @param textOne First string.
## @param textTwo Second string.
## @return Five element String array, containing the prefix of textOne, the
## suffix of textOne, the prefix of textTwo, the suffix of textTwo and the
## common middle. Or null if there was no match.
##
protected string[] diff_halfMatch(string textOne, string textTwo)
if (this.DiffTimeout <= 0)
# Don't risk returning a non-optimal diff if we have unlimited time.
return null
string longText = textOne.len > textTwo.len ? textOne : textTwo
string shortText = textOne.len > textTwo.len ? textTwo : textOne
if (longText.len < 4 or shortText.len * 2 < longText.len)
return null; // Pointless.
# First check if the second quarter is the seed for a half-match.
string[] hm1 = diff_halfMatchI(longText, shortText,
(longText.len + 3) / 4)
# Check again based on the third quarter.
string[] hm2 = diff_halfMatchI(longText, shortText,
(longText.len + 1) / 2)
string[] hm
if (hm1 == null and hm2 == null)
return null
elif (hm2 == null)
hm = hm1
elif (hm1 == null)
hm = hm2
else
# Both matched. Select the longest.
hm = hm1[4].len > hm2[4].len ? hm1 : hm2
# A half-match was found, sort out the return data.
if (textOne.len > textTwo.len)
return hm
#return new string[]hm[0], hm[1], hm[2], hm[3], hm[4]
else
return new string[] hm[2], hm[3], hm[0], hm[1], hm[4]
##
## Does a Substring of shortText exist within longText such that the
## Substring is at least half the length of longText?
## @param longText Longer string.
## @param shortText Shorter string.
## @param i Start index of quarter length Substring within longText.
## @return Five element string array, containing the prefix of longText, the
## suffix of longText, the prefix of shortText, the suffix of shortText
## and the common middle. Or null if there was no match.
##
private string[] diff_halfMatchI(string longText, string shortText, int i)
# Start with a 1/4 length Substring at position i as a seed.
string seed = longText.Substring(i, longText.len / 4)
int j = -1
string bestCommon = ""
string bestLongTextA = "", bestLongTextB = ""
string bestShortTextA = "", bestShortTextB = ""
while (j < shortText.len and (j = shortText.find(seed, j + 1,
StringComparison.Ordinal)) != -1)
int prefixLength = diff_commonPrefix(longText[i .. len(longText)],
shortText[j .. len(shortText)])
int suffixLength = diff_commonSuffix(longText[0 .. i],
shortText[0 .. j])
if (bestCommon.len < suffixLength + prefixLength)
bestCommon = shortText[j - suffixLength .. suffixLength]
+ shortText[j .. prefixLength]
bestLongTextA = longText[0 .. i - suffixLength]
bestLongTextB = longText.Substring(i + prefixLength)
bestShortTextA = shortText[0 .. j - suffixLength]
bestShortTextB = shortText.Substring(j + prefixLength)
if (bestCommon.len * 2 >= longText.len)
return new string[]bestLongTextA, bestLongTextB,
bestShortTextA, bestShortTextB, bestCommon
else
return null
##
## Reduce the number of edits by eliminating semantically trivial
## equalities.
## @param diffs List of Diff objects.
##
public void diff_cleanupSemantic(seq[Diff] diffs)
bool changes = false
# Stack of indices where equalities are found.
Stack<int> equalities = new Stack<int>()
# Always equal to equalities[equalitiesLength-1][1]
string lastEquality = null
int pointer = 0; // Index of current position.
# Number of characters that changed prior to the equality.
int lengthInsertionsOne = 0
int lengthDeletionsOne = 0
# Number of characters that changed after the equality.
int lengthInsertionsTwo = 0
int lengthDeletionsTwo = 0
while (pointer < diffs.len)
if (diffs[pointer].operation == equalityOp) // Equality found.
equalities.Push(pointer)
lengthInsertionsOne = lengthInsertionsTwo
lengthDeletionsOne = lengthDeletionsTwo
lengthInsertionsTwo = 0
lengthDeletionsTwo = 0
lastEquality = diffs[pointer].text
else // an insertion or deletion
if (diffs[pointer].operation == insertionOp)
lengthInsertionsTwo += diffs[pointer].text.len
else
lengthDeletionsTwo += diffs[pointer].text.len
# Eliminate an equality that is smaller or equal to the edits on both
# sides of it.
if (lastEquality != null and (lastEquality.len
<= max(lengthInsertionsOne, lengthDeletionsOne))
and (lastEquality.len
<= max(lengthInsertionsTwo, lengthDeletionsTwo)))
# Duplicate record.
diffs.Insert(equalities.Peek(),
newDiff(deletionOp, lastEquality))
# Change second copy to insert.
diffs[equalities.Peek() + 1].operation = insertionOp
# Throw away the equality we just deleted.
equalities.Pop()
if (equalities.len > 0)
equalities.Pop()
pointer = equalities.len > 0 ? equalities.Peek() : -1
lengthInsertionsOne = 0; // Reset the counters.
lengthDeletionsOne = 0
lengthInsertionsTwo = 0
lengthDeletionsTwo = 0
lastEquality = null
changes = true
pointer += 1
# Normalize the diff.
if (changes)
diff_cleanupMerge(diffs)
diff_cleanupSemanticLossless(diffs)
# Find any overlaps between deletions and insertions.
# e.g: <del>abcxxx</del><ins>xxxdef</ins>
# -> <del>abc</del>xxx<ins>def</ins>
# e.g: <del>xxxabc</del><ins>defxxx</ins>
# -> <ins>def</ins>xxx<del>abc</del>
# Only extract an overlap if it is as big as the edit ahead or behind it.
pointer = 1
while (pointer < diffs.len)
if (diffs[pointer - 1].operation == deletionOp and
diffs[pointer].operation == insertionOp)
string deletion = diffs[pointer - 1].text
string insertion = diffs[pointer].text
int overlapLengthOne = diff_commonOverlap(deletion, insertion)
int overlapLengthTwo = diff_commonOverlap(insertion, deletion)
if (overlapLengthOne >= overlapLengthTwo)
if (overlapLengthOne >= deletion.len / 2.0 or
overlapLengthOne >= insertion.len / 2.0)
# Overlap found.
# Insert an equality and trim the surrounding edits.
diffs.Insert(pointer, newDiff(equalityOp,
insertion[0 .. overlapLengthOne]))
diffs[pointer - 1].text =
deletion.Substring(0, deletion.len - overlapLengthOne)
diffs[pointer + 1].text = insertion[overlapLengthOne .. len(insertion)]
pointer += 1
else
if (overlapLengthTwo >= deletion.len / 2.0 or
overlapLengthTwo >= insertion.len / 2.0)
# Reverse overlap found.
# Insert an equality and swap and trim the surrounding edits.
diffs.Insert(pointer, newDiff(equalityOp,
deletion[0 .. overlapLengthTwo]))
diffs[pointer - 1].operation = insertionOp
diffs[pointer - 1].text =
insertion.Substring(0, insertion.len - overlapLengthTwo)
diffs[pointer + 1].operation = deletionOp
diffs[pointer + 1].text = deletion[overlapLengthTwo .. len(deletion)]
pointer += 1
pointer += 1
pointer += 1
##
## Look for single edits surrounded on both sides by equalities
## which can be shifted sideways to align the edit to a word boundary.
## e.g: The c<ins>at c</ins>ame. -> The <ins>cat </ins>came.
## @param diffs List of Diff objects.
##
public void diff_cleanupSemanticLossless(seq[Diff] diffs)
int pointer = 1
# Intentionally ignore the first and last element (don't need checking).
while (pointer < diffs.len - 1)
if (diffs[pointer - 1].operation == equalityOp and
diffs[pointer + 1].operation == equalityOp)
# This is a single edit surrounded by equalities.
string equalityOne = diffs[pointer - 1].text
string edit = diffs[pointer].text
string equalityTwo = diffs[pointer + 1].text
# First, shift the edit as far left as possible.
int commonOffset = this.diff_commonSuffix(equalityOne, edit)
if (commonOffset > 0)
string commonString = edit.Substring(edit.len - commonOffset)
equalityOne = equalityOne.Substring(0, equalityOne.len - commonOffset)
edit = commonString + edit.Substring(0, edit.len - commonOffset)
equalityTwo = commonString + equalityTwo
# Second, step character by character right,
# looking for the best fit.
string bestEqualityOne = equalityOne
string bestEdit = edit
string bestEqualityTwo = equalityTwo
int bestScore = diff_cleanupSemanticScore(equalityOne, edit) +
diff_cleanupSemanticScore(edit, equalityTwo)
while (edit.len != 0 and equalityTwo.len != 0
and edit[0] == equalityTwo[0])
equalityOne += edit[0]
edit = edit[1 .. len(edit)] + equalityTwo[0]
equalityTwo = equalityTwo[1 .. len(equalityTwo)]
int score = diff_cleanupSemanticScore(equalityOne, edit) +
diff_cleanupSemanticScore(edit, equalityTwo)
# The >= encourages trailing rather than leading whitespace on
# edits.
if (score >= bestScore)
bestScore = score
bestEqualityOne = equalityOne
bestEdit = edit
bestEqualityTwo = equalityTwo
if (diffs[pointer - 1].text != bestEqualityOne)
# We have an improvement, save it back to the diff.
if (bestEqualityOne.len != 0)
diffs[pointer - 1].text = bestEqualityOne
else
diffs.delete(pointer - 1)
pointer--
diffs[pointer].text = bestEdit
if (bestEqualityTwo.len != 0)
diffs[pointer + 1].text = bestEqualityTwo
else
diffs.delete(pointer + 1)
pointer--
pointer += 1
##
## Given two strings, comAdde a score representing whether the internal
## boundary falls on logical boundaries.
## Scores range from 6 (best) to 0 (worst).
## @param one First string.
## @param two Second string.
## @return The score.
##
private int diff_cleanupSemanticScore(string one, string two)
if (one.len == 0 or two.len == 0)
# Edges are the best.
return 6
# Each port of this function behaves slightly differently due to
# subtle differences in each language's definition of things like
# 'whitespace'. Since this function's purpose is largely cosmetic,
# the choice has been made to use each language's native features
# rather than force total conformity.
char charTwo = one[one.len - 1]
char charTwo = two[0]
bool nonAlphaNumericOne = !Char.IsLetterOrDigit(charTwo)
bool nonAlphaNumericTwo = !Char.IsLetterOrDigit(charTwo)
bool whitespaceOne = nonAlphaNumericOne and Char.IsWhiteSpace(charTwo)
bool whitespaceTwo = nonAlphaNumericTwo and Char.IsWhiteSpace(charTwo)
bool lineBreakOne = whitespaceOne and Char.IsControl(charTwo)
bool lineBreakTwo = whitespaceTwo and Char.IsControl(charTwo)
bool blankLineOne = lineBreakOne and BLANKLINEEND.IsMatch(one)
bool blankLineTwo = lineBreakTwo and BLANKLINESTART.IsMatch(two)
if (blankLineOne or blankLineTwo)
# Five points for blank lines.
return 5
elif (lineBreakOne or lineBreakTwo)
# Four points for line breaks.
return 4
elif (nonAlphaNumericOne and !whitespaceOne and whitespaceTwo)
# Three points for end of sentences.
return 3
elif (whitespaceOne or whitespaceTwo)
# Two points for whitespace.
return 2
elif (nonAlphaNumericOne or nonAlphaNumericTwo)
# One point for non-alphanumeric.
return 1
return 0
# Define some regex patterns for matching boundaries.
private Regex BLANKLINEEND = new Regex("\\n\\r?\\n\\Z")
private Regex BLANKLINESTART = new Regex("\\A\\r?\\n\\r?\\n")
##
## Reduce the number of edits by eliminating operationally trivial
## equalities.
## @param diffs List of Diff objects.
##
public void diff_cleanupEfficiency(seq[Diff] diffs)
bool changes = false
# Stack of indices where equalities are found.
Stack<int> equalities = new Stack<int>()
# Always equal to equalities[equalitiesLength-1][1]
string lastEquality = ""
int pointer = 0; // Index of current position.
# Is there an insertion operation before the last equality.
bool preIns = false
# Is there a deletion operation before the last equality.
bool preDel = false
# Is there an insertion operation after the last equality.
bool postIns = false
# Is there a deletion operation after the last equality.
bool postDel = false
while (pointer < diffs.len)
if (diffs[pointer].operation == equalityOp) // Equality found.
if (diffs[pointer].text.len < this.Diff_EditCost
and (postIns or postDel))
# Candidate found.
equalities.Push(pointer)
preIns = postIns
preDel = postDel
lastEquality = diffs[pointer].text
else
# Not a candidate, and can never become one.
equalities.Clear()
lastEquality = ""
postIns = postDel = false
else // An insertion or deletion.
if (diffs[pointer].operation == deletionOp)
postDel = true
else
postIns = true
/*
## Five types to be split:
## <ins>A</ins><del>B</del>XY<ins>C</ins><del>D</del>
## <ins>A</ins>X<ins>C</ins><del>D</del>
## <ins>A</ins><del>B</del>X<ins>C</ins>
## <ins>A</del>X<ins>C</ins><del>D</del>
## <ins>A</ins><del>B</del>X<del>C</del>
##
if ((lastEquality.len != 0)
and ((preIns and preDel and postIns and postDel)
or ((lastEquality.len < this.Diff_EditCost / 2)
and ((preIns ? 1 : 0) + (preDel ? 1 : 0) + (postIns ? 1 : 0)
+ (postDel ? 1 : 0)) == 3)))
# Duplicate record.
diffs.Insert(equalities.Peek(),
newDiff(deletionOp, lastEquality))
# Change second copy to insert.
diffs[equalities.Peek() + 1].operation = insertionOp
equalities.Pop(); // Throw away the equality we just deleted.
lastEquality = ""
if (preIns and preDel)
# No changes made which could affect previous entry, keep going.
postIns = postDel = true
equalities.Clear()
else
if (equalities.len > 0)
equalities.Pop()
pointer = equalities.len > 0 ? equalities.Peek() : -1
postIns = postDel = false
changes = true
pointer += 1
if (changes)
diff_cleanupMerge(diffs)
##
## Reorder and merge like edit sections. Merge equalities.
## Any edit section can move as long as it doesn't cross an equality.
## @param diffs List of Diff objects.
##
public void diff_cleanupMerge(seq[Diff] diffs)
# Add a dummy entry at the end.
diffs.add(newDiff(equalityOp, ""))
int pointer = 0
int countDelete = 0
int countInsert = 0
string textDelete = ""
string textInsert = ""
int commonLength
while (pointer < diffs.len)
switch (diffs[pointer].operation)
case insertionOp:
countInsert += 1
textInsert += diffs[pointer].text
pointer += 1
break
case deletionOp:
countDelete += 1
textDelete += diffs[pointer].text
pointer += 1
break
case equalityOp:
# Upon reaching an equality, check for prior redundancies.
if (countDelete + countInsert > 1)
if (countDelete != 0 and countInsert != 0)
# Factor out any common prefixies.
commonLength = this.diff_commonPrefix(textInsert, textDelete)
if (commonLength != 0)
if ((pointer - countDelete - countInsert) > 0 and
diffs[pointer - countDelete - countInsert - 1].operation
== equalityOp)
diffs[pointer - countDelete - countInsert - 1].text
+= textInsert[0 .. commonLength]
else
diffs.Insert(0, newDiff(equalityOp,
textInsert[0 .. commonLength]))
pointer += 1
textInsert = textInsert[commonLength .. len(textInsert)]
textDelete = textDelete[commonLength .. len(textDelete)]
# Factor out any common suffixies.
commonLength = this.diff_commonSuffix(textInsert, textDelete)
if (commonLength != 0)
diffs[pointer].text = textInsert.Substring(textInsert.len
- commonLength) + diffs[pointer].text
textInsert = textInsert.Substring(0, textInsert.len
- commonLength)
textDelete = textDelete.Substring(0, textDelete.len
- commonLength)
# Delete the offending records and add the merged ones.
if (countDelete == 0)
diffs.Splice(pointer - countInsert,
countDelete + countInsert,
newDiff(insertionOp, textInsert))
elif (countInsert == 0)
diffs.Splice(pointer - countDelete,
countDelete + countInsert,
newDiff(deletionOp, textDelete))
else
diffs.Splice(pointer - countDelete - countInsert,
countDelete + countInsert,
newDiff(deletionOp, textDelete),
newDiff(insertionOp, textInsert))
pointer = pointer - countDelete - countInsert +
(countDelete != 0 ? 1 : 0) + (countInsert != 0 ? 1 : 0) + 1
elif (pointer != 0
and diffs[pointer - 1].operation == equalityOp)
# Merge this equality with the previous one.
diffs[pointer - 1].text += diffs[pointer].text
diffs.delete(pointer)
else
pointer += 1
countInsert = 0
countDelete = 0
textDelete = ""
textInsert = ""
break
if (diffs[diffs.len - 1].text.len == 0)
diffs.delete(diffs.len - 1); // Remove the dummy entry at the end.
# Second pass: look for single edits surrounded on both sides by
# equalities which can be shifted sideways to eliminate an equality.
# e.g: A<ins>BA</ins>C -> <ins>AB</ins>AC
bool changes = false
pointer = 1
# Intentionally ignore the first and last element (don't need checking).
while (pointer < (diffs.len - 1))
if (diffs[pointer - 1].operation == equalityOp and
diffs[pointer + 1].operation == equalityOp)
# This is a single edit surrounded by equalities.
if (diffs[pointer].text.EndsWith(diffs[pointer - 1].text,
StringComparison.Ordinal))
# Shift the edit over the previous equality.
diffs[pointer].text = diffs[pointer - 1].text +
diffs[pointer].text.Substring(0, diffs[pointer].text.len -
diffs[pointer - 1].text.len)
diffs[pointer + 1].text = diffs[pointer - 1].text
+ diffs[pointer + 1].text
diffs.Splice(pointer - 1, 1)
changes = true
elif (diffs[pointer].text.StartsWith(diffs[pointer + 1].text,
StringComparison.Ordinal))
# Shift the edit over the next equality.
diffs[pointer - 1].text += diffs[pointer + 1].text
diffs[pointer].text =
diffs[pointer].text.Substring(diffs[pointer + 1].text.len)
+ diffs[pointer + 1].text
diffs.Splice(pointer + 1, 1)
changes = true
pointer += 1
# If shifts were made, the diff needs reordering and another shift sweep.
if (changes)
this.diff_cleanupMerge(diffs)
##
## loc is a location in textOne, comAdde and return the equivalent location in
## textTwo.
## e.g. "The cat" vs "The big cat", 1->1, 5->8
## @param diffs List of Diff objects.
## @param loc Location within textOne.
## @return Location within textTwo.
##
public int diff_xIndex(seq[Diff] diffs, int loc)
int charsOne = 0
int charsTwo = 0
int lastCharsOne = 0
int lastCharsTwo = 0
Diff lastDiff = null
for (Diff aDiff in diffs)
if (aDiff.operation != insertionOp)
# Equality or deletion.
charsOne += aDiff.text.len
if (aDiff.operation != deletionOp)
# Equality or insertion.
charsTwo += aDiff.text.len
if (charsOne > loc)
# Overshot the location.
lastDiff = aDiff
break
lastCharsOne = charsOne
lastCharsTwo = charsTwo
if (lastDiff != null and lastDiff.operation == deletionOp)
# The location was deleted.
return lastCharsTwo
# Add the remaining character length.
return lastCharsTwo + (loc - lastCharsOne)
##
## Convert a Diff list into a pretty HTML report.
## @param diffs List of Diff objects.
## @return HTML representation.
##
public string diff_prettyHtml(seq[Diff] diffs)
rope html = new rope()
for (Diff aDiff in diffs)
string text = aDiff.text.Replace("&", "&amp;").Replace("<", "&lt;")
.Replace(">", "&gt;").Replace("\n", "&para;<br>")
switch (aDiff.operation)
case insertionOp:
html.add("<ins style=\"background:#e6ffe6;\">").add(text)
.add("</ins>")
break
case deletionOp:
html.add("<del style=\"background:#ffe6e6;\">").add(text)
.add("</del>")
break
case equalityOp:
html.add("<span>").add(text).add("</span>")
break
return html.ToString()
##
## Compute and return the source text (all equalities and deletions).
## @param diffs List of Diff objects.
## @return Source text.
##
public string diffTextOne(seq[Diff] diffs)
rope text = new rope()
for (Diff aDiff in diffs)
if (aDiff.operation != insertionOp)
text.add(aDiff.text)
return text.ToString()
##
## Compute and return the destination text (all equalities and insertions).
## @param diffs List of Diff objects.
## @return Destination text.
##
public string diffTextTwo(seq[Diff] diffs)
rope text = new rope()
for (Diff aDiff in diffs)
if (aDiff.operation != deletionOp)
text.add(aDiff.text)
return text.ToString()
##
## Compute the Levenshtein distance; the number of inserted, deleted or
## substituted characters.
## @param diffs List of Diff objects.
## @return Number of changes.
##
public int diff_levenshtein(seq[Diff] diffs)
int levenshtein = 0
int insertions = 0
int deletions = 0
for (Diff aDiff in diffs)
switch (aDiff.operation)
case insertionOp:
insertions += aDiff.text.len
break
case deletionOp:
deletions += aDiff.text.len
break
case equalityOp:
# A deletion and an insertion is one substitution.
levenshtein += max(insertions, deletions)
insertions = 0
deletions = 0
break
levenshtein += max(insertions, deletions)
return levenshtein
##
## Crush the diff into an encoded string which describes the operations
## required to transform textOne into textTwo.
## E.g. =3\t-2\t+ing -> Keep 3 chars, delete 2 chars, insert 'ing'.
## Operations are tab-separated. Inserted text is escaped using %xx
## notation.
## @param diffs Array of Diff objects.
## @return Delta text.
##
public string diffToDelta(seq[Diff] diffs)
rope text = new rope()
for (Diff aDiff in diffs)
switch (aDiff.operation)
case insertionOp:
text.add("+").add(HttpUtility.UrlEncode(aDiff.text,
new UTF8Encoding()).Replace('+', ' ')).add("\t")
break
case deletionOp:
text.add("-").add(aDiff.text.len).add("\t")
break
case equalityOp:
text.add("=").add(aDiff.text.len).add("\t")
break
string delta = text.ToString()
if (delta.len != 0)
# Strip off trailing tab character.
delta = delta.Substring(0, delta.len - 1)
delta = unescapeForEncodeUriCompatability(delta)
return delta
##
## Given the original textOne, and an encoded string which describes the
## operations required to transform textOne into textTwo, comAdde the full diff.
## @param textOne Source string for the diff.
## @param delta Delta text.
## @return Array of Diff objects or null if invalid.
## @throws ArgumentException If invalid input.
##
public seq[Diff] diff_fromDelta(string textOne, string delta)
seq[Diff] diffs = newSeq[Diff]()
int pointer = 0; // Cursor in textOne
string[] tokens = delta.Split(new string[] "\t" ,
StringSplitOptions.None)
for (string token in tokens)
if (token.len == 0)
# Blank tokens are ok (from a trailing \t).
continue
# Each token begins with a one character parameter which specifies the
# operation of this token (delete, insert, equality).
string param = token[1 .. len(token)]
switch (token[0])
case '+':
# decode would change all "+" to " "
param = param.Replace("+", "%2b")
param = HttpUtility.UrlDecode(param, new UTF8Encoding(false, true))
# catch (UnsupportedEncodingException e)
# // Not likely on modern system.
# throw new Error("This system does not support UTF-8.", e)
# catch (IllegalArgumentException e)
# // Malformed URI sequence.
# throw new IllegalArgumentException(
# "Illegal escape in diff_fromDelta: " + param, e)
#
diffs.add(newDiff(insertionOp, param))
break
case '-':
# Fall through.
case '=':
int n
try
n = Convert.ToInt32(param)
catch (FormatException e)
throw new ArgumentException(
"Invalid number in diff_fromDelta: " + param, e)
if (n < 0)
throw new ArgumentException(
"Negative number in diff_fromDelta: " + param)
string text
try
text = textOne[pointer .. n]
pointer += n
catch (ArgumentOutOfRangeException e)
throw new ArgumentException("Delta length (" + pointer
+ ") larger than source text length (" + textOne.len
+ ").", e)
if (token[0] == '=')
diffs.add(newDiff(equalityOp, text))
else
diffs.add(newDiff(deletionOp, text))
break
default:
# Anything else is an error.
throw new ArgumentException(
"Invalid diff operation in diff_fromDelta: " + token[0])
if (pointer != textOne.len)
throw new ArgumentException("Delta length (" + pointer
+ ") smaller than source text length (" + textOne.len + ").")
return diffs
# MATCH FUNCTIONS
##
## Locate the best instance of 'pattern' in 'text' near 'loc'.
## Returns -1 if no match found.
## @param text The text to search.
## @param pattern The pattern to search for.
## @param loc The location to search around.
## @return Best match index or -1.
##
public int match_main(string text, string pattern, int loc)
# Check for null inputs not needed since null can't be passed in C#.
loc = max(0, min(loc, text.len))
if (text == pattern)
# Shortcut (potentially not guaranteed by the algorithm)
return 0
elif (text.len == 0)
# Nothing to match.
return -1
elif (loc + pattern.len <= text.len
and text.Substring(loc, pattern.len) == pattern)
# Perfect match at the perfect spot! (Includes case of null pattern)
return loc
else
# Do a fuzzy compare.
return match_bitap(text, pattern, loc)
##
## Locate the best instance of 'pattern' in 'text' near 'loc' using the
## Bitap algorithm. Returns -1 if no match found.
## @param text The text to search.
## @param pattern The pattern to search for.
## @param loc The location to search around.
## @return Best match index or -1.
##
protected int match_bitap(string text, string pattern, int loc)
# assert (Match_MaxBits == 0 or pattern.len <= Match_MaxBits)
# : "Pattern too long for this application."
# Initialise the alphabet.
Dictionary<char, int> s = match_alphabet(pattern)
# Highest score beyond which we give up.
double score_threshold = MatchThreshold
# Is there a nearby exact match? (speedup)
int bestLoc = text.find(pattern, loc, StringComparison.Ordinal)
if (bestLoc != -1)
score_threshold = min(match_bitapScore(0, bestLoc, loc,
pattern), score_threshold)
# What about in the other direction? (speedup)
bestLoc = text.rfind(pattern,
min(loc + pattern.len, text.len),
StringComparison.Ordinal)
if (bestLoc != -1)
score_threshold = min(match_bitapScore(0, bestLoc, loc,
pattern), score_threshold)
# Initialise the bit arrays.
int matchmask = 1 << (pattern.len - 1)
bestLoc = -1
int binMin, binMid
int binMax = pattern.len + text.len
# Empty initialization added to appease C# compiler.
int[] last_rd = new int[0]
for (int d = 0; d < pattern.len; d += 1)
# Scan for the best match; each iteration allows for one more error.
# Run a binary search to determine how far from 'loc' we can stray at
# this error level.
binMin = 0
binMid = binMax
while (binMin < binMid)
if (match_bitapScore(d, loc + binMid, loc, pattern)
<= score_threshold)
binMin = binMid
else
binMax = binMid
binMid = (binMax - binMin) / 2 + binMin
# Use the result from this iteration as the maximum for the next.
binMax = binMid
int start = max(1, loc - binMid + 1)
int finish = min(loc + binMid, text.len) + pattern.len
int[] rd = new int[finish + 2]
rd[finish + 1] = (1 << d) - 1
for (int j = finish; j >= start; j--)
int charMatch
if (text.len <= j - 1 or !s.ContainsKey(text[j - 1]))
# Out of range.
charMatch = 0
else
charMatch = s[text[j - 1]]
if (d == 0)
# First pass: exact match.
rd[j] = ((rd[j + 1] << 1) | 1) & charMatch
else
# Subsequent passes: fuzzy match.
rd[j] = ((rd[j + 1] << 1) | 1) & charMatch
| (((last_rd[j + 1] | last_rd[j]) << 1) | 1) | last_rd[j + 1]
if ((rd[j] & matchmask) != 0)
double score = match_bitapScore(d, j - 1, loc, pattern)
# This match will almost certainly be better than any existing
# match. But check anyway.
if (score <= score_threshold)
# Told you so.
score_threshold = score
bestLoc = j - 1
if (bestLoc > loc)
# When passing loc, don't exceed our current distance from loc.
start = max(1, 2 * loc - bestLoc)
else
# Already passed loc, downhill from here on in.
break
if (match_bitapScore(d + 1, loc, loc, pattern) > score_threshold)
# No hope for a (better) match at greater error levels.
break
last_rd = rd
return bestLoc
##
## Compute and return the score for a match with e errors and x location.
## @param e Number of errors in match.
## @param x Location of match.
## @param loc Expected location of match.
## @param pattern Pattern being sought.
## @return Overall score for match (0.0 = good, 1.0 = bad).
##
private double match_bitapScore(int e, int x, int loc, string pattern)
float accuracy = (float)e / pattern.len
int proximity = abs(loc - x)
if (MatchDistance == 0)
# Dodge divide by zero error.
return proximity == 0 ? accuracy : 1.0
return accuracy + (proximity / (float) MatchDistance)
##
## Initialise the alphabet for the Bitap algorithm.
## @param pattern The text to encode.
## @return Hash of character locations.
##
protected Dictionary<char, int> match_alphabet(string pattern)
Dictionary<char, int> s = new Dictionary<char, int>()
char[] charPattern = pattern.ToCharArray()
for (char c in charPattern)
if (!s.ContainsKey(c))
s.add(c, 0)
int i = 0
for (char c in charPattern)
int value = s[c] | (1 << (pattern.len - i - 1))
s[c] = value
i += 1
return s
# PATCH FUNCTIONS
##
## Increase the context until it is unique,
## but don't let the pattern expand beyond Match_MaxBits.
## @param patch The patch to grow.
## @param text Source text.
##
protected void patch_addContext(Patch patch, string text)
if (text.len == 0)
return
string pattern = text.Substring(patch.startTwo, patch.lenOne)
int padding = 0
# Look for the first and last matches of pattern in text. If two
# different matches are found, increase the pattern length.
while (text.find(pattern, StringComparison.Ordinal)
!= text.rfind(pattern, StringComparison.Ordinal)
and pattern.len < Match_MaxBits - Patch_Margin - Patch_Margin)
padding += Patch_Margin
pattern = text.JavaSubstring(max(0, patch.startTwo - padding),
min(text.len, patch.startTwo + patch.lenOne + padding))
# Add one chunk for good luck.
padding += Patch_Margin
# Add the prefix.
string prefix = text.JavaSubstring(max(0, patch.startTwo - padding),
patch.startTwo)
if (prefix.len != 0)
patch.diffs.Insert(0, newDiff(equalityOp, prefix))
# Add the suffix.
string suffix = text.JavaSubstring(patch.startTwo + patch.lenOne,
min(text.len, patch.startTwo + patch.lenOne + padding))
if (suffix.len != 0)
patch.diffs.add(newDiff(equalityOp, suffix))
# Roll back the start points.
patch.startOne -= prefix.len
patch.startTwo -= prefix.len
# Extend the lengths.
patch.lenOne += prefix.len + suffix.len
patch.lenTwo += prefix.len + suffix.len
##
## Compute a list of patches to turn textOne into textTwo.
## A set of diffs will be computed.
## @param textOne Old text.
## @param textTwo New text.
## @return List of Patch objects.
##
public seq[Patch] patch_make(string textOne, string textTwo)
# Check for null inputs not needed since null can't be passed in C#.
# No diffs provided, comAdde our own.
seq[Diff] diffs = diff_main(textOne, textTwo, true)
if (diffs.len > 2)
diff_cleanupSemantic(diffs)
diff_cleanupEfficiency(diffs)
return patch_make(textOne, diffs)
##
## Compute a list of patches to turn textOne into textTwo.
## textOne will be derived from the provided diffs.
## @param diffs Array of Diff objects for textOne to textTwo.
## @return List of Patch objects.
##
public seq[Patch] patch_make(seq[Diff] diffs)
# Check for null inputs not needed since null can't be passed in C#.
# No origin string provided, comAdde our own.
string textOne = diffTextOne(diffs)
return patch_make(textOne, diffs)
##
## Compute a list of patches to turn textOne into textTwo.
## textTwo is ignored, diffs are the delta between textOne and textTwo.
## @param textOne Old text
## @param textTwo Ignored.
## @param diffs Array of Diff objects for textOne to textTwo.
## @return List of Patch objects.
## @deprecated Prefer patch_make(string textOne, seq[Diff] diffs).
##
public seq[Patch] patch_make(string textOne, string textTwo,
seq[Diff] diffs)
return patch_make(textOne, diffs)
##
## Compute a list of patches to turn textOne into textTwo.
## textTwo is not provided, diffs are the delta between textOne and textTwo.
## @param textOne Old text.
## @param diffs Array of Diff objects for textOne to textTwo.
## @return List of Patch objects.
##
public seq[Patch] patch_make(string textOne, seq[Diff] diffs)
# Check for null inputs not needed since null can't be passed in C#.
seq[Patch] patches = newSeq[Patch]()
if (diffs.len == 0)
return patches; // Get rid of the null case.
Patch patch = new Patch()
int charCountOne = 0; // Number of characters into the textOne string.
int charCountTwo = 0; // Number of characters into the textTwo string.
# Start with textOne (prepatchText) and apply the diffs until we arrive at
# textTwo (postpatchText). We recreate the patches one by one to determine
# context info.
string prepatchText = textOne
string postpatchText = textOne
for (Diff aDiff in diffs)
if (patch.diffs.len == 0 and aDiff.operation != equalityOp)
# A new patch starts here.
patch.startOne = charCountOne
patch.startTwo = charCountTwo
switch (aDiff.operation)
case insertionOp:
patch.diffs.add(aDiff)
patch.lenTwo += aDiff.text.len
postpatchText = postpatchText.Insert(charCountTwo, aDiff.text)
break
case deletionOp:
patch.lenOne += aDiff.text.len
patch.diffs.add(aDiff)
postpatchText = postpatchText.Remove(charCountTwo,
aDiff.text.len)
break
case equalityOp:
if (aDiff.text.len <= 2 * Patch_Margin
and patch.diffs.len() != 0 and aDiff != diffs.Last())
# Small equality inside a patch.
patch.diffs.add(aDiff)
patch.lenOne += aDiff.text.len
patch.lenTwo += aDiff.text.len
if (aDiff.text.len >= 2 * Patch_Margin)
# Time for a new patch.
if (patch.diffs.len != 0)
patch_addContext(patch, prepatchText)
patches.add(patch)
patch = new Patch()
# Unlike Unidiff, our patch lists have a rolling context.
# http://code.google.com/p/google-diff-match-patch/wiki/Unidiff
# Update prepatch text & pos to reflect the application of the
# just completed patch.
prepatchText = postpatchText
charCountOne = charCountTwo
break
# Update the current character count.
if (aDiff.operation != insertionOp)
charCountOne += aDiff.text.len
if (aDiff.operation != deletionOp)
charCountTwo += aDiff.text.len
# Pick up the leftover patch if not empty.
if (patch.diffs.len != 0)
patch_addContext(patch, prepatchText)
patches.add(patch)
return patches
##
## Given an array of patches, return another array that is identical.
## @param patches Array of Patch objects.
## @return Array of Patch objects.
##
public seq[Patch] patchDeepCopy(seq[Patch] patches)
seq[Patch] patchesCopy = newSeq[Patch]()
for (Patch aPatch in patches)
Patch patchCopy = new Patch()
for (Diff aDiff in aPatch.diffs)
Diff diffCopy = newDiff(aDiff.operation, aDiff.text)
patchCopy.diffs.add(diffCopy)
patchCopy.startOne = aPatch.startOne
patchCopy.startTwo = aPatch.startTwo
patchCopy.lenOne = aPatch.lenOne
patchCopy.lenTwo = aPatch.lenTwo
patchesCopy.add(patchCopy)
return patchesCopy
##
## Merge a set of patches onto the text. Return a patched text, as well
## as an array of true/false values indicating which patches were applied.
## @param patches Array of Patch objects
## @param text Old text.
## @return Two element Object array, containing the new text and an array of
## bool values.
##
public Object[] patch_apply(seq[Patch] patches, string text)
if (patches.len == 0)
return new Object[] text, new bool[0]
# Deep copy the patches so that no changes are made to originals.
patches = patchDeepCopy(patches)
string nullPadding = this.patch_addPadding(patches)
text = nullPadding + text + nullPadding
patchSplitMax(patches)
int x = 0
# delta keeps track of the offset between the expected and actual
# location of the previous patch. If there are patches expected at
# positions 10 and 20, but the first patch was found at 12, delta is 2
# and the second patch has an effective expected position of 22.
int delta = 0
bool[] results = new bool[patches.len]
for (Patch aPatch in patches)
int expectedLoc = aPatch.startTwo + delta
string textOne = diffTextOne(aPatch.diffs)
int startLoc
int endLoc = -1
if (textOne.len > this.Match_MaxBits)
# patchSplitMax will only provide an oversized pattern
# in the case of a monster delete.
startLoc = match_main(text,
textOne.Substring(0, this.Match_MaxBits), expectedLoc)
if (startLoc != -1)
endLoc = match_main(text,
textOne.Substring(textOne.len - this.Match_MaxBits),
expectedLoc + textOne.len - this.Match_MaxBits)
if (endLoc == -1 or startLoc >= endLoc)
# Can't find valid trailing context. Drop this patch.
startLoc = -1
else
startLoc = this.match_main(text, textOne, expectedLoc)
if (startLoc == -1)
# No match found. :(
results[x] = false
# Subtract the delta for this failed patch from subsequent patches.
delta -= aPatch.lenTwo - aPatch.lenOne
else
# Found a match. :)
results[x] = true
delta = startLoc - expectedLoc
string textTwo
if (endLoc == -1)
textTwo = text.JavaSubstring(startLoc,
min(startLoc + textOne.len, text.len))
else
textTwo = text.JavaSubstring(startLoc,
min(endLoc + this.Match_MaxBits, text.len))
if (textOne == textTwo)
# Perfect match, just shove the Replacement text in.
text = text[0 .. startLoc] + diffTextTwo(aPatch.diffs)
+ text.Substring(startLoc + textOne.len)
else
# Imperfect match. Run a diff to get a framework of equivalent
# indices.
seq[Diff] diffs = diff_main(textOne, textTwo, false)
if (textOne.len > this.Match_MaxBits
and this.diff_levenshtein(diffs) / (float) textOne.len
> this.PatchDeleteThreshold)
# The end points match, but the content is unacceptably bad.
results[x] = false
else
diff_cleanupSemanticLossless(diffs)
int indexOne = 0
for (Diff aDiff in aPatch.diffs)
if (aDiff.operation != equalityOp)
int indexTwo = diff_xIndex(diffs, indexOne)
if (aDiff.operation == insertionOp)
# Insertion
text = text.Insert(startLoc + indexTwo, aDiff.text)
elif (aDiff.operation == deletionOp)
# Deletion
text = text.Remove(startLoc + indexTwo, diff_xIndex(diffs,
indexOne + aDiff.text.len) - indexTwo)
if (aDiff.operation != deletionOp)
indexOne += aDiff.text.len
x += 1
# Strip the padding off.
text = text.Substring(nullPadding.len, text.len
- 2 * nullPadding.len)
return new Object[] text, results
##
## Add some padding on text start and end so that edges can match something.
## Intended to be called only from within patch_apply.
## @param patches Array of Patch objects.
## @return The padding string added to each side.
##
public string patch_addPadding(seq[Patch] patches)
short paddingLength = this.Patch_Margin
string nullPadding = ""
for (short x = 1; x <= paddingLength; x += 1)
nullPadding += (char)x
# Bump all the patches forward.
for (Patch aPatch in patches)
aPatch.startOne += paddingLength
aPatch.startTwo += paddingLength
# Add some padding on start of first diff.
Patch patch = patches.First()
seq[Diff] diffs = patch.diffs
if (diffs.len == 0 or diffs.First().operation != equalityOp)
# Add nullPadding equality.
diffs.Insert(0, newDiff(equalityOp, nullPadding))
patch.startOne -= paddingLength; // Should be 0.
patch.startTwo -= paddingLength; // Should be 0.
patch.lenOne += paddingLength
patch.lenTwo += paddingLength
elif (paddingLength > diffs.First().text.len)
# Grow first equality.
Diff firstDiff = diffs.First()
int extraLength = paddingLength - firstDiff.text.len
firstDiff.text = nullPadding.Substring(firstDiff.text.len)
+ firstDiff.text
patch.startOne -= extraLength
patch.startTwo -= extraLength
patch.lenOne += extraLength
patch.lenTwo += extraLength
# Add some padding on end of last diff.
patch = patches.Last()
diffs = patch.diffs
if (diffs.len == 0 or diffs.Last().operation != equalityOp)
# Add nullPadding equality.
diffs.add(newDiff(equalityOp, nullPadding))
patch.lenOne += paddingLength
patch.lenTwo += paddingLength
elif (paddingLength > diffs.Last().text.len)
# Grow last equality.
Diff lastDiff = diffs.Last()
int extraLength = paddingLength - lastDiff.text.len
lastDiff.text += nullPadding[0 .. extraLength]
patch.lenOne += extraLength
patch.lenTwo += extraLength
return nullPadding
##
## Look through the patches and break up any which are longer than the
## maximum limit of the match algorithm.
## Intended to be called only from within patch_apply.
## @param patches List of Patch objects.
##
public void patchSplitMax(seq[Patch] patches)
short patchSize = this.Match_MaxBits
for (int x = 0; x < patches.len; x += 1)
if (patches[x].lenOne <= patchSize)
continue
Patch bigPatch = patches[x]
# Remove the big old patch.
patches.Splice(x--, 1)
int startOne = bigPatch.startOne
int startTwo = bigPatch.startTwo
string preContext = ""
while (bigPatch.diffs.len != 0)
# Create one of several smaller patches.
Patch patch = new Patch()
bool empty = true
patch.startOne = startOne - preContext.len
patch.startTwo = startTwo - preContext.len
if (preContext.len != 0)
patch.lenOne = patch.lenTwo = preContext.len
patch.diffs.add(newDiff(equalityOp, preContext))
while (bigPatch.diffs.len != 0
and patch.lenOne < patchSize - this.Patch_Margin)
Operation diffType = bigPatch.diffs[0].operation
string diffText = bigPatch.diffs[0].text
if (diffType == insertionOp)
# Insertions are harmless.
patch.lenTwo += diffText.len
startTwo += diffText.len
patch.diffs.add(bigPatch.diffs.First())
bigPatch.diffs.delete(0)
empty = false
elif (diffType == deletionOp and patch.diffs.len == 1
and patch.diffs.First().operation == equalityOp
and diffText.len > 2 * patchSize)
# This is a large deletion. Let it pass in one chunk.
patch.lenOne += diffText.len
startOne += diffText.len
empty = false
patch.diffs.add(newDiff(diffType, diffText))
bigPatch.diffs.delete(0)
else
# Deletion or equality. Only take as much as we can stomach.
diffText = diffText.Substring(0, min(diffText.len,
patchSize - patch.lenOne - Patch_Margin))
patch.lenOne += diffText.len
startOne += diffText.len
if (diffType == equalityOp)
patch.lenTwo += diffText.len
startTwo += diffText.len
else
empty = false
patch.diffs.add(newDiff(diffType, diffText))
if (diffText == bigPatch.diffs[0].text)
bigPatch.diffs.delete(0)
else
bigPatch.diffs[0].text =
bigPatch.diffs[0].text.Substring(diffText.len)
# Compute the head context for the next patch.
preContext = this.diffTextTwo(patch.diffs)
preContext = preContext.Substring(max(0,
preContext.len - this.Patch_Margin))
string postContext = null
# add the end context for this patch.
if (diffTextOne(bigPatch.diffs).len > Patch_Margin)
postContext = diffTextOne(bigPatch.diffs)
.Substring(0, Patch_Margin)
else
postContext = diffTextOne(bigPatch.diffs)
if (postContext.len != 0)
patch.lenOne += postContext.len
patch.lenTwo += postContext.len
if (patch.diffs.len != 0
and patch.diffs[patch.diffs.len - 1].operation
== equalityOp)
patch.diffs[patch.diffs.len - 1].text += postContext
else
patch.diffs.add(newDiff(equalityOp, postContext))
if (!empty)
patches.Splice(++x, 0, patch)
##
## Take a list of patches and return a textual representation.
## @param patches List of Patch objects.
## @return Text representation of patches.
##
public string patchToText(seq[Patch] patches)
rope text = new rope()
for (Patch aPatch in patches)
text.add(aPatch)
return text.ToString()
##
## Parse a textual representation of patches and return a List of Patch
## objects.
## @param textline Text representation of patches.
## @return List of Patch objects.
## @throws ArgumentException If invalid input.
##
public seq[Patch] patch_fromText(string textline)
seq[Patch] patches = newSeq[Patch]()
if (textline.len == 0)
return patches
string[] text = textline.Split('\n')
int textPointer = 0
Patch patch
Regex patchHeader
= new Regex("^@@ -(\\d+),?(\\d*) \\+(\\d+),?(\\d*) @@$")
Match m
char sign
string line
while (textPointer < text.len)
m = patchHeader.Match(text[textPointer])
if (!m.Success)
throw new ArgumentException("Invalid patch string: "
+ text[textPointer])
patch = new Patch()
patches.add(patch)
patch.startOne = Convert.ToInt32(m.Groups[1].Value)
if (m.Groups[2].len == 0)
patch.startOne--
patch.lenOne = 1
elif (m.Groups[2].Value == "0")
patch.lenOne = 0
else
patch.startOne--
patch.lenOne = Convert.ToInt32(m.Groups[2].Value)
patch.startTwo = Convert.ToInt32(m.Groups[3].Value)
if (m.Groups[4].len == 0)
patch.startTwo--
patch.lenTwo = 1
elif (m.Groups[4].Value == "0")
patch.lenTwo = 0
else
patch.startTwo--
patch.lenTwo = Convert.ToInt32(m.Groups[4].Value)
textPointer += 1
while (textPointer < text.len)
try
sign = text[textPointer][0]
catch (IndexOutOfRangeException)
# Blank line? Whatever.
textPointer += 1
continue
line = text[textPointer][1 .. len()]
line = line.Replace("+", "%2b")
line = HttpUtility.UrlDecode(line, new UTF8Encoding(false, true))
if (sign == '-')
# Deletion.
patch.diffs.add(newDiff(deletionOp, line))
elif (sign == '+')
# Insertion.
patch.diffs.add(newDiff(insertionOp, line))
elif (sign == ' ')
# Minor equality.
patch.diffs.add(newDiff(equalityOp, line))
elif (sign == '@')
# Start of next patch.
break
else
# WTF?
throw new ArgumentException(
"Invalid patch mode '" + sign + "' in: " + line)
textPointer += 1
return patches
##
## Unescape selected chars for compatability with JavaScript's encodeURI.
## In speed critical applications this could be dropped since the
## receiving application will certainly decode these fine.
## Note that this function is case-sensitive. Thus "%3F" would not be
## unescaped. But this is ok because it is only called with the output of
## HttpUtility.UrlEncode which returns lowercase hex.
##
## Example: "%3f" -> "?", "%24" -> "$", etc.
##
## @param str The string to escape.
## @return The escaped string.
##
public static string unescapeForEncodeUriCompatability(string str)
return str.Replace("%21", "!").Replace("%7e", "~")
.Replace("%27", "'").Replace("%28", "(").Replace("%29", ")")
.Replace("%3b", ";").Replace("%2f", "/").Replace("%3f", "?")
.Replace("%3a", ":").Replace("%40", "@").Replace("%26", "&")
.Replace("%3d", "=").Replace("%2b", "+").Replace("%24", "$")
.Replace("%2c", ",").Replace("%23", "#")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment