Created
December 7, 2021 19:33
-
-
Save memononen/2c83d183c2749e5f4a493ce7ddb73f4d to your computer and use it in GitHub Desktop.
3-way merge based on O(NP) Myers diff and diff3, merging per item
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include <stdio.h> | |
#include <vector> | |
#include <span> | |
#include <algorithm> | |
// Based on | |
// "An O(NP) Sequence Comparison Algorithm" by Sun Wu, Udi Manber and Gene Myers | |
// - https://publications.mpi-cbg.de/Wu_1990_6334.pdf | |
// - Good article visualizing Myer's older algorithm: https://epxx.co/artigos/diff_en.html | |
// | |
// "A Formal Investigation of Diff3" | |
// - https://www.cis.upenn.edu/~bcpierce/papers/diff3-short.pdf | |
enum class EditType { Insert, Delete, Substitute, Match }; | |
// Describes how one sequence is changed to another. | |
struct Edit { | |
Edit() = default; | |
Edit(EditType _type, int _base, int _change) : type(_type), base(_base), change(_change) {} | |
EditType type = EditType::Match; // type of edit | |
int base = INT32_MAX; // index in string a | |
int change = INT32_MAX; // index in string b | |
}; | |
struct Diag { | |
Diag() = default; | |
Diag(int _x, int _y, int _len, int _next) : x(_x), y(_y), length(_len), next(_next) {} | |
int x = 0; // start position | |
int y = 0; | |
int length = 0; // diagonal length | |
int next = -1; // next diagonal towards the start | |
}; | |
struct Point { | |
Point() = default; | |
Point(int _y, int _score, int _diag) : y(_y), score(_score), diag(_diag) {} | |
int y = 0; // furthest y | |
int score = 0; | |
int diag = -1; // nearest diagonal | |
}; | |
enum class MergeType { Conflict, Base, Left, Right }; | |
// Describes a change, type defines from which sequence should be used for merged. | |
struct Merge { | |
Merge() = default; | |
Merge(MergeType _type, EditType _edit, const int _base, const int _left, const int _right) | |
: type(_type), edit(_edit), base(_base), left(_left), right(_right) {} | |
MergeType type = MergeType::Base; | |
EditType edit = EditType::Match; | |
int base = 0; | |
int left = 0; | |
int right = 0; | |
}; | |
static void snake(const int k, std::span<const char> left, std::span<const char> right, | |
std::span<Point> fp, const int fp0, std::vector<Diag>& diags) | |
{ | |
constexpr int insRemScore = 1; | |
constexpr int subsScore = 4; // Making substitute cheaper than match favors longer diagonals and reduces small edits (chaffs). | |
constexpr int matchScore = 2; | |
const Point& belowPt = fp[fp0 + k-1]; | |
const Point& rightPt = fp[fp0 + k+1]; | |
const Point& prevPt = fp[fp0 + k]; | |
int prevDiag = prevPt.diag; | |
int y = prevPt.y+1; | |
int score = prevPt.score + subsScore; | |
if ((rightPt.score + insRemScore) > score) | |
{ | |
prevDiag = rightPt.diag; | |
y = rightPt.y; | |
score = rightPt.score + insRemScore; | |
} | |
if ((belowPt.score + insRemScore) > score) | |
{ | |
prevDiag = belowPt.diag; | |
y = belowPt.y+1; | |
score = belowPt.score + insRemScore; | |
} | |
int x = y - k; | |
int length = 0; | |
const int N = left.size(); | |
const int M = right.size(); | |
while (x < N && y < M && left[x] == right[y]) | |
{ | |
x++; y++; length++; score += matchScore; | |
} | |
if (length > 0) | |
{ | |
diags.emplace_back(x - length, y - length, length, prevDiag); | |
fp[fp0 + k] = Point(y, score, diags.size() - 1); | |
} | |
else | |
{ | |
fp[fp0 + k] = Point(y, score, prevDiag); | |
} | |
} | |
// Returns shortest(ish) edit script between left and right. | |
std::vector<Edit> diff(std::span<const char> left, std::span<const char> right) | |
{ | |
bool reverse = false; | |
if (left.size() > right.size()) | |
{ | |
std::swap(left, right); | |
reverse = true; | |
} | |
const int N = left.size(); | |
const int M = right.size(); | |
std::vector<Point> fp; | |
std::vector<Diag> diags; | |
const int delta = M - N; | |
const int fp0 = N + 1; // zero offset for furthest point indexing, indexing can go negative. | |
fp.resize((N+1) + (M+1) + 1); | |
// All paths will lead to empty diagonal at zero. | |
diags.push_back(Diag(0, 0, 0, -1)); | |
std::fill(fp.begin(), fp.end(), Point(-1,0,0)); | |
// Calculate common diagonal sequences | |
for (int p = 0; fp[fp0 + delta].y != M; p++) | |
{ | |
for (int k = -p; k <= delta-1; k++) | |
snake(k, left, right, fp, fp0, diags); | |
for (int k = delta+p; k >= delta+1; k--) | |
snake(k, left, right, fp, fp0, diags); | |
snake(delta, left, right, fp, fp0, diags); | |
} | |
// Backtrace shortest edit script | |
std::vector<Edit> diff; | |
Diag prevDiag(N, M, 0, -1); | |
for (int i = fp[fp0 + delta].diag; i != -1; i = diags[i].next) | |
{ | |
const Diag& diag = diags[i]; | |
// The path between the diagonals is handled with substitutes, inserts, and deletes. | |
int numDel = prevDiag.x - (diag.x + diag.length); | |
int numIns = prevDiag.y - (diag.y + diag.length); | |
int numSubs = std::min(numDel, numIns); | |
numDel -= numSubs; | |
numIns -= numSubs; | |
int numMatch = diag.length; | |
int x = prevDiag.x, y = prevDiag.y; | |
if (reverse) | |
{ | |
std::swap(numDel, numIns); | |
std::swap(x, y); | |
} | |
// Store edit for each item in sequence, walking backwards. | |
for (int j = 0; j < numIns; j++) | |
{ | |
y--; | |
diff.emplace_back(EditType::Insert, x, y); | |
} | |
for (int j = 0; j < numDel; j++) | |
{ | |
x--; | |
diff.emplace_back(EditType::Delete, x, y); | |
} | |
for (int j = 0; j < numSubs; j++) | |
{ | |
x--; y--; | |
diff.emplace_back(EditType::Substitute, x, y); | |
} | |
for (int j = 0; j < numMatch; j++) | |
{ | |
x--; y--; | |
diff.emplace_back(EditType::Match, x, y); | |
} | |
prevDiag = diag; | |
} | |
// Backtrace left the sequence in reverse, flip it. | |
std::reverse(diff.begin(), diff.end()); | |
return diff; | |
} | |
// Returns true if index is valid for specific diff. | |
static bool isValidIndex(const int index, std::span<const Edit> diff) | |
{ | |
return index < (int)diff.size(); | |
} | |
// Returns index in base seq, or "infinity" if the index is out of bounds (i.e. iterator has finished). | |
static int getBase(const int index, std::span<const Edit> diff) | |
{ | |
return index < (int)diff.size() ? diff[index].base : INT32_MAX; | |
} | |
// Merges diffs into operations that can be used to alter each of the sequences to get the combined result. | |
std::vector<Merge> merge3(std::span<const char> base, std::span<const char> left, std::span<const char> right, | |
std::span<const Edit> leftDiff, std::span<const Edit> rightDiff) | |
{ | |
std::vector<Merge> res; | |
int indexLeft = 0; | |
int indexRight = 0; | |
int lastLeftChange = 0; | |
int lastRightChange = 0; | |
while (isValidIndex(indexLeft, leftDiff) || isValidIndex(indexRight, rightDiff)) | |
{ | |
const int leftBasePos = getBase(indexLeft, leftDiff); | |
const int rightBasePos = getBase(indexRight, rightDiff); | |
const int basePos = std::min(leftBasePos, rightBasePos); | |
Edit leftEdit(EditType::Match, basePos, lastLeftChange); | |
if (isValidIndex(indexLeft, leftDiff) && leftDiff[indexLeft].base == basePos) | |
{ | |
leftEdit = leftDiff[indexLeft]; | |
indexLeft++; | |
} | |
Edit rightEdit(EditType::Match, basePos, lastRightChange); | |
if (isValidIndex(indexRight, rightDiff) && rightDiff[indexRight].base == basePos) | |
{ | |
rightEdit = rightDiff[indexRight]; | |
indexRight++; | |
} | |
// Merge thruth table. There likely is a simple logic that would handle it without the convoluted ifs. | |
// | |
// L / R | ins del subs match | |
// ------+----------------------------- | |
// ins | X SL LR IL | |
// del | SR D IR DR | |
// subs | RL IL X SL | |
// match | IR DL SR MB | |
if (leftEdit.type == EditType::Insert && rightEdit.type == EditType::Insert) | |
{ | |
// X - Conflict | |
res.emplace_back(MergeType::Conflict, EditType::Insert, basePos, leftEdit.change, rightEdit.change); | |
} | |
else if (leftEdit.type == EditType::Substitute && rightEdit.type == EditType::Substitute) | |
{ | |
// X - Conflict | |
res.emplace_back(MergeType::Conflict, EditType::Substitute, basePos, leftEdit.change, rightEdit.change); | |
} | |
else if ((leftEdit.type == EditType::Insert && rightEdit.type == EditType::Delete) || | |
(leftEdit.type == EditType::Substitute && rightEdit.type == EditType::Match)) | |
{ | |
// SL - substitute left | |
res.emplace_back(MergeType::Left, EditType::Substitute, basePos, leftEdit.change, rightEdit.change); | |
} | |
else if ((leftEdit.type == EditType::Delete && rightEdit.type == EditType::Insert) || | |
(leftEdit.type == EditType::Match && rightEdit.type == EditType::Substitute)) | |
{ | |
// SR - substitute right | |
res.emplace_back(MergeType::Right, EditType::Substitute, basePos, leftEdit.change, rightEdit.change); | |
} | |
else if ((leftEdit.type == EditType::Insert && rightEdit.type == EditType::Match) || | |
(leftEdit.type == EditType::Substitute && rightEdit.type == EditType::Delete)) | |
{ | |
// IL - insert left | |
res.emplace_back(MergeType::Left, EditType::Insert, basePos, leftEdit.change, rightEdit.change); | |
} | |
else if ((leftEdit.type == EditType::Match && rightEdit.type == EditType::Insert) || | |
(leftEdit.type == EditType::Delete && rightEdit.type == EditType::Substitute)) | |
{ | |
// IR - insert right | |
res.emplace_back(MergeType::Right, EditType::Insert, basePos, leftEdit.change, rightEdit.change); | |
} | |
else if (leftEdit.type == EditType::Match && rightEdit.type == EditType::Match) | |
{ | |
// MB - match base | |
res.emplace_back(MergeType::Base, EditType::Match, basePos, leftEdit.change, rightEdit.change); | |
} | |
else if (leftEdit.type == EditType::Match && rightEdit.type == EditType::Delete) | |
{ | |
// DL - delete left | |
res.emplace_back(MergeType::Left, EditType::Delete, basePos, leftEdit.change, rightEdit.change); | |
} | |
else if (leftEdit.type == EditType::Delete && rightEdit.type == EditType::Match) | |
{ | |
// DR - delete right | |
res.emplace_back(MergeType::Right, EditType::Delete, basePos, leftEdit.change, rightEdit.change); | |
} | |
else if (leftEdit.type == EditType::Insert && rightEdit.type == EditType::Substitute) | |
{ | |
// LR - left and right | |
res.emplace_back(MergeType::Left, EditType::Insert, basePos, leftEdit.change, rightEdit.change); | |
res.emplace_back(MergeType::Right, EditType::Substitute, basePos, leftEdit.change, rightEdit.change); | |
} | |
else if (leftEdit.type == EditType::Substitute && rightEdit.type == EditType::Insert) | |
{ | |
// RL - right and left | |
res.emplace_back(MergeType::Right, EditType::Insert, basePos, leftEdit.change, rightEdit.change); | |
res.emplace_back(MergeType::Left, EditType::Substitute, basePos, leftEdit.change, rightEdit.change); | |
} | |
else if (leftEdit.type == EditType::Delete && rightEdit.type == EditType::Delete) | |
{ | |
// D - delete | |
res.emplace_back(MergeType::Left, EditType::Delete, basePos, leftEdit.change, rightEdit.change); | |
res.emplace_back(MergeType::Right, EditType::Delete, basePos, leftEdit.change, rightEdit.change); | |
} | |
lastLeftChange = leftEdit.change; | |
lastRightChange = rightEdit.change; | |
} | |
return res; | |
} | |
// Merge changes to right in-situ | |
void resolveIntoRight(std::span<Merge> merge, std::span<char> base, std::span<char> left, std::vector<char>& right) | |
{ | |
int offset = 0; | |
for (const Merge& m : merge) | |
{ | |
if (m.type == MergeType::Conflict) | |
{ | |
// Conflicting insert or replaces, arbitrarily keep right (could be any) | |
} | |
else if (m.type == MergeType::Left && m.edit == EditType::Substitute) | |
{ | |
// Substitute from left | |
right[m.right + offset] = left[m.left]; | |
} | |
else if (m.type == MergeType::Left && m.edit == EditType::Insert) | |
{ | |
// Insert from left | |
right.insert(right.begin() + m.right + offset, left[m.left]); | |
offset++; | |
} | |
else if (m.type == MergeType::Right && m.edit == EditType::Delete) | |
{ | |
// Right delete. | |
right.erase(right.begin() + m.right + offset); | |
offset--; | |
} | |
else if (m.type == MergeType::Right && (m.edit == EditType::Substitute || m.edit == EditType::Insert)) | |
{ | |
// Right, keep. | |
} | |
else if (m.type == MergeType::Base && m.edit == EditType::Match) | |
{ | |
// Match with base, keep | |
} | |
} | |
} | |
void printDiff(std::span<char> left, std::span<char> right, std::span<Edit> diff) | |
{ | |
/* | |
for (const Edit& ed : diff) { | |
switch (ed.type) | |
{ | |
case EditType::Match: | |
printf("\u001b[0m"); | |
printf("%c %c", left[ed.base], right[ed.change]); | |
printf("\u001b[0m %d, %d\n", ed.base, ed.change); | |
break; | |
case EditType::Substitute: | |
printf("\u001b[46m%c %c", left[ed.base], right[ed.change]); | |
printf("\u001b[0m %d, %d\n", ed.base, ed.change); | |
break; | |
case EditType::Insert: | |
printf(" \u001b[42m%c", right[ed.change]); | |
printf("\u001b[0m %d, %d\n", ed.base, ed.change); | |
break; | |
case EditType::Delete: | |
printf("\u001b[0m%c", left[ed.base]); | |
printf("\u001b[0m %d, %d\n", ed.base, ed.change); | |
break; | |
} | |
} | |
printf("\u001b[0m\n");*/ | |
printf(" Base: "); | |
for (const Edit& ed : diff) { | |
switch (ed.type) | |
{ | |
case EditType::Match: | |
printf("\u001b[0m"); | |
printf("%c", left[ed.base]); | |
break; | |
case EditType::Substitute: | |
printf("\u001b[0m"); | |
printf("%c", left[ed.base]); | |
break; | |
case EditType::Insert: | |
printf("\u001b[0m"); | |
printf(" "); | |
break; | |
case EditType::Delete: | |
printf("\u001b[41m"); | |
printf("%c", left[ed.base]); | |
break; | |
default: | |
break; | |
} | |
} | |
printf("\u001b[0m\n"); | |
// Change | |
printf(" Change: "); | |
for (const Edit& ed : diff) { | |
switch (ed.type) | |
{ | |
case EditType::Match: | |
printf("\u001b[0m"); | |
printf("%c", right[ed.change]); | |
break; | |
case EditType::Substitute: | |
printf("\u001b[43m"); | |
printf("%c", right[ed.change]); | |
break; | |
case EditType::Insert: | |
printf("\u001b[42m"); | |
printf("%c", right[ed.change]); | |
break; | |
case EditType::Delete: | |
printf("\u001b[0m"); | |
printf(" "); | |
break; | |
default: | |
break; | |
} | |
} | |
printf("\u001b[0m\n"); | |
} | |
void printMerge(std::span<char> base, std::span<char> left, std::span<char> right, std::span<Merge> merge) | |
{ | |
/* char mergeMarker[] = "!BLR"; | |
char editMarker[] = "?+-x="; | |
printf("Diff\n"); | |
for (const Merge& m : merge) | |
{ | |
printf("%c%c %d, %d, %d %c %c %c\n", | |
mergeMarker[(int)m.type], editMarker[(int)m.edit], | |
m.base, m.left, m.right, | |
base[m.base], left[m.left], right[m.right]); | |
} | |
printf("-\n");*/ | |
printf(" "); | |
for (const Merge& m : merge) | |
{ | |
if (m.type == MergeType::Conflict) | |
{ | |
// Conflict, arbitrarily choose right | |
printf("\u001b[41m%c", right[m.right]); | |
} | |
else if (m.type == MergeType::Left && (m.edit == EditType::Substitute || m.edit == EditType::Insert)) | |
{ | |
// Left | |
printf("\u001b[44m%c", left[m.left]); | |
} | |
else if (m.type == MergeType::Right && (m.edit == EditType::Substitute || m.edit == EditType::Insert)) | |
{ | |
// Right | |
printf("\u001b[46m%c", right[m.right]); | |
} | |
else if (m.type == MergeType::Base && m.edit == EditType::Match) | |
{ | |
// Match | |
printf("\u001b[0m%c", base[m.base]); | |
} | |
} | |
printf("\u001b[0m\n"); | |
} | |
void printv(std::span<char> arr) | |
{ | |
printf(" "); | |
for (const char c : arr) | |
printf("%c", c); | |
printf("\n"); | |
} | |
std::vector<char> makev(const char* s) | |
{ | |
std::vector<char> str; | |
while (*s) | |
{ | |
str.push_back(*s); | |
s++; | |
} | |
return str; | |
} | |
int main() | |
{ | |
std::vector<char> base = makev("the quick fox jumps ovre some lazy dog"); | |
std::vector<char> left = makev("the abcck brown fox jumped ovre a dog"); | |
std::vector<char> right = makev("the qzick brown fox jumps over some record dog"); | |
std::vector<Edit> leftDiff = diff(base, left); | |
std::vector<Edit> rightDiff = diff(base, right); | |
std::vector<Merge> merged = merge3(base, left, right, leftDiff, rightDiff); | |
printf("\nDiff Base > \u001b[44mLeft\u001b[0m\n"); | |
printDiff(base, left, leftDiff); | |
printf("\nDiff Base > \u001b[46mRight\u001b[0m\n"); | |
printDiff(base, right, rightDiff); | |
printf("\nMerged\n"); | |
printMerge(base, left, right, merged); | |
resolveIntoRight(merged, base, left, right); | |
printv(right); | |
printf("\n"); | |
return 0; | |
} |
Author
memononen
commented
Dec 7, 2021
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment