Created
May 8, 2020 01:02
-
-
Save icedraco/8417d28604a41b948af6cf2de9ff9726 to your computer and use it in GitHub Desktop.
ByteCompare: A means to find different blocks of bytes inside two otherwise very similar files
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#pragma once | |
#include <exception> | |
#include <fstream> | |
#include <iostream> | |
#include <string> | |
#include <vector> | |
#include "DiffBlock.hpp" | |
using namespace std; | |
const static int buffer_size = 8192; | |
class BinaryFileNotGood : public exception { | |
public: | |
const string& path; | |
BinaryFileNotGood(const string& name) : path(name) {} | |
~BinaryFileNotGood() {} | |
const char* what() const throw () { | |
return (string("cannot open ") + path).c_str(); | |
} | |
}; | |
class BinaryFile { | |
private: | |
ifstream _ifs; | |
public: | |
const string& path; | |
const ios::pos_type size; | |
BinaryFile(const string& path) : | |
_ifs(path, ios::binary | ios::ate), | |
path(path), | |
size(_ifs.tellg()) | |
{ | |
if (!_ifs.good()) { | |
throw BinaryFileNotGood(path); | |
} | |
rewind(); | |
} | |
BinaryFile(const BinaryFile& other) = delete; | |
~BinaryFile() { | |
_ifs.close(); | |
} | |
friend ostream& operator<<(ostream& os, const BinaryFile& bf) { | |
os << bf.path; | |
return os; | |
} | |
inline void rewind() { | |
_ifs.seekg(0); | |
} | |
vector<DiffBlock> compare(BinaryFile& other); | |
private: | |
inline ifstream& stream() { | |
return _ifs; | |
} | |
}; | |
vector<DiffBlock> | |
BinaryFile::compare(BinaryFile& other) { | |
vector<DiffBlock> blocks; | |
auto it_this = istreambuf_iterator<char>(stream()); | |
auto it_other = istreambuf_iterator<char>(other.stream()); | |
auto eof = istreambuf_iterator<char>(); | |
// get a ref to the shortest iterator between the two | |
istreambuf_iterator<char>& min_iter(size < other.size ? it_this : it_other); | |
size_t offset_counter = 0; | |
size_t largest_block_size = 0; | |
while (min_iter != eof) { | |
// find the first differing byte | |
while (min_iter != eof && *it_this == *it_other) { | |
++it_this; | |
++it_other; | |
++offset_counter; | |
} | |
// if we found one (i.e., we're not at end of file) | |
if (min_iter != eof) { | |
size_t offset = offset_counter; | |
size_t size = 0; | |
// find the next matching byte | |
while (min_iter != eof && *it_this != *it_other) { | |
++it_this; | |
++it_other; | |
++size; | |
} | |
offset_counter += size; | |
blocks.push_back(DiffBlock { offset, size }); | |
if (size > largest_block_size) { | |
largest_block_size = size; | |
} | |
} | |
} | |
return blocks; | |
} |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#pragma once | |
struct DiffBlock { | |
size_t offset; | |
size_t size; | |
// Returns the offset after this block | |
size_t next() const { return offset + size; } | |
// Returns a DiffBlock that consists of the merger between this block, | |
// the next block, and all that's in between | |
DiffBlock operator+(const DiffBlock& other) const { | |
return DiffBlock { offset, other.next() - offset }; | |
} | |
// Returns a distance (in bytes) between this block and the next. | |
// A distance is amount of bytes between the last byte of this block and | |
// the first byte of the next block. | |
size_t distance(const DiffBlock& next) const { | |
return next.offset - this->next(); | |
} | |
}; |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include <algorithm> | |
#include <iostream> | |
#include <fstream> | |
#include <vector> | |
#include "BinaryFile.hpp" | |
#include "DiffBlock.hpp" | |
using namespace std; | |
const static size_t max_gap = 8; // for block consolidation | |
static void print_blocks(const vector<DiffBlock>& blocks) { | |
cout << "--- Differing Blocks --------------------------" << endl; | |
for (auto it = blocks.cbegin(); it != blocks.cend(); ++it) { | |
cout << " > 0x" << hex << it->offset << ": " << it->size << " bytes" << endl; | |
} | |
} | |
static void consolidate(vector<DiffBlock>& blocks, size_t max_distance) { | |
for (size_t i = 0; i + 1 < blocks.size();) { | |
DiffBlock& current = blocks[i]; | |
// merge next blocks into this one as long as they're close | |
while (i + 1 < blocks.size()) { | |
DiffBlock& next = blocks[i+1]; | |
if (current.distance(next) <= max_distance) { | |
current.size = next.next() - current.offset; | |
blocks.erase(blocks.begin() + i + 1); | |
} else { | |
i++; | |
break; | |
} | |
} | |
} | |
} | |
static int compare(string& left, string& right) { | |
BinaryFile left_file(left); | |
BinaryFile right_file(right); | |
vector<DiffBlock> diff_blocks = left_file.compare(right_file); | |
consolidate(diff_blocks, max_gap); | |
print_blocks(diff_blocks); | |
return 0; | |
} | |
static void print_syntax(char **argv) { | |
cout << "Syntax: " << argv[0] << " <left_file> <right_file>" << endl; | |
} | |
int main(int argc, char **argv) { | |
if (argc < 3) { | |
print_syntax(argv); | |
return 1; | |
} | |
string left(argv[1]); | |
string right(argv[2]); | |
return compare(left, right); | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment