Last active
December 4, 2023 18:10
-
-
Save BusinessDuck/6d3712d1dcf9658fff12cbe2fa150ce0 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/* eslint-disable no-console */ | |
const crypto = require('crypto'); | |
/** | |
* Modulus operation with a twist. It takes two parameters, a and b, and returns the remainder | |
* of the division of a by b. However, there's a condition to handle the case when a is negative. | |
*/ | |
function mod(a, b) { | |
return ((a % b) + b) % b; | |
} | |
/** | |
* Winnowing algorythm class implementation for document fingerprinting | |
*/ | |
class Winnowing { | |
constructor(k = 40, salt = "", windowSize = 4, removeDuplicates = true) { | |
this.k = k; | |
this.salt = salt; | |
this.windowSize = windowSize; | |
this.removeDuplicates = removeDuplicates; | |
} | |
/** | |
* Given a document, computes all k-gram hashes and uses the | |
* winnowing algorithm to reduce their number. Optionally takes a | |
* list of boilerplate hashes to remove from the winnowed list. | |
* Returns the selected hashes and their indexes in the original list | |
*/ | |
getHash(text) { | |
const [hashes, idx] = this._winnow(this._hashed_kgrams(text), this.windowSize); | |
return [hashes, idx]; | |
} | |
/** | |
* Main method for this class to create all data from text document | |
*/ | |
getFingerprint(text) { | |
const [hashes, idx] = this.getHash(text); | |
const token_len = text.length; | |
const token_coverage = this._getTokenCoverage(idx, this.k, token_len); | |
return [hashes, idx, token_coverage]; | |
} | |
/** | |
* Determines the number of tokens in the original document which are included in the winnowed indices | |
*/ | |
_getTokenCoverage(idx, k, token_len) { | |
const coverage = Array(token_len).fill(0); | |
for (let i = 0; i < idx.length; i++) { | |
for (let offset = 0; offset < k; offset++) { | |
if (idx[i] + offset < token_len) { | |
coverage[idx[i] + offset] = 1; | |
} | |
} | |
} | |
return coverage.reduce((acc, val) => acc + val, 0); | |
} | |
/** | |
* JavaScript implementation of the winnowing algorithm. | |
* Input and output are identical to the https://github.com/blingenf/copydetect | |
*/ | |
_winnowing(hashes, windowSize) { | |
const selectedIdx = []; | |
const buffer = Array(windowSize).fill(Infinity); | |
let r = 0; | |
let minIdx = 0; | |
for (let hashIdx = 0; hashIdx < hashes.length; hashIdx++) { | |
const hashVal = hashes[hashIdx]; | |
r = (r + 1) % windowSize; | |
buffer[r] = hashVal; | |
if (minIdx === r) { | |
let i = (r - 1 + windowSize) % windowSize; | |
while (i !== r) { | |
if (buffer[i] < buffer[minIdx]) { | |
minIdx = i; | |
} | |
i = (i - 1 + windowSize) % windowSize; | |
} | |
selectedIdx.push(hashIdx - (mod((r - minIdx), this.windowSize))); | |
} else if (buffer[r] < buffer[minIdx]) { | |
minIdx = r; | |
selectedIdx.push(hashIdx); | |
} | |
} | |
return selectedIdx; | |
} | |
/** | |
* implementation of the robust winnowing algorithm decribed | |
* in https://theory.stanford.edu/~aiken/publications/papers/sigmod03.pdf | |
* Returns a list of selected hashes and the indexes of those hashes. | |
*/ | |
_winnow(hashes, window_size) { | |
if (window_size < 1) { | |
throw new Error("window_size must be greater than 0"); | |
} | |
let selected_hashes; | |
let selected_idx; | |
if (window_size === 1) { | |
selected_hashes = hashes; | |
selected_idx = Array.from({ length: hashes.length }, (_, i) => i); | |
} else { | |
selected_idx = this._winnowing(hashes, window_size); | |
selected_hashes = selected_idx.map(idx => hashes[idx]); | |
} | |
if (this.removeDuplicates) { | |
const uniqueMap = new Map(); | |
selected_hashes.forEach((hash, idx) => { | |
if (!uniqueMap.has(hash)) { | |
uniqueMap.set(hash, selected_idx[idx]); | |
} | |
}); | |
selected_hashes = Array.from(uniqueMap.keys()); | |
selected_idx = Array.from(uniqueMap.values()); | |
} | |
return [selected_hashes, selected_idx]; | |
} | |
_hash(x) { | |
const sha256_hash = crypto.createHash('sha256'); | |
sha256_hash.update(String(x)); | |
const hash_bytes = sha256_hash.digest(); | |
// Convert hash bytes to BigInt | |
const bigIntHash = BigInt('0x' + hash_bytes.toString('hex')); | |
// Truncate | |
const truncated_hash = bigIntHash % BigInt(2 ** 64) - BigInt(2 ** 63); | |
return truncated_hash; // Keep as BigInt | |
} | |
/** | |
* Return hashes of all k-grams in a strin | |
*/ | |
_hashed_kgrams(string) { | |
const hashes = []; | |
const length = string.length; | |
if (length < this.k) { | |
return [this._hash(string) + this.salt]; | |
} | |
for (let offset = 0; offset <= length - this.k; offset++) { | |
hashes.push(this._hash(string.substring(offset, offset + this.k) + this.salt)); | |
} | |
return hashes; | |
} | |
} | |
console.log("Starting the winnowing process..."); | |
// Instantiate the Winnowing class | |
const winnowing = new Winnowing(); | |
const text = `V="S"print("S","S","S")forVinrange(V,V+1):ifV>1:forVinrange(2,V)`; | |
// Get hashes and indices for the provided text | |
console.log("Computing hashes and indices..."); | |
const [hashes, idx, token_coverage] = winnowing.getFingerprint(text); | |
console.log("Hashes and indices computed."); | |
console.log(hashes, idx, token_coverage); |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment