Created
August 10, 2017 12:02
-
-
Save tammoippen/3a3148649950720157b9655b2868cb8b to your computer and use it in GitHub Desktop.
Some algorithms from the paper "Bed-Tree: An All-Purpose Index Structure for String Similarity Search Based on Edit Distance", Zhang et al. (doi 10.1145/1807167.1807266)
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from collections import defaultdict | |
from functools import lru_cache | |
from typing import Dict, Iterable, List, Tuple | |
import mmh3 | |
import numpy as np | |
from slugify import slugify | |
def verify_edit_distance(s1: str, s2: str, th: int) -> bool: | |
'''Test, whether the edit distance between `s1` and `s2` is below `th` in | |
O(max(len(s1), len(s2))) | |
Parameter: | |
s1: str First query string. | |
s2: str Second query string. | |
th: int Edit distance threshold. | |
Returns: | |
bool: True if edit distance <= th, False otherwise. | |
''' | |
ls1 = len(s1) | |
ls2 = len(s2) | |
if abs(ls1 - ls2) > th: | |
return False | |
t = np.zeros((2, ls2+1)) | |
for j in range(min(ls2+1, 1+th)): | |
t[0][j] = j | |
for i in range(1, ls1+1): | |
m = th + 1 | |
for j in range(max(0, i-th), min(ls2, i+th)+1): | |
d1, d2, d3 = th+1, th+1, th+1 | |
if j < i + th: | |
d1 = t[0][j] + 1 | |
if j > 0: | |
d2 = t[1][j-1] + 1 | |
d3 = t[0][j-1] + int(s1[i-1] != s2[j-1]) | |
t[1][j] = min(d1, d2, d3) | |
m = min(m, t[1][j]) | |
if m > th: | |
return False | |
t[0] = t[1] | |
return True | |
def lower_bound_edit_distance(v_i: List[int], v_j: List[int], n: int=2) -> float: | |
'''Computes the lower bound edit distance on bucket vectors | |
Computes the lower bound edit distance on n-gram in `bucket space` between bucket | |
vectors `v_i` and `v_j`. | |
Parameter: | |
v_i: List[int] Bucket n-gram vector. | |
v_j: List[int] Bucket n-gram vector. | |
n: int n-gram size. | |
Returns: | |
float: lower bound (absolute) edit distance | |
''' | |
assert len(v_i) == len(v_j) | |
a, b = 0, 0 | |
for l in range(len(v_i)): | |
if v_i[l] > v_j[l]: | |
a += (v_i[l] - v_j[l]) / n | |
if v_i[l] < v_j[l]: | |
b += (v_i[l] - v_j[l]) / n | |
return max(a, b) | |
@lru_cache(maxsize=4096) # stores 64 x 2-gram hashes | |
def str_hash(t: str, seed=42) -> int: | |
'''Use of Murmur3 after this blog: | |
https://research.neustar.biz/2012/02/02/choosing-a-good-hash-function-part-3/ | |
''' | |
return mmh3.hash(t, seed=42) | |
def ngrams(t: str, n: int=2) -> Dict[str, int]: | |
'''Compute frequency of n-grams in text t. | |
Parameter: | |
t: str Input string. | |
n: int Size of the n-grams. | |
Returns: | |
Dict[str, int]: The n-gram frequency: n-gram -> frequency | |
''' | |
res = defaultdict(int) | |
if t: | |
for j in range(n-1, 0, -1): | |
res['^'*j + t[:n-j]] += 1 | |
for j in range(1, n): | |
res[t[-(n-j):] + '$'*j] += 1 | |
for i in range(len(t) - (n-1)): | |
res[t[i:i+n]] += 1 | |
return res | |
def ngrams_pos(t: str, n: int=2) -> List[Tuple[str, int]]: | |
'''Compute positional n-grams of text t. | |
Parameter: | |
t: str Input string. | |
n: int Size of the n-grams. | |
Returns: | |
List[Tuple[str, int]]: The positional n-grams. | |
''' | |
res = list() | |
if t: | |
i = 0 | |
for j in range(n-1, 0, -1): | |
res += [('^'*j + t[:n-j], i)] | |
i += 1 | |
for j in range(len(t) - (n-1)): | |
res += [(t[j:j+n], i)] | |
i += 1 | |
for j in range(1, n): | |
res += [(t[-(n-j):] + '$'*j, i)] | |
i += 1 | |
return res | |
def ngram2bucket(ngrams: Dict[str, int], n_buckets: int=10) -> Iterable[int]: | |
'''Assign n-gram frequency onto n buckets by hashing the n-gram and counting | |
the frequencies for each bucket. | |
Parameter: | |
ngrams: Dict[str, int] The n-gram frequency: n-gram -> frequency | |
n_buckets: int Number of target buckets. | |
Returns: | |
narray dtype=int: Frequency of bucket occurences. | |
''' | |
res = [0] * n_buckets | |
for ng, n in ngrams.items(): | |
res[str_hash(ng) % n_buckets] += n | |
return res | |
def zorder(xs: Iterable[int], sig_bits: int=None) -> int: | |
'''Compute the z-order of a list of ints, i.e. most significant bits of all | |
ints come first and least significant bits come last. For example the list | |
[3, 2, 1, 3] has bits [0b11, 0b10, 0b01, 0b11]. The z-order of these ints | |
is: 0b11011011 | |
bin(zorder([3, 2, 1, 3])) -> '0b11011011' = 219 | |
Parameter: | |
xs: Iterable[int] Some list / narray / ... of (positive) integer values. | |
sig_bits: int The num. of significant bits for counts xs. Assigning | |
an appropriat value, speeds up computation. | |
Returns: | |
int: z-order of given list | |
''' | |
res = 0 | |
if sig_bits is None: | |
sig_bits = max(xs).bit_length() | |
max_count = 1 << sig_bits | |
idx = len(xs)*sig_bits-1 | |
for i in range(sig_bits-1, -1, -1): | |
for x in xs: | |
if x >= max_count: | |
return zorder(xs, sig_bits=None) # computes the sig_bits from input | |
bit = (x & (1 << i)) >> i | |
res |= bit << idx | |
idx -= 1 | |
return res | |
def gram_counting_order(t: str, n: int=2, n_buckets: int=10, sig_bits: int=None): | |
'''Compute the Gram Counting Order from paper (doi: 10.1145/1807167.1807266) | |
Is the same as: | |
zorder(ngram2bucket(ngrams(t, n=n), n_buckets=n_buckets), sig_bits=sig_bits) | |
Parameter: | |
t: str The input string. | |
n: int The size of the n-grams. | |
n_buckets: int The number of buckets to count the n-grams. | |
sig_bits: int The num. of significant bits for counts in bucket vectors. | |
Returns: | |
int: Gram Counting Order of t. | |
''' | |
def ngram2bucket_fast(t: str, n: int=2, n_buckets: int=10) -> Iterable[int]: | |
res = [0] * n_buckets | |
if t: | |
for j in range(n-1, 0, -1): | |
res[str_hash('^'*j + t[:n-j]) % n_buckets] += 1 | |
for j in range(1, n): | |
res[str_hash(t[-(n-j):] + '$'*j) % n_buckets] += 1 | |
for i in range(len(t) - (n-1)): | |
res[str_hash(t[i:i+n]) % n_buckets] += 1 | |
return res | |
return zorder( | |
ngram2bucket_fast( | |
slugify(t), | |
n=n, | |
n_buckets=n_buckets | |
), | |
sig_bits=sig_bits | |
) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment