tammoippen · August 10, 2017 12:02
diff --git a/bed-tree.py b/bed-tree.py
 from collections import defaultdict
 from functools import lru_cache
 from typing import Dict, Iterable, List, Tuple

 import mmh3
 import numpy as np
 from slugify import slugify


 def verify_edit_distance(s1: str, s2: str, th: int) -> bool:
    '''Test, whether the edit distance between `s1` and `s2` is below `th` in
    O(max(len(s1), len(s2)))

    Parameter:
        s1: str  First query string.
        s2: str  Second query string.
        th: int  Edit distance threshold.

    Returns:
        bool: True if edit distance <= th, False otherwise.
    '''
    ls1 = len(s1)
    ls2 = len(s2)

    if abs(ls1 - ls2) > th:
        return False

    t = np.zeros((2, ls2+1))

    for j in range(min(ls2+1, 1+th)):
        t[0][j] = j

    for i in range(1, ls1+1):
        m = th + 1

        for j in range(max(0, i-th), min(ls2, i+th)+1):
            d1, d2, d3 = th+1, th+1, th+1

            if j < i + th:
                d1 = t[0][j] + 1

            if j > 0:
                d2 = t[1][j-1] + 1
                d3 = t[0][j-1] + int(s1[i-1] != s2[j-1])

            t[1][j] = min(d1, d2, d3)
            m = min(m, t[1][j])

        if m > th:
            return False

        t[0] = t[1]
    return True


 def lower_bound_edit_distance(v_i: List[int], v_j: List[int], n: int=2) -> float:
    '''Computes the lower bound edit distance on bucket vectors

    Computes the lower bound edit distance on n-gram in `bucket space` between bucket
    vectors `v_i` and `v_j`.

    Parameter:
        v_i: List[int]  Bucket n-gram vector.
        v_j: List[int]  Bucket n-gram vector.
        n: int          n-gram size.

    Returns:
        float: lower bound (absolute) edit distance
    '''
    assert len(v_i) == len(v_j)
    a, b = 0, 0
    for l in range(len(v_i)):
        if v_i[l] > v_j[l]:
            a += (v_i[l] - v_j[l]) / n
        if v_i[l] < v_j[l]:
            b += (v_i[l] - v_j[l]) / n

    return max(a, b)


 @lru_cache(maxsize=4096)  # stores 64 x 2-gram hashes
 def str_hash(t: str, seed=42) -> int:
    '''Use of Murmur3 after this blog:

    https://research.neustar.biz/2012/02/02/choosing-a-good-hash-function-part-3/
    '''
    return mmh3.hash(t, seed=42)


 def ngrams(t: str, n: int=2) -> Dict[str, int]:
    '''Compute frequency of n-grams in text t.

    Parameter:
        t: str  Input string.
        n: int  Size of the n-grams.

    Returns:
        Dict[str, int]:  The n-gram frequency: n-gram -> frequency
    '''
    res = defaultdict(int)
    if t:
        for j in range(n-1, 0, -1):
            res['^'*j + t[:n-j]] += 1
        for j in range(1, n):
            res[t[-(n-j):] + '$'*j] += 1
        for i in range(len(t) - (n-1)):
            res[t[i:i+n]] += 1
    return res


 def ngrams_pos(t: str, n: int=2) -> List[Tuple[str, int]]:
    '''Compute positional n-grams of text t.

    Parameter:
        t: str  Input string.
        n: int  Size of the n-grams.

    Returns:
        List[Tuple[str, int]]:  The positional n-grams.
    '''
    res = list()
    if t:
        i = 0
        for j in range(n-1, 0, -1):
            res += [('^'*j + t[:n-j], i)]
            i += 1
        for j in range(len(t) - (n-1)):
            res += [(t[j:j+n], i)]
            i += 1
        for j in range(1, n):
            res += [(t[-(n-j):] + '$'*j, i)]
            i += 1
    return res


 def ngram2bucket(ngrams: Dict[str, int], n_buckets: int=10) -> Iterable[int]:
    '''Assign n-gram frequency onto n buckets by hashing the n-gram and counting
    the frequencies for each bucket.

    Parameter:
        ngrams: Dict[str, int]  The n-gram frequency: n-gram -> frequency
        n_buckets: int          Number of target buckets.

    Returns:
        narray dtype=int:  Frequency of bucket occurences.
    '''
    res = [0] * n_buckets
    for ng, n in ngrams.items():
        res[str_hash(ng) % n_buckets] += n
    return res


 def zorder(xs: Iterable[int], sig_bits: int=None) -> int:
    '''Compute the z-order of a list of ints, i.e. most significant bits of all
    ints come first and least significant bits come last. For example the list
    [3, 2, 1, 3] has bits [0b11, 0b10, 0b01, 0b11]. The z-order of these ints
    is: 0b11011011

    bin(zorder([3, 2, 1, 3])) -> '0b11011011' = 219

    Parameter:
        xs: Iterable[int]  Some list / narray / ... of (positive) integer values.
        sig_bits: int      The num. of significant bits for counts xs. Assigning
                           an appropriat value, speeds up computation.

    Returns:
        int: z-order of given list
    '''
    res = 0

    if sig_bits is None:
        sig_bits = max(xs).bit_length()

    max_count = 1 << sig_bits
    idx = len(xs)*sig_bits-1
    for i in range(sig_bits-1, -1, -1):
        for x in xs:
            if x >= max_count:
                return zorder(xs, sig_bits=None)  # computes the sig_bits from input

            bit = (x & (1 << i)) >> i
            res |= bit << idx
            idx -= 1
    return res


 def gram_counting_order(t: str, n: int=2, n_buckets: int=10, sig_bits: int=None):
    '''Compute the Gram Counting Order from paper (doi: 10.1145/1807167.1807266)

    Is the same as:
        zorder(ngram2bucket(ngrams(t, n=n), n_buckets=n_buckets), sig_bits=sig_bits)

    Parameter:
        t: str          The input string.
        n: int          The size of the n-grams.
        n_buckets: int  The number of buckets to count the n-grams.
        sig_bits: int   The num. of significant bits for counts in bucket vectors.

    Returns:
        int: Gram Counting Order of t.
    '''
    def ngram2bucket_fast(t: str, n: int=2, n_buckets: int=10) -> Iterable[int]:
        res = [0] * n_buckets
        if t:
            for j in range(n-1, 0, -1):
                res[str_hash('^'*j + t[:n-j]) % n_buckets] += 1
            for j in range(1, n):
                res[str_hash(t[-(n-j):] + '$'*j) % n_buckets] += 1
            for i in range(len(t) - (n-1)):
                res[str_hash(t[i:i+n]) % n_buckets] += 1
        return res

    return zorder(
        ngram2bucket_fast(
            slugify(t),
            n=n,
            n_buckets=n_buckets
        ),
        sig_bits=sig_bits
    )
	from collections import defaultdict
	from functools import lru_cache
	from typing import Dict, Iterable, List, Tuple

	import mmh3
	import numpy as np
	from slugify import slugify


	def verify_edit_distance(s1: str, s2: str, th: int) -> bool:
	'''Test, whether the edit distance between `s1` and `s2` is below `th` in
	O(max(len(s1), len(s2)))

	Parameter:
	s1: str First query string.
	s2: str Second query string.
	th: int Edit distance threshold.

	Returns:
	bool: True if edit distance <= th, False otherwise.
	'''
	ls1 = len(s1)
	ls2 = len(s2)

	if abs(ls1 - ls2) > th:
	return False

	t = np.zeros((2, ls2+1))

	for j in range(min(ls2+1, 1+th)):
	t[0][j] = j

	for i in range(1, ls1+1):
	m = th + 1

	for j in range(max(0, i-th), min(ls2, i+th)+1):
	d1, d2, d3 = th+1, th+1, th+1

	if j < i + th:
	d1 = t[0][j] + 1

	if j > 0:
	d2 = t[1][j-1] + 1
	d3 = t[0][j-1] + int(s1[i-1] != s2[j-1])

	t[1][j] = min(d1, d2, d3)
	m = min(m, t[1][j])

	if m > th:
	return False

	t[0] = t[1]
	return True


	def lower_bound_edit_distance(v_i: List[int], v_j: List[int], n: int=2) -> float:
	'''Computes the lower bound edit distance on bucket vectors

	Computes the lower bound edit distance on n-gram in `bucket space` between bucket
	vectors `v_i` and `v_j`.

	Parameter:
	v_i: List[int] Bucket n-gram vector.
	v_j: List[int] Bucket n-gram vector.
	n: int n-gram size.

	Returns:
	float: lower bound (absolute) edit distance
	'''
	assert len(v_i) == len(v_j)
	a, b = 0, 0
	for l in range(len(v_i)):
	if v_i[l] > v_j[l]:
	a += (v_i[l] - v_j[l]) / n
	if v_i[l] < v_j[l]:
	b += (v_i[l] - v_j[l]) / n

	return max(a, b)


	@lru_cache(maxsize=4096) # stores 64 x 2-gram hashes
	def str_hash(t: str, seed=42) -> int:
	'''Use of Murmur3 after this blog:

	https://research.neustar.biz/2012/02/02/choosing-a-good-hash-function-part-3/
	'''
	return mmh3.hash(t, seed=42)


	def ngrams(t: str, n: int=2) -> Dict[str, int]:
	'''Compute frequency of n-grams in text t.

	Parameter:
	t: str Input string.
	n: int Size of the n-grams.

	Returns:
	Dict[str, int]: The n-gram frequency: n-gram -> frequency
	'''
	res = defaultdict(int)
	if t:
	for j in range(n-1, 0, -1):
	res['^'*j + t[:n-j]] += 1
	for j in range(1, n):
	res[t[-(n-j):] + '$'*j] += 1
	for i in range(len(t) - (n-1)):
	res[t[i:i+n]] += 1
	return res


	def ngrams_pos(t: str, n: int=2) -> List[Tuple[str, int]]:
	'''Compute positional n-grams of text t.

	Parameter:
	t: str Input string.
	n: int Size of the n-grams.

	Returns:
	List[Tuple[str, int]]: The positional n-grams.
	'''
	res = list()
	if t:
	i = 0
	for j in range(n-1, 0, -1):
	res += [('^'*j + t[:n-j], i)]
	i += 1
	for j in range(len(t) - (n-1)):
	res += [(t[j:j+n], i)]
	i += 1
	for j in range(1, n):
	res += [(t[-(n-j):] + '$'*j, i)]
	i += 1
	return res


	def ngram2bucket(ngrams: Dict[str, int], n_buckets: int=10) -> Iterable[int]:
	'''Assign n-gram frequency onto n buckets by hashing the n-gram and counting
	the frequencies for each bucket.

	Parameter:
	ngrams: Dict[str, int] The n-gram frequency: n-gram -> frequency
	n_buckets: int Number of target buckets.

	Returns:
	narray dtype=int: Frequency of bucket occurences.
	'''
	res = [0] * n_buckets
	for ng, n in ngrams.items():
	res[str_hash(ng) % n_buckets] += n
	return res


	def zorder(xs: Iterable[int], sig_bits: int=None) -> int:
	'''Compute the z-order of a list of ints, i.e. most significant bits of all
	ints come first and least significant bits come last. For example the list
	[3, 2, 1, 3] has bits [0b11, 0b10, 0b01, 0b11]. The z-order of these ints
	is: 0b11011011

	bin(zorder([3, 2, 1, 3])) -> '0b11011011' = 219

	Parameter:
	xs: Iterable[int] Some list / narray / ... of (positive) integer values.
	sig_bits: int The num. of significant bits for counts xs. Assigning
	an appropriat value, speeds up computation.

	Returns:
	int: z-order of given list
	'''
	res = 0

	if sig_bits is None:
	sig_bits = max(xs).bit_length()

	max_count = 1 << sig_bits
	idx = len(xs)*sig_bits-1
	for i in range(sig_bits-1, -1, -1):
	for x in xs:
	if x >= max_count:
	return zorder(xs, sig_bits=None) # computes the sig_bits from input

	bit = (x & (1 << i)) >> i
	res \|= bit << idx
	idx -= 1
	return res


	def gram_counting_order(t: str, n: int=2, n_buckets: int=10, sig_bits: int=None):
	'''Compute the Gram Counting Order from paper (doi: 10.1145/1807167.1807266)

	Is the same as:
	zorder(ngram2bucket(ngrams(t, n=n), n_buckets=n_buckets), sig_bits=sig_bits)

	Parameter:
	t: str The input string.
	n: int The size of the n-grams.
	n_buckets: int The number of buckets to count the n-grams.
	sig_bits: int The num. of significant bits for counts in bucket vectors.

	Returns:
	int: Gram Counting Order of t.
	'''
	def ngram2bucket_fast(t: str, n: int=2, n_buckets: int=10) -> Iterable[int]:
	res = [0] * n_buckets
	if t:
	for j in range(n-1, 0, -1):
	res[str_hash('^'*j + t[:n-j]) % n_buckets] += 1
	for j in range(1, n):
	res[str_hash(t[-(n-j):] + '$'*j) % n_buckets] += 1
	for i in range(len(t) - (n-1)):
	res[str_hash(t[i:i+n]) % n_buckets] += 1
	return res

	return zorder(
	ngram2bucket_fast(
	slugify(t),
	n=n,
	n_buckets=n_buckets
	),
	sig_bits=sig_bits
	)