Last active
December 26, 2022 15:50
-
-
Save rchikhi/ef2a6371c1a6dc2ab53c572ff6890cea to your computer and use it in GitHub Desktop.
NTHash rolling nucleotide (kmer) hashing. NThash1 version, pure Python implementation of https://academic.oup.com/bioinformatics/article/32/22/3492/2525588
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# nthash1, pure python implementation | |
# | |
# (not nthash2, so only for k values < 64) | |
# ported from https://github.com/luizirber/nthash/ | |
h = {'A': 0x3c8b_fbb3_95c6_0474, | |
'C': 0x3193_c185_62a0_2b4c, | |
'G': 0x2032_3ed0_8257_2324, | |
'T': 0x2955_49f5_4be2_4456, | |
'N': 0} | |
rc = {'A': 0x2955_49f5_4be2_4456, | |
'C': 0x2032_3ed0_8257_2324, | |
'G': 0x3193_c185_62a0_2b4c, | |
'T': 0x3c8b_fbb3_95c6_0474, | |
'N': 0} | |
#https://gist.github.com/williballenthin/05aadb969e43788593d3 | |
def ROR(x, n, bits = 64): | |
mask = (2**n) - 1 | |
mask_bits = x & mask | |
return (x >> (n % bits)) | (mask_bits << ((bits - n) % bits)) | |
def ROL(x, n, bits = 64): | |
return ROR(x, (bits - n) % bits, bits) | |
class NTHash: | |
def __init__(self, seq, k): | |
assert(k <= len(seq)) | |
self.seq = seq | |
self.k = k | |
self.current_idx = 0 | |
self.max_idx = len(seq) - k + 1 | |
self.fh = 0 | |
for (i,v) in enumerate(seq[:k]): | |
self.fh ^= ROL(h[v], k - i - 1) | |
self.rh = 0 | |
for (i,v) in enumerate(seq[:k][::-1]): | |
self.rh ^= ROL(rc[v], k - i - 1) | |
def __iter__(self): | |
return self | |
def __next__(self): | |
if self.current_idx == self.max_idx: | |
raise StopIteration | |
if self.current_idx != 0: | |
i = self.current_idx - 1 | |
seqi = self.seq[i] | |
seqk = self.seq[i + self.k] | |
self.fh = ROL(self.fh,1) ^ ROL(h[seqi],self.k) ^ h[seqk] | |
self.rh = ROR(self.rh,1) ^ \ | |
ROR(rc[seqi],1) ^ \ | |
ROL(rc[seqk],self.k - 1) | |
self.current_idx += 1 | |
return min(self.rh, self.fh) | |
if __name__ == '__main__': | |
# https://github.com/luizirber/nthash/blob/latest/tests/nthash.rs#L14 | |
seq = "ACGTCGTCAGTCGATGCAGT" | |
print("hashing",seq,":",[hex(h) for h in NTHash(seq,5)]) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment