Skip to content

Instantly share code, notes, and snippets.

@dcdanko
Created January 30, 2019 16:13
Show Gist options
  • Save dcdanko/439fe09db442d607f98fcee4f9059abd to your computer and use it in GitHub Desktop.
Save dcdanko/439fe09db442d607f98fcee4f9059abd to your computer and use it in GitHub Desktop.
import click
def parse_fasta(fasta):
cur_line, cur_header = '', ''
for line in fasta:
line = line.strip().upper()
if line[0] == '>':
if cur_line:
click.echo(cur_header, err=True)
yield cur_line
cur_header = line
else:
cur_line += line
click.echo(cur_header, err=True)
yield cur_line
BASE_TO_VAL = {
'A': 0, 'T': 1, 'C': 2, 'G': 3, 'N': 4
}
def rabin_karp(seq, k=7):
initial_seq, rk_hash = seq[:k], 0
for i, base in enumerate(initial_seq):
rk_hash += (2 ** (k - i - 1)) * BASE_TO_VAL[base]
yield rk_hash, 0
for i in range(k, len(seq)):
rk_hash -= (2 ** (k - 1)) * BASE_TO_VAL[seq[i - k]]
rk_hash *= 2
rk_hash += BASE_TO_VAL[seq[i]]
yield rk_hash, i
@click.command()
@click.argument('fasta', type=click.File('r'))
def main(fasta):
tbl = {}
for seq in parse_fasta(fasta):
count = 0
for rkh, pos in rabin_karp(seq):
try:
tbl[rkh].append(pos)
except KeyError:
tbl[rkh] = [pos]
count += 1
if count % 100000 == 0:
click.echo(f'{count} hashes', err=True)
click.echo(f'{count} hashes', err=True)
break
for rkh, poss in tbl.items():
poss = ','.join([str(el) for el in poss])
print(f'{rkh}\t{poss}')
if __name__ == '__main__':
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment