Created
November 2, 2020 03:05
-
-
Save ctb/763fa60e4357461d1417ccb915cdcef4 to your computer and use it in GitHub Desktop.
a simple prefetch script that searches large sourmash databases for all possible matches, and then saves them
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#! /usr/bin/env python | |
import sys | |
import argparse | |
import copy | |
import sourmash | |
from sourmash import sourmash_args | |
from sourmash.logging import notify, error | |
def main(): | |
p = argparse.ArgumentParser() | |
p.add_argument('--db', nargs='+', action='append', | |
help='one or more LCA databases to use') | |
p.add_argument('--query', nargs='*', default=[], action='append', | |
help='one or more signature files to use as queries') | |
p.add_argument('--save-matches', required=True) | |
p.add_argument('--output-unassigned') | |
p.add_argument('--threshold-bp', type=float, default=1e5) | |
args = p.parse_args() | |
# flatten --db and --query lists | |
args.db = [item for sublist in args.db for item in sublist] | |
args.query = [item for sublist in args.query for item in sublist] | |
# build one big query: | |
query_sigs = [] | |
for query_file in args.query: | |
query_sigs.extend(sourmash_args.load_file_as_signatures(query_file, | |
ksize=31)) | |
mh = query_sigs[0].minhash | |
for query_sig in query_sigs[1:]: | |
mh += query_sig.minhash | |
unident_mh = copy.copy(mh) | |
notify(f'Loaded {len(mh.hashes)} hashes from {len(query_sigs)} query signatures.') | |
# iterate over signatures in one at a time | |
keep = [] | |
n = 0 | |
for db in args.db: | |
for sig in sourmash_args.load_file_as_signatures(db, ksize=31): | |
n += 1 | |
common = mh.count_common(sig.minhash) | |
if common: | |
# check scaled... | |
if common * mh.scaled >= args.threshold_bp: | |
keep.append(sig) | |
unident_mh.remove_many(sig.minhash.hashes) | |
if n % 10 == 0: | |
notify(f'{n} searched, {len(keep)} matches.', end='\r') | |
notify(f'{n} searched, {len(keep)} matches.') | |
if keep and args.save_matches: | |
notify('saving all matches to "{}"', args.save_matches) | |
with sourmash_args.FileOutput(args.save_matches, 'wt') as fp: | |
sourmash.save_signatures(keep, fp) | |
if args.output_unassigned: | |
notify('saving unidentified hashes to "{}"', args.output_unassigned) | |
ss = sourmash.SourmashSignature(unident_mh) | |
with open(args.output_unassigned, 'wt') as fp: | |
sourmash.save_signatures([ ss ], fp) | |
return 0 | |
if __name__ == '__main__': | |
sys.exit(main()) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Thanks for this! I have been meaning to get involved on this open-source project again, but have been pulled away by exciting but demanding developments at work. This is really inspiring me to try to make some time to return to learning github and using this little gist to solve a problem of my own. :)