Implement contig contamination analysis as in (blog post)
Last active
April 10, 2020 21:01
-
-
Save ctb/fa1c11b5e2f9614128685600911c842a to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#! /usr/bin/env python | |
""" | |
Walk through a file of long contigs and analyze each one for contamination | |
against an SBT database. | |
Author: C. Titus Brown, [email protected] | |
Requires sourmash 2.0.0a6. | |
""" | |
import argparse | |
import screed | |
import sourmash | |
from sourmash import sourmash_args, search | |
from sourmash.sbtmh import SearchMinHashesFindBestIgnoreMaxHash | |
import csv | |
def main(): | |
p = argparse.ArgumentParser() | |
p.add_argument('input_seqs') | |
p.add_argument('sbt_database') | |
p.add_argument('--threshold', type=float, default=0.05) | |
p.add_argument('--output-nomatch', type=argparse.FileType('wt')) | |
p.add_argument('--output-match', type=argparse.FileType('wt')) | |
p.add_argument('--csv', type=argparse.FileType('wt')) | |
args = p.parse_args() | |
tree = sourmash.load_sbt_index(args.sbt_database) | |
print(f'found SBT database {args.sbt_database}') | |
leaf = next(iter(tree.leaves())) | |
mh = leaf.data.minhash.copy_and_clear() | |
print(f'using ksize={mh.ksize}, scaled={mh.scaled}') | |
print(f'loading sequences from {args.input_seqs}') | |
if args.output_match: | |
print(f'saving match sequences to {args.output_match.name}') | |
if args.output_nomatch: | |
print(f'saving nomatch sequences to {args.output_nomatch.name}') | |
if args.csv: | |
print(f'outputting CSV summary to {args.csv.name}') | |
found_list = [] | |
total = 0 | |
matches = 0 | |
for record in screed.open(args.input_seqs): | |
total += 1 | |
found = False | |
search_fn = SearchMinHashesFindBestIgnoreMaxHash().search | |
query_mh = mh.copy_and_clear() | |
query_mh.add_sequence(record.sequence) | |
query = sourmash.SourmashSignature(query_mh) | |
# too small a sequence/not enough hashes? notify | |
if not query_mh.get_mins(): | |
print(f'note: skipping {query.name()[:20]}, no hashes in sketch') | |
continue | |
for leaf in tree.find(search_fn, query, args.threshold): | |
found = True | |
matches += 1 | |
similarity = query.similarity(leaf.data) | |
found_list.append((record.name, leaf.data.name(), similarity)) | |
break | |
if not found: | |
found_list.append((record.name, '', 0.0)) | |
if found and args.output_match: | |
args.output_match.write(f'>{record.name}\n{record.sequence}\n') | |
if not found and args.output_nomatch: | |
args.output_nomatch.write(f'>{record.name}\n{record.sequence}\n') | |
print(f'searched {total}, found {matches}', end='\r') | |
print('') | |
if args.csv: | |
w = csv.DictWriter(args.csv, fieldnames=['query', 'match', 'score'], | |
delimiter='\t') | |
w.writeheader() | |
for (query, match, score) in found_list: | |
w.writerow(dict(query=query, match=match, score=score)) | |
if __name__ == '__main__': | |
main() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
sourmash==2.0.0a6 |
Since December 2019 GatherMinHashesFindBestIgnoreMaxHash
became GatherMinHashes
see sourmash-bio/sourmash#556
Please see sourmash-bio/sourmash#941 - adding it into sourmash/ utils subdirectory now, so that it can be part of our tests!
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
I was trying to use this and noticed that the class
SearchMinHashesFindBestIgnoreMaxHash
fromsourmash.sbtmh
was actuallyGatherMinHashesFindBestIgnoreMaxHash
.