Skip to content

Instantly share code, notes, and snippets.

@ctb
Last active April 10, 2020 21:01
Show Gist options
  • Save ctb/fa1c11b5e2f9614128685600911c842a to your computer and use it in GitHub Desktop.
Save ctb/fa1c11b5e2f9614128685600911c842a to your computer and use it in GitHub Desktop.

Implement contig contamination analysis as in (blog post)

#! /usr/bin/env python
"""
Walk through a file of long contigs and analyze each one for contamination
against an SBT database.
Author: C. Titus Brown, [email protected]
Requires sourmash 2.0.0a6.
"""
import argparse
import screed
import sourmash
from sourmash import sourmash_args, search
from sourmash.sbtmh import SearchMinHashesFindBestIgnoreMaxHash
import csv
def main():
p = argparse.ArgumentParser()
p.add_argument('input_seqs')
p.add_argument('sbt_database')
p.add_argument('--threshold', type=float, default=0.05)
p.add_argument('--output-nomatch', type=argparse.FileType('wt'))
p.add_argument('--output-match', type=argparse.FileType('wt'))
p.add_argument('--csv', type=argparse.FileType('wt'))
args = p.parse_args()
tree = sourmash.load_sbt_index(args.sbt_database)
print(f'found SBT database {args.sbt_database}')
leaf = next(iter(tree.leaves()))
mh = leaf.data.minhash.copy_and_clear()
print(f'using ksize={mh.ksize}, scaled={mh.scaled}')
print(f'loading sequences from {args.input_seqs}')
if args.output_match:
print(f'saving match sequences to {args.output_match.name}')
if args.output_nomatch:
print(f'saving nomatch sequences to {args.output_nomatch.name}')
if args.csv:
print(f'outputting CSV summary to {args.csv.name}')
found_list = []
total = 0
matches = 0
for record in screed.open(args.input_seqs):
total += 1
found = False
search_fn = SearchMinHashesFindBestIgnoreMaxHash().search
query_mh = mh.copy_and_clear()
query_mh.add_sequence(record.sequence)
query = sourmash.SourmashSignature(query_mh)
# too small a sequence/not enough hashes? notify
if not query_mh.get_mins():
print(f'note: skipping {query.name()[:20]}, no hashes in sketch')
continue
for leaf in tree.find(search_fn, query, args.threshold):
found = True
matches += 1
similarity = query.similarity(leaf.data)
found_list.append((record.name, leaf.data.name(), similarity))
break
if not found:
found_list.append((record.name, '', 0.0))
if found and args.output_match:
args.output_match.write(f'>{record.name}\n{record.sequence}\n')
if not found and args.output_nomatch:
args.output_nomatch.write(f'>{record.name}\n{record.sequence}\n')
print(f'searched {total}, found {matches}', end='\r')
print('')
if args.csv:
w = csv.DictWriter(args.csv, fieldnames=['query', 'match', 'score'],
delimiter='\t')
w.writeheader()
for (query, match, score) in found_list:
w.writerow(dict(query=query, match=match, score=score))
if __name__ == '__main__':
main()
sourmash==2.0.0a6
@marcopessoa
Copy link

I was trying to use this and noticed that the class SearchMinHashesFindBestIgnoreMaxHash from sourmash.sbtmh was actually GatherMinHashesFindBestIgnoreMaxHash.

@Juke34
Copy link

Juke34 commented Jan 22, 2020

Since December 2019 GatherMinHashesFindBestIgnoreMaxHash became GatherMinHashes see sourmash-bio/sourmash#556

@ctb
Copy link
Author

ctb commented Apr 10, 2020

Please see sourmash-bio/sourmash#941 - adding it into sourmash/ utils subdirectory now, so that it can be part of our tests!

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment