Sam Minot sminot

Microbiome scientist at the Fred Hutchinson Cancer Research Center

sminot / parse_uniref_xml.py

Created March 15, 2018 18:12

Parse UniRef XML -> CSV

	#!/usr/bin/python

	import os
	import sys
	import xml
	import gzip
	import json
	import time
	from collections import defaultdict
	import pandas as pd

sminot / collect_s3_results.py

Last active April 4, 2018 21:38

Collect a set of results from S3 into a single feather file

sminot / interleave_all_fastq.py

Last active May 14, 2018 17:30

Interleave all of the Illumina files in a folder

	#!/usr/bin/python
	# encoding:utf8
	# authors: Erik Garrison, Sébastien Boisvert, Sam Minot
	"""This script takes a folder of fastq(.gz) files and interleaves them

	Usage:
	interleave-fasta folder
	"""

	import os

sminot / read_fasta_from_s3.py

Created May 25, 2018 22:16

Read FASTA from S3 with boto3

	def read_fasta_from_s3(bucket_name, key_name, sep="\t"):
	s3 = boto3.client('s3')
	retr = s3.get_object(Bucket=bucket_name, Key=key_name)

	for header, seq in SimpleFastaParser(io.StringIO(retr['Body'].read().decode('utf-8'))):
	yield header, seq

sminot / parse_sra_xml.py

Created June 12, 2018 16:32

Parse XML output from SRA

	#!/usr/bin/env python3

	import os
	import sys
	import xmltodict
	from collections import OrderedDict

	assert len(sys.argv) > 1
	assert os.path.exists(sys.argv[1])
	assert sys.argv[1].endswith(".xml")

sminot / read_gzip_fasta_from_s3.py

Created July 13, 2018 19:37

Read a gzipped fasta from s3

	def read_fasta_from_s3(bucket_name, key_name, sep="\t"):
	s3 = boto3.client('s3')
	retr = s3.get_object(Bucket=bucket_name, Key=key_name)
	bytestream = io.BytesIO(retr['Body'].read())
	got_text = gzip.GzipFile(None, 'rb', fileobj=bytestream).read().decode('utf-8')

	for header, seq in SimpleFastaParser(io.StringIO(got_text)):
	yield header, seq

sminot / get_assembly_name_from_ncbi.py

Created August 1, 2018 21:48

Get the name of an organism, given an assembly ID

	from Bio import Entrez
	import xmltodict

	def get_name_from_assembly_id(assembly_name):
	handle = Entrez.esearch("assembly", term=assembly_name)
	search_result = xmltodict.parse("".join(handle.readlines()))
	handle.close()
	try:
	assembly_id = search_result["eSearchResult"]["IdList"]["Id"]
	except:

sminot / merge_paired_end_fastq.sh

Created August 22, 2018 20:23

Merge paired-end FASTQ reads with PANDAseq

	#!/bin/bash

	set -e

	module load PANDAseq/2.11-foss-2016b

	find . -name "*_1.fq.gz" \| sort -R \| while read fwd; do
	rev="${fwd/_1.fq.gz/_2.fq.gz}"
	[[ -s "$rev" ]]

sminot / kegg_api.py

Last active September 6, 2018 16:43

Get name from KEGG API

	# KEGG names
	@lru_cache(maxsize=None)
	def get_kegg_name(ko):
	r = requests.get("http://rest.kegg.jp/list/{}".format(ko))
	return r.text.split("\t")[-1].rstrip("\n")

	get_kegg_name("K00975")

sminot / read_feather_from_s3.py

Created September 2, 2018 16:36

Read feather file directly from AWS S3

	import io
	import boto3
	import pandas as pd

	def read_feather_file_from_s3(s3_url):
	assert s3_url.startswith("s3://")
	bucket_name, key_name = s3_url[5:].split("/", 1)

	s3 = boto3.client('s3')
	retr = s3.get_object(Bucket=bucket_name, Key=key_name)