Skip to content

Instantly share code, notes, and snippets.

View sminot's full-sized avatar

Sam Minot sminot

View GitHub Profile
@sminot
sminot / parse_uniref_xml.py
Created March 15, 2018 18:12
Parse UniRef XML -> CSV
#!/usr/bin/python
import os
import sys
import xml
import gzip
import json
import time
from collections import defaultdict
import pandas as pd
@sminot
sminot / collect_s3_results.py
Last active April 4, 2018 21:38
Collect a set of results from S3 into a single feather file
#!/usr/local/bin/python
import os
import io
import json
import gzip
import boto3
import argparse
import pandas as pd
@sminot
sminot / interleave_all_fastq.py
Last active May 14, 2018 17:30
Interleave all of the Illumina files in a folder
#!/usr/bin/python
# encoding:utf8
# authors: Erik Garrison, Sébastien Boisvert, Sam Minot
"""This script takes a folder of fastq(.gz) files and interleaves them
Usage:
interleave-fasta folder
"""
import os
@sminot
sminot / read_fasta_from_s3.py
Created May 25, 2018 22:16
Read FASTA from S3 with boto3
def read_fasta_from_s3(bucket_name, key_name, sep="\t"):
s3 = boto3.client('s3')
retr = s3.get_object(Bucket=bucket_name, Key=key_name)
for header, seq in SimpleFastaParser(io.StringIO(retr['Body'].read().decode('utf-8'))):
yield header, seq
@sminot
sminot / parse_sra_xml.py
Created June 12, 2018 16:32
Parse XML output from SRA
#!/usr/bin/env python3
import os
import sys
import xmltodict
from collections import OrderedDict
assert len(sys.argv) > 1
assert os.path.exists(sys.argv[1])
assert sys.argv[1].endswith(".xml")
@sminot
sminot / read_gzip_fasta_from_s3.py
Created July 13, 2018 19:37
Read a gzipped fasta from s3
def read_fasta_from_s3(bucket_name, key_name, sep="\t"):
s3 = boto3.client('s3')
retr = s3.get_object(Bucket=bucket_name, Key=key_name)
bytestream = io.BytesIO(retr['Body'].read())
got_text = gzip.GzipFile(None, 'rb', fileobj=bytestream).read().decode('utf-8')
for header, seq in SimpleFastaParser(io.StringIO(got_text)):
yield header, seq
@sminot
sminot / get_assembly_name_from_ncbi.py
Created August 1, 2018 21:48
Get the name of an organism, given an assembly ID
from Bio import Entrez
import xmltodict
def get_name_from_assembly_id(assembly_name):
handle = Entrez.esearch("assembly", term=assembly_name)
search_result = xmltodict.parse("".join(handle.readlines()))
handle.close()
try:
assembly_id = search_result["eSearchResult"]["IdList"]["Id"]
except:
@sminot
sminot / merge_paired_end_fastq.sh
Created August 22, 2018 20:23
Merge paired-end FASTQ reads with PANDAseq
#!/bin/bash
set -e
module load PANDAseq/2.11-foss-2016b
find . -name "*_1.fq.gz" | sort -R | while read fwd; do
rev="${fwd/_1.fq.gz/_2.fq.gz}"
[[ -s "$rev" ]]
@sminot
sminot / kegg_api.py
Last active September 6, 2018 16:43
Get name from KEGG API
# KEGG names
@lru_cache(maxsize=None)
def get_kegg_name(ko):
r = requests.get("http://rest.kegg.jp/list/{}".format(ko))
return r.text.split("\t")[-1].rstrip("\n")
get_kegg_name("K00975")
@sminot
sminot / read_feather_from_s3.py
Created September 2, 2018 16:36
Read feather file directly from AWS S3
import io
import boto3
import pandas as pd
def read_feather_file_from_s3(s3_url):
assert s3_url.startswith("s3://")
bucket_name, key_name = s3_url[5:].split("/", 1)
s3 = boto3.client('s3')
retr = s3.get_object(Bucket=bucket_name, Key=key_name)