Skip to content

Instantly share code, notes, and snippets.

View sminot's full-sized avatar

Sam Minot sminot

View GitHub Profile
@sminot
sminot / boto2_aws_s3_ls.py
Last active March 28, 2019 21:44
List contents of 'folder' in S3
import boto3
def aws_s3_ls(bucket, prefix):
conn = boto3.client('s3')
fps = []
r = conn.list_objects_v2(
Bucket=bucket,
Prefix=prefix
@sminot
sminot / glimmer_gff_to_fasta.py
Last active February 25, 2019 19:00
GFF to Protein FASTA
#!/usr/bin/env python
"""Convert GlimmerHMM GFF3 gene predictions into protein sequences.
This works with the GlimmerHMM GFF3 output format:
##gff-version 3
##sequence-region Contig5.15 1 47390
Contig5.15 GlimmerHMM mRNA 323 325 . + . ID=Contig5.15.path1.gene1;Name=Contig5.15.path1.gene1
Contig5.15 GlimmerHMM CDS 323 325 . + 0 ID=Contig5.15.cds1.1;Parent=Contig5.15.path1.gene1;Name=Contig5.15.path1.gene1;Note=final-exon
@sminot
sminot / delete_versioned_s3_folder.sh
Created January 29, 2019 17:51
Delete all files and versioned file history from an S3 folder
#!/bin/bash
set -e
bucket=$1
prefix=$2
(( ${#bucket} > 0 ))
(( ${#prefix} > 0 ))
@sminot
sminot / split_10X_barcodes.py
Created January 18, 2019 00:16
Split reads by 10X barcode (in header)
#!/usr/bin/env python3
from Bio.SeqIO.QualityIO import FastqGeneralIterator
from collections import defaultdict
import gzip
from functools import lru_cache
import os
import sys
assert len(sys.argv) == 3, "Please specify input file and output prefix"
@sminot
sminot / read_feather_from_s3.py
Created September 2, 2018 16:36
Read feather file directly from AWS S3
import io
import boto3
import pandas as pd
def read_feather_file_from_s3(s3_url):
assert s3_url.startswith("s3://")
bucket_name, key_name = s3_url[5:].split("/", 1)
s3 = boto3.client('s3')
retr = s3.get_object(Bucket=bucket_name, Key=key_name)
@sminot
sminot / kegg_api.py
Last active September 6, 2018 16:43
Get name from KEGG API
# KEGG names
@lru_cache(maxsize=None)
def get_kegg_name(ko):
r = requests.get("http://rest.kegg.jp/list/{}".format(ko))
return r.text.split("\t")[-1].rstrip("\n")
get_kegg_name("K00975")
@sminot
sminot / merge_paired_end_fastq.sh
Created August 22, 2018 20:23
Merge paired-end FASTQ reads with PANDAseq
#!/bin/bash
set -e
module load PANDAseq/2.11-foss-2016b
find . -name "*_1.fq.gz" | sort -R | while read fwd; do
rev="${fwd/_1.fq.gz/_2.fq.gz}"
[[ -s "$rev" ]]
@sminot
sminot / get_assembly_name_from_ncbi.py
Created August 1, 2018 21:48
Get the name of an organism, given an assembly ID
from Bio import Entrez
import xmltodict
def get_name_from_assembly_id(assembly_name):
handle = Entrez.esearch("assembly", term=assembly_name)
search_result = xmltodict.parse("".join(handle.readlines()))
handle.close()
try:
assembly_id = search_result["eSearchResult"]["IdList"]["Id"]
except:
@sminot
sminot / read_gzip_fasta_from_s3.py
Created July 13, 2018 19:37
Read a gzipped fasta from s3
def read_fasta_from_s3(bucket_name, key_name, sep="\t"):
s3 = boto3.client('s3')
retr = s3.get_object(Bucket=bucket_name, Key=key_name)
bytestream = io.BytesIO(retr['Body'].read())
got_text = gzip.GzipFile(None, 'rb', fileobj=bytestream).read().decode('utf-8')
for header, seq in SimpleFastaParser(io.StringIO(got_text)):
yield header, seq
@sminot
sminot / parse_sra_xml.py
Created June 12, 2018 16:32
Parse XML output from SRA
#!/usr/bin/env python3
import os
import sys
import xmltodict
from collections import OrderedDict
assert len(sys.argv) > 1
assert os.path.exists(sys.argv[1])
assert sys.argv[1].endswith(".xml")