Skip to content

Instantly share code, notes, and snippets.

View sminot's full-sized avatar

Sam Minot sminot

View GitHub Profile
@sminot
sminot / plot_geneshot_cag_summary.py
Created June 26, 2020 15:29
Plot CAG summary figures from geneshot results HDF5
# Plot the distribution of CAG sizes
def plot_cag_size(hdf_fp, pdf=None, min_size=5, alpha=0.25):
cag_annot = pd.read_hdf(hdf_fp, "/annot/cag/all").set_index("CAG")
# Calculate the log10 size (number of genes per CAG)
cag_annot = cag_annot.assign(
size_log10 = cag_annot["size"].apply(np.log10)
)
# Filter by CAG size
@sminot
sminot / plot_geneshot_specimen_summary.py
Created June 26, 2020 15:00
Plot specimen summary from geneshot results HDF5
# Plot the number of genes detected and the proportion of reads aligned
def plot_specimen_summary(hdf_fp, pdf=None, alpha = 0.85):
specimen_summary = pd.read_hdf(hdf_fp, "/summary/all").set_index("specimen")
specimen_summary = specimen_summary.assign(
prop_reads = specimen_summary["aligned_reads"] / specimen_summary["n_reads"]
)
for col_name, axis_title in [
@sminot
sminot / import_geneshot_hdf5.py
Last active June 26, 2020 14:55
Import data from Geneshot HDF5
# Read in data from the HDF store
assert os.path.exists(hdf_fp)
cag_annot = pd.read_hdf(hdf_fp, "/annot/cag/all").set_index("CAG")
cag_abund = pd.read_hdf(hdf_fp, "/abund/cag/wide").set_index("CAG")
corncob_df = pd.read_hdf(hdf_fp, "/stats/cag/corncob")
betta_df = pd.read_hdf(hdf_fp, "/stats/enrichment/betta")
manifest = pd.read_hdf(hdf_fp, "/manifest").set_index("specimen")
specimen_summary = pd.read_hdf(hdf_fp, "/summary/all").set_index("specimen")
@sminot
sminot / basic_python_imports.py
Last active June 26, 2020 16:30
Typical Python import block
from functools import lru_cache
from collections import defaultdict
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.backends.backend_pdf import PdfPages
import matplotlib.patches as mpatches
from scipy import stats
import seaborn as sns
import os
@sminot
sminot / jupyter_nbconvert.sh
Created October 25, 2019 14:37
Execute Jupyter notebook in place
jupyter nbconvert --ExecutePreprocessor.timeout=-1 --to notebook --inplace --execute NOTEBBOOK
@sminot
sminot / demultiplex_separate_reads.sh
Last active September 10, 2019 16:37
Demultiplex FASTQ files in which barcode was sequenced as a separate read
paste -d '' <(bunzip2 -c Raw_Read2_Barcodes.fq.bz2 | awk '{if(NR % 4 == 2 || NR % 4 == 4){print}else{print ""}}') <(bunzip2 -c Raw_Read1.fq.bz2) | fastx_barcode_splitter.pl --bcfile barcodes.tsv --prefix PREFIX --suffix _R1.fastq --bol
@sminot
sminot / read_csv_from_s3.py
Created June 19, 2019 16:58
Read a CSV table from AWS S3
import io
import boto3
import pandas as pd
def read_csv_from_s3(s3_url, s3=None, sep=","):
assert s3_url.startswith("s3://")
bucket_name, key_name = s3_url[5:].split("/", 1)
if s3 is None:
s3 = boto3.client('s3')
@sminot
sminot / boto2_aws_s3_ls.py
Last active March 28, 2019 21:44
List contents of 'folder' in S3
import boto3
def aws_s3_ls(bucket, prefix):
conn = boto3.client('s3')
fps = []
r = conn.list_objects_v2(
Bucket=bucket,
Prefix=prefix
@sminot
sminot / glimmer_gff_to_fasta.py
Last active February 25, 2019 19:00
GFF to Protein FASTA
#!/usr/bin/env python
"""Convert GlimmerHMM GFF3 gene predictions into protein sequences.
This works with the GlimmerHMM GFF3 output format:
##gff-version 3
##sequence-region Contig5.15 1 47390
Contig5.15 GlimmerHMM mRNA 323 325 . + . ID=Contig5.15.path1.gene1;Name=Contig5.15.path1.gene1
Contig5.15 GlimmerHMM CDS 323 325 . + 0 ID=Contig5.15.cds1.1;Parent=Contig5.15.path1.gene1;Name=Contig5.15.path1.gene1;Note=final-exon
@sminot
sminot / delete_versioned_s3_folder.sh
Created January 29, 2019 17:51
Delete all files and versioned file history from an S3 folder
#!/bin/bash
set -e
bucket=$1
prefix=$2
(( ${#bucket} > 0 ))
(( ${#prefix} > 0 ))