Skip to content

Instantly share code, notes, and snippets.

View sminot's full-sized avatar

Sam Minot sminot

View GitHub Profile
@sminot
sminot / parse_fasta.py
Created August 13, 2020 16:35
Parse FASTA
from functools import lru_cache
import os
import pandas as pd
@lru_cache(maxsize=16)
def read_fasta(fasta_fp):
assert os.path.exists(fasta_fp)
fasta = {}
header = None
from functools import lru_cache
import pandas as pd
@lru_cache(maxsize=128)
def parse_gff(gff_fp):
df = pd.read_csv(
gff_fp,
sep="\t",
comment="#",
header=None,
@sminot
sminot / print_cloudwatch_logs_aws_batch.py
Last active June 15, 2023 16:46
Print CloudWatch logs for an AWS Batch job
#!/usr/bin/env python3
import boto3
import argparse
from datetime import datetime
parser = argparse.ArgumentParser()
parser.add_argument("job_id")
# Add the arguments
@sminot
sminot / plot_geneshot_cag_summary.py
Created June 26, 2020 15:29
Plot CAG summary figures from geneshot results HDF5
# Plot the distribution of CAG sizes
def plot_cag_size(hdf_fp, pdf=None, min_size=5, alpha=0.25):
cag_annot = pd.read_hdf(hdf_fp, "/annot/cag/all").set_index("CAG")
# Calculate the log10 size (number of genes per CAG)
cag_annot = cag_annot.assign(
size_log10 = cag_annot["size"].apply(np.log10)
)
# Filter by CAG size
@sminot
sminot / plot_geneshot_specimen_summary.py
Created June 26, 2020 15:00
Plot specimen summary from geneshot results HDF5
# Plot the number of genes detected and the proportion of reads aligned
def plot_specimen_summary(hdf_fp, pdf=None, alpha = 0.85):
specimen_summary = pd.read_hdf(hdf_fp, "/summary/all").set_index("specimen")
specimen_summary = specimen_summary.assign(
prop_reads = specimen_summary["aligned_reads"] / specimen_summary["n_reads"]
)
for col_name, axis_title in [
@sminot
sminot / import_geneshot_hdf5.py
Last active June 26, 2020 14:55
Import data from Geneshot HDF5
# Read in data from the HDF store
assert os.path.exists(hdf_fp)
cag_annot = pd.read_hdf(hdf_fp, "/annot/cag/all").set_index("CAG")
cag_abund = pd.read_hdf(hdf_fp, "/abund/cag/wide").set_index("CAG")
corncob_df = pd.read_hdf(hdf_fp, "/stats/cag/corncob")
betta_df = pd.read_hdf(hdf_fp, "/stats/enrichment/betta")
manifest = pd.read_hdf(hdf_fp, "/manifest").set_index("specimen")
specimen_summary = pd.read_hdf(hdf_fp, "/summary/all").set_index("specimen")
@sminot
sminot / basic_python_imports.py
Last active June 26, 2020 16:30
Typical Python import block
from functools import lru_cache
from collections import defaultdict
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.backends.backend_pdf import PdfPages
import matplotlib.patches as mpatches
from scipy import stats
import seaborn as sns
import os
@sminot
sminot / jupyter_nbconvert.sh
Created October 25, 2019 14:37
Execute Jupyter notebook in place
jupyter nbconvert --ExecutePreprocessor.timeout=-1 --to notebook --inplace --execute NOTEBBOOK
@sminot
sminot / demultiplex_separate_reads.sh
Last active September 10, 2019 16:37
Demultiplex FASTQ files in which barcode was sequenced as a separate read
paste -d '' <(bunzip2 -c Raw_Read2_Barcodes.fq.bz2 | awk '{if(NR % 4 == 2 || NR % 4 == 4){print}else{print ""}}') <(bunzip2 -c Raw_Read1.fq.bz2) | fastx_barcode_splitter.pl --bcfile barcodes.tsv --prefix PREFIX --suffix _R1.fastq --bol
@sminot
sminot / read_csv_from_s3.py
Created June 19, 2019 16:58
Read a CSV table from AWS S3
import io
import boto3
import pandas as pd
def read_csv_from_s3(s3_url, s3=None, sep=","):
assert s3_url.startswith("s3://")
bucket_name, key_name = s3_url[5:].split("/", 1)
if s3 is None:
s3 = boto3.client('s3')