Sam Minot sminot

Microbiome scientist at the Fred Hutchinson Cancer Research Center

58 followers · 3 following

View GitHub Profile

Recently created

Least recently created

Recently updated

Least recently updated

sminot / parse_fasta.py

Created August 13, 2020 16:35

Parse FASTA

	from functools import lru_cache
	import os
	import pandas as pd

	@lru_cache(maxsize=16)
	def read_fasta(fasta_fp):
	assert os.path.exists(fasta_fp)

	fasta = {}
	header = None

sminot / parse_gff.py

Created August 13, 2020 16:34

	from functools import lru_cache
	import pandas as pd

	@lru_cache(maxsize=128)
	def parse_gff(gff_fp):
	df = pd.read_csv(
	gff_fp,
	sep="\t",
	comment="#",
	header=None,

sminot / print_cloudwatch_logs_aws_batch.py

Last active June 15, 2023 16:46

Print CloudWatch logs for an AWS Batch job

	#!/usr/bin/env python3

	import boto3
	import argparse
	from datetime import datetime

	parser = argparse.ArgumentParser()
	parser.add_argument("job_id")

	# Add the arguments

sminot / plot_geneshot_cag_summary.py

Created June 26, 2020 15:29

Plot CAG summary figures from geneshot results HDF5

	# Plot the distribution of CAG sizes
	def plot_cag_size(hdf_fp, pdf=None, min_size=5, alpha=0.25):
	cag_annot = pd.read_hdf(hdf_fp, "/annot/cag/all").set_index("CAG")

	# Calculate the log10 size (number of genes per CAG)
	cag_annot = cag_annot.assign(
	size_log10 = cag_annot["size"].apply(np.log10)
	)

	# Filter by CAG size

sminot / plot_geneshot_specimen_summary.py

Created June 26, 2020 15:00

Plot specimen summary from geneshot results HDF5

	# Plot the number of genes detected and the proportion of reads aligned
	def plot_specimen_summary(hdf_fp, pdf=None, alpha = 0.85):

	specimen_summary = pd.read_hdf(hdf_fp, "/summary/all").set_index("specimen")

	specimen_summary = specimen_summary.assign(
	prop_reads = specimen_summary["aligned_reads"] / specimen_summary["n_reads"]
	)

	for col_name, axis_title in [

sminot / import_geneshot_hdf5.py

Last active June 26, 2020 14:55

Import data from Geneshot HDF5

	# Read in data from the HDF store
	assert os.path.exists(hdf_fp)
	cag_annot = pd.read_hdf(hdf_fp, "/annot/cag/all").set_index("CAG")
	cag_abund = pd.read_hdf(hdf_fp, "/abund/cag/wide").set_index("CAG")
	corncob_df = pd.read_hdf(hdf_fp, "/stats/cag/corncob")
	betta_df = pd.read_hdf(hdf_fp, "/stats/enrichment/betta")
	manifest = pd.read_hdf(hdf_fp, "/manifest").set_index("specimen")
	specimen_summary = pd.read_hdf(hdf_fp, "/summary/all").set_index("specimen")

sminot / basic_python_imports.py

Last active June 26, 2020 16:30

Typical Python import block

	from functools import lru_cache
	from collections import defaultdict
	import pandas as pd
	import numpy as np
	import matplotlib.pyplot as plt
	from matplotlib.backends.backend_pdf import PdfPages
	import matplotlib.patches as mpatches
	from scipy import stats
	import seaborn as sns
	import os

sminot / jupyter_nbconvert.sh

Created October 25, 2019 14:37

Execute Jupyter notebook in place

jupyter nbconvert --ExecutePreprocessor.timeout=-1 --to notebook --inplace --execute NOTEBBOOK

sminot / demultiplex_separate_reads.sh

Last active September 10, 2019 16:37

Demultiplex FASTQ files in which barcode was sequenced as a separate read

paste -d '' <(bunzip2 -c Raw_Read2_Barcodes.fq.bz2 | awk '{if(NR % 4 == 2 || NR % 4 == 4){print}else{print ""}}') <(bunzip2 -c Raw_Read1.fq.bz2) | fastx_barcode_splitter.pl --bcfile barcodes.tsv --prefix PREFIX --suffix _R1.fastq --bol

sminot / read_csv_from_s3.py

Created June 19, 2019 16:58

Read a CSV table from AWS S3

	import io
	import boto3
	import pandas as pd

	def read_csv_from_s3(s3_url, s3=None, sep=","):
	assert s3_url.startswith("s3://")
	bucket_name, key_name = s3_url[5:].split("/", 1)

	if s3 is None:
	s3 = boto3.client('s3')

Newer Older