Sam Minot sminot

Microbiome scientist at the Fred Hutchinson Cancer Research Center

54 followers · 3 following

View GitHub Profile

Recently created

Least recently created

Recently updated

Least recently updated

sminot / plot_geneshot_cag_summary.py

Created June 26, 2020 15:29

Plot CAG summary figures from geneshot results HDF5

	# Plot the distribution of CAG sizes
	def plot_cag_size(hdf_fp, pdf=None, min_size=5, alpha=0.25):
	cag_annot = pd.read_hdf(hdf_fp, "/annot/cag/all").set_index("CAG")

	# Calculate the log10 size (number of genes per CAG)
	cag_annot = cag_annot.assign(
	size_log10 = cag_annot["size"].apply(np.log10)
	)

	# Filter by CAG size

sminot / plot_geneshot_specimen_summary.py

Created June 26, 2020 15:00

Plot specimen summary from geneshot results HDF5

	# Plot the number of genes detected and the proportion of reads aligned
	def plot_specimen_summary(hdf_fp, pdf=None, alpha = 0.85):

	specimen_summary = pd.read_hdf(hdf_fp, "/summary/all").set_index("specimen")

	specimen_summary = specimen_summary.assign(
	prop_reads = specimen_summary["aligned_reads"] / specimen_summary["n_reads"]
	)

	for col_name, axis_title in [

sminot / import_geneshot_hdf5.py

Last active June 26, 2020 14:55

Import data from Geneshot HDF5

	# Read in data from the HDF store
	assert os.path.exists(hdf_fp)
	cag_annot = pd.read_hdf(hdf_fp, "/annot/cag/all").set_index("CAG")
	cag_abund = pd.read_hdf(hdf_fp, "/abund/cag/wide").set_index("CAG")
	corncob_df = pd.read_hdf(hdf_fp, "/stats/cag/corncob")
	betta_df = pd.read_hdf(hdf_fp, "/stats/enrichment/betta")
	manifest = pd.read_hdf(hdf_fp, "/manifest").set_index("specimen")
	specimen_summary = pd.read_hdf(hdf_fp, "/summary/all").set_index("specimen")

sminot / basic_python_imports.py

Last active June 26, 2020 16:30

Typical Python import block

	from functools import lru_cache
	from collections import defaultdict
	import pandas as pd
	import numpy as np
	import matplotlib.pyplot as plt
	from matplotlib.backends.backend_pdf import PdfPages
	import matplotlib.patches as mpatches
	from scipy import stats
	import seaborn as sns
	import os

sminot / jupyter_nbconvert.sh

Created October 25, 2019 14:37

Execute Jupyter notebook in place

jupyter nbconvert --ExecutePreprocessor.timeout=-1 --to notebook --inplace --execute NOTEBBOOK

sminot / demultiplex_separate_reads.sh

Last active September 10, 2019 16:37

Demultiplex FASTQ files in which barcode was sequenced as a separate read

paste -d '' <(bunzip2 -c Raw_Read2_Barcodes.fq.bz2 | awk '{if(NR % 4 == 2 || NR % 4 == 4){print}else{print ""}}') <(bunzip2 -c Raw_Read1.fq.bz2) | fastx_barcode_splitter.pl --bcfile barcodes.tsv --prefix PREFIX --suffix _R1.fastq --bol

sminot / read_csv_from_s3.py

Created June 19, 2019 16:58

Read a CSV table from AWS S3

	import io
	import boto3
	import pandas as pd

	def read_csv_from_s3(s3_url, s3=None, sep=","):
	assert s3_url.startswith("s3://")
	bucket_name, key_name = s3_url[5:].split("/", 1)

	if s3 is None:
	s3 = boto3.client('s3')

sminot / boto2_aws_s3_ls.py

Last active March 28, 2019 21:44

List contents of 'folder' in S3

	import boto3

	def aws_s3_ls(bucket, prefix):
	conn = boto3.client('s3')

	fps = []

	r = conn.list_objects_v2(
	Bucket=bucket,
	Prefix=prefix

sminot / glimmer_gff_to_fasta.py

Last active February 25, 2019 19:00

GFF to Protein FASTA

	#!/usr/bin/env python
	"""Convert GlimmerHMM GFF3 gene predictions into protein sequences.

	This works with the GlimmerHMM GFF3 output format:

	##gff-version 3
	##sequence-region Contig5.15 1 47390
	Contig5.15 GlimmerHMM mRNA 323 325 . + . ID=Contig5.15.path1.gene1;Name=Contig5.15.path1.gene1
	Contig5.15 GlimmerHMM CDS 323 325 . + 0 ID=Contig5.15.cds1.1;Parent=Contig5.15.path1.gene1;Name=Contig5.15.path1.gene1;Note=final-exon

sminot / delete_versioned_s3_folder.sh

Created January 29, 2019 17:51

Delete all files and versioned file history from an S3 folder

	#!/bin/bash

	set -e

	bucket=$1
	prefix=$2

	(( ${#bucket} > 0 ))
	(( ${#prefix} > 0 ))

Newer Older