Josh Herr jrherr

Preface

This is my recommended pipeline for assembly and annotation of small eukaryotic genomes (50 - 500 Mb).

All small scripts are available at CGP-scripts. For the programs a link is provided.

Please cite if you found the pipeline useful!

	killall Xcode
	xcrun -k
	xcodebuild -alltargets clean
	rm -rf "$(getconf DARWIN_USER_CACHE_DIR)/org.llvm.clang/ModuleCache"
	rm -rf "$(getconf DARWIN_USER_CACHE_DIR)/org.llvm.clang.$(whoami)/ModuleCache"
	rm -rf /Applications/Xcode.app
	rm -rf ~/Library/Caches/com.apple.dt.Xcode
	rm -rf ~/Library/Developer
	rm -rf ~/Library/MobileDevice
	rm -rf ~/Library/Preferences/com.apple.dt.Xcode.plist

	metaphlanToPhyloseq <- function(
	metaphlandir,
	metadat=NULL,
	simplify=TRUE){
	## tax is a matrix or data.frame with the table of taxonomic abundances, rows are taxa, columns are samples
	## metadat is an optional data.frame of specimen metadata, rows are samples, columns are variables
	## if simplify=TRUE, use only the most detailed level of taxa names in the final object
	## metaphlanToPhyloseq("~/Downloads/metaphlan_bugs_list")
	.getMetaphlanTree <- function(removeGCF=TRUE, simplify=TRUE){
	if (!requireNamespace("ape")) {

	# Adapted from https://stackoverflow.com/a/7267364/1036500 by Andrie de Vries

	# This is it: theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5))

	library(ggplot2)

	td <- expand.grid(
	hjust=c(0, 0.5, 1),
	vjust=c(0, 0.5, 1),
	angle=c(0, 45, 90),

	cue_file = 'file.cue'

	d = open(cue_file).read().splitlines()

	general = {}

	tracks = []

	current_file = None

	if (!require("pacman")) install.packages("pacman")
	pacman::p_load(dplyr, venneuler)
	pacman::p_load_current_gh('trinker/textshape', 'thomasp85/ggforce')


	x <- matrix(sample(0:4, 80, TRUE, c(.6, .1, .1, .1, .1)), ncol=4)

	colnames(x) <- LETTERS[1:4]

	library(data.table)
	species <- fread('species.csv')
	## identify and remove records that were present when we imported the table
	usdaplants <- species[!is.na(NativeStatus) ]

	## data table method of renaming columns
	setnames(usdaplants,
	old = c('id', 'genus', 'species', 'scientificname', 'commonname'),
	new = c('betydb.species.id', 'Genus', 'Species', 'ScientificName', 'CommonName'))

	x <- data.frame(d=runif(12), g=rep(1:4, each =3))

	my.col <- c("deepskyblue3","darkorange2","darkgray","gold")

	spacer <- c(1, 0.1, 0.1, 1, 0.1, 0.1, 1, 0.1, 0.1, 1, 0.1, 0.1)

	bw <- 0.8

	xmax <- (sum(spacer) * bw) + (nrow(x) * bw)

	interleave-reads.py file.1.fq.gz file.2.fq.gz \
	\| skewer -Q 2 -t 2 -x $HOME/Trimmomatic-0.33/adapters/TruSeq3-PE.fa - -1 \
	\| normalize-by-median.py --max-memory-usage 2e9 -C 30 -o - - \
	\| trim-low-abund.py -V -M 2e9 -o - --cutoff 2 - \
	\| split-paired-reads.py --output-orphaned orph.fq -1 stream.1.fq -2 stream.2.fq -