Kamil Slowikowski slowkow

Let's start with a mapping of Ensembl gene identifiers to Entrez gene identifiers:

xs <- structure(
  c("7105", "64102", "9093", "9093"),
  .Names = c("ENSG00000000003", "ENSG00000000005", "ENSG00000103423", "ENSG00000276726")
)
xs

	#' Convert counts to transcripts per million (TPM).
	#'
	#' Convert a numeric matrix of features (rows) and conditions (columns) with
	#' raw feature counts to transcripts per million.
	#'
	#' Lior Pachter. Models for transcript quantification from RNA-Seq.
	#' arXiv:1104.3889v2
	#'
	#' Wagner, et al. Measurement of mRNA abundance using RNA-seq data:
	#' RPKM measure is inconsistent among samples. Theory Biosci. 24 July 2012.

	# RPKM versus TPM
	#
	# RPKM and TPM are both normalized for library size and gene length.
	#
	# RPKM is not comparable across different samples.
	#
	# For more details, see: http://blog.nextgenetics.net/?e=51

	rpkm <- function(counts, lengths) {
	rate <- counts / lengths

	library(ggplot2)
	library(FField)

	# You'll have to play with repulsion, cex.x, and cex.y to get satisfactory results.
	plot_text <- function(x, y, label, repulsion = 1, cex.x = 110, cex.y = 40) {
	dat <- data.frame(xpos = x, ypos = y, label = label)
	dat$label <- as.character(dat$label)

	# Use the FField package to repel the text labels away from each other.
	dat <- cbind(

	#!/usr/bin/env bash
	# bigWigRegions
	# Kamil Slowikowski
	#
	# Depends on bigWigToBedGraph: http://hgdownload.cse.ucsc.edu/admin/exe/

	if [[ $# -ne 2 ]]; then
	echo "bigWigRegions - Print bigWig data for each region in a bed file."
	echo -e "usage:\n bigWigRegions in.bigWig in.bed > out.bedGraph"
	echo " bigWigRegions in.bigWig <(zcat in.bed.gz) > out.bedGraph"

	#!/usr/bin/env bash
	# sra2srr.sh
	#
	# Example
	# -------
	# To download all of the read archive files for SRP012001:
	# sra2srr.sh SRP012001 \| while read srr; do prefetch $srr; done
	#
	# For 'esearch', 'efetch', 'xtract', you must install Entrez Direct:
	# http://www.ncbi.nlm.nih.gov/books/NBK179288/

	#!/usr/bin/env bash
	# reads_per_seq
	#
	# Run 8 jobs in parallel on many BAM files:
	#
	# parallel -j8 reads_per_seq ::: *.bam

	set -eou pipefail

	if [[ $# -ne 1 \|\| "$1" != *.bam ]]; then

	#' Make a matrix suitable for use with RUVSeq methods such as RUVs().
	#'
	#' Each row in the returned matrix corresponds to a set of replicate samples.
	#' The number of columns is the size of the largest set of replicates; rows for
	#' smaller sets are padded with -1 values.
	#'
	#' @param xs A vector indicating membership in a group.
	#' @seealso RUVSeq::RUVs
	#' @example
	#' makeGroups(c("A","B","B","C","C","D","D","D","A"))

	#!/usr/bin/env bash
	# drop_long_skipped_reads.sh
	# Kamil Slowikowski
	# July 19, 2015
	#
	# Drop reads from a BAM file if they have long skipped regions.
	#
	# Example:
	#
	# ./drop_long_skipped_reads.sh FILE.bam 1000000