Vince Buffalo vsbuffalo

Bioinformatics Data Skills Table of Contents

This may change due to length considerations. Parts in bold are available for early release from O'Reilly.

	from __future__ import division
	from collections import Counter
	from math import log

	def entropy(seq, unit="bit"):
	"""
	Returns entropy of DNA sequence.

	The entropy formula is:
	entropy = -sum_i (log(p_i) * p_i)

	library(GenomicRanges)

	summarizeByTile <-
	# given a GRanges (or some sort of ranged data) object `x`, and a
	# corresponding vector values to summarize `y` (these must
	# correspond), calculate the summary per tile with the function `fun`.
	# Note: this is still beta; wider tests coming, use with caution.
	function(x, y, tiles, fun, mcol_name="y") {
	stopifnot(length(x) == length(y))

	library(ggplot2)
	library(lubridate)
	library(dplyr)
	library(reshape2)
	myname <- "@vsbuffalo" # for removing later

	d <- read.csv("tweets.csv", header=TRUE, stringsAsFactors=FALSE)

	extractMentions <- function(x) {
	gsub("[^@](@[a-zA-Z0-9_]+).", "\\1", x, perl=TRUE)

	# Factors are more memory efficient (if labels > few bytes), since redundant multi-byte
	# labels are stored once in memory (as attributes), and integers keep the mapping. E.g.:

	a = sample(paste0("chrom", c(1:22, "X", "Y")), 1e8, replace=TRUE)
	object.size(a)
	# 800001192 bytes
	object.size(factor(a))
	# 400001744 bytes

	# For long character vectors of repeating values, this really pays off.

	# Rather trivial example of how dataframe row indexing could be used.
	# I would not use this method, but it's not terrible design — access
	# is fast compared to say genotype == 1L. Why limit these cases

	> genotypes <- sample(0:2, 10, replace=TRUE)
	> freq <- 0.3
	> df <- data.frame(name=c("aa", "Aa", "AA"), type=c("homozygote", "heterozygote", "homozygote"),
	freq=c((1-freq)^2, 2freq(1-freq), freq^2))
	> df
	name type freq

	# A quick function to save a PBM (http://en.wikipedia.org/wiki/Netpbm_format)
	# visualize a lot of missing data pretty quickly (outside of R).

	writeMissingPBM <- function(x, file) {
	dims <- dim(x)
	x[] <- as.integer(is.na(x))
	con <- file(file, open="wt")
	writeLines(sprintf("P1\n%d %d", ncol(x), nrow(x)), con)
	write.table(x, file=con, sep=" ", col.names=FALSE, row.names=FALSE, quote=FALSE)
	close(con)

	@HD VN:1.0 GO:none SO:coordinate
	@SQ SN:1 LN:249250621 AS:NCBI37 UR:file:/lustre/scratch102/projects/g1k/ref/main_project/human_g1k_v37.fasta M5:1b22b98cdeb4a9304cb5d48026a85128
	@SQ SN:2 LN:243199373 AS:NCBI37 UR:file:/lustre/scratch102/projects/g1k/ref/main_project/human_g1k_v37.fasta M5:a0d9851da00400dec1098a9255ac712e
	@SQ SN:3 LN:198022430 AS:NCBI37 UR:file:/lustre/scratch102/projects/g1k/ref/main_project/human_g1k_v37.fasta M5:fdfd811849cc2fadebc929bb925902e5
	@SQ SN:4 LN:191154276 AS:NCBI37 UR:file:/lustre/scratch102/projects/g1k/ref/main_project/human_g1k_v37.fasta M5:23dccd106897542ad87d2765d28a19a1
	@SQ SN:5 LN:180915260 AS:NCBI37 UR:file:/lustre/scratch102/projects/g1k/ref/main_project/human_g1k_v37.fasta M5:0740173db9ffd264d728f32784845cd7
	@SQ SN:6 LN:171115067 AS:NCBI37 UR:file:/lustre/scratch102/projects/g1k/ref/main_project/human_g1k_v37.fasta M5:1d3a93a248d92a729ee764823acbbc6b
	@SQ SN:7 LN:159138663 AS:NCBI37 UR:file:/lustre/scratch102/projects/g1k/ref/main_project/human_g1k_v37.fasta M5:618366e95

	import sys
	import pysam
	from collections import Counter

	if len(sys.argv) < 2:
	sys.exit("usage: alnstat.py in.bam")

	fname = sys.argv[1]
	mode = "rb" if fname.endswith(".bam") else "r"
	bamfile = pysam.AlignmentFile(fname, mode)