This may change due to length considerations. Parts in bold are available for early release from O'Reilly.
- How to Learn Robust and Reproducible Bioinformatics
from __future__ import division | |
from collections import Counter | |
from math import log | |
def entropy(seq, unit="bit"): | |
""" | |
Returns entropy of DNA sequence. | |
The entropy formula is: | |
entropy = -sum_i (log(p_i) * p_i) |
library(GenomicRanges) | |
summarizeByTile <- | |
# given a GRanges (or some sort of ranged data) object `x`, and a | |
# *corresponding* vector values to summarize `y` (these *must* | |
# correspond), calculate the summary per tile with the function `fun`. | |
# Note: this is still beta; wider tests coming, use with caution. | |
function(x, y, tiles, fun, mcol_name="y") { | |
stopifnot(length(x) == length(y)) |
This may change due to length considerations. Parts in bold are available for early release from O'Reilly.
library(ggplot2) | |
library(lubridate) | |
library(dplyr) | |
library(reshape2) | |
myname <- "@vsbuffalo" # for removing later | |
d <- read.csv("tweets.csv", header=TRUE, stringsAsFactors=FALSE) | |
extractMentions <- function(x) { | |
gsub("[^@]*(@[a-zA-Z0-9_]+).*", "\\1", x, perl=TRUE) |
# Factors are more memory efficient (if labels > few bytes), since redundant multi-byte | |
# labels are stored once in memory (as attributes), and integers keep the mapping. E.g.: | |
a = sample(paste0("chrom", c(1:22, "X", "Y")), 1e8, replace=TRUE) | |
object.size(a) | |
# 800001192 bytes | |
object.size(factor(a)) | |
# 400001744 bytes | |
# For long character vectors of repeating values, this *really* pays off. |
# Rather trivial example of how dataframe row indexing could be used. | |
# I would not use this method, but it's not terrible design — access | |
# is fast compared to say genotype == 1L. Why limit these cases | |
> genotypes <- sample(0:2, 10, replace=TRUE) | |
> freq <- 0.3 | |
> df <- data.frame(name=c("aa", "Aa", "AA"), type=c("homozygote", "heterozygote", "homozygote"), | |
freq=c((1-freq)^2, 2*freq*(1-freq), freq^2)) | |
> df | |
name type freq |
# A quick function to save a PBM (http://en.wikipedia.org/wiki/Netpbm_format) | |
# visualize *a lot* of missing data pretty quickly (outside of R). | |
writeMissingPBM <- function(x, file) { | |
dims <- dim(x) | |
x[] <- as.integer(is.na(x)) | |
con <- file(file, open="wt") | |
writeLines(sprintf("P1\n%d %d", ncol(x), nrow(x)), con) | |
write.table(x, file=con, sep=" ", col.names=FALSE, row.names=FALSE, quote=FALSE) | |
close(con) |
@HD VN:1.0 GO:none SO:coordinate | |
@SQ SN:1 LN:249250621 AS:NCBI37 UR:file:/lustre/scratch102/projects/g1k/ref/main_project/human_g1k_v37.fasta M5:1b22b98cdeb4a9304cb5d48026a85128 | |
@SQ SN:2 LN:243199373 AS:NCBI37 UR:file:/lustre/scratch102/projects/g1k/ref/main_project/human_g1k_v37.fasta M5:a0d9851da00400dec1098a9255ac712e | |
@SQ SN:3 LN:198022430 AS:NCBI37 UR:file:/lustre/scratch102/projects/g1k/ref/main_project/human_g1k_v37.fasta M5:fdfd811849cc2fadebc929bb925902e5 | |
@SQ SN:4 LN:191154276 AS:NCBI37 UR:file:/lustre/scratch102/projects/g1k/ref/main_project/human_g1k_v37.fasta M5:23dccd106897542ad87d2765d28a19a1 | |
@SQ SN:5 LN:180915260 AS:NCBI37 UR:file:/lustre/scratch102/projects/g1k/ref/main_project/human_g1k_v37.fasta M5:0740173db9ffd264d728f32784845cd7 | |
@SQ SN:6 LN:171115067 AS:NCBI37 UR:file:/lustre/scratch102/projects/g1k/ref/main_project/human_g1k_v37.fasta M5:1d3a93a248d92a729ee764823acbbc6b | |
@SQ SN:7 LN:159138663 AS:NCBI37 UR:file:/lustre/scratch102/projects/g1k/ref/main_project/human_g1k_v37.fasta M5:618366e95 |
@HD VN:1.0 GO:none SO:coordinate | |
@SQ SN:1 LN:249250621 AS:NCBI37 UR:file:/lustre/scratch102/projects/g1k/ref/main_project/human_g1k_v37.fasta M5:1b22b98cdeb4a9304cb5d48026a85128 | |
@SQ SN:2 LN:243199373 AS:NCBI37 UR:file:/lustre/scratch102/projects/g1k/ref/main_project/human_g1k_v37.fasta M5:a0d9851da00400dec1098a9255ac712e | |
@SQ SN:3 LN:198022430 AS:NCBI37 UR:file:/lustre/scratch102/projects/g1k/ref/main_project/human_g1k_v37.fasta M5:fdfd811849cc2fadebc929bb925902e5 | |
@SQ SN:4 LN:191154276 AS:NCBI37 UR:file:/lustre/scratch102/projects/g1k/ref/main_project/human_g1k_v37.fasta M5:23dccd106897542ad87d2765d28a19a1 | |
@SQ SN:5 LN:180915260 AS:NCBI37 UR:file:/lustre/scratch102/projects/g1k/ref/main_project/human_g1k_v37.fasta M5:0740173db9ffd264d728f32784845cd7 | |
@SQ SN:6 LN:171115067 AS:NCBI37 UR:file:/lustre/scratch102/projects/g1k/ref/main_project/human_g1k_v37.fasta M5:1d3a93a248d92a729ee764823acbbc6b | |
@SQ SN:7 LN:159138663 AS:NCBI37 UR:file:/lustre/scratch102/projects/g1k/ref/main_project/human_g1k_v37.fasta M5:618366e95 |
import sys | |
import pysam | |
from collections import Counter | |
if len(sys.argv) < 2: | |
sys.exit("usage: alnstat.py in.bam") | |
fname = sys.argv[1] | |
mode = "rb" if fname.endswith(".bam") else "r" | |
bamfile = pysam.AlignmentFile(fname, mode) |