Skip to content

Instantly share code, notes, and snippets.

View brentp's full-sized avatar

Brent Pedersen brentp

View GitHub Profile
@brentp
brentp / af_lim.py
Last active December 7, 2015 23:04
calculate confidence limits of a proportion (an allele frequency)
from math import sqrt
def jeffreys_interval(af, n_samples, alpha=0.05):
# https://en.wikipedia.org/wiki/Binomial_proportion_confidence_interval
"""
>>> jeffreys_interval(0.2, 5) #doctest: +ELLIPSIS
(0.02251..., 0.6286...)
>>> jeffreys_interval(0.2, 10) #doctest: +ELLIPSIS
(0.0440..., 0.5027...)
@brentp
brentp / inseq.py
Last active September 29, 2015 14:38
expand <DEL>, etc into the sequence field so we can do graph stuff.
from __future__ import print_function
from pyfaidx import Fasta
from cyvcf2 import VCF
import string
import re
import sys
try: # 2.x
maketrans = string.maketrans
except AttributeError: # 3.x
@brentp
brentp / crap2ped.py
Last active July 22, 2016 19:27
convert a crap CSV to a ped file with some crappy code
import csv
import re
import sys
import os.path
import string
sex_replace = {'m': 1, 'male': 1, 'female': 2, 'f': 2}
phenotype_replace = {'affected': 2, 'unaffected': 1}
if False:
with-bcolz time (seconds) bcolz-time md5sum num_lines filter
True 2.8 0.62 5281fecb8571cc7c8e832e5682e865bb 16778 (gt_phred_ll_homalt).(*).(==0).(all)
False 155.2 NA 5281fecb8571cc7c8e832e5682e865bb 16778 (gt_phred_ll_homalt).(*).(==0).(all)
True 3.1 0.74 b2b75c9a56e128410a0e19708f096e2f 16750 ((gt_phred_ll_homalt).().(==0).(all)) and (gt_types).().(==HOM_ALT).(all)
False 212.8 NA b2b75c9a56e128410a0e19708f096e2f 16750 ((gt_phred_ll_homalt).().(==0).(all)) and (gt_types).().(==HOM_ALT).(all)
True 1.4 0.42 c4885f35d03bed862d1e9d0ece71bc7c 2367 (gt_depths.{dad} > 20 and gt_depths.{mom} > 20 and gt_depths.{kid} > 20) and (gt_types.{dad} == HOM_REF and gt_types.{mom} == HOM_REF and gt_types.{kid} != HOM_REF)
False 203.0 NA c4885f35d03bed862d1e9d0ece71bc7c 2367 (gt_depths.{dad} > 20 and gt_depths.{mom} > 20 and gt_depths.{kid} > 20) and (gt_types.{dad} == HO
@brentp
brentp / format.md
Last active August 29, 2015 14:19
scratch

$ gemini de_novo --columns "chrom, start, end, ref, alt" test.de_novo.db --min-kindreds 1 --gt-pl-max 1000 | cols

chrom  start      end        ref  alt  gene    variant_id    family_id  family_members                                                     family_genotypes  affected_samples  family_count
chr10  48003991   48003992   C    T    ASAH2C  variant_id:2  2          2_dad(dad;unaffected),2_mom(mom;unaffected),2_kid(child;affected)  C/C,C/C,C/T       2_kid             2
chr10  48004991   48004992   C    T    ASAH2C  variant_id:3  3          3_dad(dad;unaffected),3_mom(mom;unaffected),3_kid(child;affected)  C/C,C/C,C/T       3_kid             2
chr10  135336655  135336656  G    A    SPRN    variant_id:4  2          2_dad(dad;unaffected),2_mom(mom;unaffected),2_kid(child;affected)  G/G,G/G,G/A       2_kid             2
chr10  135336655  135336656  G    A    SPRN    variant_id:4  1          1_dad(dad;unaffected),1_mom(mom;unaffected),1_kid(child;affected)  G/G,G/G,G/A       1_kid             2
chr10  135
from bx.bbi import bigwig_file
from bx.bbi import bigbed_file
print bigwig_file
import time
from hashlib import md5
prefix = '/data/gemini/data/hg19_fitcons_fc-i6-0_V1-01'
print "before bw: 8.6 seconds"
@brentp
brentp / preprocess.py
Last active August 29, 2015 14:18
preprocess a (non GATK/Freebayes) VCF to set the ref, alt and total depth tags so that gemini can use it.
"""
This will standardize input file genotype fields to have AD=REF,ALT1,ALT2 ... (a
la GTK).
For platypus: --depth NR --alt-depth NV
varscan: --ref-depth RD --alt-depth AD
samtools: --ref-depth DP4 (untested)
"""
import argparse
@brentp
brentp / compare.sh
Created March 11, 2015 21:00
compare different variant normalization methods
set -eo pipefail
REF=human_g1k_v37_decoy.fasta
VCF=Mills_and_1000G_gold_standard.indels.b37.vcf.gz
VCF=/usr/local/src/gemini/test/test.mult-alts.fb.vep.vcf.gz
java -Xmx4G -jar /usr/local/src/gatk/GenomeAnalysisTK.jar -T LeftAlignAndTrimVariants --splitMultiallelics --trimAlleles \
-R $REF \
--variant $VCF -o gatk.norm.vcf
@brentp
brentp / sorted_random.py
Last active October 10, 2020 01:41
generate sorted random numbers
"""
Generate sorted random numbers.
method from: http://repository.cmu.edu/cgi/viewcontent.cgi?article=3483&context=compsci
Has same interface as random module except it takes N as the first argument
for the number of numbers to generate.
"""
from random import random as rand, seed
def random(N):
@brentp
brentp / irelate.go
Last active August 29, 2015 14:10
Streaming relation (overlap, distance, KNN) testing of (any number of) sorted files of intervals.
package main
import (
"bufio"
"compress/gzip"
"container/heap"
"fmt"
"io"
"os"
"strconv"