Last active
March 14, 2018 20:05
-
-
Save arq5x/657a4c36bc6195b041b0 to your computer and use it in GitHub Desktop.
breast-cancer-evolution-cnv-segmentation
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# bedtools --version | |
# bedtools v2.24.0-14-gaa11ef9 | |
######################################################## | |
# Create a BED file of 5kb windows with 2.5kb overlap | |
# tiling build 37 (hg19) of the human genome | |
######################################################## | |
bedtools makewindows -g hg19.txt -w 5000 -s 2500 > hg19.w5k.s2.5k.bedg | |
######################################################## | |
# Count the number of reads aligned to each overlapping | |
# 5kb genomic window for each sample. | |
######################################################## | |
bedtools intersect -a hg19.w5k.s2.5k.bedg \ | |
-b /scratch/ucgd/lustre/Projects/WashU/15-05-21_Marth_Quinlan_Breast_Cancer/alignment/B0.bam \ | |
-sorted -c -g hg19.txt \ | |
> B0.w5k.s2.5k.conc.bedg | |
bedtools intersect -a hg19.w5k.s2.5k.bedg \ | |
-b /scratch/ucgd/lustre/Projects/WashU/15-05-21_Marth_Quinlan_Breast_Cancer/alignment/B1.bam \ | |
-sorted -c -g hg19.txt \ | |
> B1.w5k.s2.5k.conc.bedg | |
bedtools intersect -a hg19.w5k.s2.5k.bedg \ | |
-b /scratch/ucgd/lustre/Projects/WashU/15-05-21_Marth_Quinlan_Breast_Cancer/alignment/B2.bam \ | |
-sorted -c -g hg19.txt \ | |
> B2.w5k.s2.5k.conc.bedg | |
bedtools intersect -a hg19.w5k.s2.5k.bedg \ | |
-b /scratch/ucgd/lustre/Projects/WashU/15-05-21_Marth_Quinlan_Breast_Cancer/alignment/B3.bam \ | |
-sorted -c -g hg19.txt \ | |
> B3.w5k.s2.5k.conc.bedg | |
bedtools intersect -a hg19.w5k.s2.5k.bedg \ | |
-b /scratch/ucgd/lustre/Projects/WashU/15-05-21_Marth_Quinlan_Breast_Cancer/alignment/B4.bam \ | |
-sorted -c -g hg19.txt \ | |
> B4.w5k.s2.5k.conc.bedg | |
######################################################## | |
# restrict depth calls to chrom 1 through X | |
######################################################## | |
awk 'length($1) <=2' B0.w5k.s2.5k.conc.bedg | grep -v "^MT" | grep -v "^Y" > B0.w5k.s2.5k.conc.chr1-chrX.bedg | |
awk 'length($1) <=2' B1.w5k.s2.5k.conc.bedg | grep -v "^MT" | grep -v "^Y" > B1.w5k.s2.5k.conc.chr1-chrX.bedg | |
awk 'length($1) <=2' B2.w5k.s2.5k.conc.bedg | grep -v "^MT" | grep -v "^Y" > B2.w5k.s2.5k.conc.chr1-chrX.bedg | |
awk 'length($1) <=2' B3.w5k.s2.5k.conc.bedg | grep -v "^MT" | grep -v "^Y" > B3.w5k.s2.5k.conc.chr1-chrX.bedg | |
awk 'length($1) <=2' B4.w5k.s2.5k.conc.bedg | grep -v "^MT" | grep -v "^Y" > B4.w5k.s2.5k.conc.chr1-chrX.bedg | |
######################################################## | |
# Prefix chromosomes with chr | |
######################################################## | |
awk '{print "chr"$0}' B3.w5k.s2.5k.conc.chr1-chrX.bedg > B3.w5k.s2.5k.conc.chr1-chrX.chromlabels.bedg | |
awk '{print "chr"$0}' B4.w5k.s2.5k.conc.chr1-chrX.bedg > B4.w5k.s2.5k.conc.chr1-chrX.chromlabels.bedg | |
awk '{print "chr"$0}' B2.w5k.s2.5k.conc.chr1-chrX.bedg > B2.w5k.s2.5k.conc.chr1-chrX.chromlabels.bedg | |
awk '{print "chr"$0}' B1.w5k.s2.5k.conc.chr1-chrX.bedg > B1.w5k.s2.5k.conc.chr1-chrX.chromlabels.bedg | |
awk '{print "chr"$0}' B0.w5k.s2.5k.conc.chr1-chrX.bedg > B0.w5k.s2.5k.conc.chr1-chrX.chromlabels.bedg | |
######################################################## | |
# Normalize each samples coverage by GC content | |
# Uses the codachrom package: | |
# https://github.com/arq5x/codachrom | |
######################################################## | |
python codachrom/windowed_coverage.py -w 5000 -s 2500 -c B0.w5k.s2.5k.conc.chr1-chrX.chromlabels.bedg -f hg19.fa | |
python codachrom/windowed_coverage.py -w 5000 -s 2500 -c B1.w5k.s2.5k.conc.chr1-chrX.chromlabels.bedg -f hg19.fa | |
python codachrom/windowed_coverage.py -w 5000 -s 2500 -c B2.w5k.s2.5k.conc.chr1-chrX.chromlabels.bedg -f hg19.fa | |
python codachrom/windowed_coverage.py -w 5000 -s 2500 -c B3.w5k.s2.5k.conc.chr1-chrX.chromlabels.bedg -f hg19.fa | |
python codachrom/windowed_coverage.py -w 5000 -s 2500 -c B4.w5k.s2.5k.conc.chr1-chrX.chromlabels.bedg -f hg19.fa | |
######################################################## | |
# Compute log2 ratios of one sample's (-1) normalized | |
# coverage versus another's (-2). | |
######################################################## | |
python codachrom/log_ratio.py -1 B1.w5k.s2.5k.conc.chr1-chrX.chromlabels.bedg.depth.txt -2 B0.w5k.s2.5k.conc.chr1-chrX.chromlabels.bedg.depth.txt > B1vB0.log2.bedg & | |
python codachrom/log_ratio.py -1 B2.w5k.s2.5k.conc.chr1-chrX.chromlabels.bedg.depth.txt -2 B0.w5k.s2.5k.conc.chr1-chrX.chromlabels.bedg.depth.txt > B2vB0.log2.bedg & | |
python codachrom/log_ratio.py -1 B3.w5k.s2.5k.conc.chr1-chrX.chromlabels.bedg.depth.txt -2 B0.w5k.s2.5k.conc.chr1-chrX.chromlabels.bedg.depth.txt > B3vB0.log2.bedg & | |
python codachrom/log_ratio.py -1 B4.w5k.s2.5k.conc.chr1-chrX.chromlabels.bedg.depth.txt -2 B0.w5k.s2.5k.conc.chr1-chrX.chromlabels.bedg.depth.txt > B4vB0.log2.bedg & | |
python codachrom/log_ratio.py -1 B2.w5k.s2.5k.conc.chr1-chrX.chromlabels.bedg.depth.txt -2 B1.w5k.s2.5k.conc.chr1-chrX.chromlabels.bedg.depth.txt > B2vB1.log2.bedg & | |
python codachrom/log_ratio.py -1 B3.w5k.s2.5k.conc.chr1-chrX.chromlabels.bedg.depth.txt -2 B2.w5k.s2.5k.conc.chr1-chrX.chromlabels.bedg.depth.txt > B3vB2.log2.bedg & | |
python codachrom/log_ratio.py -1 B4.w5k.s2.5k.conc.chr1-chrX.chromlabels.bedg.depth.txt -2 B3.w5k.s2.5k.conc.chr1-chrX.chromlabels.bedg.depth.txt > B4vB3.log2.bedg & | |
# segment the log2 ratios using DNAcopy | |
# see details in seg.R script | |
######################################################## | |
# convert DNAcopy output to bedgraph | |
######################################################## | |
awk '{OFS="\t"; print $3,$4,$5,$7,$6,$1}' segs.b1b0.txt > segs.b1b0.bedg | |
awk '{OFS="\t"; print $3,$4,$5,$7,$6,$1}' segs.b2b0.txt > segs.b2b0.bedg | |
awk '{OFS="\t"; print $3,$4,$5,$7,$6,$1}' segs.b3b0.txt > segs.b3b0.bedg | |
awk '{OFS="\t"; print $3,$4,$5,$7,$6,$1}' segs.b4b0.txt > segs.b4b0.bedg | |
awk '{OFS="\t"; print $3,$4,$5,$7,$6,$1}' segs.b2b1.txt > segs.b2b1.bedg | |
awk '{OFS="\t"; print $3,$4,$5,$7,$6,$1}' segs.b3b2.txt > segs.b3b2.bedg | |
awk '{OFS="\t"; print $3,$4,$5,$7,$6,$1}' segs.b4b3.txt > segs.b4b3.bedg | |
######################################################## | |
# Identify segments with a mean log2 ratio of > 0.4 | |
# or less than -0.4 and that are at least 15kb in size | |
######################################################## | |
awk '($4>=0.4 || $4<=-0.4) && ($5 >=5)' segs.b1b0.bedg > segs.b1b0.cnvs.bedg | |
awk '($4>=0.4 || $4<=-0.4) && ($5 >=5)' segs.b2b0.bedg > segs.b2b0.cnvs.bedg | |
awk '($4>=0.4 || $4<=-0.4) && ($5 >=5)' segs.b3b0.bedg > segs.b3b0.cnvs.bedg | |
awk '($4>=0.4 || $4<=-0.4) && ($5 >=5)' segs.b4b0.bedg > segs.b4b0.cnvs.bedg | |
awk '($4>=0.4 || $4<=-0.4) && ($5 >=5)' segs.b2b1.bedg > segs.b2b1.cnvs.bedg | |
awk '($4>=0.4 || $4<=-0.4) && ($5 >=5)' segs.b3b2.bedg > segs.b3b2.cnvs.bedg | |
awk '($4>=0.4 || $4<=-0.4) && ($5 >=5)' segs.b4b3.bedg > segs.b4b3.cnvs.bedg | |
######################################################## | |
# merge segments that are within 250Kb of one another. | |
######################################################## | |
sort -k1,1 -k2,2n segs.b3b2.cnvs.bedg | bedtools merge -i - -d 250000 -c 4,5 -o mean,sum > segs.b3b2.cnvs.merged.bedg | |
sort -k1,1 -k2,2n segs.b2b1.cnvs.bedg | bedtools merge -i - -d 250000 -c 4,5 -o mean,sum > segs.b2b1.cnvs.merged.bedg | |
sort -k1,1 -k2,2n segs.b1b0.cnvs.bedg | bedtools merge -i - -d 250000 -c 4,5 -o mean,sum > segs.b1b0.cnvs.merged.bedg | |
sort -k1,1 -k2,2n segs.b2b0.cnvs.bedg | bedtools merge -i - -d 250000 -c 4,5 -o mean,sum > segs.b2b0.cnvs.merged.bedg | |
sort -k1,1 -k2,2n segs.b3b0.cnvs.bedg | bedtools merge -i - -d 250000 -c 4,5 -o mean,sum > segs.b3b0.cnvs.merged.bedg | |
sort -k1,1 -k2,2n segs.b4b0.cnvs.bedg | bedtools merge -i - -d 250000 -c 4,5 -o mean,sum > segs.b4b0.cnvs.merged.bedg | |
sort -k1,1 -k2,2n segs.b4b3.cnvs.bedg | bedtools merge -i - -d 250000 -c 4,5 -o mean,sum > segs.b4b3.cnvs.merged.bedg | |
######################################################## | |
# get protein coding, known GENCODE genes in | |
# BED format (downloaded from UCSC) | |
######################################################## | |
grep protein_coding gencode.bed | grep KNOWN | sort -k1,1 -k2,2n > gencode.prot.known.sorted.bed | |
######################################################## | |
# collect a gene list for each putative CNV | |
######################################################## | |
bedtools intersect -a segs.b2b1.cnvs.merged.bedg -b gencode.prot.known.sorted.bed -wa -wb | bedtools groupby -i - -g 1-5 -c 9 -o distinct > segs.b2b1.cnvs.merged.genelist.bedgraph | |
bedtools intersect -a segs.b1b0.cnvs.merged.bedg -b gencode.prot.known.sorted.bed -wa -wb | bedtools groupby -i - -g 1-5 -c 9 -o distinct > segs.b1b0.cnvs.merged.genelist.bedgraph | |
bedtools intersect -a segs.b2b0.cnvs.merged.bedg -b gencode.prot.known.sorted.bed -wa -wb | bedtools groupby -i - -g 1-5 -c 9 -o distinct > segs.b2b0.cnvs.merged.genelist.bedgraph | |
bedtools intersect -a segs.b3b0.cnvs.merged.bedg -b gencode.prot.known.sorted.bed -wa -wb | bedtools groupby -i - -g 1-5 -c 9 -o distinct > segs.b3b0.cnvs.merged.genelist.bedgraph | |
bedtools intersect -a segs.b4b0.cnvs.merged.bedg -b gencode.prot.known.sorted.bed -wa -wb | bedtools groupby -i - -g 1-5 -c 9 -o distinct > segs.b4b0.cnvs.merged.genelist.bedgraph | |
bedtools intersect -a segs.b3b2.cnvs.merged.bedg -b gencode.prot.known.sorted.bed -wa -wb | bedtools groupby -i - -g 1-5 -c 9 -o distinct > segs.b3b2.cnvs.merged.genelist.bedgraph | |
bedtools intersect -a segs.b4b3.cnvs.merged.bedg -b gencode.prot.known.sorted.bed -wa -wb | bedtools groupby -i - -g 1-5 -c 9 -o distinct > segs.b4b3.cnvs.merged.genelist.bedgraph | |
######################################################## | |
# bgzip and tabix for IGV | |
######################################################## | |
bgzip segs.b1b0.cnvs.merged.genelist.bedgraph | |
bgzip segs.b2b1.cnvs.merged.genelist.bedgraph | |
bgzip segs.b3b2.cnvs.merged.genelist.bedgraph | |
bgzip segs.b4b3.cnvs.merged.genelist.bedgraph | |
tabix -p bed segs.b1b0.cnvs.merged.genelist.bedgraph.gz | |
tabix -p bed segs.b2b1.cnvs.merged.genelist.bedgraph.gz | |
tabix -p bed segs.b3b2.cnvs.merged.genelist.bedgraph.gz | |
tabix -p bed segs.b4b3.cnvs.merged.genelist.bedgraph.gz |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
library("DNAcopy") | |
b1b0 <- read.table('B1vB0.log2.bedg') | |
b2b0 <- read.table('B2vB0.log2.bedg') | |
b3b0 <- read.table('B3vB0.log2.bedg') | |
b4b0 <- read.table('B4vB0.log2.bedg') | |
b2b1 <- read.table('B2vB1.log2.bedg') | |
b3b2 <- read.table('B3vB2.log2.bedg') | |
b4b3 <- read.table('B4vB3.log2.bedg') | |
########################################### | |
# Load into DNAcopy objects and smooth | |
########################################### | |
CNA.object.b1b0 <- CNA( genomdat = b1b0[,4], chrom = b1b0[,1], maploc = b1b0[,2]+(b1b0[,3]-b1b0[,2])/2, data.type = 'logratio', sampleid="b1b0") | |
CNA.object.b2b0 <- CNA( genomdat = b2b0[,4], chrom = b2b0[,1], maploc = b2b0[,2]+(b2b0[,3]-b2b0[,2])/2, data.type = 'logratio', sampleid="b2b0") | |
CNA.object.b3b0 <- CNA( genomdat = b3b0[,4], chrom = b3b0[,1], maploc = b3b0[,2]+(b3b0[,3]-b3b0[,2])/2, data.type = 'logratio', sampleid="b3b0") | |
CNA.object.b4b0 <- CNA( genomdat = b4b0[,4], chrom = b4b0[,1], maploc = b4b0[,2]+(b4b0[,3]-b4b0[,2])/2, data.type = 'logratio', sampleid="b4b0") | |
CNA.object.b2b1 <- CNA( genomdat = b2b1[,4], chrom = b2b1[,1], maploc = b2b1[,2]+(b2b1[,3]-b2b1[,2])/2, data.type = 'logratio', sampleid="b2b1") | |
CNA.object.b3b2 <- CNA( genomdat = b3b2[,4], chrom = b3b2[,1], maploc = b3b2[,2]+(b3b2[,3]-b3b2[,2])/2, data.type = 'logratio', sampleid="b3b2") | |
CNA.object.b4b3 <- CNA( genomdat = b4b3[,4], chrom = b4b3[,1], maploc = b4b3[,2]+(b4b3[,3]-b4b3[,2])/2, data.type = 'logratio', sampleid="b4b3") | |
CNA.smoothed.b1b0 <- smooth.CNA(CNA.object.b1b0) | |
CNA.smoothed.b2b0 <- smooth.CNA(CNA.object.b2b0) | |
CNA.smoothed.b3b0 <- smooth.CNA(CNA.object.b3b0) | |
CNA.smoothed.b4b0 <- smooth.CNA(CNA.object.b4b0) | |
CNA.smoothed.b2b1 <- smooth.CNA(CNA.object.b2b1) | |
CNA.smoothed.b3b2 <- smooth.CNA(CNA.object.b3b2) | |
CNA.smoothed.b4b3 <- smooth.CNA(CNA.object.b4b3) | |
########################################### | |
# Segment | |
########################################### | |
segs.b1b0 <- segment(CNA.smoothed.b1b0, undo.splits = "sdundo", undo.SD = 1.5, verbose = 1, min.width=5) | |
segs.b2b0 <- segment(CNA.smoothed.b2b0, undo.splits = "sdundo", undo.SD = 1.5, verbose = 1, min.width=5) | |
segs.b3b0 <- segment(CNA.smoothed.b3b0, undo.splits = "sdundo", undo.SD = 1.5, verbose = 1, min.width=5) | |
segs.b4b0 <- segment(CNA.smoothed.b4b0, undo.splits = "sdundo", undo.SD = 1.5, verbose = 1, min.width=5) | |
segs.b2b1 <- segment(CNA.smoothed.b2b1, undo.splits = "sdundo", undo.SD = 1.5, verbose = 1, min.width=5) | |
segs.b3b2 <- segment(CNA.smoothed.b3b2, undo.splits = "sdundo", undo.SD = 1.5, verbose = 1, min.width=5) | |
segs.b4b3 <- segment(CNA.smoothed.b4b3, undo.splits = "sdundo", undo.SD = 1.5, verbose = 1, min.width=5) | |
########################################### | |
# Make individual plots for each chrom and sample | |
########################################### | |
chroms <- c('chr1', 'chr10', 'chr11', 'chr12', 'chr13', 'chr14', 'chr15', 'chr16', 'chr17', 'chr18', | |
'chr19', 'chr2', 'chr20', 'chr21', 'chr22', 'chr3', 'chr4', 'chr5', 'chr6', 'chr7', 'chr8', | |
'chr9', 'chrX') | |
for (chrom in chroms) | |
{ | |
title <- paste("b1b0", ".", chrom, ".pdf", sep="") | |
pdf(file=title) | |
plotSample(segs.b1b0, chromlist = chrom, ylim=c(-3,3), main=title) | |
dev.off() | |
title <- paste("b2b1", ".", chrom, ".pdf", sep="") | |
pdf(file=title) | |
plotSample(segs.b2b1, chromlist = chrom, ylim=c(-3,3), main=title) | |
dev.off() | |
title <- paste("b3b2", ".", chrom, ".pdf", sep="") | |
pdf(file=title) | |
plotSample(segs.b3b2, chromlist = chrom, ylim=c(-3,3), main=title) | |
dev.off() | |
title <- paste("b4b3", ".", chrom, ".pdf", sep="") | |
pdf(file=title) | |
plotSample(segs.b4b3, chromlist = chrom, ylim=c(-3,3), main=title) | |
dev.off() | |
} | |
########################################### | |
# Write the CN segments to a tab-delimited file | |
########################################### | |
write.table(segs.b1b0$output, file="segs.b1b0.txt", row.names=T, col.names=F, quote=F, sep="\t") | |
write.table(segs.b2b0$output, file="segs.b2b0.txt", row.names=T, col.names=F, quote=F, sep="\t") | |
write.table(segs.b3b0$output, file="segs.b3b0.txt", row.names=T, col.names=F, quote=F, sep="\t") | |
write.table(segs.b4b0$output, file="segs.b4b0.txt", row.names=T, col.names=F, quote=F, sep="\t") | |
write.table(segs.b2b1$output, file="segs.b2b1.txt", row.names=T, col.names=F, quote=F, sep="\t") | |
write.table(segs.b3b2$output, file="segs.b3b2.txt", row.names=T, col.names=F, quote=F, sep="\t") | |
write.table(segs.b4b3$output, file="segs.b4b3.txt", row.names=T, col.names=F, quote=F, sep="\t") | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment