arq5x · March 30, 2017 09:09
diff --git a/cadd_compress.sh b/cadd_compress.sh
 # Download the raw CADD TSV and Tabix index (no annotations, just scores)
 wget http://krishna.gs.washington.edu/download/CADD/v1.0/whole_genome_SNVs.tsv.gz
 wget http://krishna.gs.washington.edu/download/CADD/v1.0/whole_genome_SNVs.tsv.gz.tbi

 # it is big. 79Gb
 ls -ltrh whole_genome_SNVs.tsv.gz
 -rw-r--r-- 1 arq5x users 79G Sep 26 01:44 whole_genome_SNVs.tsv.gz

 # for testing, let's play with the chr22 intervals
 tabix whole_genome_SNVs.tsv.gz 22 | bgzip > whole_genome_SNVs.tsv.22.gz

 # 987 Mb for chr22
 ls -ltrh whole_genome_SNVs.tsv.22.gz
 -rw-r--r-- 1 arq5x users 987M Mar  5 16:14 whole_genome_SNVs.tsv.22.gz

 # peek at the file
 zcat whole_genome_SNVs.tsv.22.gz | head
 22	16050001	G	A	-1.123523	0.123
 22	16050001	G	C	-1.244133	0.053
 22	16050001	G	T	-1.183845	0.081
 22	16050002	A	C	-0.974966	0.299
 22	16050002	A	G	-0.989565	0.276
 22	16050002	A	T	-0.874362	0.493
 22	16050003	T	A	-0.698084	0.978
 22	16050003	T	C	-0.820725	0.622
 22	16050003	T	G	-0.815978	0.634
 22	16050004	C	A	-1.019644	0.233


 # By glancing at the data, we see that each genomic position is repeated three time, 
 # once for each possible alternate allele. Also, the reported precision for both the
 # raw CADD SVM score (column 5) and the Phred-scaled score (column 6) is rather high.
 # Lastly, if we simply store the reference allele, we can infer which allele is which
 # if we enforce alphabetical sorting. Let's combine all three tricks and reduce the
 # precision to two decimal places. the bedtools groupby function collapses the data.
 zcat whole_genome_SNVs.tsv.22.gz \
   | awk '{printf("%d\t%d\t%s\t%s\t%0.2f\t%0.2f\n",$1,$2,$3,$4,$5,$6)}' \
   | bedtools groupby -g 1,2,3 -c 5,6 -o collapse,collapse \
   | bgzip \
   > whole_genome_SNVs.tsv.22.compressed.gz

 # let's peek at the result.
 # the format is now to use one line per genome position.
 # for each position, we store the reference allele and
 # report the raw and scaled scores as a list for each of
 # the alternate alleles. 
 # for example, at position 16050001, the reference
 # is a G. The three raw scores are -1.12, -1.24, 
 # and -1.18. We store them in alphabetical order, so
 # we know that -1.12 is for C, -1.24 for G, and -1.18
 # for T. Notice we have also truncated the precision of 
 # the scores as compared to above.
 zcat whole_genome_SNVs.tsv.22.compressed.gz | head
 22	16050001	G	-1.12,-1.24,-1.18	0.12,0.05,0.08
 22	16050002	A	-0.97,-0.99,-0.87	0.30,0.28,0.49
 22	16050003	T	-0.70,-0.82,-0.82	0.98,0.62,0.63
 22	16050004	C	-1.02,-1.10,-0.96	0.23,0.15,0.33
 22	16050005	T	-0.88,-0.97,-0.97	0.47,0.30,0.31
 22	16050006	G	-0.95,-1.09,-1.01	0.34,0.15,0.24
 22	16050007	A	-0.99,-1.00,-0.90	0.27,0.26,0.44
 22	16050008	T	-0.89,-0.99,-0.98	0.47,0.27,0.28
 22	16050009	A	-0.98,-0.98,-0.88	0.29,0.29,0.49
 22	16050010	A	-0.96,-0.98,-0.88	0.32,0.29,0.48

 # how big is it?
 ls -ltrh whole_genome_SNVs.tsv.22.compressed.gz
 -rw-r--r-- 1 arq5x users 430M Mar  5 17:04 whole_genome_SNVs.tsv.22.compressed.gz

 # so, 430 Mb versus 987Mb. Call it 50% reduction.

 # To do better, we need to further reduce the precision of the scores.
 # One might argue that this is too lossy, but I would argue that for
 # Phred scaled scores, differentiating between a score of 22.1 and 22.2 is
 # more than suffcient precision.
 zcat whole_genome_SNVs.tsv.22.gz \
   | awk '{printf("%d\t%d\t%s\t%s\t%0.1f\t%0.1f\n",$1,$2,$3,$4,$5,$6)}' \
   | bedtools groupby -g 1,2,3 -c 5,6 -o collapse,collapse \
   | bgzip \
   > whole_genome_SNVs.tsv.22.compressed.2.gz

 # let's peek at the result.
 zcat whole_genome_SNVs.tsv.22.compressed.2.gz | head
 22	16050001	G	-1.1,-1.2,-1.2	0.1,0.1,0.1
 22	16050002	A	-1.0,-1.0,-0.9	0.3,0.3,0.5
 22	16050003	T	-0.7,-0.8,-0.8	1.0,0.6,0.6
 22	16050004	C	-1.0,-1.1,-1.0	0.2,0.1,0.3
 22	16050005	T	-0.9,-1.0,-1.0	0.5,0.3,0.3
 22	16050006	G	-1.0,-1.1,-1.0	0.3,0.2,0.2
 22	16050007	A	-1.0,-1.0,-0.9	0.3,0.3,0.4
 22	16050008	T	-0.9,-1.0,-1.0	0.5,0.3,0.3
 22	16050009	A	-1.0,-1.0,-0.9	0.3,0.3,0.5
 22	16050010	A	-1.0,-1.0,-0.9	0.3,0.3,0.5

 # how big is it?
 ls -ltrh whole_genome_SNVs.tsv.22.quinlan.gz
 -rw-r--r-- 1 arq5x users 240M Mar  5 16:30 whole_genome_SNVs.tsv.22.compressed.2.gz

 # so, 240 Mb versus 987Mb. Call it 75% reduction. Not bad.
 # Extrapolating to the full file, we could expect to to drop from
 # 79 Gb to 19Gb. Still a large file, but substantially smaller. And it is still Tabix-able.



 # To do better, we need to further reduce the precision of the scores.
 # One might argue that this is too lossy, but I would argue that for
 # Phred scaled scores, differentiating between a score of 22.1 and 22.2 is
 # more than suffcient precision.
 zcat whole_genome_SNVs.tsv.22.gz \
   | awk '{printf("%d\t%d\t%s\t%s\t%0.1f\t%0.1f\n",$1,$2,$3,$4,$5,$6)}' \
   | bedtools groupby -g 1,2,3 -c 6 -o collapse,collapse \
   | bgzip \
   > whole_genome_SNVs.tsv.22.compressed.3.gz
   



 # I also played with creating 6 different BigWig files but that appears to only yield a 40-50% reduction in size
	# Download the raw CADD TSV and Tabix index (no annotations, just scores)
	wget http://krishna.gs.washington.edu/download/CADD/v1.0/whole_genome_SNVs.tsv.gz
	wget http://krishna.gs.washington.edu/download/CADD/v1.0/whole_genome_SNVs.tsv.gz.tbi

	# it is big. 79Gb
	ls -ltrh whole_genome_SNVs.tsv.gz
	-rw-r--r-- 1 arq5x users 79G Sep 26 01:44 whole_genome_SNVs.tsv.gz

	# for testing, let's play with the chr22 intervals
	tabix whole_genome_SNVs.tsv.gz 22 \| bgzip > whole_genome_SNVs.tsv.22.gz

	# 987 Mb for chr22
	ls -ltrh whole_genome_SNVs.tsv.22.gz
	-rw-r--r-- 1 arq5x users 987M Mar 5 16:14 whole_genome_SNVs.tsv.22.gz

	# peek at the file
	zcat whole_genome_SNVs.tsv.22.gz \| head
	22 16050001 G A -1.123523 0.123
	22 16050001 G C -1.244133 0.053
	22 16050001 G T -1.183845 0.081
	22 16050002 A C -0.974966 0.299
	22 16050002 A G -0.989565 0.276
	22 16050002 A T -0.874362 0.493
	22 16050003 T A -0.698084 0.978
	22 16050003 T C -0.820725 0.622
	22 16050003 T G -0.815978 0.634
	22 16050004 C A -1.019644 0.233


	# By glancing at the data, we see that each genomic position is repeated three time,
	# once for each possible alternate allele. Also, the reported precision for both the
	# raw CADD SVM score (column 5) and the Phred-scaled score (column 6) is rather high.
	# Lastly, if we simply store the reference allele, we can infer which allele is which
	# if we enforce alphabetical sorting. Let's combine all three tricks and reduce the
	# precision to two decimal places. the bedtools groupby function collapses the data.
	zcat whole_genome_SNVs.tsv.22.gz \
	\| awk '{printf("%d\t%d\t%s\t%s\t%0.2f\t%0.2f\n",$1,$2,$3,$4,$5,$6)}' \
	\| bedtools groupby -g 1,2,3 -c 5,6 -o collapse,collapse \
	\| bgzip \
	> whole_genome_SNVs.tsv.22.compressed.gz

	# let's peek at the result.
	# the format is now to use one line per genome position.
	# for each position, we store the reference allele and
	# report the raw and scaled scores as a list for each of
	# the alternate alleles.
	# for example, at position 16050001, the reference
	# is a G. The three raw scores are -1.12, -1.24,
	# and -1.18. We store them in alphabetical order, so
	# we know that -1.12 is for C, -1.24 for G, and -1.18
	# for T. Notice we have also truncated the precision of
	# the scores as compared to above.
	zcat whole_genome_SNVs.tsv.22.compressed.gz \| head
	22 16050001 G -1.12,-1.24,-1.18 0.12,0.05,0.08
	22 16050002 A -0.97,-0.99,-0.87 0.30,0.28,0.49
	22 16050003 T -0.70,-0.82,-0.82 0.98,0.62,0.63
	22 16050004 C -1.02,-1.10,-0.96 0.23,0.15,0.33
	22 16050005 T -0.88,-0.97,-0.97 0.47,0.30,0.31
	22 16050006 G -0.95,-1.09,-1.01 0.34,0.15,0.24
	22 16050007 A -0.99,-1.00,-0.90 0.27,0.26,0.44
	22 16050008 T -0.89,-0.99,-0.98 0.47,0.27,0.28
	22 16050009 A -0.98,-0.98,-0.88 0.29,0.29,0.49
	22 16050010 A -0.96,-0.98,-0.88 0.32,0.29,0.48

	# how big is it?
	ls -ltrh whole_genome_SNVs.tsv.22.compressed.gz
	-rw-r--r-- 1 arq5x users 430M Mar 5 17:04 whole_genome_SNVs.tsv.22.compressed.gz

	# so, 430 Mb versus 987Mb. Call it 50% reduction.

	# To do better, we need to further reduce the precision of the scores.
	# One might argue that this is too lossy, but I would argue that for
	# Phred scaled scores, differentiating between a score of 22.1 and 22.2 is
	# more than suffcient precision.
	zcat whole_genome_SNVs.tsv.22.gz \
	\| awk '{printf("%d\t%d\t%s\t%s\t%0.1f\t%0.1f\n",$1,$2,$3,$4,$5,$6)}' \
	\| bedtools groupby -g 1,2,3 -c 5,6 -o collapse,collapse \
	\| bgzip \
	> whole_genome_SNVs.tsv.22.compressed.2.gz

	# let's peek at the result.
	zcat whole_genome_SNVs.tsv.22.compressed.2.gz \| head
	22 16050001 G -1.1,-1.2,-1.2 0.1,0.1,0.1
	22 16050002 A -1.0,-1.0,-0.9 0.3,0.3,0.5
	22 16050003 T -0.7,-0.8,-0.8 1.0,0.6,0.6
	22 16050004 C -1.0,-1.1,-1.0 0.2,0.1,0.3
	22 16050005 T -0.9,-1.0,-1.0 0.5,0.3,0.3
	22 16050006 G -1.0,-1.1,-1.0 0.3,0.2,0.2
	22 16050007 A -1.0,-1.0,-0.9 0.3,0.3,0.4
	22 16050008 T -0.9,-1.0,-1.0 0.5,0.3,0.3
	22 16050009 A -1.0,-1.0,-0.9 0.3,0.3,0.5
	22 16050010 A -1.0,-1.0,-0.9 0.3,0.3,0.5

	# how big is it?
	ls -ltrh whole_genome_SNVs.tsv.22.quinlan.gz
	-rw-r--r-- 1 arq5x users 240M Mar 5 16:30 whole_genome_SNVs.tsv.22.compressed.2.gz

	# so, 240 Mb versus 987Mb. Call it 75% reduction. Not bad.
	# Extrapolating to the full file, we could expect to to drop from
	# 79 Gb to 19Gb. Still a large file, but substantially smaller. And it is still Tabix-able.



	# To do better, we need to further reduce the precision of the scores.
	# One might argue that this is too lossy, but I would argue that for
	# Phred scaled scores, differentiating between a score of 22.1 and 22.2 is
	# more than suffcient precision.
	zcat whole_genome_SNVs.tsv.22.gz \
	\| awk '{printf("%d\t%d\t%s\t%s\t%0.1f\t%0.1f\n",$1,$2,$3,$4,$5,$6)}' \
	\| bedtools groupby -g 1,2,3 -c 6 -o collapse,collapse \
	\| bgzip \
	> whole_genome_SNVs.tsv.22.compressed.3.gz




	# I also played with creating 6 different BigWig files but that appears to only yield a 40-50% reduction in size