Created
July 2, 2015 18:20
-
-
Save daler/efa5d69742466df931f4 to your computer and use it in GitHub Desktop.
prepare rRNA.lst files for CollectRnaSeqMetrics.jar, from any genome supported by UCSC
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env bash | |
# Ryan Dale, July 2015 | |
# [email protected] | |
# | |
# CollectRnaSeqMetrics.jar from Picard [1] needs an interval list corresponding | |
# to ribosomal RNA. The format is described at [2]. | |
# | |
# SAM header creation idea from [3]; idea for using rmsk tables to get rRNA is | |
# from [4]. | |
# | |
# [1] https://broadinstitute.github.io/picard/command-line-overview.html#CollectRnaSeqMetrics | |
# [2] http://samtools.github.io/htsjdk/javadoc/htsjdk/htsjdk/samtools/util/IntervalList.html | |
# [3] https://gist.github.com/slowkow/b11c28796508f03cdf4b | |
# [4] https://www.biostars.org/p/82399/#82423 | |
set -e | |
set -o pipefail | |
usage="Usage: $0 <ucsc assembly name> <output file>" | |
: ${1?"$usage"} | |
: ${2?"$usage"} | |
ASSEMBLY=$1 | |
rRNA=$2 | |
rRNAtmp=$(mktemp) | |
RMSK=$(mktemp) | |
chromsizes=$(mktemp) | |
# We'll be using this a few times to connect to UCSC's server | |
MYSQL="mysql -N --user=genome --host=genome-mysql.cse.ucsc.edu -A -e" | |
# Identify the rmsk tables for this assembly. dm3 has one per chrom, but many | |
# other assemblies just have a single table. | |
$MYSQL "use $ASSEMBLY; show tables like \"%rmsk\"" > $RMSK | |
# Save chromsizes in a tempfile | |
$MYSQL "select chrom, size from ${ASSEMBLY}.chromInfo ORDER BY size DESC" > $chromsizes | |
# Make a SAM-style header for the output | |
perl -lane 'print "\@SQ\tSN:$F[0]\tLN:$F[1]"' $chromsizes > $rRNA | |
# Get all relevant fields. For the case of multi-rmsk assemblies like dm3, we | |
# need to make sure we have everything in the file before sorting. | |
# | |
# Also, add an incrementing number to ensure unique interval names. | |
for table in $(cat $RMSK); do | |
query="select genoName, genoStart, genoEnd, strand, repName from ${ASSEMBLY}.$table where repClass = \"rRNA\"" | |
echo [`date`] $query | |
$MYSQL "$query" \ | |
| awk -F "\t" 'BEGIN{t=0} {print $0"."t; t+=1}' \ | |
>> $rRNAtmp | |
done | |
sort -k1V -k2n -k3n $rRNAtmp >> $rRNA | |
rm $RMSK $chromsizes $rRNAtmp |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment