Last active
January 25, 2016 11:28
-
-
Save CnrLwlss/51449ca55a0b4625546d to your computer and use it in GitHub Desktop.
Shell script for aligning reads from Smith & Whitehouse 2012.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Data from Smith & Whitehouse (2012) http://dx.doi.org/10.1038/nature10895 | |
#SRR364781 -> wt_sample | |
#SRR364782 -> wt_replicate | |
#SRR364783 -> pol32_sample | |
#SRR364784 -> pol32_replicate | |
roots=( | |
SRR364781 | |
SRR364782 | |
SRR364783 | |
SRR364784 | |
) | |
let ncpu=$(nproc)-1 | |
# -p argument in mkdir skips command if directory already present | |
while getopts "pibdfiacq:" opt; do | |
case $opt in | |
p) | |
echo "Installing bioinformatics packages, assuming apt-get available (Ubuntu)..." | |
sudo apt-get install sra-toolkit samtools bedtools bowtie2 | |
;; | |
b) | |
echo "Downloading & uncompressing processed (aligned) read files..." | |
mkdir -p beds | |
(wget ftp://ftp.ncbi.nlm.nih.gov/geo/samples/GSM835nnn/GSM835650/suppl/GSM835650_wt_sample.bed.gz -P beds/ | |
gunzip -c beds/GSM835650_wt_sample.bed.gz > beds/GSM835650_wt_sample.bed)& | |
(wget ftp://ftp.ncbi.nlm.nih.gov/geo/samples/GSM835nnn/GSM835651/suppl/GSM835651_wt_replicate.bed.gz -P beds/ | |
gunzip -c beds/GSM835651_wt_replicate.bed.gz > beds/GSM835651_wt_replicate.bed)& | |
(wget ftp://ftp.ncbi.nlm.nih.gov/geo/samples/GSM835nnn/GSM835652/suppl/GSM835652_pol32_sample.bed.gz -P beds/ | |
gunzip -c beds/GSM835652_pol32_sample.bed.gz > beds/GSM835652_pol32_sample.bed)& | |
(wget ftp://ftp.ncbi.nlm.nih.gov/geo/samples/GSM835nnn/GSM835653/suppl/GSM835653_pol32_replicate.bed.gz -P beds/ | |
gunzip -c beds/GSM835653_pol32_replicate.bed.gz > beds/GSM835653_pol32_replicate.bed)& | |
;; | |
d) | |
echo "Downloading raw read files..." | |
for x in ${roots[@]}; do | |
wget ftp://ftp-trace.ncbi.nlm.nih.gov/sra/sra-instant/reads/ByStudy/sra/SRP/SRP009/SRP009379/$x/$x.sra& | |
done | |
;; | |
f) | |
# NOTE: If you omit the --origfmt flag here, paired reads will be labelled readname.1 and readname.2 | |
# whereas the convention that most tools (e.g. bedtools) expects is two rows labelled readname | |
echo "Converting .sra files to .fastq..." | |
for x in ${roots[@]}; do | |
fastq-dump --origfmt -I --split-files $x.sra& | |
done | |
;; | |
i) | |
echo "Download reference sequence and build indices..." | |
mkdir -p reference_SGD | |
mkdir -p indices | |
wget http://downloads.yeastgenome.org/sequence/S288C_reference/genome_releases/S288C_reference_genome_R64-2-1_20150113.tgz -P reference_SGD/ | |
tar -xzf reference_SGD/S288C_reference_genome_R64-2-1_20150113.tgz --directory reference_SGD/ | |
bowtie2-build reference_SGD/S288C_reference_genome_R64-2-1_20150113/S288C_reference_sequence_R64-2-1_20150113.fsa indices/SGD_S288C | |
;; | |
a) | |
echo "Aligning reads..." | |
mkdir -p alignments | |
for x in ${roots[@]}; do | |
echo "Align reads from ${x} using $ncpu cores" | |
bowtie2-align -x indices/SGD_S288C -1 "$x"_1.fastq -2 "$x"_2.fastq -S alignments/"$x".sam -p $ncpu --end-to-end --no-mixed --no-discordant --minins 200 --maxins 1000 --fr -D 30 -R 4 | |
done | |
;; | |
c) | |
# NOTE: can view contents of binary .bam files by | |
# samtools view SRR364783_fixed.bam | less -S | |
echo "Converting alignment files..." | |
for x in ${roots[@]}; do | |
( | |
echo Convert .sam alignments to .bam, sort and index ${x} | |
samtools view -bS alignments/"$x".sam >alignments/"$x".bam | |
samtools sort -n alignments/"$x".bam alignments/"$x"_sorted | |
samtools fixmate alignments/"$x"_sorted.bam alignments/"$x"_fixed.bam | |
bedtools bamtobed -bedpe -i alignments/"$x"_fixed.bam > alignments/"$x".bedpe | |
)& | |
done | |
;; | |
q) | |
echo Filtering reads by MAPQ quality score $OPTARG | |
for x in ${roots[@]}; do | |
( | |
echo ${x} | |
samtools view -q "$OPTARG" -b alignments/"$x"_fixed.bam > alignments/"$x"_"$OPTARG"_fixed.bam | |
) | |
done | |
;; | |
\?) | |
echo "You must specify a valid argument (or arguments) to use this script. Valid arguments are: | |
-p Install bioinformatics packages required for analysis | |
-b Download & uncompress .bed alignment files from NCBI | |
-d Download raw read .sra files from NCBI | |
-f Convert downloaded .sra files to .fastq format | |
-i Make indices for reference sequence | |
-a Align reads from .fastq files to reference sequence | |
-c Convert .sam alignments to other file formats (.bam and .bedpe). | |
-q Filter reads by MAPQ score q. Must specify value of q, separated by space." | |
esac | |
done |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Need to run this manually in sections, waiting for each section to complete (check that files have finished writing to HD, check that processes have finished running by executing "top").
Some documentation by executing:
The slowest steps are the alignment and the file format conversions. These can be run in serial (e.g. overnight) by: