CnrLwlss · January 25, 2016 11:28 · CnrLwlss · Nov 9, 2015
diff --git a/OkazakiAnalysis.sh b/OkazakiAnalysis.sh
 # Data from Smith & Whitehouse (2012) http://dx.doi.org/10.1038/nature10895
 #SRR364781 -> wt_sample
 #SRR364782 -> wt_replicate
 #SRR364783 -> pol32_sample
 #SRR364784 -> pol32_replicate

 roots=(
 SRR364781
 SRR364782
 SRR364783
 SRR364784
 )

 let ncpu=$(nproc)-1

 # -p argument in mkdir skips command if directory already present

 while getopts "pibdfiacq:" opt; do
    case $opt in
 	p)
 	  echo "Installing bioinformatics packages, assuming apt-get available (Ubuntu)..."
 	  sudo apt-get install sra-toolkit samtools bedtools bowtie2
 	  ;;
 	b)
 	  echo "Downloading & uncompressing processed (aligned) read files..."
 	  mkdir -p beds
 	 (wget ftp://ftp.ncbi.nlm.nih.gov/geo/samples/GSM835nnn/GSM835650/suppl/GSM835650_wt_sample.bed.gz -P beds/
 	  gunzip -c beds/GSM835650_wt_sample.bed.gz > beds/GSM835650_wt_sample.bed)&
 	 (wget ftp://ftp.ncbi.nlm.nih.gov/geo/samples/GSM835nnn/GSM835651/suppl/GSM835651_wt_replicate.bed.gz -P beds/
 	  gunzip -c beds/GSM835651_wt_replicate.bed.gz > beds/GSM835651_wt_replicate.bed)&
 	 (wget ftp://ftp.ncbi.nlm.nih.gov/geo/samples/GSM835nnn/GSM835652/suppl/GSM835652_pol32_sample.bed.gz -P beds/
 	  gunzip -c beds/GSM835652_pol32_sample.bed.gz > beds/GSM835652_pol32_sample.bed)&
 	 (wget ftp://ftp.ncbi.nlm.nih.gov/geo/samples/GSM835nnn/GSM835653/suppl/GSM835653_pol32_replicate.bed.gz -P beds/
 	  gunzip -c beds/GSM835653_pol32_replicate.bed.gz > beds/GSM835653_pol32_replicate.bed)&
 	  ;;
 	d)
 	  echo "Downloading raw read files..."
 	  for x in ${roots[@]}; do
 	     wget ftp://ftp-trace.ncbi.nlm.nih.gov/sra/sra-instant/reads/ByStudy/sra/SRP/SRP009/SRP009379/$x/$x.sra&
 	  done
 	  ;;
 	f)	
 	  # NOTE: If you omit the --origfmt flag here, paired reads will be labelled readname.1 and readname.2
 	  # whereas the convention that most tools (e.g. bedtools) expects is two rows labelled readname
 	  echo "Converting .sra files to .fastq..."
 	  for x in ${roots[@]}; do
 	     fastq-dump --origfmt -I --split-files $x.sra&
 	  done
 	  ;;
 	i)
 	  echo "Download reference sequence and build indices..."
 	  mkdir -p reference_SGD
 	  mkdir -p indices
 	  wget http://downloads.yeastgenome.org/sequence/S288C_reference/genome_releases/S288C_reference_genome_R64-2-1_20150113.tgz -P reference_SGD/
          tar -xzf reference_SGD/S288C_reference_genome_R64-2-1_20150113.tgz --directory reference_SGD/
 	  bowtie2-build reference_SGD/S288C_reference_genome_R64-2-1_20150113/S288C_reference_sequence_R64-2-1_20150113.fsa indices/SGD_S288C
 	  ;;
 	a)
 	  echo "Aligning reads..."
 	  mkdir -p alignments
 	  for x in ${roots[@]}; do
 	         echo "Align reads from ${x} using $ncpu cores"
 	         bowtie2-align -x indices/SGD_S288C -1 "$x"_1.fastq -2 "$x"_2.fastq -S alignments/"$x".sam -p $ncpu --end-to-end --no-mixed --no-discordant --minins 200 --maxins 1000 --fr -D 30 -R 4
 	  done
 	  ;;
 	c)
 	  # NOTE: can view contents of binary .bam files by 
 	  # samtools view SRR364783_fixed.bam | less -S 
 	  echo "Converting alignment files..."
 	  for x in ${roots[@]}; do
            (
            echo Convert .sam alignments to .bam, sort and index ${x}
            samtools view -bS alignments/"$x".sam >alignments/"$x".bam
            samtools sort -n alignments/"$x".bam alignments/"$x"_sorted
            samtools fixmate alignments/"$x"_sorted.bam alignments/"$x"_fixed.bam
            bedtools bamtobed -bedpe -i alignments/"$x"_fixed.bam > alignments/"$x".bedpe
            )&
 	  done
 	  ;;
 	q)
 	  echo Filtering reads by MAPQ quality score $OPTARG
 	  for x in ${roots[@]}; do
            (
 	    echo ${x}
 	    samtools view -q "$OPTARG" -b alignments/"$x"_fixed.bam > alignments/"$x"_"$OPTARG"_fixed.bam 
 	    )
 	  done
 	  ;;
 	\?)
 	  echo "You must specify a valid argument (or arguments) to use this script.  Valid arguments are:
 -p Install bioinformatics packages required for analysis
 -b Download & uncompress .bed alignment files from NCBI
 -d Download raw read .sra files from NCBI
 -f Convert downloaded .sra files to .fastq format
 -i Make indices for reference sequence
 -a Align reads from .fastq files to reference sequence
 -c Convert .sam alignments to other file formats (.bam and .bedpe).
 -q Filter reads by MAPQ score q.  Must specify value of q, separated by space."
 	esac
 done
	# Data from Smith & Whitehouse (2012) http://dx.doi.org/10.1038/nature10895
	#SRR364781 -> wt_sample
	#SRR364782 -> wt_replicate
	#SRR364783 -> pol32_sample
	#SRR364784 -> pol32_replicate

	roots=(
	SRR364781
	SRR364782
	SRR364783
	SRR364784
	)

	let ncpu=$(nproc)-1

	# -p argument in mkdir skips command if directory already present

	while getopts "pibdfiacq:" opt; do
	case $opt in
	p)
	echo "Installing bioinformatics packages, assuming apt-get available (Ubuntu)..."
	sudo apt-get install sra-toolkit samtools bedtools bowtie2
	;;
	b)
	echo "Downloading & uncompressing processed (aligned) read files..."
	mkdir -p beds
	(wget ftp://ftp.ncbi.nlm.nih.gov/geo/samples/GSM835nnn/GSM835650/suppl/GSM835650_wt_sample.bed.gz -P beds/
	gunzip -c beds/GSM835650_wt_sample.bed.gz > beds/GSM835650_wt_sample.bed)&
	(wget ftp://ftp.ncbi.nlm.nih.gov/geo/samples/GSM835nnn/GSM835651/suppl/GSM835651_wt_replicate.bed.gz -P beds/
	gunzip -c beds/GSM835651_wt_replicate.bed.gz > beds/GSM835651_wt_replicate.bed)&
	(wget ftp://ftp.ncbi.nlm.nih.gov/geo/samples/GSM835nnn/GSM835652/suppl/GSM835652_pol32_sample.bed.gz -P beds/
	gunzip -c beds/GSM835652_pol32_sample.bed.gz > beds/GSM835652_pol32_sample.bed)&
	(wget ftp://ftp.ncbi.nlm.nih.gov/geo/samples/GSM835nnn/GSM835653/suppl/GSM835653_pol32_replicate.bed.gz -P beds/
	gunzip -c beds/GSM835653_pol32_replicate.bed.gz > beds/GSM835653_pol32_replicate.bed)&
	;;
	d)
	echo "Downloading raw read files..."
	for x in ${roots[@]}; do
	wget ftp://ftp-trace.ncbi.nlm.nih.gov/sra/sra-instant/reads/ByStudy/sra/SRP/SRP009/SRP009379/$x/$x.sra&
	done
	;;
	f)
	# NOTE: If you omit the --origfmt flag here, paired reads will be labelled readname.1 and readname.2
	# whereas the convention that most tools (e.g. bedtools) expects is two rows labelled readname
	echo "Converting .sra files to .fastq..."
	for x in ${roots[@]}; do
	fastq-dump --origfmt -I --split-files $x.sra&
	done
	;;
	i)
	echo "Download reference sequence and build indices..."
	mkdir -p reference_SGD
	mkdir -p indices
	wget http://downloads.yeastgenome.org/sequence/S288C_reference/genome_releases/S288C_reference_genome_R64-2-1_20150113.tgz -P reference_SGD/
	tar -xzf reference_SGD/S288C_reference_genome_R64-2-1_20150113.tgz --directory reference_SGD/
	bowtie2-build reference_SGD/S288C_reference_genome_R64-2-1_20150113/S288C_reference_sequence_R64-2-1_20150113.fsa indices/SGD_S288C
	;;
	a)
	echo "Aligning reads..."
	mkdir -p alignments
	for x in ${roots[@]}; do
	echo "Align reads from ${x} using $ncpu cores"
	bowtie2-align -x indices/SGD_S288C -1 "$x"_1.fastq -2 "$x"_2.fastq -S alignments/"$x".sam -p $ncpu --end-to-end --no-mixed --no-discordant --minins 200 --maxins 1000 --fr -D 30 -R 4
	done
	;;
	c)
	# NOTE: can view contents of binary .bam files by
	# samtools view SRR364783_fixed.bam \| less -S
	echo "Converting alignment files..."
	for x in ${roots[@]}; do
	(
	echo Convert .sam alignments to .bam, sort and index ${x}
	samtools view -bS alignments/"$x".sam >alignments/"$x".bam
	samtools sort -n alignments/"$x".bam alignments/"$x"_sorted
	samtools fixmate alignments/"$x"_sorted.bam alignments/"$x"_fixed.bam
	bedtools bamtobed -bedpe -i alignments/"$x"_fixed.bam > alignments/"$x".bedpe
	)&
	done
	;;
	q)
	echo Filtering reads by MAPQ quality score $OPTARG
	for x in ${roots[@]}; do
	(
	echo ${x}
	samtools view -q "$OPTARG" -b alignments/"$x"_fixed.bam > alignments/"$x"_"$OPTARG"_fixed.bam
	)
	done
	;;
	\?)
	echo "You must specify a valid argument (or arguments) to use this script. Valid arguments are:
	-p Install bioinformatics packages required for analysis
	-b Download & uncompress .bed alignment files from NCBI
	-d Download raw read .sra files from NCBI
	-f Convert downloaded .sra files to .fastq format
	-i Make indices for reference sequence
	-a Align reads from .fastq files to reference sequence
	-c Convert .sam alignments to other file formats (.bam and .bedpe).
	-q Filter reads by MAPQ score q. Must specify value of q, separated by space."
	esac
	done