Skip to content

Instantly share code, notes, and snippets.

View jodyphelan's full-sized avatar

Jody Phelan jodyphelan

  • London, UK
View GitHub Profile
#! /usr/bin/perl
use warnings;
use strict;
if (scalar @ARGV !=1){print "\nebiDownload <acc_list>\n\n"; exit;}
open F, $ARGV[0] or die;
while(<F>){
chomp;
echo $1 > $1.sample_name.txt \
&& ebiDownload.pl $1.sample_name.txt | parallel wget \
&& tb-profiler profile -1 ${1}_1.fastq.gz -2 ${1}_2.fastq.gz -t 5 -p $1 \
&& bcftools mpileup -f /home/ubuntu/refgenome/MTB-h37rv_asm19595v2-eg18.fa bam/$1.bam -a DP,AD -g 10 -B -Q 0 -q 0 -I | bcftools call -mg 10 -V indels -Oz -o vcf/$1.bcftools.g.vcf.gz \
&& gatk HaplotypeCaller -I bam/$1.bam -R /home/ubuntu/refgenome/MTB-h37rv_asm19595v2-eg18.fa -O vcf/$1.gatk.vcf.gz -ERC GVCF \
&& delly call -g /home/ubuntu/refgenome/MTB-h37rv_asm19595v2-eg18.fa -o vcf/$1.delly.bcf bam/$1.bam \
&& rm ${1}_1.fastq.gz ${1}_2.fastq.gz bam/${1}.*
wget ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR209/009/SRR2099969/SRR2099969_1.fastq.gz
wget ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR209/009/SRR2099969/SRR2099969_2.fastq.gz
wget ftp://ftp.ensemblgenomes.org/pub/release-32/bacteria//fasta/bacteria_0_collection/mycobacterium_tuberculosis_h37rv/dna/Mycobacterium_tuberculosis_h37rv.ASM19595v2.dna.toplevel.fa.gz
gunzip Mycobacterium_tuberculosis_h37rv.ASM19595v2.dna.toplevel.fa.gz
mv Mycobacterium_tuberculosis_h37rv.ASM19595v2.dna.toplevel.fa H37Rv.fa
# Get software
wget https://github.com/samtools/samtools/releases/download/1.9/samtools-1.9.tar.bz2 && tar -xvf samtools-1.9.tar.bz2 && cd samtools-1.9/ && make && cd ..
wget https://github.com/samtools/bcftools/releases/download/1.9/bcftools-1.9.tar.bz2 && tar -xvf bcftools-1.9.tar.bz2 && cd bcftools-1.9/ && make && cd ..
git clone https://github.com/lh3/bwa.git && cd bwa && make && cd ../
cd /tmp/
git clone https://github.com/jodyphelan/pathogen-profiler.git
cd pathogen-profiler
python setup.py install
cd ../
rm -fr pathogen-profiler
rm -r genomics_db
gatk GenomicsDBImport --genomicsdb-workspace-path genomics_db -L Chromosome --sample-name-map test.map --reader-threads 10 --max-num-intervals-to-import-in-parallel 10
gatk --java-options "-Xmx40g" GenotypeGVCFs -R ~/refgenome/MTB-h37rv_asm19595v2-eg18.fa -V gendb://genomics_db -O test.raw.vcf.gz
bcftools view -V indels test.raw.vcf.gz | bcftools filter -e 'GT="het"' -S . | awk 'length($4)==1 || $0~/^#/' | tr '|' '/' | tr '*' '.' | bcftools view -a | bcftools view -c 1 -Oz -o test.filt.vcf.gz
bcf2fasta.py test.filt.vcf.gz ~/refgenome/MTB-h37rv_asm19595v2-eg18.fa test.snps.fa
# VCF file = test.filt.vcf.gz
# replace where necessary
(printf "chr\npos\nref\n"; bcftools query -l test.filt.vcf.gz) | datamash transpose > test.filt.mat.bin
bcftools query -f '%CHROM\t%POS\t%REF[\t%GT]\n' test.filt.vcf.gz | tr '|' '/' | sed 's/\.\/\./N/g' | sed 's/0\/0/0/g' | sed 's/.\/./1/g' >> test.filt.mat.bin
rsync -am --include='*.gatk.vcf.gz' --include='*/' --exclude='*' /mnt/storage/jody/ena/vcf/ .
#! /usr/bin/env python
import sys
for l in sys.stdin:
accessions = l.rstrip().split("_")
for accession in accessions:
if len(accession)==9:
dir1 = accession[:6]
sys.stdout.write("ftp://ftp.sra.ebi.ac.uk/vol1/fastq/%s/%s/%s_1.fastq.gz\n" % (dir1,accession,accession))
sys.stdout.write("ftp://ftp.sra.ebi.ac.uk/vol1/fastq/%s/%s/%s_2.fastq.gz\n" % (dir1,accession,accession))
elif len(accession)==10:
cd ~/data/
rm -r gwas/*
wget http://pathogenseq.lshtm.ac.uk/downloads/new_gwas.tgz
tar -xvf new_gwas.tgz
rm new_gwas.tgz
#! /usr/bin/env python
import sys
import subprocess
import argparse
import random
import os
rand_generator = random.SystemRandom()
def get_random_file(prefix = None,extension=None):
randint = rand_generator.randint(1,999999)