This is bash script is based on the Impute-First github and the preprint.
Inputs:
- Reference genome
- HGSVC2 Reference panel
- PLINK genetic mac
- Novaseq HG002 sequencing reads
Tools and packages:
- Bowtie (v2.5.4)
- BCFtools (v1.21)
- seqtk (1.4-r122)
- samtools (1.21)
- Python script fix_VCF.py (here)
- Beagle (v5.1 18May20.d20, Java)
- VG (autoindex and giraffe v1.55.0 "Bernolda")
- Deepvariant (v1.6.1)
mkdir panel; cd panel
wget https://ftp.1000genomes.ebi.ac.uk/vol1/ftp/data_collections/HGSVC2/release/v2.0/integrated_callset/variants_freeze4_snv_snv_alt.vcf.gz -O ref_panel/variants_freeze4_snv_snv_alt.vcf.gz
wget https://ftp.1000genomes.ebi.ac.uk/vol1/ftp/data_collections/HGSVC2/release/v2.0/integrated_callset/variants_freeze4_indel_insdel_alt.vcf.gz -O ref_panel/variants_freeze4_indel_insdel_alt.vcf.gz
wget https://ftp.1000genomes.ebi.ac.uk/vol1/ftp/data_collections/HGSVC2/release/v2.0/integrated_callset/variants_freeze4_sv_insdel_alt.vcf.gz -O ref_panel/variants_freeze4_sv_insdel_alt.vcf.gz
for i in "variants_freeze4_indel_insdel_alt.vcf.gz" "variants_freeze4_snv_snv_alt.vcf.gz" "variants_freeze4_sv_insdel_alt.vcf.gz"; do
bcftools index $i
done
threads=48
bcftools concat -Oz -a variants_freeze4_indel_insdel_alt.vcf.gz variants_freeze4_snv_snv_alt.vcf.gz variants_freeze4_sv_insdel_alt.vcf.gz -o hsvc_raw.vcf.gz
bcftools norm -m+any -Oz hsvc_raw.vcf.gz -o hsvc.all.vcf.gz --threads ${threads}
bcftools query -l hsvc_raw.vcf.gz > hsvc.all.samples
grep -v NA24385 hsvc.all.samples > hsvc.all.less1.samples
bcftools view -Oz -S hsvc.all.less1.samples hsvc_raw.vcf.gz > hsvc.less1.vcf.gz
ref_panel_vcf_file="hsvc.less1.vcf.gz"
python fix_VCF.py ${ref_panel_vcf_file} ${ref_panel_vcf_file}_fixed.vcf.gz
for chromosome_number in {1..22}; do bcftools view -O z -o ${panel}/perchr/${chromosome_number}.vcf.gz ${input_vcf_file_all} chr"${chromosome_number}" ; done
wget https://bochet.gcc.biostat.washington.edu/beagle/genetic_maps/plink.GRCh38.map.zip
for chromosome_number in {1..22}; do
cat raw/plink.chr${chromosome_number}.GRCh38.map |sed -e 's/^/chr/' > plink.chr${chromosome_number}.GRCh38.map
done
cp GCA_000001405.15_GRCh38_no_alt_analysis_set.fna ref.fa
ref="ref.fa"
bowtie2-build --threads ${threads} ${ref} indexes/${ref}
wget https://storage.googleapis.com/brain-genomics-public/research/sequencing/fastq/novaseq/wgs_pcr_free/30x/HG002.novaseq.pcr-free.30x.R1.fastq.gz
wget https://storage.googleapis.com/brain-genomics-public/research/sequencing/fastq/novaseq/wgs_pcr_free/30x/HG002.novaseq.pcr-free.30x.R2.fastq.gz
read1=${folder}/data/HG002.novaseq.pcr-free.30x.R1.fastq.gz
read2=${folder}/data/HG002.novaseq.pcr-free.30x.R2.fastq.gz
frac="0.166" # $(echo "$cov / $read_cov" | bc -l) # from 30x to 5x
(seqtk sample -s1234 $read1 $frac | awk '{if(NR%4==1) print $0 "/1"; else print $0}') > tmp_R1.fq
(seqtk sample -s1234 $read2 $frac | awk '{if(NR%4==1) print $0 "/2"; else print $0}') > tmp_R2.fq
cat tmp_R1.fq tmp_R2.fq > "r_real.c.fq"
bowtie2 --threads ${threads} -x ${folder}/v2/step1/indexes/${ref} -U "r_real.c.fq" -S out.sam
samtools view -@ ${threads} -hb out.sam > out.bam
bcftools mpileup --threads ${threads} -Ou -f ref.fa out_sorted.bam | bcftools call --threads ${threads} -Ou -mv > output_raw.vcf
cat output_raw.vcf | bcftools filter --threads ${threads} -s LowQual -e "QUAL<20 || DP>100" -Oz > output.vcf.gz
bcftools index output.vcf.gz
bcftools view -Ov output.vcf.gz > output.vcf
for chromosome_number in {1..22}; do bcftools view -Oz -o perchr/${chromosome_number}.vcf.gz ${rough_vcf} chr"${chromosome_number}" ; done
wget https://faculty.washington.edu/browning/beagle/beagle.18May20.d20.jar
mkdir perchr_out
for chromosome_number in {1..22}; do
input_vcf_file=${rough_folder}/perchr/${chromosome_number}.vcf.gz
ref_panel_vcf_file=${panel}/perchr/${chromosome_number}.vcf.gz
linkage_map_file=${data}$"plink/plink.chr${chromosome_number}.GRCh38.map"
java -jar ${beagle} chrom=chr${chromosome_number} gt=${input_vcf_file} ref=${ref_panel_vcf_file} out=perchr_out/${chromosome_number}_imputed map=${linkage_map_file} nthreads=${threads}
bcftools index perchr/${i}_imputed.vcf.gz
done
for i in {1..22}; do echo "perchr_out/${i}_imputed.vcf.gz">>files.txt; done
bcftools concat --file-list files.txt -Oz -o imputed.vcf.gz
bcftools index imputed.vcf.gz
personalized_vcf_file="imputed.vcf.gz"
vg autoindex --workflow giraffe -r "$reference_genome_fasta" -v "$personalized_vcf_file" -p "prefix" --threads "$threads"
vg giraffe -Z prefix.giraffe.gbz -m prefix.min -d prefix.dist -f ${read1} -f ${read2}\
-p -o BAM -t ${threads} > out.bam
samtools sort -@ ${threads} out.bam > out_sorted.bam
samtools index -@ ${threads} out_sorted.bam
samtools faidx ref.fa
BIN_VERSION="1.6.1"
singularity pull docker://google/deepvariant:"${BIN_VERSION}"
reads=out_sorted.bam
ref="ref.fa"
singularity run -B /usr/lib/locale/:${current} docker://google/deepvariant:"${BIN_VERSION}" /opt/deepvariant/bin/run_deepvariant --model_type=WGS --ref="${INPUT_DIR}"/${ref} --reads="${reads}" --output_vcf="${OUTPUT_DIR}"/output.vcf.gz --output_gvcf="${OUTPUT_DIR}"/output.g.vcf.gz --num_shards=${threads}
Special thank to Kavya for the help.