Last active
September 10, 2020 12:40
-
-
Save explodecomputer/a215a400501dcae2c821 to your computer and use it in GitHub Desktop.
Extract SNPs from VCF files
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/bin/bash | |
#PBS -l walltime=24:00:00,nodes=1:ppn=1 | |
#PBS -o output.file | |
#PBS -me | |
#PBS -S /bin/bash | |
set -e | |
module add apps/vcftools-0.1.12b | |
export PERL5LIB=/cm/shared/apps/VCFTOOLS-0.1.12b/vcftools_0.1.12b/perl/ | |
module add apps/tabix-0.2.6 | |
wd="/projects/UK10K/GOYA/imputed" | |
snplist="/panfs/panasas01/sscm/eprcr/GOYA/speliotes_snps.txt" | |
tempdir="${HOME}/tempdir" | |
out="${HOME}/output.vcf.gz" | |
# Run the code | |
cd ${wd} | |
mkdir -p ${tempdir} | |
find . -maxdepth 1 -name '*.vcf.gz' | while read f; do | |
# Output filenames - strip path and extensions | |
fn=$(basename "${f}") | |
fn="${fn%.*.*}" | |
# Create copy of file with tab delimiter instead of space delimiter | |
# Note - you may want to just do this to all the files because vcftools won't recognise them as they currently are | |
zcat ${f} | tr ' ' '\t' | sed 's/exm-//g' | gzip -c > ${tempdir}/temp.vcf.gz | |
# Extract SNPs | |
vcftools --gzvcf ${tempdir}/temp.vcf.gz --snps ${snplist} --recode --keep-INFO-all --out ${tempdir}/${fn} | |
# Prepare for merging | |
bgzip ${tempdir}/${fn}.recode.vcf | |
tabix ${tempdir}/${fn}.recode.vcf.gz | |
done | |
rm ${tempdir}/temp.vcf.gz | |
# Merge all output files | |
# note: maybe better to just select files that have an rs in them: | |
# zgrep rs tempdir/*.vcf.gz | |
vcf-merge ${tempdir}/*vcf.gz | bgzip -c > ${out} | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment