Last active
October 19, 2018 07:32
-
-
Save dinovski/69a208fbf0dd1e4c0080fc89967eacd8 to your computer and use it in GitHub Desktop.
imputation, phasing, and file wrangling for local ancestry inference with RFMix
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/bin/bash | |
## 1. Imputation of WGS samples with Impute2 (http://mathgen.stats.ox.ac.uk/impute/impute_v2.html) | |
## and phasing with Shapeit (http://www.shapeit.fr/) | |
## 2. File wrangling for local ancestry inference with RFMix (https://github.com/slowkoni/rfmix) | |
## | |
## NOTE: 'PopPhased' directory included with RFMix MUST be local to working directory ie. ${ODIR} | |
## reference data was obtained from http://genetics.med.harvard.edu/reichlab/Reich_Lab/Datasets.html | |
## Bin paths | |
PYTHON= | |
PLINK= | |
SHAPEIT= | |
RFMIX= | |
IDIR=/path/to/plink/files | |
ODIR=/out/path | |
NAME=jnomes.hgdp | |
REF_DIR=/path/to/ref/samps | |
## sample VCFs must be converted to plink format and merged with reference data | |
${PLINK} --vcf ${VCF} --biallelic-only strict --make-bed --out ${IDIR}/jnomes | |
${PLINK} --bfile ${IDIR}/jnomes --bmerge ${IDIR}/hgdp --make-bed --out ${IDIR}/${NAME} | |
## split dataset by chr prior to phasing | |
for chr in $(seq 1 22); do | |
${PLINK} --bfile ${IDIR}/${NAME} \ | |
--chr $chr \ | |
--make-bed \ | |
--out ${ODIR}/${NAME}.chr$chr | |
done | |
## remove problem snps | |
## Prior to phasing, individuals or sites with more than 5% missing genotypes and failing strand alignment are excluded. | |
## Monomorphic or singleton markers should be removed, as they are not informative for phasing | |
for chr in $(seq 1 22); do | |
${SHAPEIT} -check --input-bed ${IDIR}/${NAME}.chr${chr}.bed ${IDIR}/${NAME}.chr${chr}.bim ${IDIR}/${NAME}.chr${chr}.fam \ | |
-M ${REF_DIR}/chr${chr}.genetic_map.txt \ | |
--input-ref ${REF_DIR}/chr${chr}.haplotypes.gz \ | |
${REF_DIR}/chr${chr}.legend.gz \ | |
${REF_DIR}/ALL_1000G_phase1integrated_v3.sample \ | |
--output-log ${ODIR}/check${chr} 2>&1 ${ODIR}/logfile.txt ; | |
done | |
## phase | |
for chr in $(seq 1 22); do | |
${SHAPEIT} --input-bed ${IDIR}/${NAME}.chr${chr}.bed ${IDIR}/${NAME}.chr${chr}.bim ${IDIR}/${NAME}.chr${chr}.fam \ | |
-M ${REF_DIR}/chr${chr}.genetic_map.txt \ | |
--input-ref ${REF_DIR}/chr${chr}.haplotypes.gz \ | |
${REF_DIR}/chr${chr}.legend.gz \ | |
${REF_DIR}/ALL_1000G_phase1integrated_v3.sample \ | |
--exclude-snp check${chr}.snp.strand.exclude \ | |
-O ${ODIR}/output${chr} \ | |
--output-log ${ODIR}/real${chr} 2>&1 > ${ODIR}/logfile${chr}.txt ; | |
done | |
## Modify haps files to select populations of interest to compare to admixed (ie. input) samples | |
## split 'haps' files (output from phasing with Shapeit) | |
## person in the nth row has columns (n-1)*2 + 1 and (n-1)*2 + 2 | |
SAMP_FILE=${ODIR}/output.revised.pops.SAMPLE | |
sed 1,2d ${SAMP_FILE} | tr ' ' '\t' | cat -n - | awk '($2 ~ "GEN")' | |
## TO DO: automate column detection and separation of admixed samples | |
## admixed sample column info | |
## 275 GEN15-59-D GEN15-59-D 0 0 0 0 -9 = col 549,550 | |
## 276 GEN15-60-D GEN15-60-D 0 0 0 0 -9 = col 551,552 | |
## 277 GEN15-61-D GEN15-61-D 0 0 0 0 -9 = col 553,554 | |
## 278 GEN15-63-D GEN15-63-D 0 0 0 0 -9 = col 555,556 | |
for chr in $(seq 1 22); do | |
cat ${ODIR}/output${chr}.haps | tr ' ' '\t' | cut -f 1-5 | tr '\t' ' ' > ${ODIR}/output${chr}.haps.header | |
done | |
## remove admixed samples from haps files | |
for chr in $(seq 1 22); do | |
cat ${ODIR}/output${chr}.haps | tr ' ' '\t' | cut -f 1-5 --complement | cut -f 549-556 --complement | tr '\t' ' ' | \ | |
paste ${ODIR}/output${chr}.haps.header - | tr '\t' ' ' > ${ODIR}/ref${chr}.haps | |
done | |
## keep only admixed samples in haps files | |
for chr in $(seq 1 22); do | |
cat ${ODIR}/output${chr}.haps | tr ' ' '\t' | cut -f 1-5 --complement | cut -f 549-556 | tr '\t' ' ' | \ | |
paste ${ODIR}/output${chr}.haps.header - | tr '\t' ' ' > ${ODIR}/admixed${chr}.haps | |
done | |
## sample files | |
sed -n 1,2p ${SAMP_FILE} > ${ODIR}/sample.header | |
## admixed samples | |
cat ${SAMP_FILE} | tr ' ' '\t' | awk '($2 ~ "GEN")' | tr '\t' ' ' | cat ${ODIR}/sample.header - > ${ODIR}/admixed.SAMPLE | |
## reference samples | |
cat ${SAMP_FILE} | tr ' ' '\t' | awk '($2 !~ "GEN")' | tr '\t' ' ' > ${ODIR}/refonly.SAMPLE | |
## edit ref.haps file to include specific populations | |
## BEGIN narrow-cut-haps.sh | |
for chr in $(seq 1 22); do | |
SAMP_FILE=${ODIR}/output.revised.pops.SAMPLE | |
HAPS=${ODIR}/ref${chr}.haps | |
## print a list of columns (1-based) to keep | |
COLS=$(awk 'BEGIN { | |
## List of populations to keep (from first column of sample file) | |
list["Druze" ] = 1 | |
list["French"] = 1 | |
list["Italian" ] = 1 | |
list["Palestinian" ] = 1 | |
list["Russian" ] = 1 | |
list["Adygei" ] = 1 | |
# Keep the first 5 columns of the HAPS file | |
print 1 | |
print 2 | |
print 3 | |
print 4 | |
print 5 | |
} | |
$1 in list { | |
# for each population to keep, print the corresponding columns in the HAPS file. | |
# 1. skip the first 5 columns | |
# 2. assume te input file (in "sample" format) has two header lines, | |
# subtract 3 from the line number (NR) to make column number zero-based | |
# 3. multiply by 2 (as there are two fields for each item in he HAPS file) | |
# 4. add 6 => 5 fields + 1 to make it 1-based again. | |
# add 6+1 for the next field number of the pair. | |
print (NR-3)*2+6 | |
print (NR-3)*2+6+1 | |
}' "$SAMP_FILE") | |
## Combine columns into a comma-separated list | |
COLS=$(echo "$COLS" | paste -d, -s) | |
cut -d' ' --fields "$COLS" "$HAPS" > ${ODIR}/rfmix-narrow${chr}.haps | |
done | |
## END | |
## edit sample file to include ref samples by pop in modified .haps file | |
## wide cut (ie, more populations) | |
cat ${SAMP_FILE} | tr ' ' '\t' | awk '($1 == "Adygei" || $1 =="Druze" || $1 == "French" || $1 == "Han" || \ | |
$1 == "Italian" || $1 == "Luhya" || $1 == "Orcadian" || $1 == "Palestinian" || $1 == "Russian" || $1 == "Yoruba")' | \ | |
tr '\t' ' ' | cat {ODIR}/sample.header - > ${ODIR}/ref.wide.SAMPLE | |
## narrow cut | |
cat ${SAMP_FILE} | tr ' ' '\t' | awk '($1 =="Druze" || $1 == "French" || $1 == "Italian" || \ | |
$1 == "Palestinian" || $1 == "Russian" || $1 == "Adygei")' | tr '\t' ' ' | cat {ODIR}/sample.header - > {ODIR}/ref.narrow.SAMPLE | |
## edit pop labels so that there are 4 classes: | |
## levant = druze/palestinian | |
## sw_eur = french/italian | |
## e_eur = russian | |
## khazar = adygei | |
cat {ODIR}/ref.narrow.SAMPLE | tr ' ' '\t' | awk '{ if ($1 == "Druze" || $1 == "Palestinian") {$1 = "levant"} ; print $0}' | \ | |
awk '{if ($1 == "French" || $1 == "Italian") {$1 = "sw_eur"} ; print $0}' | \ | |
awk '{ if ($1 == "Russian") {$1 = "e_eur"} ; print $0}' | \ | |
awk '{ if ($1 == "Adygei") {$1 = "khazar"} ; print $0}' > {ODIR}/ref.narrow.mod.SAMPLE | |
## 'keep' files=single sample per line | |
cat {ODIR}/admixed.SAMPLE | tr ' ' '\t' | cut -f 2 | sed 1,2d > {ODIR}/admixed.keep | |
#cat {ODIR}/ref.wide.SAMPLE | sed 1,2d | tr ' ' '\t' | cut -f 2 > {ODIR}/ref.wide.keep | |
cat {ODIR}/ref.narrow.mod.SAMPLE | sed 1,2d | tr ' ' '\t' | cut -f 2 > {ODIR}/ref.narrow.keep | |
## shapeit to rfmix by chr: narrow pops | |
## rfmix-narrow${chr}.haps are modified ref${chr}.haps files | |
for chr in $(seq 1 22); do | |
${PYTHON} ${RFMIX}/shapeit2rfmix.py \ | |
--shapeit_hap_ref {ODIR}/rfmix-narrow${chr}.haps \ | |
--shapeit_hap_admixed {ODIR}/admixed${chr}.haps \ | |
--shapeit_sample_ref {ODIR}/ref.narrow.mod.SAMPLE \ | |
--shapeit_sample_admixed {ODIR}/admixed.SAMPLE \ | |
--ref_keep {ODIR}/ref.narrow.keep \ | |
--admixed_keep {ODIR}/admixed.keep \ | |
--chr ${chr} \ | |
--genetic_map ${REF_DIR}/chr${chr}.genetic_map.txt \ | |
--out {ODIR}/${NAME}_${chr}_narrow | |
done | |
## shapeit to rfmix output files must end in "chr${chr}.map" etc.; need to rename due to redundant chr names | |
for chr in $(seq 1 22); do | |
mv ${ODIR}/${NAME}_${chr}_narrow_chr${chr}.snp_locations ${ODIR}/${NAME}.narrow.chr${chr}.snp_locations | |
done | |
for chr in $(seq 1 22); do | |
mv ${ODIR}/${NAME}_${chr}_narrow_chr${chr}.map ${ODIR}/${NAME}.narrow.chr${chr}.map | |
done | |
for chr in $(seq 1 22); do | |
mv ${ODIR}/${NAME}_${chr}_narrow_chr${chr}.alleles ${ODIR}/${NAME}.narrow.chr${chr}.alleles | |
done | |
## edit classes files to assign index to each ref population | |
## this must be done if ref and admixed samples were phased together | |
## --sample argument includes all samples | |
## This 'keep' file is different: ie. first 2 columns of sample files | |
## Sample files are not sample files...they are output from shapeit to rfmix | |
sed 1,2d ${ODIR}/ref.narrow.mod.SAMPLE | tr ' ' '\t' | cut -f 1,2 | tr '\t' ' ' > ${ODIR}/ref.narrow.keep | |
## there must be a keep file for each individual population | |
awk '{print $0 >> $1 ".keep"}' ${ODIR}/ref.narrow.keep | |
for chr in $(seq 1 22); do | |
${PYTHON} ${RFMIX}/classes.py \ | |
--ref ${ODIR}/e_eur.keep,${ODIR}/khazar.keep,${ODIR}/levant.keep,${ODIR}/sw_eur.keep \ | |
--sample ${ODIR}/${NAME}_${chr}_narrow.sample \ | |
--out ${ODIR}/${NAME}_${chr}.narrow.mod.classes | |
done | |
## RUN RFMIX: 2 EM iterations, 50 generations; min node size 5 (min num ref haplotypes per tree node, default=1); bootstrap 0 (for admixed) | |
## must run with 'PopPhased' dir local | |
## node size 5 recommended if ref sample sizes are skewed > 2:1 or if doing EM or if related samples (to reduce variance) | |
#RFMIX_OPTS='-e 2 -n 5 -w 0.2 -G 50 -s 0' | |
## rerun RFMix with larger window; OMNI array has ~600K SNPs and ~300K overlap with jnomes so we only | |
## included about 20 per window before with -w 0.2 (cM); try going up to 1cM | |
RFMIX_OPTS='-e 2 -n 5 -w 1 -G 50 -s 0' | |
for chr in $(seq 1 22); do | |
${PYTHON} ${RFMIX}/RunRFMix.py ${RFMIX_OPTS} \ | |
--num-threads 4 \ | |
--use-reference-panels-in-EM \ | |
--forward-backward \ | |
PopPhased \ | |
${ODIR}/${NAME}.narrow.chr${chr}.alleles \ | |
${ODIR}/${NAME}_${chr}.narrow.mod.classes \ | |
${ODIR}/${NAME}.narrow.chr${chr}.snp_locations \ | |
-o ${ODIR}/${NAME}_chr${chr}.rfmix | |
done | |
## Collapse output | |
## multiple viterbi files indexed by EM iteration = 1 row per snp and 1 col per admixed haplotype | |
## forward-backward = each haplotype gets 1 col per ancestry (posterior probability of that ancestry | |
## at that SNP in that haplotype, for as many ref pops used) | |
## corrected phasings = same format as alleles file (1 row per snp, 1 col per haplotype) but only contains admixed haplotypes | |
## collapse output into bed files and generate karyogram plots | |
## use ${ODIR}/${NAME}.revised.pops.SAMPLE for pop IDs | |
## pop labels need to be in order of "rfmix populations", ie. order of ref pop classes | |
## determined the class IDs by comparing a *narrow.mod.classes file with ref.narrow.mod.SAMPLE: | |
## 0=ashkenazi (ie. admixed samples) | |
## 1=e_eur | |
## 2=khazar | |
## 3=levant | |
## 4=sw_eur | |
## with fbk; default threshold (0.9) | |
SAMP_ID=GEN15-59-D ## admixed sample of interest | |
for chr in $(seq 1 22); do | |
${PYTHON} ${RFMIX}/collapse_ancestry_mod.py \ | |
--rfmix ${ODIR}/${NAME}_chr${chr}.rfmix.2.Viterbi.txt \ | |
--snp_locations ${ODIR}/${NAME}.narrow.${chr}.snp_locations \ | |
--fbk ${ODIR}/${NAME}_${chr}.rfmix.2.ForwardBackward.txt \ | |
--ind ${SAMP_ID} \ | |
--ind_info ${ODIR}/${NAME}_narrow.sample \ | |
--pop_labels e_eur,khazar,levant,sw_eur \ | |
--out ${SAMP_ID} | |
done | |
## note: forward backward is same format as Viterbi except instead of 1 col per | |
## ancestry, each haplotype gets 1 col per ancestry | |
## value is the posterior probability of that ancestry at that SNP in that haplotype | |
## Forward-Backward gives marginal probability for each individual state, Viterbi | |
## gives probability of the most likely sequence of states | |
## no fbk | |
for chr in $(seq 1 22); do | |
${PYTHON} ${RFMIX}/collapse_ancestry_mod.py \ | |
--rfmix ${ODIR}/${NAME}_chr${chr}.rfmix.2.Viterbi.txt \ | |
--snp_locations ${ODIR}/${NAME}.narrow.chr${chr}.snp_locations \ | |
--ind ${SAMP_ID} \ | |
--ind_info ${ODIR}/${NAME}_narrow.sample \ | |
--pop_labels e_eur,khazar,levant,sw_eur \ | |
--out ${ODIR}/${SAMP_ID}_nofbk | |
done | |
## plot karyogram | |
IND='${SAMP_ID}_nofbk'; ${PYTHON} ${RFMIX}/plot_karyogram.py \ | |
--bed_a ${IND}_A.bed \ | |
--bed_b ${IND}_B.bed \ | |
--pop_order e_eur,khazar,levant,sw_eur \ | |
--centromeres ${RFMIX}/centromeres_hg19.bed \ | |
--ind ${IND} \ | |
--out ${IND}.png |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment