dinovski · October 19, 2018 07:32
diff --git a/inferAncestry.sh b/inferAncestry.sh
 #!/bin/bash

 ## 1. Imputation of WGS samples with Impute2 (http://mathgen.stats.ox.ac.uk/impute/impute_v2.html)
 ## and phasing with Shapeit (http://www.shapeit.fr/)
 ## 2. File wrangling for local ancestry inference with RFMix (https://github.com/slowkoni/rfmix)
 ##
 ## NOTE: 'PopPhased' directory included with RFMix MUST be local to working directory ie. ${ODIR}
 ## reference data was obtained from http://genetics.med.harvard.edu/reichlab/Reich_Lab/Datasets.html

 ## Bin paths
 PYTHON=
 PLINK=
 SHAPEIT=
 RFMIX=

 IDIR=/path/to/plink/files
 ODIR=/out/path
 NAME=jnomes.hgdp
 REF_DIR=/path/to/ref/samps

 ## sample VCFs must be converted to plink format and merged with reference data
 ${PLINK} --vcf ${VCF} --biallelic-only strict --make-bed --out ${IDIR}/jnomes
 ${PLINK} --bfile ${IDIR}/jnomes --bmerge ${IDIR}/hgdp --make-bed --out ${IDIR}/${NAME}

 ## split dataset by chr prior to phasing
 for chr in $(seq 1 22); do
    ${PLINK} --bfile ${IDIR}/${NAME} \
    --chr $chr \
    --make-bed \
    --out ${ODIR}/${NAME}.chr$chr
 done

 ## remove problem snps
 ## Prior to phasing, individuals or sites with more than 5% missing genotypes and failing strand alignment are excluded. 
 ## Monomorphic or singleton markers should be removed, as they are not informative for phasing
 for chr in $(seq 1 22); do
    ${SHAPEIT} -check --input-bed ${IDIR}/${NAME}.chr${chr}.bed ${IDIR}/${NAME}.chr${chr}.bim ${IDIR}/${NAME}.chr${chr}.fam \
   -M ${REF_DIR}/chr${chr}.genetic_map.txt \
   --input-ref ${REF_DIR}/chr${chr}.haplotypes.gz \
   ${REF_DIR}/chr${chr}.legend.gz \
   ${REF_DIR}/ALL_1000G_phase1integrated_v3.sample \
   --output-log ${ODIR}/check${chr} 2>&1 ${ODIR}/logfile.txt ;
 done

 ## phase
 for chr in $(seq 1 22); do
    ${SHAPEIT} --input-bed ${IDIR}/${NAME}.chr${chr}.bed ${IDIR}/${NAME}.chr${chr}.bim ${IDIR}/${NAME}.chr${chr}.fam \
    -M ${REF_DIR}/chr${chr}.genetic_map.txt \
   --input-ref ${REF_DIR}/chr${chr}.haplotypes.gz \
   ${REF_DIR}/chr${chr}.legend.gz \
   ${REF_DIR}/ALL_1000G_phase1integrated_v3.sample \
   --exclude-snp check${chr}.snp.strand.exclude \
   -O ${ODIR}/output${chr} \
   --output-log ${ODIR}/real${chr} 2>&1 > ${ODIR}/logfile${chr}.txt ;
 done

 ## Modify haps files to select populations of interest to compare to admixed (ie. input) samples
 ## split 'haps' files (output from phasing with Shapeit) 
 ## person in the nth row has columns (n-1)*2 + 1 and (n-1)*2 + 2
 SAMP_FILE=${ODIR}/output.revised.pops.SAMPLE 
 sed 1,2d ${SAMP_FILE} | tr ' ' '\t' | cat -n - | awk '($2 ~ "GEN")' 

 ## TO DO: automate column detection and separation of admixed samples

 ## admixed sample column info
 ## 275    GEN15-59-D  GEN15-59-D  0   0   0   0   -9 = col 549,550
 ## 276    GEN15-60-D  GEN15-60-D  0   0   0   0   -9 = col 551,552
 ## 277    GEN15-61-D  GEN15-61-D  0   0   0   0   -9 = col 553,554
 ## 278    GEN15-63-D  GEN15-63-D  0   0   0   0   -9 = col 555,556

 for chr in $(seq 1 22); do
 cat ${ODIR}/output${chr}.haps | tr ' ' '\t' | cut -f 1-5 | tr '\t' ' ' > ${ODIR}/output${chr}.haps.header
 done

 ## remove admixed samples from haps files
 for chr in $(seq 1 22); do
 cat ${ODIR}/output${chr}.haps | tr ' ' '\t' | cut -f 1-5 --complement | cut -f 549-556 --complement | tr '\t' ' ' | \
 paste ${ODIR}/output${chr}.haps.header - | tr '\t' ' ' > ${ODIR}/ref${chr}.haps
 done

 ## keep only admixed samples in haps files
 for chr in $(seq 1 22); do
 cat ${ODIR}/output${chr}.haps | tr ' ' '\t' | cut -f 1-5 --complement | cut -f 549-556 | tr '\t' ' ' | \
 paste ${ODIR}/output${chr}.haps.header - | tr '\t' ' ' > ${ODIR}/admixed${chr}.haps
 done

 ## sample files
 sed -n 1,2p ${SAMP_FILE} > ${ODIR}/sample.header

 ## admixed samples
 cat ${SAMP_FILE} | tr ' ' '\t' | awk '($2 ~ "GEN")' | tr '\t' ' ' | cat ${ODIR}/sample.header - > ${ODIR}/admixed.SAMPLE

 ## reference samples
 cat ${SAMP_FILE} |  tr ' ' '\t' | awk '($2 !~ "GEN")' | tr '\t' ' ' > ${ODIR}/refonly.SAMPLE

 ## edit ref.haps file to include specific populations
 ## BEGIN narrow-cut-haps.sh
 for chr in $(seq 1 22); do
 SAMP_FILE=${ODIR}/output.revised.pops.SAMPLE
 HAPS=${ODIR}/ref${chr}.haps

 ## print a list of columns (1-based) to keep
 COLS=$(awk 'BEGIN {
  ## List of populations to keep (from first column of sample file)
  list["Druze" ] = 1
  list["French"] = 1
  list["Italian" ]  = 1
  list["Palestinian" ] = 1
  list["Russian" ] = 1
  list["Adygei" ] = 1


  # Keep the first 5 columns of the HAPS file
  print 1
  print 2
  print 3
  print 4
  print 5
 }
 $1 in list {
  # for each population to keep, print the corresponding columns in the HAPS file.
  # 1. skip the first 5 columns
  # 2. assume te input file (in "sample" format) has two header lines,
  #    subtract 3 from the line number (NR) to make column number zero-based
  # 3. multiply by 2 (as there are two fields for each item in he HAPS file)
  # 4. add 6 => 5 fields + 1 to make it 1-based again.
  #    add 6+1 for the next field number of the pair.
  print (NR-3)*2+6
  print (NR-3)*2+6+1
 }' "$SAMP_FILE")

 ## Combine columns into a comma-separated list
 COLS=$(echo "$COLS" | paste -d, -s)

 cut -d' ' --fields "$COLS" "$HAPS" > ${ODIR}/rfmix-narrow${chr}.haps
 done

 ## END

 ## edit sample file to include ref samples by pop in modified .haps file
 ## wide cut (ie, more populations)
 cat ${SAMP_FILE} | tr ' ' '\t' | awk '($1 == "Adygei" || $1 =="Druze" || $1 == "French" || $1 == "Han" || \
 $1 == "Italian" || $1 == "Luhya" || $1 == "Orcadian" || $1 == "Palestinian" || $1 == "Russian" || $1 == "Yoruba")' | \
 tr '\t' ' ' | cat {ODIR}/sample.header - > ${ODIR}/ref.wide.SAMPLE

 ## narrow cut
 cat ${SAMP_FILE} | tr ' ' '\t' | awk '($1 =="Druze" || $1 == "French" || $1 == "Italian" || \
 $1 == "Palestinian" || $1 == "Russian" || $1 == "Adygei")' | tr '\t' ' ' | cat {ODIR}/sample.header - > {ODIR}/ref.narrow.SAMPLE

 ## edit pop labels so that there are 4 classes: 
 ## levant = druze/palestinian
 ## sw_eur = french/italian
 ## e_eur = russian
 ## khazar = adygei

 cat {ODIR}/ref.narrow.SAMPLE | tr ' ' '\t' | awk '{ if ($1 == "Druze" || $1 == "Palestinian") {$1 = "levant"} ; print $0}' | \
 awk '{if ($1 == "French" || $1 == "Italian") {$1 = "sw_eur"} ; print $0}' | \
 awk '{ if ($1 == "Russian") {$1 = "e_eur"} ; print $0}' | \
 awk '{ if ($1 == "Adygei") {$1 = "khazar"} ; print $0}' > {ODIR}/ref.narrow.mod.SAMPLE

 ## 'keep' files=single sample per line
 cat {ODIR}/admixed.SAMPLE | tr ' ' '\t' | cut -f 2 | sed 1,2d > {ODIR}/admixed.keep

 #cat {ODIR}/ref.wide.SAMPLE | sed 1,2d | tr ' ' '\t' | cut -f 2 > {ODIR}/ref.wide.keep
 cat {ODIR}/ref.narrow.mod.SAMPLE | sed 1,2d | tr ' ' '\t' | cut -f 2 > {ODIR}/ref.narrow.keep

 ## shapeit to rfmix by chr: narrow pops
 ## rfmix-narrow${chr}.haps are modified ref${chr}.haps files

 for chr in $(seq 1 22); do
    ${PYTHON} ${RFMIX}/shapeit2rfmix.py \
    --shapeit_hap_ref {ODIR}/rfmix-narrow${chr}.haps \
    --shapeit_hap_admixed {ODIR}/admixed${chr}.haps \
    --shapeit_sample_ref {ODIR}/ref.narrow.mod.SAMPLE \
    --shapeit_sample_admixed {ODIR}/admixed.SAMPLE \
    --ref_keep {ODIR}/ref.narrow.keep  \
    --admixed_keep {ODIR}/admixed.keep \
    --chr ${chr} \
    --genetic_map ${REF_DIR}/chr${chr}.genetic_map.txt \
    --out {ODIR}/${NAME}_${chr}_narrow
 done

 ## shapeit to rfmix output files must end in  "chr${chr}.map" etc.; need to rename due to redundant chr names
 for chr in $(seq 1 22); do
 mv ${ODIR}/${NAME}_${chr}_narrow_chr${chr}.snp_locations ${ODIR}/${NAME}.narrow.chr${chr}.snp_locations
 done

 for chr in $(seq 1 22); do
 mv ${ODIR}/${NAME}_${chr}_narrow_chr${chr}.map ${ODIR}/${NAME}.narrow.chr${chr}.map
 done

 for chr in $(seq 1 22); do
 mv ${ODIR}/${NAME}_${chr}_narrow_chr${chr}.alleles ${ODIR}/${NAME}.narrow.chr${chr}.alleles
 done

 ## edit classes files to assign index to each ref population
 ## this must be done if ref and admixed samples were phased together
 ## --sample argument includes all samples
 ## This 'keep' file is different: ie. first 2 columns of sample files
 ## Sample files are not sample files...they are output from shapeit to rfmix

 sed 1,2d ${ODIR}/ref.narrow.mod.SAMPLE | tr ' ' '\t' | cut -f 1,2 | tr '\t' ' ' > ${ODIR}/ref.narrow.keep

 ## there must be a keep file for each individual population
 awk '{print $0 >> $1 ".keep"}' ${ODIR}/ref.narrow.keep

 for chr in $(seq 1 22); do
    ${PYTHON} ${RFMIX}/classes.py \
    --ref ${ODIR}/e_eur.keep,${ODIR}/khazar.keep,${ODIR}/levant.keep,${ODIR}/sw_eur.keep \
    --sample ${ODIR}/${NAME}_${chr}_narrow.sample \
    --out ${ODIR}/${NAME}_${chr}.narrow.mod.classes
 done

 ## RUN RFMIX: 2 EM iterations, 50 generations; min node size 5 (min num ref haplotypes per tree node, default=1); bootstrap 0 (for admixed)
 ## must run with 'PopPhased' dir local
 ## node size 5 recommended if ref sample sizes are skewed > 2:1 or if doing EM or if related samples (to reduce variance) 

 #RFMIX_OPTS='-e 2 -n 5 -w 0.2 -G 50 -s 0'
 ## rerun RFMix with larger window; OMNI array has ~600K SNPs and ~300K overlap with jnomes so we only 
 ## included about 20 per window before with -w 0.2 (cM); try going up to 1cM
 RFMIX_OPTS='-e 2 -n 5 -w 1 -G 50 -s 0'

 for chr in $(seq 1 22); do
        ${PYTHON} ${RFMIX}/RunRFMix.py ${RFMIX_OPTS} \
        --num-threads 4 \
        --use-reference-panels-in-EM \
        --forward-backward \
        PopPhased \
        ${ODIR}/${NAME}.narrow.chr${chr}.alleles \
        ${ODIR}/${NAME}_${chr}.narrow.mod.classes \
        ${ODIR}/${NAME}.narrow.chr${chr}.snp_locations \
        -o ${ODIR}/${NAME}_chr${chr}.rfmix
 done

 ## Collapse output
 ## multiple viterbi files indexed by EM iteration = 1 row per snp and 1 col per admixed haplotype
 ## forward-backward = each haplotype gets 1 col per ancestry (posterior probability of that ancestry 
 ## at that SNP in that haplotype, for as many ref pops used)
 ## corrected phasings = same format as alleles file (1 row per snp, 1 col per haplotype) but only contains admixed haplotypes

 ## collapse output into bed files and generate karyogram plots
 ## use ${ODIR}/${NAME}.revised.pops.SAMPLE for pop IDs

 ## pop labels need to be in order of "rfmix populations", ie. order of ref pop classes
 ## determined the class IDs by comparing a *narrow.mod.classes file with ref.narrow.mod.SAMPLE:

 ## 0=ashkenazi (ie. admixed samples)
 ## 1=e_eur
 ## 2=khazar
 ## 3=levant
 ## 4=sw_eur

 ## with fbk; default threshold (0.9)
 SAMP_ID=GEN15-59-D ## admixed sample of interest

 for chr in $(seq 1 22); do
    ${PYTHON} ${RFMIX}/collapse_ancestry_mod.py \
    --rfmix ${ODIR}/${NAME}_chr${chr}.rfmix.2.Viterbi.txt  \
    --snp_locations ${ODIR}/${NAME}.narrow.${chr}.snp_locations \
    --fbk ${ODIR}/${NAME}_${chr}.rfmix.2.ForwardBackward.txt \
    --ind ${SAMP_ID} \
    --ind_info ${ODIR}/${NAME}_narrow.sample \
    --pop_labels e_eur,khazar,levant,sw_eur \
    --out ${SAMP_ID}
 done

 ## note: forward backward is same format as Viterbi except instead of 1 col per 
 ## ancestry, each haplotype gets 1 col per ancestry
 ## value is the posterior probability of that ancestry at that SNP in that haplotype
 ## Forward-Backward gives marginal probability for each individual state, Viterbi 
 ## gives probability of the most likely sequence of states

 ## no fbk
 for chr in $(seq 1 22); do
    ${PYTHON} ${RFMIX}/collapse_ancestry_mod.py \
    --rfmix ${ODIR}/${NAME}_chr${chr}.rfmix.2.Viterbi.txt  \
    --snp_locations ${ODIR}/${NAME}.narrow.chr${chr}.snp_locations \
    --ind ${SAMP_ID} \
    --ind_info ${ODIR}/${NAME}_narrow.sample \
    --pop_labels e_eur,khazar,levant,sw_eur \
    --out ${ODIR}/${SAMP_ID}_nofbk
 done

 ## plot karyogram
 IND='${SAMP_ID}_nofbk'; ${PYTHON} ${RFMIX}/plot_karyogram.py \
 --bed_a ${IND}_A.bed \
 --bed_b ${IND}_B.bed \
 --pop_order e_eur,khazar,levant,sw_eur \
 --centromeres ${RFMIX}/centromeres_hg19.bed \
 --ind ${IND} \
 --out ${IND}.png
	#!/bin/bash

	## 1. Imputation of WGS samples with Impute2 (http://mathgen.stats.ox.ac.uk/impute/impute_v2.html)
	## and phasing with Shapeit (http://www.shapeit.fr/)
	## 2. File wrangling for local ancestry inference with RFMix (https://github.com/slowkoni/rfmix)
	##
	## NOTE: 'PopPhased' directory included with RFMix MUST be local to working directory ie. ${ODIR}
	## reference data was obtained from http://genetics.med.harvard.edu/reichlab/Reich_Lab/Datasets.html

	## Bin paths
	PYTHON=
	PLINK=
	SHAPEIT=
	RFMIX=

	IDIR=/path/to/plink/files
	ODIR=/out/path
	NAME=jnomes.hgdp
	REF_DIR=/path/to/ref/samps

	## sample VCFs must be converted to plink format and merged with reference data
	${PLINK} --vcf ${VCF} --biallelic-only strict --make-bed --out ${IDIR}/jnomes
	${PLINK} --bfile ${IDIR}/jnomes --bmerge ${IDIR}/hgdp --make-bed --out ${IDIR}/${NAME}

	## split dataset by chr prior to phasing
	for chr in $(seq 1 22); do
	${PLINK} --bfile ${IDIR}/${NAME} \
	--chr $chr \
	--make-bed \
	--out ${ODIR}/${NAME}.chr$chr
	done

	## remove problem snps
	## Prior to phasing, individuals or sites with more than 5% missing genotypes and failing strand alignment are excluded.
	## Monomorphic or singleton markers should be removed, as they are not informative for phasing
	for chr in $(seq 1 22); do
	${SHAPEIT} -check --input-bed ${IDIR}/${NAME}.chr${chr}.bed ${IDIR}/${NAME}.chr${chr}.bim ${IDIR}/${NAME}.chr${chr}.fam \
	-M ${REF_DIR}/chr${chr}.genetic_map.txt \
	--input-ref ${REF_DIR}/chr${chr}.haplotypes.gz \
	${REF_DIR}/chr${chr}.legend.gz \
	${REF_DIR}/ALL_1000G_phase1integrated_v3.sample \
	--output-log ${ODIR}/check${chr} 2>&1 ${ODIR}/logfile.txt ;
	done

	## phase
	for chr in $(seq 1 22); do
	${SHAPEIT} --input-bed ${IDIR}/${NAME}.chr${chr}.bed ${IDIR}/${NAME}.chr${chr}.bim ${IDIR}/${NAME}.chr${chr}.fam \
	-M ${REF_DIR}/chr${chr}.genetic_map.txt \
	--input-ref ${REF_DIR}/chr${chr}.haplotypes.gz \
	${REF_DIR}/chr${chr}.legend.gz \
	${REF_DIR}/ALL_1000G_phase1integrated_v3.sample \
	--exclude-snp check${chr}.snp.strand.exclude \
	-O ${ODIR}/output${chr} \
	--output-log ${ODIR}/real${chr} 2>&1 > ${ODIR}/logfile${chr}.txt ;
	done

	## Modify haps files to select populations of interest to compare to admixed (ie. input) samples
	## split 'haps' files (output from phasing with Shapeit)
	## person in the nth row has columns (n-1)2 + 1 and (n-1)2 + 2
	SAMP_FILE=${ODIR}/output.revised.pops.SAMPLE
	sed 1,2d ${SAMP_FILE} \| tr ' ' '\t' \| cat -n - \| awk '($2 ~ "GEN")'

	## TO DO: automate column detection and separation of admixed samples

	## admixed sample column info
	## 275 GEN15-59-D GEN15-59-D 0 0 0 0 -9 = col 549,550
	## 276 GEN15-60-D GEN15-60-D 0 0 0 0 -9 = col 551,552
	## 277 GEN15-61-D GEN15-61-D 0 0 0 0 -9 = col 553,554
	## 278 GEN15-63-D GEN15-63-D 0 0 0 0 -9 = col 555,556

	for chr in $(seq 1 22); do
	cat ${ODIR}/output${chr}.haps \| tr ' ' '\t' \| cut -f 1-5 \| tr '\t' ' ' > ${ODIR}/output${chr}.haps.header
	done

	## remove admixed samples from haps files
	for chr in $(seq 1 22); do
	cat ${ODIR}/output${chr}.haps \| tr ' ' '\t' \| cut -f 1-5 --complement \| cut -f 549-556 --complement \| tr '\t' ' ' \| \
	paste ${ODIR}/output${chr}.haps.header - \| tr '\t' ' ' > ${ODIR}/ref${chr}.haps
	done

	## keep only admixed samples in haps files
	for chr in $(seq 1 22); do
	cat ${ODIR}/output${chr}.haps \| tr ' ' '\t' \| cut -f 1-5 --complement \| cut -f 549-556 \| tr '\t' ' ' \| \
	paste ${ODIR}/output${chr}.haps.header - \| tr '\t' ' ' > ${ODIR}/admixed${chr}.haps
	done

	## sample files
	sed -n 1,2p ${SAMP_FILE} > ${ODIR}/sample.header

	## admixed samples
	cat ${SAMP_FILE} \| tr ' ' '\t' \| awk '($2 ~ "GEN")' \| tr '\t' ' ' \| cat ${ODIR}/sample.header - > ${ODIR}/admixed.SAMPLE

	## reference samples
	cat ${SAMP_FILE} \| tr ' ' '\t' \| awk '($2 !~ "GEN")' \| tr '\t' ' ' > ${ODIR}/refonly.SAMPLE

	## edit ref.haps file to include specific populations
	## BEGIN narrow-cut-haps.sh
	for chr in $(seq 1 22); do
	SAMP_FILE=${ODIR}/output.revised.pops.SAMPLE
	HAPS=${ODIR}/ref${chr}.haps

	## print a list of columns (1-based) to keep
	COLS=$(awk 'BEGIN {
	## List of populations to keep (from first column of sample file)
	list["Druze" ] = 1
	list["French"] = 1
	list["Italian" ] = 1
	list["Palestinian" ] = 1
	list["Russian" ] = 1
	list["Adygei" ] = 1


	# Keep the first 5 columns of the HAPS file
	print 1
	print 2
	print 3
	print 4
	print 5
	}
	$1 in list {
	# for each population to keep, print the corresponding columns in the HAPS file.
	# 1. skip the first 5 columns
	# 2. assume te input file (in "sample" format) has two header lines,
	# subtract 3 from the line number (NR) to make column number zero-based
	# 3. multiply by 2 (as there are two fields for each item in he HAPS file)
	# 4. add 6 => 5 fields + 1 to make it 1-based again.
	# add 6+1 for the next field number of the pair.
	print (NR-3)*2+6
	print (NR-3)*2+6+1
	}' "$SAMP_FILE")

	## Combine columns into a comma-separated list
	COLS=$(echo "$COLS" \| paste -d, -s)

	cut -d' ' --fields "$COLS" "$HAPS" > ${ODIR}/rfmix-narrow${chr}.haps
	done

	## END

	## edit sample file to include ref samples by pop in modified .haps file
	## wide cut (ie, more populations)
	cat ${SAMP_FILE} \| tr ' ' '\t' \| awk '($1 == "Adygei" \|\| $1 =="Druze" \|\| $1 == "French" \|\| $1 == "Han" \|\| \
	$1 == "Italian" \|\| $1 == "Luhya" \|\| $1 == "Orcadian" \|\| $1 == "Palestinian" \|\| $1 == "Russian" \|\| $1 == "Yoruba")' \| \
	tr '\t' ' ' \| cat {ODIR}/sample.header - > ${ODIR}/ref.wide.SAMPLE

	## narrow cut
	cat ${SAMP_FILE} \| tr ' ' '\t' \| awk '($1 =="Druze" \|\| $1 == "French" \|\| $1 == "Italian" \|\| \
	$1 == "Palestinian" \|\| $1 == "Russian" \|\| $1 == "Adygei")' \| tr '\t' ' ' \| cat {ODIR}/sample.header - > {ODIR}/ref.narrow.SAMPLE

	## edit pop labels so that there are 4 classes:
	## levant = druze/palestinian
	## sw_eur = french/italian
	## e_eur = russian
	## khazar = adygei

	cat {ODIR}/ref.narrow.SAMPLE \| tr ' ' '\t' \| awk '{ if ($1 == "Druze" \|\| $1 == "Palestinian") {$1 = "levant"} ; print $0}' \| \
	awk '{if ($1 == "French" \|\| $1 == "Italian") {$1 = "sw_eur"} ; print $0}' \| \
	awk '{ if ($1 == "Russian") {$1 = "e_eur"} ; print $0}' \| \
	awk '{ if ($1 == "Adygei") {$1 = "khazar"} ; print $0}' > {ODIR}/ref.narrow.mod.SAMPLE

	## 'keep' files=single sample per line
	cat {ODIR}/admixed.SAMPLE \| tr ' ' '\t' \| cut -f 2 \| sed 1,2d > {ODIR}/admixed.keep

	#cat {ODIR}/ref.wide.SAMPLE \| sed 1,2d \| tr ' ' '\t' \| cut -f 2 > {ODIR}/ref.wide.keep
	cat {ODIR}/ref.narrow.mod.SAMPLE \| sed 1,2d \| tr ' ' '\t' \| cut -f 2 > {ODIR}/ref.narrow.keep

	## shapeit to rfmix by chr: narrow pops
	## rfmix-narrow${chr}.haps are modified ref${chr}.haps files

	for chr in $(seq 1 22); do
	${PYTHON} ${RFMIX}/shapeit2rfmix.py \
	--shapeit_hap_ref {ODIR}/rfmix-narrow${chr}.haps \
	--shapeit_hap_admixed {ODIR}/admixed${chr}.haps \
	--shapeit_sample_ref {ODIR}/ref.narrow.mod.SAMPLE \
	--shapeit_sample_admixed {ODIR}/admixed.SAMPLE \
	--ref_keep {ODIR}/ref.narrow.keep \
	--admixed_keep {ODIR}/admixed.keep \
	--chr ${chr} \
	--genetic_map ${REF_DIR}/chr${chr}.genetic_map.txt \
	--out {ODIR}/${NAME}_${chr}_narrow
	done

	## shapeit to rfmix output files must end in "chr${chr}.map" etc.; need to rename due to redundant chr names
	for chr in $(seq 1 22); do
	mv ${ODIR}/${NAME}_${chr}_narrow_chr${chr}.snp_locations ${ODIR}/${NAME}.narrow.chr${chr}.snp_locations
	done

	for chr in $(seq 1 22); do
	mv ${ODIR}/${NAME}_${chr}_narrow_chr${chr}.map ${ODIR}/${NAME}.narrow.chr${chr}.map
	done

	for chr in $(seq 1 22); do
	mv ${ODIR}/${NAME}_${chr}_narrow_chr${chr}.alleles ${ODIR}/${NAME}.narrow.chr${chr}.alleles
	done

	## edit classes files to assign index to each ref population
	## this must be done if ref and admixed samples were phased together
	## --sample argument includes all samples
	## This 'keep' file is different: ie. first 2 columns of sample files
	## Sample files are not sample files...they are output from shapeit to rfmix

	sed 1,2d ${ODIR}/ref.narrow.mod.SAMPLE \| tr ' ' '\t' \| cut -f 1,2 \| tr '\t' ' ' > ${ODIR}/ref.narrow.keep

	## there must be a keep file for each individual population
	awk '{print $0 >> $1 ".keep"}' ${ODIR}/ref.narrow.keep

	for chr in $(seq 1 22); do
	${PYTHON} ${RFMIX}/classes.py \
	--ref ${ODIR}/e_eur.keep,${ODIR}/khazar.keep,${ODIR}/levant.keep,${ODIR}/sw_eur.keep \
	--sample ${ODIR}/${NAME}_${chr}_narrow.sample \
	--out ${ODIR}/${NAME}_${chr}.narrow.mod.classes
	done

	## RUN RFMIX: 2 EM iterations, 50 generations; min node size 5 (min num ref haplotypes per tree node, default=1); bootstrap 0 (for admixed)
	## must run with 'PopPhased' dir local
	## node size 5 recommended if ref sample sizes are skewed > 2:1 or if doing EM or if related samples (to reduce variance)

	#RFMIX_OPTS='-e 2 -n 5 -w 0.2 -G 50 -s 0'
	## rerun RFMix with larger window; OMNI array has ~600K SNPs and ~300K overlap with jnomes so we only
	## included about 20 per window before with -w 0.2 (cM); try going up to 1cM
	RFMIX_OPTS='-e 2 -n 5 -w 1 -G 50 -s 0'

	for chr in $(seq 1 22); do
	${PYTHON} ${RFMIX}/RunRFMix.py ${RFMIX_OPTS} \
	--num-threads 4 \
	--use-reference-panels-in-EM \
	--forward-backward \
	PopPhased \
	${ODIR}/${NAME}.narrow.chr${chr}.alleles \
	${ODIR}/${NAME}_${chr}.narrow.mod.classes \
	${ODIR}/${NAME}.narrow.chr${chr}.snp_locations \
	-o ${ODIR}/${NAME}_chr${chr}.rfmix
	done

	## Collapse output
	## multiple viterbi files indexed by EM iteration = 1 row per snp and 1 col per admixed haplotype
	## forward-backward = each haplotype gets 1 col per ancestry (posterior probability of that ancestry
	## at that SNP in that haplotype, for as many ref pops used)
	## corrected phasings = same format as alleles file (1 row per snp, 1 col per haplotype) but only contains admixed haplotypes

	## collapse output into bed files and generate karyogram plots
	## use ${ODIR}/${NAME}.revised.pops.SAMPLE for pop IDs

	## pop labels need to be in order of "rfmix populations", ie. order of ref pop classes
	## determined the class IDs by comparing a *narrow.mod.classes file with ref.narrow.mod.SAMPLE:

	## 0=ashkenazi (ie. admixed samples)
	## 1=e_eur
	## 2=khazar
	## 3=levant
	## 4=sw_eur

	## with fbk; default threshold (0.9)
	SAMP_ID=GEN15-59-D ## admixed sample of interest

	for chr in $(seq 1 22); do
	${PYTHON} ${RFMIX}/collapse_ancestry_mod.py \
	--rfmix ${ODIR}/${NAME}_chr${chr}.rfmix.2.Viterbi.txt \
	--snp_locations ${ODIR}/${NAME}.narrow.${chr}.snp_locations \
	--fbk ${ODIR}/${NAME}_${chr}.rfmix.2.ForwardBackward.txt \
	--ind ${SAMP_ID} \
	--ind_info ${ODIR}/${NAME}_narrow.sample \
	--pop_labels e_eur,khazar,levant,sw_eur \
	--out ${SAMP_ID}
	done

	## note: forward backward is same format as Viterbi except instead of 1 col per
	## ancestry, each haplotype gets 1 col per ancestry
	## value is the posterior probability of that ancestry at that SNP in that haplotype
	## Forward-Backward gives marginal probability for each individual state, Viterbi
	## gives probability of the most likely sequence of states

	## no fbk
	for chr in $(seq 1 22); do
	${PYTHON} ${RFMIX}/collapse_ancestry_mod.py \
	--rfmix ${ODIR}/${NAME}_chr${chr}.rfmix.2.Viterbi.txt \
	--snp_locations ${ODIR}/${NAME}.narrow.chr${chr}.snp_locations \
	--ind ${SAMP_ID} \
	--ind_info ${ODIR}/${NAME}_narrow.sample \
	--pop_labels e_eur,khazar,levant,sw_eur \
	--out ${ODIR}/${SAMP_ID}_nofbk
	done

	## plot karyogram
	IND='${SAMP_ID}_nofbk'; ${PYTHON} ${RFMIX}/plot_karyogram.py \
	--bed_a ${IND}_A.bed \
	--bed_b ${IND}_B.bed \
	--pop_order e_eur,khazar,levant,sw_eur \
	--centromeres ${RFMIX}/centromeres_hg19.bed \
	--ind ${IND} \
	--out ${IND}.png
No results found