Last active
August 29, 2015 14:21
-
-
Save brevans/6c3b099daeefcb43c3f8 to your computer and use it in GitHub Desktop.
export2plink.py prefix pop_file snp_list AxiomGT1.calls.txt
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
import csv | |
import re | |
import sys | |
from collections import OrderedDict as od | |
def translate(annots, gt_row): | |
#uses annotation file to translate from 0/1/2/-1 to ACGT | |
ps_id = gt_row[0] | |
t_gts = [] | |
for gt in gt_row[1:]: | |
A = annots[ps_id][2] | |
B = annots[ps_id][3] | |
#trans = {'AA':A+' '+A, 'AB':A+' '+B, 'BB':B+' '+B, 'NoCall':'0 0'} | |
trans = {'0':A+' '+A, '1':A+' '+B, '2':B+' '+B, '-1':'0 0'} | |
t_gts.append(trans[gt]) | |
return [ps_id]+t_gts | |
annots = {} | |
gts = [] | |
try: | |
assert len(sys.argv) == 5 | |
prefix = sys.argv[1] | |
pop_fn = sys.argv[2] | |
snp_list_fn = sys.argv[3] | |
export_file = sys.argv[4] | |
except Exception as e: | |
print(''' | |
Script use: | |
export2plink.py prefix_of_plink_files populations_file snp_list_file AxiomGT1.calls_file | |
-will exclude all SNPs in the snp list file | |
-will only keep individuals in the populations file | |
-populations file is format: | |
individual_ID<TAB>population | |
''') | |
sys.exit(1) | |
with open("CsvAnnotationFile.v1.txt") as annotfile: | |
annot_reader = csv.reader(annotfile, delimiter=",", quotechar='"') | |
for row in annot_reader: | |
if not row[0].startswith('#'): | |
#all[0] will be the header | |
#Probe Set ID: [Chromosome,Physical Position,Allele A,Allele B] | |
annots[row[0]] = [row[x] for x in [3,4,9,10]] | |
pops = od() | |
with open(pop_fn) as pop_file: | |
for l in pop_file: | |
tmp = l.rstrip().split('\t') | |
pops[tmp[0]]=tmp[1] | |
snp_list = set() | |
with open(snp_list_fn) as snp_list_file: | |
for l in snp_list_file: | |
snp_list.add(l.rstrip()) | |
keepers = [0] | |
sample_names = [] | |
with open(export_file) as gtfile: | |
gt_reader = csv.reader(gtfile, delimiter="\t") | |
for row in gt_reader: | |
if not row[0].startswith('#'): | |
if row[0] == "probeset_id": | |
for i, ind_tmp in enumerate(row[1:]): | |
ind = re.sub('(\.CEL)', '', ind_tmp) | |
if ind in pops: | |
keepers.append(i+1) | |
sample_names.append(ind) | |
else: | |
if row[0] not in snp_list: | |
gts.append(translate(annots, [row[x] for x in keepers])) | |
''' | |
The PED file is a white-space (space or tab) delimited file. | |
the first six columns are mandatory: | |
Family ID | |
Individual ID | |
Paternal ID | |
Maternal ID | |
Sex (1=male; 2=female; other=unknown) | |
Phenotype | |
Genotypes (column 7 onwards) should also be white-space delimited; | |
they can be any character (e.g. 1,2,3,4 or A,C,G,T or anything else) | |
except 0 which is, by default, the missing genotype character. | |
All markers should be biallelic. | |
The MAP file describes a single marker and must contain exactly 4 columns: | |
chromosome (1-22, X, Y or 0 if unplaced) | |
rs# or snp identifier | |
Genetic distance (morgans) | |
Base-pair position (bp units) | |
''' | |
with open(prefix+'.ped', 'w') as ped_file, open(prefix+'.map', 'w') as map_file: | |
ped_dict = dict(zip(sample_names, [[pops[x].replace(' ', '_'),x,'0','0','0','0'] for x in sample_names])) | |
#ped_dict = dict(zip(sample_names, [[pops[x],x,'0','0','0','0'] for x in sample_names])) | |
for gt_row in gts: | |
ps_id = gt_row[0] | |
sc = re.sub('SC1\.', '', annots[ps_id][0]) | |
map_file.write('\t'.join([annots[ps_id][0], ps_id, '0', annots[ps_id][1]])+'\n') | |
for ind, gt in zip(sample_names, gt_row[1:]): | |
ped_dict[ind].append(gt) | |
for ind in pops: | |
if ind in ped_dict: | |
ped_file.write('\t'.join(ped_dict[ind])+'\n') |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment