brevans · August 29, 2015 14:21
diff --git a/export2plink.py b/export2plink.py
 #!/usr/bin/env python
 import csv
 import re
 import sys
 from collections import OrderedDict as od

 def translate(annots, gt_row):
    #uses annotation file to translate from 0/1/2/-1 to ACGT
    ps_id = gt_row[0]
    t_gts = []
    for gt in gt_row[1:]:
        A = annots[ps_id][2]
        B = annots[ps_id][3]
        #trans = {'AA':A+' '+A, 'AB':A+' '+B, 'BB':B+' '+B, 'NoCall':'0 0'}
        trans = {'0':A+' '+A, '1':A+' '+B, '2':B+' '+B, '-1':'0 0'}
        t_gts.append(trans[gt])
    return [ps_id]+t_gts
    
 annots = {}
 gts = []
 try:
    assert len(sys.argv) == 5
    prefix = sys.argv[1]
    pop_fn = sys.argv[2]
    snp_list_fn = sys.argv[3]
    export_file = sys.argv[4]
 except Exception as e:
    print('''
    Script use:
    export2plink.py prefix_of_plink_files populations_file snp_list_file AxiomGT1.calls_file
    
    -will exclude all SNPs in the snp list file
    -will only keep individuals in the populations file
        -populations file is format:
            individual_ID<TAB>population
    ''')
    sys.exit(1)

 with open("CsvAnnotationFile.v1.txt") as annotfile:
    annot_reader = csv.reader(annotfile, delimiter=",", quotechar='"')
    for row in annot_reader:
        if not row[0].startswith('#'):
            #all[0] will be the header
            #Probe Set ID: [Chromosome,Physical Position,Allele A,Allele B]
            annots[row[0]] = [row[x] for x in [3,4,9,10]]

 pops = od()
 with open(pop_fn) as pop_file:
    for l in pop_file:
        tmp = l.rstrip().split('\t')
        pops[tmp[0]]=tmp[1]

 snp_list = set()
 with open(snp_list_fn) as snp_list_file:
    for l in snp_list_file:
        snp_list.add(l.rstrip())

 keepers = [0]
 sample_names = []
 with open(export_file) as gtfile:
    gt_reader = csv.reader(gtfile, delimiter="\t")
    for row in gt_reader:
        if not row[0].startswith('#'):
            if row[0] == "probeset_id":
                for i, ind_tmp in enumerate(row[1:]):
                    ind = re.sub('(\.CEL)', '', ind_tmp)
                    if ind in pops:
                        keepers.append(i+1)
                        sample_names.append(ind)
            else:
                if row[0] not in snp_list:
                    gts.append(translate(annots, [row[x] for x in keepers]))

 '''
 The PED file is a white-space (space or tab) delimited file.
 the first six columns are mandatory:
     Family ID
     Individual ID
     Paternal ID
     Maternal ID
     Sex (1=male; 2=female; other=unknown)
     Phenotype
 Genotypes (column 7 onwards) should also be white-space delimited; 
 they can be any character (e.g. 1,2,3,4 or A,C,G,T or anything else) 
 except 0 which is, by default, the missing genotype character. 
 All markers should be biallelic.

 The MAP file describes a single marker and must contain exactly 4 columns:
     chromosome (1-22, X, Y or 0 if unplaced)
     rs# or snp identifier
     Genetic distance (morgans)
     Base-pair position (bp units)
 '''

 with open(prefix+'.ped', 'w') as ped_file, open(prefix+'.map', 'w') as map_file:

    ped_dict = dict(zip(sample_names, [[pops[x].replace(' ', '_'),x,'0','0','0','0'] for x in sample_names]))
    #ped_dict = dict(zip(sample_names, [[pops[x],x,'0','0','0','0'] for x in sample_names]))

    for gt_row in gts:
        ps_id = gt_row[0]
        sc = re.sub('SC1\.', '', annots[ps_id][0])
        map_file.write('\t'.join([annots[ps_id][0], ps_id, '0', annots[ps_id][1]])+'\n')
        for ind, gt in zip(sample_names, gt_row[1:]):
            ped_dict[ind].append(gt)

    for ind in pops:
        if ind in ped_dict:
            ped_file.write('\t'.join(ped_dict[ind])+'\n')
	#!/usr/bin/env python
	import csv
	import re
	import sys
	from collections import OrderedDict as od

	def translate(annots, gt_row):
	#uses annotation file to translate from 0/1/2/-1 to ACGT
	ps_id = gt_row[0]
	t_gts = []
	for gt in gt_row[1:]:
	A = annots[ps_id][2]
	B = annots[ps_id][3]
	#trans = {'AA':A+' '+A, 'AB':A+' '+B, 'BB':B+' '+B, 'NoCall':'0 0'}
	trans = {'0':A+' '+A, '1':A+' '+B, '2':B+' '+B, '-1':'0 0'}
	t_gts.append(trans[gt])
	return [ps_id]+t_gts

	annots = {}
	gts = []
	try:
	assert len(sys.argv) == 5
	prefix = sys.argv[1]
	pop_fn = sys.argv[2]
	snp_list_fn = sys.argv[3]
	export_file = sys.argv[4]
	except Exception as e:
	print('''
	Script use:
	export2plink.py prefix_of_plink_files populations_file snp_list_file AxiomGT1.calls_file

	-will exclude all SNPs in the snp list file
	-will only keep individuals in the populations file
	-populations file is format:
	individual_ID<TAB>population
	''')
	sys.exit(1)

	with open("CsvAnnotationFile.v1.txt") as annotfile:
	annot_reader = csv.reader(annotfile, delimiter=",", quotechar='"')
	for row in annot_reader:
	if not row[0].startswith('#'):
	#all[0] will be the header
	#Probe Set ID: [Chromosome,Physical Position,Allele A,Allele B]
	annots[row[0]] = [row[x] for x in [3,4,9,10]]

	pops = od()
	with open(pop_fn) as pop_file:
	for l in pop_file:
	tmp = l.rstrip().split('\t')
	pops[tmp[0]]=tmp[1]

	snp_list = set()
	with open(snp_list_fn) as snp_list_file:
	for l in snp_list_file:
	snp_list.add(l.rstrip())

	keepers = [0]
	sample_names = []
	with open(export_file) as gtfile:
	gt_reader = csv.reader(gtfile, delimiter="\t")
	for row in gt_reader:
	if not row[0].startswith('#'):
	if row[0] == "probeset_id":
	for i, ind_tmp in enumerate(row[1:]):
	ind = re.sub('(\.CEL)', '', ind_tmp)
	if ind in pops:
	keepers.append(i+1)
	sample_names.append(ind)
	else:
	if row[0] not in snp_list:
	gts.append(translate(annots, [row[x] for x in keepers]))

	'''
	The PED file is a white-space (space or tab) delimited file.
	the first six columns are mandatory:
	Family ID
	Individual ID
	Paternal ID
	Maternal ID
	Sex (1=male; 2=female; other=unknown)
	Phenotype
	Genotypes (column 7 onwards) should also be white-space delimited;
	they can be any character (e.g. 1,2,3,4 or A,C,G,T or anything else)
	except 0 which is, by default, the missing genotype character.
	All markers should be biallelic.

	The MAP file describes a single marker and must contain exactly 4 columns:
	chromosome (1-22, X, Y or 0 if unplaced)
	rs# or snp identifier
	Genetic distance (morgans)
	Base-pair position (bp units)
	'''

	with open(prefix+'.ped', 'w') as ped_file, open(prefix+'.map', 'w') as map_file:

	ped_dict = dict(zip(sample_names, [[pops[x].replace(' ', '_'),x,'0','0','0','0'] for x in sample_names]))
	#ped_dict = dict(zip(sample_names, [[pops[x],x,'0','0','0','0'] for x in sample_names]))

	for gt_row in gts:
	ps_id = gt_row[0]
	sc = re.sub('SC1\.', '', annots[ps_id][0])
	map_file.write('\t'.join([annots[ps_id][0], ps_id, '0', annots[ps_id][1]])+'\n')
	for ind, gt in zip(sample_names, gt_row[1:]):
	ped_dict[ind].append(gt)

	for ind in pops:
	if ind in ped_dict:
	ped_file.write('\t'.join(ped_dict[ind])+'\n')