brentp · July 22, 2016 19:27
diff --git a/crap2ped.py b/crap2ped.py
 import csv
 import re
 import sys
 import os.path
 import string

 sex_replace = {'m': 1, 'male': 1, 'female': 2, 'f': 2}
 phenotype_replace = {'affected': 2, 'unaffected': 1}

 if False:
 # the first arg is the template where each $var must be a header in the csv
    sys.argv.append("$Family_ID\t$Individual_ID-$Sample_ID\t$Father_ID-$Father_ID\t$Mother_ID-$Mother_ID\t$Gender\t$Clinical_Status\t$Type\t$Race\t$Ethnicity\t$Tissue_Name\t$Tissue_Type")
    # the 2nd arg is the path 2 the csv
    sys.argv.append("~/Downloads/CDH.csv")
    # the 3rd arg is the sex column
    sys.argv.append("Gender")
    # the 4th arg is the phenotype
    sys.argv.append("Clinical_Status")
 elif False:
    sys.argv.append("$Family_ID\t$UU_Barcode\t$Paternal_ID\t$Maternal_ID\t$Sex\t$Phenotype")
    sys.argv.append("~/Downloads/H1K_Samples.txt")
    sys.argv[-1] = "~/Downloads/trios.txt"
    sys.argv.append("Sex")
    sys.argv.append("Phenotype")

 elif False:
    # Cardio
    sys.argv.append("$Family_ID\t$Individual_ID-$Individual_ID\t$Father_ID-$Father_ID\t$Mother_ID-$Mother_ID\t$Sex\t$Clinical_Status\t$Phenotype\t$Race\t$Ethnicity")
    # the 2nd arg is the path 2 the csv
    sys.argv.append("~/Downloads/WashU_Bowles_Cardiac_Individuals.csv")
    # the 3rd arg is the sex column
    sys.argv.append("Sex")
    # the 4th arg is the phenotype
    sys.argv.append("Clinical_Status")
 elif False:
    # IHH
    sys.argv.append("nant_welt_ihh\t$Sample_ID\t$Paternal_ID\t$Maternal_ID\t$Sex\t$Affection_Status\t$Affection_Status_ORIG")
    sys.argv.append("~/Downloads/15-04-15_Nant-Welt-IHH_Samples.csv")
    sys.argv.append("Sex")
    sys.argv.append("Affection_Status")

 elif False:
    sys.argv.append("$Kindred_ID\t$Individual_ID\t$Paternal_ID\t$Maternal_ID\t$Sex\t$Affection_Status\t$Tissue_Type")
    sys.argv.append("~/Downloads/Kardon.csv")
    sys.argv.append("Sex")
    sys.argv.append("Affection_Status")
 elif False: # Ostrander EIEE
    sys.argv.append("$Kindred_ID\t$Sample_ID\t$Paternal_ID\t$Maternal_ID\t$Sex\t$Affection_Status\t$Tissue_Type")
    sys.argv.append("~/Downloads/Nant-Ostrander-EIEE-Samples.txt")
    sys.argv.append("Sex")

 else: #

    sys.argv.append("$Pedigree\t$vcfID\t$dadID\t$momID\t$Sex\t$Affection\t$Source\t$mzTwin\t$Project")
    sys.argv.append("~/Downloads/localWGS11JUL.ped.txt")
    sys.argv.append("Sex")
    sys.argv.append("Affection")


 template = string.Template(sys.argv[1])
 fh = open(os.path.expanduser(sys.argv[2]), "rU")
 sep = "," if fh.name.endswith(".csv") else "\t"

 dialect = csv.excel
 dialect.delimiter = sep

 sex_col = sys.argv[3] if len(sys.argv) > 3 else None
 phenotype_col = sys.argv[4] if len(sys.argv) > 4 else None

 # replace white-space and '-'
 header = next(fh)
 header = [x.strip().replace("-", "_").replace(" ", "_") for x in header.rstrip("\r\n").split(sep)]
 d = dict(zip(header, header))
 for i, h in enumerate(header):
    if h.lower() in ("phenotype", "sex"):
        if h.lower() == "sex" and h != sex_col and sex_col is not None:
            d[header[i]] = header[i] + "_extra"
        if h.lower() == "phenotype" and h != phenotype_col and phenotype_col is not None:
            d[header[i]] = header[i] + "_extra"


 if phenotype_col is not None:
    d[phenotype_col] = "phenotype"
 if sex_col is not None:
    d[sex_col] = "sex"

 print "#" + re.sub("(.+)-\\1", "\\1", template.substitute(d).lower())

 for toks in csv.reader(fh, dialect):
    if not any(t.strip() for t in toks):
        continue

    d = dict(zip(header, toks))
    if sex_col is not None:
        d[sex_col] = sex_replace.get(d[sex_col].lower().strip(), "-9")
    if phenotype_col is not None:
        d[phenotype_col] = phenotype_replace.get(d[phenotype_col].lower().strip(), "-9")

    for pid in ('paternal_id', 'maternal_id', 'mother_id', 'father_id',
                'mom_id', 'dad_id', 'mother', 'father'):
        key = [x for x in d.keys() if x.lower() == pid]
        if not key:
            continue
        if d[key[0]].strip().lower() in ("", "-", "na", "unknown", "0"):
            d[key[0]] = '-9'
    for k in d:
        if isinstance(d[k], basestring) and d[k].strip() == "":
            d[k] = "."
    # if the ssample name required in the ped is sample-sample, we dont want to
    # double-print the -9
    toks = template.substitute(d).replace("-9--9", "-9").split("\t")
    if toks[1].endswith("-."):
        toks[1] = toks[1].rstrip(".") + toks[1].rstrip("-.")
    print "\t".join(toks)
	import csv
	import re
	import sys
	import os.path
	import string

	sex_replace = {'m': 1, 'male': 1, 'female': 2, 'f': 2}
	phenotype_replace = {'affected': 2, 'unaffected': 1}

	if False:
	# the first arg is the template where each $var must be a header in the csv
	sys.argv.append("$Family_ID\t$Individual_ID-$Sample_ID\t$Father_ID-$Father_ID\t$Mother_ID-$Mother_ID\t$Gender\t$Clinical_Status\t$Type\t$Race\t$Ethnicity\t$Tissue_Name\t$Tissue_Type")
	# the 2nd arg is the path 2 the csv
	sys.argv.append("~/Downloads/CDH.csv")
	# the 3rd arg is the sex column
	sys.argv.append("Gender")
	# the 4th arg is the phenotype
	sys.argv.append("Clinical_Status")
	elif False:
	sys.argv.append("$Family_ID\t$UU_Barcode\t$Paternal_ID\t$Maternal_ID\t$Sex\t$Phenotype")
	sys.argv.append("~/Downloads/H1K_Samples.txt")
	sys.argv[-1] = "~/Downloads/trios.txt"
	sys.argv.append("Sex")
	sys.argv.append("Phenotype")

	elif False:
	# Cardio
	sys.argv.append("$Family_ID\t$Individual_ID-$Individual_ID\t$Father_ID-$Father_ID\t$Mother_ID-$Mother_ID\t$Sex\t$Clinical_Status\t$Phenotype\t$Race\t$Ethnicity")
	# the 2nd arg is the path 2 the csv
	sys.argv.append("~/Downloads/WashU_Bowles_Cardiac_Individuals.csv")
	# the 3rd arg is the sex column
	sys.argv.append("Sex")
	# the 4th arg is the phenotype
	sys.argv.append("Clinical_Status")
	elif False:
	# IHH
	sys.argv.append("nant_welt_ihh\t$Sample_ID\t$Paternal_ID\t$Maternal_ID\t$Sex\t$Affection_Status\t$Affection_Status_ORIG")
	sys.argv.append("~/Downloads/15-04-15_Nant-Welt-IHH_Samples.csv")
	sys.argv.append("Sex")
	sys.argv.append("Affection_Status")

	elif False:
	sys.argv.append("$Kindred_ID\t$Individual_ID\t$Paternal_ID\t$Maternal_ID\t$Sex\t$Affection_Status\t$Tissue_Type")
	sys.argv.append("~/Downloads/Kardon.csv")
	sys.argv.append("Sex")
	sys.argv.append("Affection_Status")
	elif False: # Ostrander EIEE
	sys.argv.append("$Kindred_ID\t$Sample_ID\t$Paternal_ID\t$Maternal_ID\t$Sex\t$Affection_Status\t$Tissue_Type")
	sys.argv.append("~/Downloads/Nant-Ostrander-EIEE-Samples.txt")
	sys.argv.append("Sex")

	else: #

	sys.argv.append("$Pedigree\t$vcfID\t$dadID\t$momID\t$Sex\t$Affection\t$Source\t$mzTwin\t$Project")
	sys.argv.append("~/Downloads/localWGS11JUL.ped.txt")
	sys.argv.append("Sex")
	sys.argv.append("Affection")


	template = string.Template(sys.argv[1])
	fh = open(os.path.expanduser(sys.argv[2]), "rU")
	sep = "," if fh.name.endswith(".csv") else "\t"

	dialect = csv.excel
	dialect.delimiter = sep

	sex_col = sys.argv[3] if len(sys.argv) > 3 else None
	phenotype_col = sys.argv[4] if len(sys.argv) > 4 else None

	# replace white-space and '-'
	header = next(fh)
	header = [x.strip().replace("-", "_").replace(" ", "_") for x in header.rstrip("\r\n").split(sep)]
	d = dict(zip(header, header))
	for i, h in enumerate(header):
	if h.lower() in ("phenotype", "sex"):
	if h.lower() == "sex" and h != sex_col and sex_col is not None:
	d[header[i]] = header[i] + "_extra"
	if h.lower() == "phenotype" and h != phenotype_col and phenotype_col is not None:
	d[header[i]] = header[i] + "_extra"


	if phenotype_col is not None:
	d[phenotype_col] = "phenotype"
	if sex_col is not None:
	d[sex_col] = "sex"

	print "#" + re.sub("(.+)-\\1", "\\1", template.substitute(d).lower())

	for toks in csv.reader(fh, dialect):
	if not any(t.strip() for t in toks):
	continue

	d = dict(zip(header, toks))
	if sex_col is not None:
	d[sex_col] = sex_replace.get(d[sex_col].lower().strip(), "-9")
	if phenotype_col is not None:
	d[phenotype_col] = phenotype_replace.get(d[phenotype_col].lower().strip(), "-9")

	for pid in ('paternal_id', 'maternal_id', 'mother_id', 'father_id',
	'mom_id', 'dad_id', 'mother', 'father'):
	key = [x for x in d.keys() if x.lower() == pid]
	if not key:
	continue
	if d[key[0]].strip().lower() in ("", "-", "na", "unknown", "0"):
	d[key[0]] = '-9'
	for k in d:
	if isinstance(d[k], basestring) and d[k].strip() == "":
	d[k] = "."
	# if the ssample name required in the ped is sample-sample, we dont want to
	# double-print the -9
	toks = template.substitute(d).replace("-9--9", "-9").split("\t")
	if toks[1].endswith("-."):
	toks[1] = toks[1].rstrip(".") + toks[1].rstrip("-.")
	print "\t".join(toks)