Skip to content

Instantly share code, notes, and snippets.

@brentp
Last active July 22, 2016 19:27
Show Gist options
  • Save brentp/38bbef89f4398bd1b884 to your computer and use it in GitHub Desktop.
Save brentp/38bbef89f4398bd1b884 to your computer and use it in GitHub Desktop.
convert a crap CSV to a ped file with some crappy code
import csv
import re
import sys
import os.path
import string
sex_replace = {'m': 1, 'male': 1, 'female': 2, 'f': 2}
phenotype_replace = {'affected': 2, 'unaffected': 1}
if False:
# the first arg is the template where each $var must be a header in the csv
sys.argv.append("$Family_ID\t$Individual_ID-$Sample_ID\t$Father_ID-$Father_ID\t$Mother_ID-$Mother_ID\t$Gender\t$Clinical_Status\t$Type\t$Race\t$Ethnicity\t$Tissue_Name\t$Tissue_Type")
# the 2nd arg is the path 2 the csv
sys.argv.append("~/Downloads/CDH.csv")
# the 3rd arg is the sex column
sys.argv.append("Gender")
# the 4th arg is the phenotype
sys.argv.append("Clinical_Status")
elif False:
sys.argv.append("$Family_ID\t$UU_Barcode\t$Paternal_ID\t$Maternal_ID\t$Sex\t$Phenotype")
sys.argv.append("~/Downloads/H1K_Samples.txt")
sys.argv[-1] = "~/Downloads/trios.txt"
sys.argv.append("Sex")
sys.argv.append("Phenotype")
elif False:
# Cardio
sys.argv.append("$Family_ID\t$Individual_ID-$Individual_ID\t$Father_ID-$Father_ID\t$Mother_ID-$Mother_ID\t$Sex\t$Clinical_Status\t$Phenotype\t$Race\t$Ethnicity")
# the 2nd arg is the path 2 the csv
sys.argv.append("~/Downloads/WashU_Bowles_Cardiac_Individuals.csv")
# the 3rd arg is the sex column
sys.argv.append("Sex")
# the 4th arg is the phenotype
sys.argv.append("Clinical_Status")
elif False:
# IHH
sys.argv.append("nant_welt_ihh\t$Sample_ID\t$Paternal_ID\t$Maternal_ID\t$Sex\t$Affection_Status\t$Affection_Status_ORIG")
sys.argv.append("~/Downloads/15-04-15_Nant-Welt-IHH_Samples.csv")
sys.argv.append("Sex")
sys.argv.append("Affection_Status")
elif False:
sys.argv.append("$Kindred_ID\t$Individual_ID\t$Paternal_ID\t$Maternal_ID\t$Sex\t$Affection_Status\t$Tissue_Type")
sys.argv.append("~/Downloads/Kardon.csv")
sys.argv.append("Sex")
sys.argv.append("Affection_Status")
elif False: # Ostrander EIEE
sys.argv.append("$Kindred_ID\t$Sample_ID\t$Paternal_ID\t$Maternal_ID\t$Sex\t$Affection_Status\t$Tissue_Type")
sys.argv.append("~/Downloads/Nant-Ostrander-EIEE-Samples.txt")
sys.argv.append("Sex")
else: #
sys.argv.append("$Pedigree\t$vcfID\t$dadID\t$momID\t$Sex\t$Affection\t$Source\t$mzTwin\t$Project")
sys.argv.append("~/Downloads/localWGS11JUL.ped.txt")
sys.argv.append("Sex")
sys.argv.append("Affection")
template = string.Template(sys.argv[1])
fh = open(os.path.expanduser(sys.argv[2]), "rU")
sep = "," if fh.name.endswith(".csv") else "\t"
dialect = csv.excel
dialect.delimiter = sep
sex_col = sys.argv[3] if len(sys.argv) > 3 else None
phenotype_col = sys.argv[4] if len(sys.argv) > 4 else None
# replace white-space and '-'
header = next(fh)
header = [x.strip().replace("-", "_").replace(" ", "_") for x in header.rstrip("\r\n").split(sep)]
d = dict(zip(header, header))
for i, h in enumerate(header):
if h.lower() in ("phenotype", "sex"):
if h.lower() == "sex" and h != sex_col and sex_col is not None:
d[header[i]] = header[i] + "_extra"
if h.lower() == "phenotype" and h != phenotype_col and phenotype_col is not None:
d[header[i]] = header[i] + "_extra"
if phenotype_col is not None:
d[phenotype_col] = "phenotype"
if sex_col is not None:
d[sex_col] = "sex"
print "#" + re.sub("(.+)-\\1", "\\1", template.substitute(d).lower())
for toks in csv.reader(fh, dialect):
if not any(t.strip() for t in toks):
continue
d = dict(zip(header, toks))
if sex_col is not None:
d[sex_col] = sex_replace.get(d[sex_col].lower().strip(), "-9")
if phenotype_col is not None:
d[phenotype_col] = phenotype_replace.get(d[phenotype_col].lower().strip(), "-9")
for pid in ('paternal_id', 'maternal_id', 'mother_id', 'father_id',
'mom_id', 'dad_id', 'mother', 'father'):
key = [x for x in d.keys() if x.lower() == pid]
if not key:
continue
if d[key[0]].strip().lower() in ("", "-", "na", "unknown", "0"):
d[key[0]] = '-9'
for k in d:
if isinstance(d[k], basestring) and d[k].strip() == "":
d[k] = "."
# if the ssample name required in the ped is sample-sample, we dont want to
# double-print the -9
toks = template.substitute(d).replace("-9--9", "-9").split("\t")
if toks[1].endswith("-."):
toks[1] = toks[1].rstrip(".") + toks[1].rstrip("-.")
print "\t".join(toks)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment