Last active
July 22, 2016 19:27
-
-
Save brentp/38bbef89f4398bd1b884 to your computer and use it in GitHub Desktop.
convert a crap CSV to a ped file with some crappy code
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import csv | |
import re | |
import sys | |
import os.path | |
import string | |
sex_replace = {'m': 1, 'male': 1, 'female': 2, 'f': 2} | |
phenotype_replace = {'affected': 2, 'unaffected': 1} | |
if False: | |
# the first arg is the template where each $var must be a header in the csv | |
sys.argv.append("$Family_ID\t$Individual_ID-$Sample_ID\t$Father_ID-$Father_ID\t$Mother_ID-$Mother_ID\t$Gender\t$Clinical_Status\t$Type\t$Race\t$Ethnicity\t$Tissue_Name\t$Tissue_Type") | |
# the 2nd arg is the path 2 the csv | |
sys.argv.append("~/Downloads/CDH.csv") | |
# the 3rd arg is the sex column | |
sys.argv.append("Gender") | |
# the 4th arg is the phenotype | |
sys.argv.append("Clinical_Status") | |
elif False: | |
sys.argv.append("$Family_ID\t$UU_Barcode\t$Paternal_ID\t$Maternal_ID\t$Sex\t$Phenotype") | |
sys.argv.append("~/Downloads/H1K_Samples.txt") | |
sys.argv[-1] = "~/Downloads/trios.txt" | |
sys.argv.append("Sex") | |
sys.argv.append("Phenotype") | |
elif False: | |
# Cardio | |
sys.argv.append("$Family_ID\t$Individual_ID-$Individual_ID\t$Father_ID-$Father_ID\t$Mother_ID-$Mother_ID\t$Sex\t$Clinical_Status\t$Phenotype\t$Race\t$Ethnicity") | |
# the 2nd arg is the path 2 the csv | |
sys.argv.append("~/Downloads/WashU_Bowles_Cardiac_Individuals.csv") | |
# the 3rd arg is the sex column | |
sys.argv.append("Sex") | |
# the 4th arg is the phenotype | |
sys.argv.append("Clinical_Status") | |
elif False: | |
# IHH | |
sys.argv.append("nant_welt_ihh\t$Sample_ID\t$Paternal_ID\t$Maternal_ID\t$Sex\t$Affection_Status\t$Affection_Status_ORIG") | |
sys.argv.append("~/Downloads/15-04-15_Nant-Welt-IHH_Samples.csv") | |
sys.argv.append("Sex") | |
sys.argv.append("Affection_Status") | |
elif False: | |
sys.argv.append("$Kindred_ID\t$Individual_ID\t$Paternal_ID\t$Maternal_ID\t$Sex\t$Affection_Status\t$Tissue_Type") | |
sys.argv.append("~/Downloads/Kardon.csv") | |
sys.argv.append("Sex") | |
sys.argv.append("Affection_Status") | |
elif False: # Ostrander EIEE | |
sys.argv.append("$Kindred_ID\t$Sample_ID\t$Paternal_ID\t$Maternal_ID\t$Sex\t$Affection_Status\t$Tissue_Type") | |
sys.argv.append("~/Downloads/Nant-Ostrander-EIEE-Samples.txt") | |
sys.argv.append("Sex") | |
else: # | |
sys.argv.append("$Pedigree\t$vcfID\t$dadID\t$momID\t$Sex\t$Affection\t$Source\t$mzTwin\t$Project") | |
sys.argv.append("~/Downloads/localWGS11JUL.ped.txt") | |
sys.argv.append("Sex") | |
sys.argv.append("Affection") | |
template = string.Template(sys.argv[1]) | |
fh = open(os.path.expanduser(sys.argv[2]), "rU") | |
sep = "," if fh.name.endswith(".csv") else "\t" | |
dialect = csv.excel | |
dialect.delimiter = sep | |
sex_col = sys.argv[3] if len(sys.argv) > 3 else None | |
phenotype_col = sys.argv[4] if len(sys.argv) > 4 else None | |
# replace white-space and '-' | |
header = next(fh) | |
header = [x.strip().replace("-", "_").replace(" ", "_") for x in header.rstrip("\r\n").split(sep)] | |
d = dict(zip(header, header)) | |
for i, h in enumerate(header): | |
if h.lower() in ("phenotype", "sex"): | |
if h.lower() == "sex" and h != sex_col and sex_col is not None: | |
d[header[i]] = header[i] + "_extra" | |
if h.lower() == "phenotype" and h != phenotype_col and phenotype_col is not None: | |
d[header[i]] = header[i] + "_extra" | |
if phenotype_col is not None: | |
d[phenotype_col] = "phenotype" | |
if sex_col is not None: | |
d[sex_col] = "sex" | |
print "#" + re.sub("(.+)-\\1", "\\1", template.substitute(d).lower()) | |
for toks in csv.reader(fh, dialect): | |
if not any(t.strip() for t in toks): | |
continue | |
d = dict(zip(header, toks)) | |
if sex_col is not None: | |
d[sex_col] = sex_replace.get(d[sex_col].lower().strip(), "-9") | |
if phenotype_col is not None: | |
d[phenotype_col] = phenotype_replace.get(d[phenotype_col].lower().strip(), "-9") | |
for pid in ('paternal_id', 'maternal_id', 'mother_id', 'father_id', | |
'mom_id', 'dad_id', 'mother', 'father'): | |
key = [x for x in d.keys() if x.lower() == pid] | |
if not key: | |
continue | |
if d[key[0]].strip().lower() in ("", "-", "na", "unknown", "0"): | |
d[key[0]] = '-9' | |
for k in d: | |
if isinstance(d[k], basestring) and d[k].strip() == "": | |
d[k] = "." | |
# if the ssample name required in the ped is sample-sample, we dont want to | |
# double-print the -9 | |
toks = template.substitute(d).replace("-9--9", "-9").split("\t") | |
if toks[1].endswith("-."): | |
toks[1] = toks[1].rstrip(".") + toks[1].rstrip("-.") | |
print "\t".join(toks) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment