Last active
July 26, 2021 14:49
-
-
Save IsmailM/b4fc0b5593bf81f64f4d8fb0440217d1 to your computer and use it in GitHub Desktop.
Produce JSON files from GenomeChronicler output
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import os | |
import csv | |
import json | |
import re | |
import sqlite3 | |
import sys | |
from shutil import copyfile | |
import html | |
def execute_db_query(query, data): | |
with DB_CONNECTION: | |
cur = DB_CONNECTION.cursor() | |
cur.execute(query, data) | |
rows = cur.fetchall() | |
if len(rows) == 1: | |
return html.unescape(rows[0][0].strip().capitalize()) | |
else: | |
return '' | |
def get_trait_full_description(id, allele_1, allele_2): | |
query = "SELECT summary FROM data where id = ? AND allele1 = ? and allele2 = ? " | |
execute_db_query(query, (id, allele_1, allele_2)) | |
def get_genoset_full_description(id): | |
query = "SELECT summary FROM genoset where id = ?" | |
execute_db_query(query, (id,)) | |
def process_link(text): | |
matches = re.findall(r'{(.+)}{(.+)}', text) | |
if matches == []: | |
links = ['', ''] | |
else: | |
links = list(matches[0]) | |
return links | |
def convert_genoset_tables_to_json(input_file, output_file): | |
json_data = [] | |
with open(input_file) as csvfile: | |
next(csvfile) | |
data = csv.reader(csvfile) | |
for row in data: | |
identifier_obj = process_link(row[1]) | |
description_obj = process_link(row[2]) | |
row_data = { | |
"magnitude": row[0], | |
"identifer": { | |
"link": identifier_obj[0], | |
"name": identifier_obj[1] | |
}, | |
"description": { | |
"short": description_obj[1], | |
"full": get_genoset_full_description(identifier_obj[1]) | |
}, | |
"db_links": [ | |
{ "name": "SNPedia", "link": identifier_obj[0] }, | |
] | |
} | |
json_data.append(row_data) | |
with open(output_file, 'w') as f: | |
json.dump(json_data, f, sort_keys=True, indent=4) | |
def convert_trait_tables_to_json(input_file, output_file): | |
json_data = [] | |
with open(input_file) as csvfile: | |
next(csvfile) | |
data = csv.reader(csvfile) | |
for row in data: | |
identifier_obj = process_link(row[1]) | |
description_obj = process_link(row[3]) | |
gnomad_obj = process_link(row[4]) | |
get_evidence_obj = process_link(row[5]) | |
clinvar_obj = process_link(row[6]) | |
alleles = list(re.findall(r'([-\w]);([-\w])', row[2])[0]) | |
row_data = { | |
"magnitude": row[0], | |
"identifer": { | |
"link": identifier_obj[0], | |
"name": identifier_obj[1] | |
}, | |
"genotype": alleles, | |
"description": { | |
"short": description_obj[1], | |
"full": get_trait_full_description(identifier_obj[1], alleles[0], alleles[1]) | |
}, | |
"db_links": [ | |
{ "name": "SNPedia", "link": identifier_obj[0] }, | |
{ "name": "GnomAD", "link": gnomad_obj[0]}, | |
{ "name": "GetEvidence", "link": get_evidence_obj[0]}, | |
{ "name": "ClinVar", "link": clinvar_obj[0]} | |
] | |
} | |
json_data.append(row_data) | |
with open(output_file, 'w') as f: | |
json.dump(json_data, f, sort_keys=True, indent=4) | |
db = r"/data/pgp_data/snpedia.db" | |
DB_CONNECTION = sqlite3.connect(db) | |
DB_CONNECTION.text_factory = lambda x: str(x, 'latin1') | |
data_dir = sys.argv[1] | |
output_folder = os.path.join(data_dir, "react_data") | |
os.mkdir(output_folder) | |
copyfile(os.path.join(data_dir, 'AncestryPlot.pdf'), | |
os.path.join(output_folder, 'AncestryPlot.pdf')) | |
convert_trait_tables_to_json( | |
os.path.join(data_dir, 'latest.good.reportTable.csv'), | |
os.path.join(output_folder, 'beneficial_data.json')) | |
convert_trait_tables_to_json( | |
os.path.join(data_dir, 'latest.bad.reportTable.csv'), | |
os.path.join(output_folder,'harmful_data.json')) | |
convert_genoset_tables_to_json( | |
os.path.join(data_dir, 'latest.genoset.reportTable.csv'), | |
os.path.join(output_folder,'genosets_data.json')) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment