Skip to content

Instantly share code, notes, and snippets.

@IsmailM
Last active July 26, 2021 14:49
Show Gist options
  • Save IsmailM/b4fc0b5593bf81f64f4d8fb0440217d1 to your computer and use it in GitHub Desktop.
Save IsmailM/b4fc0b5593bf81f64f4d8fb0440217d1 to your computer and use it in GitHub Desktop.
Produce JSON files from GenomeChronicler output
import os
import csv
import json
import re
import sqlite3
import sys
from shutil import copyfile
import html
def execute_db_query(query, data):
with DB_CONNECTION:
cur = DB_CONNECTION.cursor()
cur.execute(query, data)
rows = cur.fetchall()
if len(rows) == 1:
return html.unescape(rows[0][0].strip().capitalize())
else:
return ''
def get_trait_full_description(id, allele_1, allele_2):
query = "SELECT summary FROM data where id = ? AND allele1 = ? and allele2 = ? "
execute_db_query(query, (id, allele_1, allele_2))
def get_genoset_full_description(id):
query = "SELECT summary FROM genoset where id = ?"
execute_db_query(query, (id,))
def process_link(text):
matches = re.findall(r'{(.+)}{(.+)}', text)
if matches == []:
links = ['', '']
else:
links = list(matches[0])
return links
def convert_genoset_tables_to_json(input_file, output_file):
json_data = []
with open(input_file) as csvfile:
next(csvfile)
data = csv.reader(csvfile)
for row in data:
identifier_obj = process_link(row[1])
description_obj = process_link(row[2])
row_data = {
"magnitude": row[0],
"identifer": {
"link": identifier_obj[0],
"name": identifier_obj[1]
},
"description": {
"short": description_obj[1],
"full": get_genoset_full_description(identifier_obj[1])
},
"db_links": [
{ "name": "SNPedia", "link": identifier_obj[0] },
]
}
json_data.append(row_data)
with open(output_file, 'w') as f:
json.dump(json_data, f, sort_keys=True, indent=4)
def convert_trait_tables_to_json(input_file, output_file):
json_data = []
with open(input_file) as csvfile:
next(csvfile)
data = csv.reader(csvfile)
for row in data:
identifier_obj = process_link(row[1])
description_obj = process_link(row[3])
gnomad_obj = process_link(row[4])
get_evidence_obj = process_link(row[5])
clinvar_obj = process_link(row[6])
alleles = list(re.findall(r'([-\w]);([-\w])', row[2])[0])
row_data = {
"magnitude": row[0],
"identifer": {
"link": identifier_obj[0],
"name": identifier_obj[1]
},
"genotype": alleles,
"description": {
"short": description_obj[1],
"full": get_trait_full_description(identifier_obj[1], alleles[0], alleles[1])
},
"db_links": [
{ "name": "SNPedia", "link": identifier_obj[0] },
{ "name": "GnomAD", "link": gnomad_obj[0]},
{ "name": "GetEvidence", "link": get_evidence_obj[0]},
{ "name": "ClinVar", "link": clinvar_obj[0]}
]
}
json_data.append(row_data)
with open(output_file, 'w') as f:
json.dump(json_data, f, sort_keys=True, indent=4)
db = r"/data/pgp_data/snpedia.db"
DB_CONNECTION = sqlite3.connect(db)
DB_CONNECTION.text_factory = lambda x: str(x, 'latin1')
data_dir = sys.argv[1]
output_folder = os.path.join(data_dir, "react_data")
os.mkdir(output_folder)
copyfile(os.path.join(data_dir, 'AncestryPlot.pdf'),
os.path.join(output_folder, 'AncestryPlot.pdf'))
convert_trait_tables_to_json(
os.path.join(data_dir, 'latest.good.reportTable.csv'),
os.path.join(output_folder, 'beneficial_data.json'))
convert_trait_tables_to_json(
os.path.join(data_dir, 'latest.bad.reportTable.csv'),
os.path.join(output_folder,'harmful_data.json'))
convert_genoset_tables_to_json(
os.path.join(data_dir, 'latest.genoset.reportTable.csv'),
os.path.join(output_folder,'genosets_data.json'))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment