Last active
August 15, 2022 07:53
-
-
Save callahantiff/597ed54a684da2e7ac832c5bb57c162a to your computer and use it in GitHub Desktop.
Composite Patient Similarity Algorithm for Semi-Supervised Rare Disease Phenotyping. Additional details can be found here: https://mor.nlm.nih.gov/pubs/alum/2017-callahan.pdf
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
######################################################################################################### | |
# 2017 NLM Summer Medical Informatics Internship | |
# Purpose: queries a Google BigQuery Database and returns a vector of values for a set of patients | |
# version 1.1.0 | |
# date: 08.15.2017 | |
######################################################################################################### | |
# import and load needed scripts | |
import dawg | |
import math | |
import matplotlib as mpl | |
import matplotlib.pyplot as pylab | |
import numpy as np | |
import os | |
import pandas as pd | |
import pickle | |
from progressbar import ProgressBar, FormatLabel, Percentage, Bar | |
import pydendroheatmap as pdh | |
import scipy.cluster.hierarchy as sch | |
import scipy.spatial.distance as dist | |
def GBQData(query): | |
""" | |
Function takes a string containing a user query to be run against the de-identified OMOP database and returns the | |
query results as a list. Each row of the list corresponds to a row of the results. Function is designed to | |
re-establish client connection for each query run against the database. | |
:param query: a string containing a user query | |
:return: list of query results | |
""" | |
print("Started running query from " + str() + "against the OMOP de-id database: " + str(datetime.datetime.now())) | |
# Authenticate and set client | |
client = bigquery.Client('sandbox-tc') | |
# get project datasets | |
# datasets = [dataset.name for dataset in client.list_datasets()] | |
# run query | |
user_query = client.run_sync_query(query) | |
# Use standard SQL syntax. | |
user_query.use_legacy_sql = False | |
# Google API request | |
user_query.run() | |
# convert query results to list | |
results = [] | |
while len(results) == 0: | |
results = [x for x in user_query.fetch_data()] | |
if len(results) == 0: | |
print('Query returned no results, trying again') | |
else: | |
break | |
# save results to working directory | |
# pickle.dump(results, open('omop_concepts.txt', 'wb')) | |
print("Finished processing query: " + str(datetime.datetime.now())) | |
print('Query returned: %d' % len(results) + " results") | |
return results | |
def AncestorQuery(vocab, code): | |
''' | |
Function takes a string containing the vocabulary type to filter on, and a code to search for ancestors of. The | |
function updates generic keywords using the input strings and returns a string containing the updated query. | |
:param vocab: a string containing the vocabulary type to filter on | |
:param code: a code to search for ancestors of | |
:return: a string containing the updated query | |
''' | |
input_file = 'Queries/concept_ancestor_query' | |
# CHECK - file has data | |
if os.stat(input_file).st_size == 0: | |
raise ValueError('input file: {} is empty'.format(input_file)) | |
else: | |
query = open(input_file).read() | |
# update query text | |
if vocab == 'cond': | |
query_update = query.replace('VOCAB', "'" + 'SNOMED' + "'") | |
query_update = query_update.replace('CODE', code) | |
return query_update | |
elif vocab == 'meas': | |
query_update = query.replace('VOCAB', "'" + 'LOINC' + "'") | |
query_update = query_update.replace('CODE', code) | |
return query_update | |
else: | |
query_update = query.replace('VOCAB', "'" + 'VA Class' + "'") | |
query_update = query_update.replace('CODE', code) | |
return query_update | |
def AncestorFinder(query): | |
''' | |
Function takes a string storing a SQL query as input and runs this string against the Google BQ API. Withe the | |
returned results the function creates a nested dictionary where the outer key is the OMOP descendant code and the | |
inner keys are the vocabulary, source ancestor code and label and the source descendant code and label. | |
:param query: a string storing a SQL query | |
:return: a nested dictionary where the outer key is the OMOP descendant code and the inner keys are the vocabulary, | |
source ancestor code and label and the source descendant code and label. | |
''' | |
ancestor_dict = {} | |
# return data from Google API | |
# ancestors = Google_API.GBQData(query) | |
# initialize progress bar progress bar | |
widgets = [Percentage(), Bar(), FormatLabel('(elapsed: %(elapsed)s)')] | |
pbar = ProgressBar(widgets=widgets, maxval=len(query)) | |
# create ancestor dictionary - keyed by omop descendant concept | |
for row in pbar(query): | |
key = row[1] | |
if key in ancestor_dict.keys(): | |
# ancestor_dict[key]['vocab']= str(row[2]) | |
ancestor_dict[key]['descendant_code'].add(str(row[5])) | |
ancestor_dict[key]['descendant_label'].add(str(row[6])) | |
ancestor_dict[key]['ancestor_code'].add(str(row[3])) | |
ancestor_dict[key]['ancestor_label'].add(str(row[4])) | |
else: | |
ancestor_dict[key] = {} | |
# ancestor_dict[key]['vocab'] = str(row[2]) | |
ancestor_dict[key]['descendant_code'] = set([str(row[5])]) | |
ancestor_dict[key]['descendant_label'] = set([str(row[6])]) | |
ancestor_dict[key]['ancestor_code'] = set([str(row[3])]) | |
ancestor_dict[key]['ancestor_label'] = set([str(row[4])]) | |
pbar.finish() | |
return ancestor_dict | |
def AncestorUpdate(ancestor_dict, patient_concepts): | |
''' | |
Function takes a dictionary of concept ancestor information and a list of lists representing patient concepts. | |
With this input the function checks that all patient concepts are included in the ancestor dictionary. Concepts | |
in the patient data that are not included in the ancestor dictionary (most likely those concepts without any | |
ancestors are added to the ancestor dictionary using themselves as their ancestor (this works because of the | |
definition of the current semantic similarity measure and would need to be re-evaluated should the measure | |
change). The function returns an updated ancestor dictionary. | |
:param ancestor_dict: a dictionary of concept ancestor information; keys are OMOP concepts and values are sets | |
of ancestor codes and labels as well as the concept code's mapped code | |
:param patient_concepts: a list of lists representing patient concepts resulting from running from GBQ | |
:return: updated ancestor dictionary | |
''' | |
for code in patient_concepts: | |
if code[1] not in ancestor_dict.keys() and code[1] != 0: | |
ancestor_dict.update({code[1]: {'ancestor_code':set([str(code[2])]), | |
'ancestor_label':set([str(code[3])]) , | |
'descendant_code':set([str(code[2])]), | |
'descendant_label':set([str(code[3])])}}) | |
return ancestor_dict | |
def SemanticSimilarity(data): | |
''' | |
Function takes a lists of lists as input and calculates the pairwise semantic similarity of the ancestors for all | |
descendant concept identifiers. The similarity scores are normalized to be between 0 and 1 and only if a concept | |
compared directly to itself is given a score of 1. The function returns a dictionary where the keys are pairs of | |
concepts and the values are the semantic similarity score for the pair of concepts. | |
:param data: a lists of lists storing the output of a google big query | |
:return: a dictionary where the keys are pairs of concepts and the values are the semantic similarity score for the | |
pair of concepts. | |
''' | |
semantic_sim = {} | |
# initialize progress bar progress bar | |
widgets = [Percentage(), Bar(), FormatLabel('(elapsed: %(elapsed)s)')] | |
pbar = ProgressBar(widgets=widgets, maxval=len(data.keys())) | |
# pairwise compare all elements in the list to each other | |
for i in pbar(range(0, len(data.keys()))): | |
for j in range(i, len(data.keys())): | |
# get ancestor sets | |
id1 = data[data.keys()[i]]['descendant_code'] | |
id1_anc = data[data.keys()[i]]['ancestor_code'].union(id1) | |
id2 = data[data.keys()[j]]['descendant_code'] | |
id2_anc = data[data.keys()[j]]['ancestor_code'].union(id2) | |
# semantic similarity | |
if id1 != id2: | |
numerator = len((id1_anc.union(id2_anc) - id1_anc.intersection(id2_anc))) | |
denominator = len(id1_anc.union(id2_anc)) | |
sim = abs(-math.log(numerator/float(denominator), 2)) | |
# append to dictionary | |
semantic_sim[tuple([list(id1)[0], list(id2)[0]])] = sim | |
semantic_sim[tuple([list(id2)[0], list(id1)[0]])] = sim | |
else: | |
# append to dictionary | |
semantic_sim[tuple([list(id1)[0], list(id2)[0]])] = 0.0 | |
# normalize scores by the max semantic similarity score | |
max_score = max(semantic_sim.values()) + 0.05 | |
semantic_sim = {comp[0]: comp[1]/max_score if comp[0][0] != comp[0][1] else 1.0 for comp in semantic_sim.items()} | |
# CHECK - make sure that the range is 1.0 | |
if max(semantic_sim.values()) != 1.0: | |
raise ValueError('Problem with semantic similarity score normalization ') | |
else: | |
pbar.finish() | |
return semantic_sim | |
def PatientDict(input_files): | |
''' | |
Function takes a list of labeled input files and creates a dictionary where the keys are person_ids and the value is | |
a list (0-age; 1-gender; 2-race; 3-condition concepts set; 4-measurements set; 5-medications set). Concept codes of | |
0 or "No matching concept" represent concepts that were unable to be mapped by OMOP to a standardized terminology, | |
are excluded. | |
:param input_files: a list of labeled input files | |
:return: a dictionary where the keys are person_ids and the value is a list (0-age; 1-gender; 2-race; 3-condition | |
concepts set; 4-measurements set; 5-medications set) | |
''' | |
patient_dict = {} | |
cond_count = set() | |
meas_count = set() | |
med_count = set() | |
for query in input_files: | |
# return data from Google API | |
data = Google_API.GBQData(open(query[1]).read()) | |
# initialize progress bar progress bar | |
widgets = [Percentage(), Bar(), FormatLabel('(elapsed: %(elapsed)s)')] | |
pbar = ProgressBar(widgets=widgets, maxval=len(data)) | |
# create patient dictionary - keyed by OMOP descendant concept | |
if query[0] == 'person': | |
for row in pbar(data): | |
patient_dict[str(row[0])] = [[] for _ in range(6)] | |
patient_dict[str(row[0])][0].append(str(row[4])) | |
patient_dict[str(row[0])][1].append(str(row[2])) | |
patient_dict[str(row[0])][2].append(str(row[3])) | |
if query[0] == 'cond': | |
for row in pbar(data): | |
if str(row[2]) == 'No matching concept': | |
cond_count.add(row[4]) | |
if str(row[2]) != 'No matching concept': | |
patient_dict[str(row[0])][3].append(str(row[2])) | |
if query[0] == 'meas': | |
for row in pbar(data): | |
if str(row[2]) == 'No matching concept': | |
meas_count.add(row[4]) | |
if str(row[2]) != 'No matching concept': | |
patient_dict[str(row[0])][4].append(str(row[2])) | |
if query[0] == 'meds': | |
for row in pbar(data): | |
if str(row[2]) == 'No matching concept': | |
med_count.add(row[4]) | |
if str(row[2]) != 'No matching concept': | |
patient_dict[str(row[0])][5].append(str(row[2])) | |
# print counts by table of un-mappable codes | |
print 'From the condition data there were ' + str(len(cond_count)) + ' conditions with no SNOMED Code' | |
print 'From the measurement data there were ' + str(len(meas_count)) + ' measurements with no LOINC Code' | |
print 'From the medication data there were ' + str(len(med_count)) + ' drugs with no RxNorm Code' | |
pbar.finish() | |
return patient_dict | |
def SetSim(sim_dict, list1, list2): | |
''' | |
Function takes a dictionary of pre-computed semantic similarity scores and two lists of concepts. The function | |
computes all pairwise comparisons of the scores find the max score for each concept in the list (from comparing it | |
to all other concepts in the other set). The function then takes the sum of each set's concept max and divides it | |
by the total number of items in each set. | |
:param sim_dict: pre-computed semantic similarity scores (keys: pairs of concepts; values: scores) | |
:param list1: list of concepts | |
:param list2: list of concepts | |
:return: integer representing the similarity score | |
''' | |
#https://mor.nlm.nih.gov/pubs/pdf/2005-ismb_bioontologies-fja.pdf | |
list1_max = [] | |
list2_max = [] | |
for i in list1: | |
scores = [] | |
for j in list2: | |
scores.append(sim_dict[(i, j)]) | |
list1_max.append(max(scores)) | |
for i in list2: | |
scores = [] | |
for j in list1: | |
scores.append(sim_dict[(i, j)]) | |
list2_max.append(max(scores)) | |
return (sum(list1_max) + sum(list2_max)) / float((len(list1) + len(list2))) | |
def DiseaseSetSim(sim_dicts, list1, list2): | |
''' | |
Function takes a list of dictionaries that have pre-computed pairwise similarity scores and two list of lists | |
representing sets of concepts for each individual. The function then calculates similarity for the set via the | |
SetSim function and returns a list of scores. | |
:param sim_dicts: a list of dictionaries that have pre-computed pairwise similarity scores | |
:param list1: list of lists representing a set of concepts for an individual | |
:param list2: list of lists representing a set of concepts for an individual | |
:return: a list of scores | |
''' | |
# conditions - calculate disease set sim for each person | |
cond_sim = SetSim(sim_dicts[0], list1[3], list2[3]) | |
# measurements - calculate disease set sim for each person | |
meas_sim = SetSim(sim_dicts[1], list1[4], list2[4]) | |
# medications - calculate disease set sim for each person | |
med_sim = SetSim(sim_dicts[2], list1[5], list2[5]) | |
return cond_sim, meas_sim, med_sim | |
def AgeSim(ages, age1, age2): | |
''' | |
Function takes a lists of lists and two numbers representing two ages and using the range of ages in the data set | |
calculates the difference between the two ages and returns a proportion representing the difference between the two | |
ages. | |
:param: list of lists representing person-level data | |
:param age1: number representing an age | |
:param age2: number representing an age | |
:return: a proportion representing the difference between the two ages | |
''' | |
age_range = max(ages) - min(ages) | |
# convert set to int | |
age1 = float(list(age1)[0]) | |
age2 = float(list(age2)[0]) | |
# calculate similarity | |
sim = 1 - (abs(age1 - age2) / float(age_range)) | |
return sim | |
def PatientSimilarity(patient_data, ages, sim_dicts, weight): | |
patient_sim = {} | |
# initialize progress bar progress bar | |
widgets = [Percentage(), Bar(), FormatLabel('(elapsed: %(elapsed)s)')] | |
pbar = ProgressBar(widgets=widgets, maxval=len(patient_data.keys())) | |
# pairwise compare all patients | |
for i in pbar(range(0, len(patient_data.items()))): | |
for j in range(i, len(patient_data.items())): | |
p1 = patient_data.keys()[i] | |
p2 = patient_data.keys()[j] | |
p1_data = patient_data[p1] | |
p2_data = patient_data[p2] | |
# age | |
age = AgeSim(ages, p1_data[0], p2_data[0]) | |
# gender | |
gender = [1 if p1_data[1] == p2_data[1] else 0][0] | |
# race | |
race = [1 if p1_data[2] == p2_data[2] else 0][0] | |
# conditions, measurements, medications | |
DiseaseSetSim(sim_dicts, p1_data, p2_data) | |
# calculate patient similarity score | |
sim = [age, gender, race] + list(DiseaseSetSim(sim_dicts, p1_data, p2_data)) | |
# apply weight | |
sim_weight = [(1+float(y))*x for x,y in zip(sim, weight)] | |
# apply weighting to similarity | |
if p1 != p2: | |
patient_sim[tuple([p1, p2])] = sum(sim_weight) | |
else: | |
patient_sim[tuple([p1, p2])] = 0.0 | |
# normalize scores by the max semantic similarity score | |
max_score = max(patient_sim.values()) + 0.05 | |
sem_sim = {comp[0]: comp[1]/max_score if comp[0][0] != comp[0][1] else 0.5 for comp in patient_sim.items()} | |
# CHECK - make sure that the range is 1.0 | |
if max(sem_sim.values()) > 1.0: | |
raise ValueError('Problem with semantic similarity score normalization ') | |
pbar.finish() | |
return sem_sim | |
def DendroHeat(grps, data, method, title, filename): | |
''' | |
Function takes a dictionary of pairwise patient similarities (keys: tuples(p1, p2); values: similarity) and | |
and converts the dictionary to a matrix of distances. With this matrix a dendrogram and heatmap are produced and | |
the output the plot is written to the working directory. | |
:param grps: dictionary of group type (key) and patient_id (value) | |
:param data: a dictionary of pairwise patient similarities (keys: tuples(p1, p2); values: similarity) | |
:param method: linkage method to use ('average', 'single', 'centroid', 'complete') | |
:param title: a plot title | |
:param filename: string with location for where to write file | |
:return: a dendrogram and heatmap are produced and written to the working directory | |
''' | |
# code: https://github.com/maximilianh/crisporPaper/blob/master/heatmap.py; | |
# https://stackoverflow.com/questions/38705359/how-to-give-sns-clustermap-a-precomputed-distance-matrix | |
# convert data to matrix | |
unq_keys, key_idx = np.unique(np.array(data.keys()), return_inverse=True) | |
key_idx = key_idx.reshape(-1, 2) | |
matrix = np.zeros((len(unq_keys), len(unq_keys)), dtype=np.array(data.values()).dtype) | |
matrix[key_idx[:, 0], key_idx[:, 1]] = np.array(data.values()) | |
matrix += matrix.T | |
# subtract from 1 to convert similarity to distance | |
dist_matrix = 1 - matrix | |
# calculate linkage and produce a dendrogram | |
linkage = sch.linkage(dist.squareform(dist_matrix), method=str(method)) #metric = 'euclidean' | |
dendro = sch.dendrogram(linkage) | |
idx1 = dendro['leaves'] | |
# re-order columns grouped by linkage to be together | |
dist_matrix = dist_matrix[idx1, :]; dist_matrix = dist_matrix[:, idx1] | |
# create heat map | |
heatmap = pdh.DendroHeatMap(heat_map_data = dist_matrix, top_dendrogram = linkage) | |
heatmap.title = str(title) | |
heatmap.colormap = pylab.cm.YlGnBu | |
# heatmap.cluster_cb_colors = mpl.colors.LinearSegmentedColormap.from_list(name = "custom", | |
# colors = ['red','lime'], N=2) | |
# color dendrogram | |
sch.set_link_color_palette(['blue']) | |
# add labels to heat map | |
heatmap.col_labels = [str(grps[unq_keys[dendro['leaves'][x]]]) + '-' + str(unq_keys[dendro['leaves'][x]]) for x in range(dist_matrix.shape[1])] | |
# show plot and write to file | |
heatmap.show() | |
pylab.savefig(str(filename) + '.jpg', dpi=600) | |
def main(): | |
#### PATIENT DATA #### | |
# cystic fibrosis patients | |
cf_person = 'Queries/CF_person' | |
cf_cond_query = 'Queries/CF_conditions_query' | |
cf_ancestor_cond_query = 'Queries/CF_conds_ancest' | |
cf_lab_query = 'Queries/CF_measurement_query' | |
cf_ancestor_lab_query = 'Queries/CF_meas_ancest' | |
cf_med_query = 'Queries/CF_medications_query' | |
cf_ancestor_med_query = 'Queries/CF_meds_ancest' | |
# huntingtons chorea patients | |
hc_person = 'Queries/HC_person' | |
hc_cond_query = 'Queries/HC_conditions_query' | |
hc_ancestor_cond_query = 'Queries/HC_conds_ancest' | |
hc_lab_query = 'Queries/HC_measurement_query' | |
hc_ancestor_lab_query = 'Queries/HC_meas_ancest' | |
hc_med_query = 'Queries/HC_medications_query' | |
hc_ancestor_med_query = 'Queries/HC_meds_ancest' | |
# person - for calculating age range | |
ages = [x[4] for x in GBQData(open(cf_person).read()) + GBQData(open(hc_person).read())] | |
# set group labels by id | |
grp1 = {str(x[0]):'CF' for x in GBQData(open(cf_person).read())} | |
grp2 = {str(x[0]):'HC' for x in GBQData(open(hc_person).read())} | |
grps = dict(grp1.items() + grp2.items()) | |
## Patient Vectors ## | |
CF_patients = PatientDict([['person',cf_person], ['cond', cf_cond_query], ['meas', cf_lab_query], ['meds', cf_med_query]]) | |
# unmappable codes = conds(0); meas(0); meds(151) | |
HC_patients = PatientDict([['person', hc_person], ['cond', hc_cond_query'], ['meas', hc_lab_query], ['meds', hc_med_query]]) | |
# unmappable codes = conds(0); meas(0); meds(45) | |
CF_red = {k: CF_patients.get(k, None) for k in ('350177','582315')} | |
HC_red = {k: HC_patients.get(k, None) for k in ('151130', '434388')} | |
patient_data = dict(CF_patients.items() + HC_patients.items()) | |
#### ANCESTORS #### | |
# conditions | |
# conds + ancestors | |
conds = GBQData(open(cf_ancestor_cond_query).read()) + GBQData(open(hc_ancestor_cond_query).read()) | |
cond_ancestor = AncestorFinder(conds) | |
len(cond_ancestor) # 637 | |
# patient conds | |
patient_conds = GBQData(open(cf_cond_query).read()) + GBQData(open(hc_cond_query).read()) | |
# make sure all patient concepts are included in semantic similarity calculations | |
cond_ancestor = AncestorUpdate(cond_ancestor, patient_conds) | |
len(cond_ancestor) # 637 | |
# calculate semantic similarity | |
cond_sim = SemanticSimilarity(cond_ancestor) | |
## measurements | |
meas = GBQData(open(cf_ancestor_lab_query).read()) + GBQData(open(hc_ancestor_lab_query).read()) | |
meas_ancestor = AncestorFinder(meas) | |
len(meas_ancestor) #137 | |
# patient meas | |
patient_meas = GBQData(open(cf_lab_query).read()) + GBQData(open(hc_lab_query).read()) | |
# make sure all patient concepts are included in semantic similarity calculations | |
meas_ancestor = AncestorUpdate(meas_ancestor, patient_meas) | |
len(meas_ancestor) #137 | |
# calculate semantic similarity | |
meas_sim = SemanticSimilarity(meas_ancestor) | |
## medications | |
meds = GBQData(open(cf_ancestor_med_query).read()) + GBQData(open(HC_ancestor_med_query).read()) | |
meds_ancestor = AncestorFinder(meds) | |
len(meds_ancestor) #432 | |
# patient meds | |
patient_meds = GBQData(open(cf_med_query).read()) + GBQData(open(hc_med_query).read()) | |
# make sure all patient concepts are included in semantic similarity calculations | |
meds_ancestor = AncestorUpdate(meds_ancestor, patient_meds) | |
len(meds_ancestor) #460 | |
# calculate semantic similarity | |
meds_sim = SemanticSimilarity(meds_ancestor) | |
# put all similarity dictionaries in a list | |
sim_dicts = [cond_sim, meas_sim, meds_sim] | |
#### PATIENT SIMILARITY #### | |
weight = [-1.0, -1.0, -1.0, 0.0, 0.0, 0.0] | |
patient_similarity = PatientSimilarity(patient_data, ages, sim_dicts, weight) | |
# export dictionary | |
output_loc = 'Data/HC_CF_patient_similarity_full' | |
with open(output_loc, 'wb') as handle: | |
pickle.dump(patient_similarity, handle, protocol=pickle.HIGHEST_PROTOCOL) | |
with open(output_loc, 'rb') as handle: | |
patient_similarity = pickle.load(handle) | |
DendroHeat(grps, | |
patient_similarity, | |
'complete',"Patient Similarity: Huntington's Chorea (333.4) vs. Cystic Fibrosis (277.0) - Clinical", | |
"HCCF_heatMap_clinical") |
HC Queries
File: Queries/HC_person.sql
SELECT
p.person_id AS pat_id,
c1.concept_name AS ethnicity, --OMOP normalized concept code name
c2.concept_name AS gender, --OMOP normalized concept code name
c3.concept_name AS race, --OMOP normalized concept code name
ROUND(DATEDIFF(CURRENT_DATE(), p.time_of_birth)/365.25) AS age
FROM db.person p
INNER JOIN db.concept c1
ON c1.concept_id = p.ethnicity_concept_id
INNER JOIN db.concept c2
ON c2.concept_id = p.gender_concept_id
INNER JOIN db.concept c3
ON c3.concept_id = p.race_concept_id
INNER JOIN db.HC_patients pi
ON pi.person_id = p.person_id
;
File: Queries/HC_conditions_query.sql
SELECT
co.person_id AS pat_id,
co.condition_concept_id AS omop_cond_id, --OMOP concept id
c1.concept_code AS source_cond_id,
c1.concept_name,
co.condition_source_value
FROM db.condition_occurrence co
INNER JOIN db.concept c1
ON c1.concept_id = co.condition_concept_id
INNER JOIN db.HC_patients p
ON p.person_id = co.person_id
-- GROUP BY pat_id
;
File: HC_conds_ancest.sql
SELECT
ca.ancestor_concept_id AS omop_ancestor_id,
ca.descendant_concept_id AS omop_descendant_id,
c2.concept_class_id AS ancestor_concept_class_id,
c2.concept_code AS ancestor_source_code,
c2.concept_name AS ancestor_code_name,
c1.concept_code AS descendant_source_code,
c1.concept_name AS descendant_code_name
FROM db.concept_ancestor_tc ca
INNER JOIN db.concept c2
ON ca.ancestor_concept_id = c2.concept_id
INNER JOIN db.concept c1
ON ca.descendant_concept_id = c1.concept_id
-- get concept ancestors (get the parents of the OMOP concept id)
WHERE c2.vocabulary_id = 'SNOMED' AND ca.descendant_concept_id IN
(SELECT
co.condition_concept_id AS omop_cond_id, --OMOP concept id
FROM db.condition_occurrence co
I. NNER JOIN db.HC_patients p
ON p.person_id = co.person_id)
;
File: Queries/HC_measurement_query.sql
SELECT
m.person_id AS pat_id,
m.measurement_concept_id, --OMOP concept id
c1.concept_code, --source mapped code
c1.concept_name,
m.measurement_source_value
FROM db.measurement m
INNER JOIN db.concept c1
ON c1.concept_id = m.measurement_concept_id
INNER JOIN db.HC_patients pi
ON pi.person_id = m.person_id
;
File: Queries/HC_meas_ancest.sql
SELECT
ca.ancestor_concept_id AS omop_ancestor_id,
ca.descendant_concept_id AS omop_descendant_id,
c2.concept_class_id AS ancestor_concept_class_id,
c2.concept_code AS ancestor_source_code,
c2.concept_name AS ancestor_code_name,
c1.concept_code AS descendant_source_code,
c1.concept_name AS descendant_code_name
FROM db.concept_ancestor_tc ca
INNER JOIN db.concept c2
ON ca.ancestor_concept_id = c2.concept_id
INNER JOIN db.concept c1
ON ca.descendant_concept_id = c1.concept_id
-- get concept ancestors (get the parents of the OMOP concept id)
WHERE
c2.concept_class_id = 'LOINC Class' AND ca.descendant_concept_id IN
(SELECT
m.measurement_concept_id, --OMOP concept id
FROM db.measurement m
INNER JOIN db.HC_patients pi
ON pi.person_id = m.person_id)
;
File: Queries/HC_medications_query.sql
SELECT
d.person_id AS pat_id,
d.drug_concept_id, --OMOP concept id
c1.concept_code,
c1.concept_name,
d.drug_source_value
FROM db.drug_exposure d
INNER JOIN db.concept c1
ON c1.concept_id = d.drug_concept_id
INNER JOIN db.HC_patients pi
ON pi.person_id = d.person_id
;
File: Queries/HC_meds_ancest.sql
SELECT
ca.ancestor_concept_id AS omop_ancestor_id,
ca.descendant_concept_id AS omop_descendant_id,
c2.concept_class_id AS ancestor_concept_class_id,
c2.concept_code AS ancestor_source_code,
c2.concept_name AS ancestor_code_name,
c1.concept_code AS descendant_source_code,
c1.concept_name AS descendant_code_name
FROM db.concept_ancestor_tc ca
INNER JOIN db.concept c2
ON ca.ancestor_concept_id = c2.concept_id
INNER JOIN db.concept c1
ON ca.descendant_concept_id = c1.concept_id
-- get concept ancestors (get the parents of the OMOP concept id)
WHERE c2.concept_class_id = 'VA Class' AND ca.descendant_concept_id IN
(SELECT
d.drug_concept_id, --OMOP concept id
FROM db.drug_exposure d
INNER JOIN db.HC_patients pi
ON pi.person_id = d.person_id)
;
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
CF Queries
File:
Queries/CF_person.sql
File:
Queries/CF_conditions_query.sql
File:
CF_conds_ancest.sql
File:
Queries/CF_measurement_query.sql
File:
Queries/CF_meas_ancest.sql
File:
Queries/CF_medications_query.sql
File:
Queries/CF_meds_ancest.sql