Created
June 26, 2020 19:49
-
-
Save linuskohl/892ed937432e8b5497e860d4cf61c029 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import io | |
import os | |
import string | |
import csv | |
import xml | |
import re | |
import unicodedata | |
import itertools | |
import requests | |
from functools import partial | |
import multiprocessing as mp | |
from statistics import mean | |
from sklearn.preprocessing import minmax_scale | |
import pandas as pd | |
import numpy as np | |
from quickumls import QuickUMLS | |
# Initialize QuickUMLS | |
# UMLS data available in the quickumls directory | |
# overlapping_criteria="length" sets precedence for longest match | |
umls_matcher = QuickUMLS("./quickumls", overlapping_criteria="length", threshold=0.7) | |
# Helper functions | |
def load_csv_from_url(url, index_col): | |
"""Loads CSV file from url | |
Args: | |
url (str): The URL of the CSV file | |
index_col (str): Name of the index column | |
Returns: | |
pandas.DataFrame: DataFrame containint the table | |
""" | |
raw_data=requests.get(url).content | |
return pd.read_csv(io.StringIO(raw_data.decode('utf-8')), index_col=index_col) | |
def get_umls_terms(text): | |
"""Extracts UMLS terms from text | |
Args: | |
text (str): Text to extract terms from | |
Returns: | |
list: List of dictionaries containing term and CUI | |
""" | |
terms = [] | |
results = umls_matcher.match(text, best_match=True, ignore_syntax=False) | |
for result in results: | |
for x in result: | |
terms.append({'term': x['term'], 'cui': x['cui']}) | |
return terms | |
# Load BIOSSES dataset | |
biosses_texts_url = "https://gist.githubusercontent.com/linuskohl/5b6f82e9cd0b1ad50e5a57fa48210371/raw/46cab550499c8ffb7cc9e49f61639ae707028c13/biosses_texts.csv" | |
biosses_meta_url = "https://gist.githubusercontent.com/linuskohl/a037ea921af159f1f95a55ae82a21d43/raw/92b20bf4f898ca19d947d6f552098868f87f0b12/biosses_meta.csv" | |
biosses_texts = load_csv_from_url(biosses_texts_url, "Id") | |
biosses_meta = load_csv_from_url(biosses_meta_url, "Id") | |
# Scale the average rating fron [0,4] to [0,1] | |
biosses_meta['Avg'] = minmax_scale(biosses_meta['Avg']) | |
# Extract terms from texts | |
biosses_texts['UMLS_Terms']=np.NaN | |
biosses_texts['UMLS_CUIs']=np.NaN | |
for idx, text in biosses_texts.iterrows(): | |
terms = get_umls_terms(text.Text) | |
biosses_texts.loc[idx, 'UMLS_Terms'] = [term['term'] for term in terms] | |
biosses_texts.loc[idx, 'UMLS_CUIs'] = [term['cui'] for term in terms] | |
# Generate unique list of all CUIs that occur in the texts | |
biosses_cuis = [cui for cui in set(biosses_texts['UMLS_CUIs'].explode()) if str(cui) != 'nan'] | |
# Create a dataframe of pairings that we need to calculate the distance for | |
cui_pairings = pd.DataFrame([[cui_0,cui_1] for cui_0 in biosses_cuis for cui_1 in biosses_cuis if cui_0 != cui_1]) | |
# Export to CSV | |
cui_pairings.to_csv('./cui_pairings.csv', index=False, header=False) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment