linuskohl · June 26, 2020 19:49
diff --git a/umls_similarity_article_p1.py b/umls_similarity_article_p1.py
 import io
 import os
 import string
 import csv
 import xml
 import re
 import unicodedata
 import itertools
 import requests
 from functools import partial
 import multiprocessing as mp
 from statistics import mean 
 from sklearn.preprocessing import minmax_scale
 import pandas as pd
 import numpy as np
 from quickumls import QuickUMLS

 # Initialize QuickUMLS
 # UMLS data available in the quickumls directory
 # overlapping_criteria="length" sets precedence for longest match
 umls_matcher = QuickUMLS("./quickumls", overlapping_criteria="length", threshold=0.7)

 # Helper functions
 def load_csv_from_url(url, index_col):
  """Loads CSV file from url
     Args:
        url (str): The URL of the CSV file
        index_col (str): Name of the index column
     Returns:
        pandas.DataFrame: DataFrame containint the table
  """
  raw_data=requests.get(url).content
  return pd.read_csv(io.StringIO(raw_data.decode('utf-8')), index_col=index_col)

 def get_umls_terms(text):
  """Extracts UMLS terms from text
     Args:
        text (str): Text to extract terms from
     Returns:
        list: List of dictionaries containing term and CUI
  """
  terms = []
  results = umls_matcher.match(text, best_match=True, ignore_syntax=False)
  for result in results:
      for x in result:
        terms.append({'term': x['term'], 'cui': x['cui']})
  return terms

 # Load BIOSSES dataset
 biosses_texts_url = "https://gist.githubusercontent.com/linuskohl/5b6f82e9cd0b1ad50e5a57fa48210371/raw/46cab550499c8ffb7cc9e49f61639ae707028c13/biosses_texts.csv"
 biosses_meta_url = "https://gist.githubusercontent.com/linuskohl/a037ea921af159f1f95a55ae82a21d43/raw/92b20bf4f898ca19d947d6f552098868f87f0b12/biosses_meta.csv"
 biosses_texts = load_csv_from_url(biosses_texts_url, "Id")
 biosses_meta = load_csv_from_url(biosses_meta_url, "Id")
 # Scale the average rating fron [0,4] to [0,1]
 biosses_meta['Avg'] = minmax_scale(biosses_meta['Avg'])

 # Extract terms from texts
 biosses_texts['UMLS_Terms']=np.NaN
 biosses_texts['UMLS_CUIs']=np.NaN
 for idx, text in biosses_texts.iterrows():
    terms = get_umls_terms(text.Text)
    biosses_texts.loc[idx, 'UMLS_Terms'] = [term['term'] for term in terms]
    biosses_texts.loc[idx, 'UMLS_CUIs']  = [term['cui']  for term in terms]
  
 # Generate unique list of all CUIs that occur in the texts
 biosses_cuis =   [cui for cui in set(biosses_texts['UMLS_CUIs'].explode()) if str(cui) != 'nan']

 # Create a dataframe of pairings that we need to calculate the distance for
 cui_pairings = pd.DataFrame([[cui_0,cui_1] for cui_0 in biosses_cuis for cui_1 in biosses_cuis if cui_0 != cui_1])

 # Export to CSV
 cui_pairings.to_csv('./cui_pairings.csv', index=False, header=False)
	import io
	import os
	import string
	import csv
	import xml
	import re
	import unicodedata
	import itertools
	import requests
	from functools import partial
	import multiprocessing as mp
	from statistics import mean
	from sklearn.preprocessing import minmax_scale
	import pandas as pd
	import numpy as np
	from quickumls import QuickUMLS

	# Initialize QuickUMLS
	# UMLS data available in the quickumls directory
	# overlapping_criteria="length" sets precedence for longest match
	umls_matcher = QuickUMLS("./quickumls", overlapping_criteria="length", threshold=0.7)

	# Helper functions
	def load_csv_from_url(url, index_col):
	"""Loads CSV file from url
	Args:
	url (str): The URL of the CSV file
	index_col (str): Name of the index column
	Returns:
	pandas.DataFrame: DataFrame containint the table
	"""
	raw_data=requests.get(url).content
	return pd.read_csv(io.StringIO(raw_data.decode('utf-8')), index_col=index_col)

	def get_umls_terms(text):
	"""Extracts UMLS terms from text
	Args:
	text (str): Text to extract terms from
	Returns:
	list: List of dictionaries containing term and CUI
	"""
	terms = []
	results = umls_matcher.match(text, best_match=True, ignore_syntax=False)
	for result in results:
	for x in result:
	terms.append({'term': x['term'], 'cui': x['cui']})
	return terms

	# Load BIOSSES dataset
	biosses_texts_url = "https://gist.githubusercontent.com/linuskohl/5b6f82e9cd0b1ad50e5a57fa48210371/raw/46cab550499c8ffb7cc9e49f61639ae707028c13/biosses_texts.csv"
	biosses_meta_url = "https://gist.githubusercontent.com/linuskohl/a037ea921af159f1f95a55ae82a21d43/raw/92b20bf4f898ca19d947d6f552098868f87f0b12/biosses_meta.csv"
	biosses_texts = load_csv_from_url(biosses_texts_url, "Id")
	biosses_meta = load_csv_from_url(biosses_meta_url, "Id")
	# Scale the average rating fron [0,4] to [0,1]
	biosses_meta['Avg'] = minmax_scale(biosses_meta['Avg'])

	# Extract terms from texts
	biosses_texts['UMLS_Terms']=np.NaN
	biosses_texts['UMLS_CUIs']=np.NaN
	for idx, text in biosses_texts.iterrows():
	terms = get_umls_terms(text.Text)
	biosses_texts.loc[idx, 'UMLS_Terms'] = [term['term'] for term in terms]
	biosses_texts.loc[idx, 'UMLS_CUIs'] = [term['cui'] for term in terms]

	# Generate unique list of all CUIs that occur in the texts
	biosses_cuis = [cui for cui in set(biosses_texts['UMLS_CUIs'].explode()) if str(cui) != 'nan']

	# Create a dataframe of pairings that we need to calculate the distance for
	cui_pairings = pd.DataFrame([[cui_0,cui_1] for cui_0 in biosses_cuis for cui_1 in biosses_cuis if cui_0 != cui_1])

	# Export to CSV
	cui_pairings.to_csv('./cui_pairings.csv', index=False, header=False)