Skip to content

Instantly share code, notes, and snippets.

@linuskohl
Created June 26, 2020 20:18
Show Gist options
  • Save linuskohl/886e3fe9398209fb4c182a880cbda9c6 to your computer and use it in GitHub Desktop.
Save linuskohl/886e3fe9398209fb4c182a880cbda9c6 to your computer and use it in GitHub Desktop.
# Create evaluation DataFrame containing BIOSSES pairings and additional information for evaluation
evaluation = pd.DataFrame(biosses_meta.loc[:,['Text1', 'Text2', 'Avg', 'Var']])
# Add CUI information
evaluation['Text1_CUIs'] = evaluation['Text1'].apply(lambda x: biosses_texts.loc[x,'UMLS_CUIs'])
evaluation['Text2_CUIs'] = evaluation['Text2'].apply(lambda x: biosses_texts.loc[x,'UMLS_CUIs'])
# Add UMLS terms
evaluation['Text1_UMLS_TERMS'] = evaluation['Text1'].apply(lambda x: biosses_texts.loc[x,'UMLS_Terms'])
evaluation['Text2_UMLS_TERMS'] = evaluation['Text2'].apply(lambda x: biosses_texts.loc[x,'UMLS_Terms'])
# Add texts for evaluation purposes
evaluation['Text1'] = evaluation['Text1'].apply(lambda x: biosses_texts.loc[x,'Text'])
evaluation['Text2'] = evaluation['Text2'].apply(lambda x: biosses_texts.loc[x,'Text'])
# Helper function for multiprocessing
def split_process(function, dataframe, params, num_processes=1):
"""Splits DataFrame into multiple chunks and processes them in multiple processes
Args:
function (function): Function to be called
dataframe (DataFrame): DataFrame to be chunked and processed
params (dict): Dictionary containing settings for function
num_processes (int): Number of processes
Returns:
pandas.DataFrame: DataFrame containing the result
"""
len_data = dataframe.shape[0]
num_rows = int(len_data/num_processes)
chunks = [dataframe.loc[dataframe.index[i:i + num_rows]] for i in range(0, dataframe.shape[0], num_rows)]
pool = mp.Pool(processes=num_processes)
func = partial(function, params)
results = pool.map(func, chunks)
return pd.concat(results)
# Text similarity function
def calc_cui_similarities(params, data):
"""Calculates average of similarities between all CUIs in a text
Args:
params (dict): Settings
data (DataFrame): DataFrame to be processed
Returns:
pandas.DataFrame: DataFrame containing the result
"""
for idx,row in data.iterrows():
similarities = []
pairings = list(itertools.product(row[params['col_0_name']], row[params['col_1_name']]))
for pairing in pairings:
if pairing[0] == pairing[1]:
similarities.append(1)
else:
try:
similarities.append(cui_table.at[pairing, params['similarity']])
except:
pass
if len(similarities) > 0:
data.at[idx, "{}_cnt".format(params['output_name'])] = len(similarities)
data.at[idx, params['output_name']] = mean(similarities)
else:
data.at[idx, "{}_cnt".format(params['output_name'])] = 0
data.at[idx, params['output_name']] = 0.0
return data
# Now calculate similarity and the error, defined as | expert rating - similarity | for each pair of texts in the BIOSSES dataset.
for similarity in ['path', 'lch', 'wup']:
output_col = "{}_similarity".format(similarity)
error_col = "{}_error".format(similarity)
parameters = {
'col_0_name': 'Text1_CUIs',
'col_1_name': 'Text2_CUIs',
'similarity': similarity,
'output_name': output_col
}
evaluation = split_process(calc_cui_similarities, evaluation, parameters, num_processes=10)
evaluation[error_col] = abs(evaluation['Avg'] - evaluation[output_col])
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment