Created
June 26, 2020 20:18
-
-
Save linuskohl/886e3fe9398209fb4c182a880cbda9c6 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Create evaluation DataFrame containing BIOSSES pairings and additional information for evaluation | |
evaluation = pd.DataFrame(biosses_meta.loc[:,['Text1', 'Text2', 'Avg', 'Var']]) | |
# Add CUI information | |
evaluation['Text1_CUIs'] = evaluation['Text1'].apply(lambda x: biosses_texts.loc[x,'UMLS_CUIs']) | |
evaluation['Text2_CUIs'] = evaluation['Text2'].apply(lambda x: biosses_texts.loc[x,'UMLS_CUIs']) | |
# Add UMLS terms | |
evaluation['Text1_UMLS_TERMS'] = evaluation['Text1'].apply(lambda x: biosses_texts.loc[x,'UMLS_Terms']) | |
evaluation['Text2_UMLS_TERMS'] = evaluation['Text2'].apply(lambda x: biosses_texts.loc[x,'UMLS_Terms']) | |
# Add texts for evaluation purposes | |
evaluation['Text1'] = evaluation['Text1'].apply(lambda x: biosses_texts.loc[x,'Text']) | |
evaluation['Text2'] = evaluation['Text2'].apply(lambda x: biosses_texts.loc[x,'Text']) | |
# Helper function for multiprocessing | |
def split_process(function, dataframe, params, num_processes=1): | |
"""Splits DataFrame into multiple chunks and processes them in multiple processes | |
Args: | |
function (function): Function to be called | |
dataframe (DataFrame): DataFrame to be chunked and processed | |
params (dict): Dictionary containing settings for function | |
num_processes (int): Number of processes | |
Returns: | |
pandas.DataFrame: DataFrame containing the result | |
""" | |
len_data = dataframe.shape[0] | |
num_rows = int(len_data/num_processes) | |
chunks = [dataframe.loc[dataframe.index[i:i + num_rows]] for i in range(0, dataframe.shape[0], num_rows)] | |
pool = mp.Pool(processes=num_processes) | |
func = partial(function, params) | |
results = pool.map(func, chunks) | |
return pd.concat(results) | |
# Text similarity function | |
def calc_cui_similarities(params, data): | |
"""Calculates average of similarities between all CUIs in a text | |
Args: | |
params (dict): Settings | |
data (DataFrame): DataFrame to be processed | |
Returns: | |
pandas.DataFrame: DataFrame containing the result | |
""" | |
for idx,row in data.iterrows(): | |
similarities = [] | |
pairings = list(itertools.product(row[params['col_0_name']], row[params['col_1_name']])) | |
for pairing in pairings: | |
if pairing[0] == pairing[1]: | |
similarities.append(1) | |
else: | |
try: | |
similarities.append(cui_table.at[pairing, params['similarity']]) | |
except: | |
pass | |
if len(similarities) > 0: | |
data.at[idx, "{}_cnt".format(params['output_name'])] = len(similarities) | |
data.at[idx, params['output_name']] = mean(similarities) | |
else: | |
data.at[idx, "{}_cnt".format(params['output_name'])] = 0 | |
data.at[idx, params['output_name']] = 0.0 | |
return data | |
# Now calculate similarity and the error, defined as | expert rating - similarity | for each pair of texts in the BIOSSES dataset. | |
for similarity in ['path', 'lch', 'wup']: | |
output_col = "{}_similarity".format(similarity) | |
error_col = "{}_error".format(similarity) | |
parameters = { | |
'col_0_name': 'Text1_CUIs', | |
'col_1_name': 'Text2_CUIs', | |
'similarity': similarity, | |
'output_name': output_col | |
} | |
evaluation = split_process(calc_cui_similarities, evaluation, parameters, num_processes=10) | |
evaluation[error_col] = abs(evaluation['Avg'] - evaluation[output_col]) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment