Created
June 24, 2025 12:43
-
-
Save kiri-thornalley/f58f69b85baee5df1d6c6deb601e18a2 to your computer and use it in GitHub Desktop.
Assign keywords for papers through named entity recognition
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from datetime import datetime, timedelta | |
import requests | |
import spacy | |
import scispacy | |
from scispacy.linking import EntityLinker | |
# Load the NLP model. Smaller, therefore faster, but potentially at the expense of accuracy | |
nlp = spacy.load("en_core_sci_sm") | |
# Add the EntityLinker - map entities to those found in MeSH. | |
nlp.add_pipe("scispacy_linker", config={ | |
"resolve_abbreviations": True, | |
"linker_name": "mesh", | |
"max_entities_per_mention": 2 # Speeds things up | |
}) | |
def abstract_processing(abstract, score_threshold=0.90): | |
""" | |
Function to extract MeSH-linked entities from an abstract. | |
Parameters: | |
abstract (str): The abstract from the paper. | |
score_threshold (float): The similarity score the entity must exceed for it to make the shortlist of keywords | |
Returns: | |
keywords (list): A list of the top 5 MeSH entities associated with the abstract. | |
""" | |
doc = nlp(abstract) | |
linker = nlp.get_pipe("scispacy_linker") | |
possible_keywords = [] | |
ranked_keywords = [] | |
seen_ids = set() | |
for ent in doc.ents: | |
for cui, score in ent._.kb_ents: | |
if score > score_threshold and cui not in seen_ids: | |
entity = linker.kb.cui_to_entity[cui] | |
possible_keywords.append((entity.canonical_name, score)) | |
seen_ids.add(cui) | |
print(possible_keywords) | |
# Sort found entities by score, then add to ranked_keywords | |
for name, score in sorted(possible_keywords, key=lambda x: x[1], reverse=True): | |
ranked_keywords.append(name) | |
if len(ranked_keywords) == 5: | |
break | |
keywords = ranked_keywords | |
return keywords | |
def get_chemarxiv_papers(term, search_from, limit): | |
""" | |
Fetch papers from ChemRxiv based on a query. | |
Parameters: | |
query (str): The search term. | |
search_from (str): Start date for the search (YYYY-MM-DD). | |
limit (int): Maximum number of results to fetch. | |
Returns: | |
list: A list of papers matching the query. | |
""" | |
# Define endpoint and parameters | |
base_url = "https://chemrxiv.org/engage/chemrxiv/public-api" | |
endpoint = "/v1/items" | |
url = f"{base_url}{endpoint}" | |
parameters = { | |
"term": term, | |
"searchDateFrom": search_from, | |
"limit": limit | |
} | |
try: | |
papers = [] | |
# Make the API request | |
response = requests.get(url, params=parameters) | |
response.raise_for_status() # Raise an error for bad status codes | |
# Parse the JSON response | |
data = response.json() | |
#print (data) | |
papers = data.get ("itemHits", []) | |
return papers | |
except requests.exceptions.HTTPError as http_err: | |
print(f"HTTP error occurred: {http_err}") | |
except requests.exceptions.RequestException as err: | |
print(f"Other error occurred: {err}") | |
return {"error": str(http_err)} | |
if __name__ == "__main__": | |
# Define parameters for search queries - ChemArXiv | |
term = "'protein corona'" | |
search_from = (datetime.now() - timedelta(days=90)).strftime("%Y-%m-%d") | |
limit = 5 | |
# Fetch papers | |
chemarxiv_papers = get_chemarxiv_papers(term, search_from, limit) | |
total_count = len(chemarxiv_papers) #print total number of papers retrieved. | |
print(f"Total number of papers retrieved from ChemArXiv: {total_count}") | |
# Define score threshold if NER used to parse keywords from abstract. | |
score_threshold = 0.90 | |
for paper_hit in chemarxiv_papers: | |
paper = paper_hit.get("item", {}) | |
title = paper.get('title', 'No Title') | |
abstract = paper.get('abstract', 'N/A') | |
authors = paper.get('authors', []) | |
author_names = ", ".join( | |
[f"{author.get('firstName', '')} {author.get('lastName', '')}" for author in authors] | |
) | |
keywords = paper.get('keywords', '[]') | |
if not keywords: | |
keywords = abstract_processing(abstract, score_threshold) | |
print(f"Title: {title}") | |
print(f"Authors: {author_names}") | |
print(f"Abstract: {abstract}") | |
print(f"Keywords: {keywords}") | |
print(f"DOI: {paper.get('doi', 'N/A')}") | |
print(f"Published Date: {paper.get('publishedDate', 'N/A')}") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment