Skip to content

Instantly share code, notes, and snippets.

@kiri-thornalley
Created June 24, 2025 12:43
Show Gist options
  • Save kiri-thornalley/f58f69b85baee5df1d6c6deb601e18a2 to your computer and use it in GitHub Desktop.
Save kiri-thornalley/f58f69b85baee5df1d6c6deb601e18a2 to your computer and use it in GitHub Desktop.
Assign keywords for papers through named entity recognition
from datetime import datetime, timedelta
import requests
import spacy
import scispacy
from scispacy.linking import EntityLinker
# Load the NLP model. Smaller, therefore faster, but potentially at the expense of accuracy
nlp = spacy.load("en_core_sci_sm")
# Add the EntityLinker - map entities to those found in MeSH.
nlp.add_pipe("scispacy_linker", config={
"resolve_abbreviations": True,
"linker_name": "mesh",
"max_entities_per_mention": 2 # Speeds things up
})
def abstract_processing(abstract, score_threshold=0.90):
"""
Function to extract MeSH-linked entities from an abstract.
Parameters:
abstract (str): The abstract from the paper.
score_threshold (float): The similarity score the entity must exceed for it to make the shortlist of keywords
Returns:
keywords (list): A list of the top 5 MeSH entities associated with the abstract.
"""
doc = nlp(abstract)
linker = nlp.get_pipe("scispacy_linker")
possible_keywords = []
ranked_keywords = []
seen_ids = set()
for ent in doc.ents:
for cui, score in ent._.kb_ents:
if score > score_threshold and cui not in seen_ids:
entity = linker.kb.cui_to_entity[cui]
possible_keywords.append((entity.canonical_name, score))
seen_ids.add(cui)
print(possible_keywords)
# Sort found entities by score, then add to ranked_keywords
for name, score in sorted(possible_keywords, key=lambda x: x[1], reverse=True):
ranked_keywords.append(name)
if len(ranked_keywords) == 5:
break
keywords = ranked_keywords
return keywords
def get_chemarxiv_papers(term, search_from, limit):
"""
Fetch papers from ChemRxiv based on a query.
Parameters:
query (str): The search term.
search_from (str): Start date for the search (YYYY-MM-DD).
limit (int): Maximum number of results to fetch.
Returns:
list: A list of papers matching the query.
"""
# Define endpoint and parameters
base_url = "https://chemrxiv.org/engage/chemrxiv/public-api"
endpoint = "/v1/items"
url = f"{base_url}{endpoint}"
parameters = {
"term": term,
"searchDateFrom": search_from,
"limit": limit
}
try:
papers = []
# Make the API request
response = requests.get(url, params=parameters)
response.raise_for_status() # Raise an error for bad status codes
# Parse the JSON response
data = response.json()
#print (data)
papers = data.get ("itemHits", [])
return papers
except requests.exceptions.HTTPError as http_err:
print(f"HTTP error occurred: {http_err}")
except requests.exceptions.RequestException as err:
print(f"Other error occurred: {err}")
return {"error": str(http_err)}
if __name__ == "__main__":
# Define parameters for search queries - ChemArXiv
term = "'protein corona'"
search_from = (datetime.now() - timedelta(days=90)).strftime("%Y-%m-%d")
limit = 5
# Fetch papers
chemarxiv_papers = get_chemarxiv_papers(term, search_from, limit)
total_count = len(chemarxiv_papers) #print total number of papers retrieved.
print(f"Total number of papers retrieved from ChemArXiv: {total_count}")
# Define score threshold if NER used to parse keywords from abstract.
score_threshold = 0.90
for paper_hit in chemarxiv_papers:
paper = paper_hit.get("item", {})
title = paper.get('title', 'No Title')
abstract = paper.get('abstract', 'N/A')
authors = paper.get('authors', [])
author_names = ", ".join(
[f"{author.get('firstName', '')} {author.get('lastName', '')}" for author in authors]
)
keywords = paper.get('keywords', '[]')
if not keywords:
keywords = abstract_processing(abstract, score_threshold)
print(f"Title: {title}")
print(f"Authors: {author_names}")
print(f"Abstract: {abstract}")
print(f"Keywords: {keywords}")
print(f"DOI: {paper.get('doi', 'N/A')}")
print(f"Published Date: {paper.get('publishedDate', 'N/A')}")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment