kiri-thornalley · June 24, 2025 12:43
diff --git a/keywords.py b/keywords.py
 from datetime import datetime, timedelta
 import requests
 import spacy
 import scispacy
 from scispacy.linking import EntityLinker

 # Load the NLP model. Smaller, therefore faster, but potentially at the expense of accuracy
 nlp = spacy.load("en_core_sci_sm") 

 # Add the EntityLinker - map entities to those found in MeSH. 
 nlp.add_pipe("scispacy_linker", config={
    "resolve_abbreviations": True,
    "linker_name": "mesh",
    "max_entities_per_mention": 2  # Speeds things up
 })

 def abstract_processing(abstract, score_threshold=0.90):
    """
    Function to extract MeSH-linked entities from an abstract.

    Parameters:
        abstract (str): The abstract from the paper.
        score_threshold (float): The similarity score the entity must exceed for it to make the shortlist of keywords

    Returns:
        keywords (list): A list of the top 5 MeSH entities associated with the abstract.
    """
    doc = nlp(abstract)
    linker = nlp.get_pipe("scispacy_linker")
    possible_keywords = []
    ranked_keywords = []
    seen_ids = set()

    for ent in doc.ents:
        for cui, score in ent._.kb_ents:
            if score > score_threshold and cui not in seen_ids:
                entity = linker.kb.cui_to_entity[cui]
                possible_keywords.append((entity.canonical_name, score))
                seen_ids.add(cui)
    print(possible_keywords)
    # Sort found entities by score, then add to ranked_keywords
    for name, score in sorted(possible_keywords, key=lambda x: x[1], reverse=True):
        ranked_keywords.append(name)
        if len(ranked_keywords) == 5:
            break

        keywords = ranked_keywords
    return keywords

 def get_chemarxiv_papers(term, search_from, limit):
    """
    Fetch papers from ChemRxiv based on a query.

    Parameters:
        query (str): The search term.
        search_from (str): Start date for the search (YYYY-MM-DD).
        limit (int): Maximum number of results to fetch.

    Returns:
        list: A list of papers matching the query.
    """
    # Define endpoint and parameters
    base_url = "https://chemrxiv.org/engage/chemrxiv/public-api"
    endpoint = "/v1/items"
    url = f"{base_url}{endpoint}"
    parameters = {
        "term": term,
        "searchDateFrom": search_from,
        "limit": limit
    }

    try:
        papers = []
        # Make the API request
        response = requests.get(url, params=parameters)
        response.raise_for_status()  # Raise an error for bad status codes

        # Parse the JSON response
        data = response.json()
        #print (data)
        papers = data.get ("itemHits", [])
        return papers

    except requests.exceptions.HTTPError as http_err:
        print(f"HTTP error occurred: {http_err}")
    except requests.exceptions.RequestException as err:
        print(f"Other error occurred: {err}")
    return {"error": str(http_err)} 

 if __name__ == "__main__":    
    # Define parameters for search queries - ChemArXiv
    term = "'protein corona'"
    search_from = (datetime.now() - timedelta(days=90)).strftime("%Y-%m-%d")
    limit = 5
    # Fetch papers
    chemarxiv_papers = get_chemarxiv_papers(term, search_from, limit)
    total_count = len(chemarxiv_papers) #print total number of papers retrieved. 
    print(f"Total number of papers retrieved from ChemArXiv: {total_count}")
    # Define score threshold if NER used to parse keywords from abstract.
    score_threshold = 0.90

    for paper_hit in chemarxiv_papers:
        paper = paper_hit.get("item", {})
        title = paper.get('title', 'No Title')
        abstract = paper.get('abstract', 'N/A')
        authors = paper.get('authors', [])
        author_names = ", ".join(
            [f"{author.get('firstName', '')} {author.get('lastName', '')}" for author in authors]
        )
        keywords = paper.get('keywords', '[]')
        if not keywords:
            keywords = abstract_processing(abstract, score_threshold)

        print(f"Title: {title}")
        print(f"Authors: {author_names}")
        print(f"Abstract: {abstract}")
        print(f"Keywords: {keywords}")
        print(f"DOI: {paper.get('doi', 'N/A')}")
        print(f"Published Date: {paper.get('publishedDate', 'N/A')}")
	from datetime import datetime, timedelta
	import requests
	import spacy
	import scispacy
	from scispacy.linking import EntityLinker

	# Load the NLP model. Smaller, therefore faster, but potentially at the expense of accuracy
	nlp = spacy.load("en_core_sci_sm")

	# Add the EntityLinker - map entities to those found in MeSH.
	nlp.add_pipe("scispacy_linker", config={
	"resolve_abbreviations": True,
	"linker_name": "mesh",
	"max_entities_per_mention": 2 # Speeds things up
	})

	def abstract_processing(abstract, score_threshold=0.90):
	"""
	Function to extract MeSH-linked entities from an abstract.

	Parameters:
	abstract (str): The abstract from the paper.
	score_threshold (float): The similarity score the entity must exceed for it to make the shortlist of keywords

	Returns:
	keywords (list): A list of the top 5 MeSH entities associated with the abstract.
	"""
	doc = nlp(abstract)
	linker = nlp.get_pipe("scispacy_linker")
	possible_keywords = []
	ranked_keywords = []
	seen_ids = set()

	for ent in doc.ents:
	for cui, score in ent._.kb_ents:
	if score > score_threshold and cui not in seen_ids:
	entity = linker.kb.cui_to_entity[cui]
	possible_keywords.append((entity.canonical_name, score))
	seen_ids.add(cui)
	print(possible_keywords)
	# Sort found entities by score, then add to ranked_keywords
	for name, score in sorted(possible_keywords, key=lambda x: x[1], reverse=True):
	ranked_keywords.append(name)
	if len(ranked_keywords) == 5:
	break

	keywords = ranked_keywords
	return keywords

	def get_chemarxiv_papers(term, search_from, limit):
	"""
	Fetch papers from ChemRxiv based on a query.

	Parameters:
	query (str): The search term.
	search_from (str): Start date for the search (YYYY-MM-DD).
	limit (int): Maximum number of results to fetch.

	Returns:
	list: A list of papers matching the query.
	"""
	# Define endpoint and parameters
	base_url = "https://chemrxiv.org/engage/chemrxiv/public-api"
	endpoint = "/v1/items"
	url = f"{base_url}{endpoint}"
	parameters = {
	"term": term,
	"searchDateFrom": search_from,
	"limit": limit
	}

	try:
	papers = []
	# Make the API request
	response = requests.get(url, params=parameters)
	response.raise_for_status() # Raise an error for bad status codes

	# Parse the JSON response
	data = response.json()
	#print (data)
	papers = data.get ("itemHits", [])
	return papers

	except requests.exceptions.HTTPError as http_err:
	print(f"HTTP error occurred: {http_err}")
	except requests.exceptions.RequestException as err:
	print(f"Other error occurred: {err}")
	return {"error": str(http_err)}

	if __name__ == "__main__":
	# Define parameters for search queries - ChemArXiv
	term = "'protein corona'"
	search_from = (datetime.now() - timedelta(days=90)).strftime("%Y-%m-%d")
	limit = 5
	# Fetch papers
	chemarxiv_papers = get_chemarxiv_papers(term, search_from, limit)
	total_count = len(chemarxiv_papers) #print total number of papers retrieved.
	print(f"Total number of papers retrieved from ChemArXiv: {total_count}")
	# Define score threshold if NER used to parse keywords from abstract.
	score_threshold = 0.90

	for paper_hit in chemarxiv_papers:
	paper = paper_hit.get("item", {})
	title = paper.get('title', 'No Title')
	abstract = paper.get('abstract', 'N/A')
	authors = paper.get('authors', [])
	author_names = ", ".join(
	[f"{author.get('firstName', '')} {author.get('lastName', '')}" for author in authors]
	)
	keywords = paper.get('keywords', '[]')
	if not keywords:
	keywords = abstract_processing(abstract, score_threshold)

	print(f"Title: {title}")
	print(f"Authors: {author_names}")
	print(f"Abstract: {abstract}")
	print(f"Keywords: {keywords}")
	print(f"DOI: {paper.get('doi', 'N/A')}")
	print(f"Published Date: {paper.get('publishedDate', 'N/A')}")