Skip to content

Instantly share code, notes, and snippets.

@victormurcia
Last active August 4, 2023 13:48
Show Gist options
  • Save victormurcia/22a5aabd07892fee06529ec4c55d3516 to your computer and use it in GitHub Desktop.
Save victormurcia/22a5aabd07892fee06529ec4c55d3516 to your computer and use it in GitHub Desktop.
Get information for a given term using Pubmed
from Bio import Entrez
from Bio.Medline import parse
from io import StringIO
import pandas as pd
def fetch_pubmed_data(search_term, email, retmax=100):
"""
Fetches data from PubMed related to a specific search term.
Parameters:
search_term (str): The term to search for in the PubMed database.
email (str): The email address to be used for accessing PubMed's API.
retmax (int, optional): The maximum number of results to retrieve. Defaults to 100.
Returns:
pandas.DataFrame: A DataFrame containing the details of the PubMed entries, including
PMID, Title, Authors, Abstract, Publication Date, Journal, Volume,
Issue, Pages, Affiliation, Article ID, E-Publication Date, Place of
Publication, Journal Abbreviation, Language, Publication Type, and MeSH Terms.
"""
Entrez.email = email
handle = Entrez.esearch(db="pubmed", term=search_term, retmax=retmax)
record = Entrez.read(handle)
handle.close()
idlist = record["IdList"]
handle = Entrez.efetch(db="pubmed", id=idlist, rettype="medline", retmode="text")
records = handle.read()
handle.close()
records = parse(StringIO(records))
columns = ["PMID", "Title", "Authors", "Abstract", "Publication Date", "Journal", "Volume", "Issue", "Pages", "Affiliation", "Article ID", "E-Publication Date", "Place of Publication", "Journal Abbreviation", "Language", "Publication Type", "MeSH Terms"]
df = pd.DataFrame(columns=columns)
for record in records:
new_row = {
"PMID": record.get("PMID", "N/A"),
"Title": record.get("TI", "N/A"),
"Authors": ", ".join(record.get("AU", ["N/A"])),
"Abstract": record.get("AB", "N/A"),
"Publication Date": record.get("DP", "N/A"),
"Journal": record.get("JT", "N/A"),
"Volume": record.get("VI", "N/A"),
"Issue": record.get("IP", "N/A"),
"Pages": record.get("PG", "N/A"),
"Affiliation": record.get("AD", "N/A"),
"Article ID": ", ".join(record.get("AID", ["N/A"])),
"E-Publication Date": record.get("DEP", "N/A"),
"Place of Publication": record.get("PL", "N/A"),
"Journal Abbreviation": record.get("TA", "N/A"),
"Language": ", ".join(record.get("LA", ["N/A"])),
"Publication Type": ", ".join(record.get("PT", ["N/A"])),
"MeSH Terms": ", ".join(record.get("MH", ["N/A"])),
}
df = pd.concat([df, pd.DataFrame([new_row])], ignore_index=True)
return df
# Example usage
search_term = "Chronic Inflammatory Demyelinating Polyneuropathy (CIDP)"
email = "[email protected]"
df = fetch_pubmed_data(search_term, email)
print(df.head())
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment