Created
September 26, 2023 02:24
-
-
Save corneliusroemer/0a8fd4417ca61c479907b4601d550069 to your computer and use it in GitHub Desktop.
Automatically generate faculty profiles using pubmed abstracts and GPT-4, filtering papers for relevance based on author position
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import argparse | |
from math import e | |
from Bio import Entrez | |
import openai | |
def search_latest_papers(faculty_name, source="Pubmed", num_papers=5): | |
papers_info = [] | |
if source == "Pubmed": | |
Entrez.email = "[email protected]" | |
handle = Entrez.esearch( | |
db="pubmed", | |
term=f"{faculty_name}[Author]", | |
retmax=num_papers, | |
sort="relevance", | |
retmode="xml", | |
) | |
results = Entrez.read(handle) | |
handle.close() | |
for paper_id in results["IdList"]: | |
handle = Entrez.efetch(db="pubmed", id=paper_id, retmode="xml") | |
papers = Entrez.read(handle) | |
handle.close() | |
try: | |
abstract = papers["PubmedArticle"][0]["MedlineCitation"]["Article"]["Abstract"]["AbstractText"][0] | |
authors = papers["PubmedArticle"][0]["MedlineCitation"]["Article"]["AuthorList"] | |
author_names = [f"{author['LastName']}" for author in authors] | |
papers_info.append({'abstract': abstract, 'authors': author_names}) | |
except (IndexError, KeyError): | |
pass | |
return papers_info | |
def generate_summary(abstracts, name, api_key, model): | |
""" | |
Use the OpenAI API to generate a summary for the given abstract. | |
:param abstract: Abstract of the paper. | |
:return: Summarized abstract. | |
""" | |
openai.api_key = api_key | |
# completion = openai.ChatCompletion.create( | |
if model == "gpt-3.5-turbo-instruct": | |
completion = openai.Completion.create( | |
prompt=f"You are the head of a university department. Please produce a faculty profile of around 150 words for {name}. It shouldn't be too technical. Useful for general educated audience. These are some recent abstracts {abstracts}. Only use these abstracts for background. It's not important what the exact research is. More about the field and general topics. Make sure you don't assume someone is a Dr. or Professor. And don't be too positive. Understatement is better, be very conservative in calling someone 'prominent'. If a paper doesn't fit the others, assume it's a different researcher and ignore. Please start now:", | |
model="gpt-3.5-turbo-instruct", | |
max_tokens=500, | |
) | |
else: | |
completion = openai.ChatCompletion.create( | |
model="gpt-4", | |
messages=[ | |
{ | |
"role": "system", | |
"content": "You are the head of a university department. If you don't know something, don't guess. Just say you don't know. You tend to be conservative, you don't exaggerate. You are not a salesman. You write succinctly.", | |
}, | |
{ | |
"role": "user", | |
"content": f"Please produce a faculty profile of around 100 words for {name}. It shouldn't be too technical. Useful for general educated audience. These are some recent abstracts {abstracts}. Only use these abstracts for background. It's not important what the exact research is. More about the field and general topics. Make sure you don't assume someone is a Dr. or Professor. Just refer to them by name. And don't be too positive. Understatement is better, be very conservative in calling someone 'prominent'. If a paper doesn't fit the others, assume it's a different researcher and ignore. Don't go into too much detail as research areas often change. Everyone knows that this is a faculty profile, definitely do no say things like '{name} is a faculty member'. Start with '{name} specializes in' or '{name} is an expert in'.", | |
}, | |
], | |
max_tokens=500, | |
) | |
return completion | |
def main(): | |
parser = argparse.ArgumentParser( | |
description="Fetch the latest papers' abstracts for a faculty from PubMed." | |
) | |
parser.add_argument("first", help="First Name of the faculty member.") | |
parser.add_argument("last", help="Last Name of the faculty member.") | |
parser.add_argument( | |
"--source", | |
choices=["Pubmed"], | |
default="Pubmed", | |
help="Source to fetch papers from. Currently supports only 'Pubmed'.", | |
) | |
parser.add_argument( | |
"--num-papers", | |
type=int, | |
default=5, | |
help="Number of papers to retrieve. Default is 5.", | |
) | |
parser.add_argument("--api-key", help="OpenAI API key.") | |
parser.add_argument("--verbose", action="store_true", help="Print verbose output.") | |
parser.add_argument("--model", help="OpenAI model to use.", default="gpt-3.5-turbo") | |
parser.add_argument("--relevant-first", help="Number of first author positions that are relevant.", default=2) | |
parser.add_argument("--relevant-last", help="Number of last author positions that are relevant.", default=2) | |
args = parser.parse_args() | |
abstracts = search_latest_papers(f"{args.first} {args.last}", args.source, args.num_papers) | |
# Filter out abstracts where faculty is not among first 2 or last 2 authors. | |
relevant_abstracts = [] | |
for abstract in abstracts: | |
if args.last in abstract['authors'][:args.relevant_first] or args.last in abstract['authors'][-args.relevant_last:]: | |
relevant_abstracts.append(abstract['abstract']) | |
if args.verbose: | |
print(f"Relevant abstract: {abstract['abstract']}\n") | |
else: | |
if args.verbose: | |
print(f"Ignoring abstract: {abstract['abstract']}\n") | |
# if args.verbose: | |
# for idx, abstract in enumerate(abstracts, 1): | |
# print(f"Abstract {idx}: {abstract}\n") | |
summary = generate_summary(relevant_abstracts, f"{args.first} {args.last}", args.api_key, model=args.model) | |
# Extract the summary from the completion. | |
extraction = summary["choices"][0]["message"]["content"] | |
# Pretty print the summary. | |
print(extraction) | |
if __name__ == "__main__": | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment