Last active
October 9, 2024 08:47
-
-
Save fgirbal/e78edf67f43667bdd68e6a5433e09eb8 to your computer and use it in GitHub Desktop.
Python script that calls an OpenAI model to clean up a references file (saving the old reference if it fails). Using `gpt-4o-mini` costs about $0.05 for cleaning a .bib file with ~400 references.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import re | |
import argparse | |
from tqdm import tqdm | |
import openai | |
from openai import OpenAI | |
openai.api_key = "YOUR API KEY" | |
client = OpenAI(api_key=openai.api_key) | |
MODEL_ID = "gpt-4o-mini" | |
CLEANING_PROMPT = """ | |
Here is a bibtex entry: | |
``` | |
{reference} | |
``` | |
Clean it by following these guidelines: | |
- Every field should be on a new line and idented by two spaces | |
- Always follow the order of fields: author, title, year, journal, followed by any other needed | |
- If it's a book, keep the publisher's information | |
- If it's an article or in proceedings but not a book, removing unnecessary information such as issue numbers, volumes, months, publisher, ISSN, ISBN, or article URLs, and any empty fields | |
- If it's a website, keep the URL | |
- If it's on arxiv, make sure the type is article and use the journal name as "arXiv pre-print arxiv:{{reference}}" | |
- Make sure the full name of the journal is mentioned and remove any abbreviation, edition (e.g., "Forty-fourth"), or "Proceedings of the" from the name. For example, NeurIPS should be replaced with Advances in Neural Information Processing Systems, ICML with International Conference in Machine and ICLR with International Conference in Learning Representations | |
- Make sure that nouns (e.g. "ChatGPT" or "Go") are put in {{}} to remain capitalized | |
Do not provide any other text in the response, only the cleaned bibtex entry. | |
""" | |
def load_bib_file(file_path: str) -> str: | |
"""Load the .bib file and return its contents as a string.""" | |
with open(file_path, 'r') as file: | |
return file.read() | |
def split_bib_entries(bib_content: str) -> list: | |
"""Split the .bib content into individual entries, ignoring comments.""" | |
# Remove comments that start with % and any leading/trailing whitespace | |
cleaned_content = re.sub(r'%.*', '', bib_content).strip() | |
# Use regex to split based on @article, @inproceedings, etc. | |
entries = re.split(r'(?=@\w+)', cleaned_content) | |
return [entry.strip() for entry in entries if entry.strip()] | |
def prepare_for_api_call(entries: list) -> list: | |
"""Prepare the bibliography entries for OpenAI API call.""" | |
prepared_entries = [] | |
for entry in entries: | |
# Format each entry for the API request | |
prepared_entry = { | |
"model": MODEL_ID, # Change to the desired model | |
"messages": [{ | |
"role": "user", | |
"content": CLEANING_PROMPT.format(reference=entry) | |
}], | |
"temperature": 0.9, # Adjust as necessary | |
"max_tokens": 150, # Adjust based on expected output | |
"entry": entry | |
} | |
prepared_entries.append(prepared_entry) | |
return prepared_entries | |
def call_openai_api(prepared_entries: list, verbose: bool = True): | |
"""Send requests to OpenAI API and retrieve completions.""" | |
cleaned_entries = [] | |
for entry in tqdm(prepared_entries): | |
try: | |
# Make the API call | |
response = client.chat.completions.create( | |
model=entry['model'], | |
messages=entry['messages'], | |
temperature=entry['temperature'], | |
max_tokens=entry['max_tokens'] | |
) | |
# Extract the cleaned response from the API result | |
cleaned_text = response.choices[0].message.content | |
# Remove ``` from the start and end of the response | |
cleaned_text = cleaned_text.strip('`').strip('bibtex').strip() | |
extracted_entry = split_bib_entries(cleaned_text) | |
if len(extracted_entry) > 1 or len(extracted_entry) == 0: | |
cleaned_entries.append(entry["entry"]) | |
else: | |
extracted_entry = extracted_entry[0] | |
cleaned_entries.append(extracted_entry) | |
except Exception as e: | |
print(f"Error calling OpenAI API: {e}") | |
cleaned_entries.append(entry["entry"]) | |
if verbose: | |
print("Original entry:", entry["entry"]) | |
print("Cleaned entry:", cleaned_entries[-1]) | |
return cleaned_entries | |
def main(): | |
parser = argparse.ArgumentParser(description='Clean bibliography entries using gpt-4o-mini.') | |
parser.add_argument( | |
'--input-file', | |
type=str, | |
help='Path to the original .bib file', | |
required=True | |
) | |
parser.add_argument( | |
'--output-file', | |
type=str, | |
default='cleaned_refs.bib', | |
help='Path to save the cleaned .bib file' | |
) | |
args = parser.parse_args() | |
# Path to your .bib file | |
bib_content = load_bib_file(args.input_file) | |
entries = split_bib_entries(bib_content) | |
prepared_entries = prepare_for_api_call(entries) | |
cleaned_entries = call_openai_api(prepared_entries) | |
# Save the cleaned entries to a new file | |
with open(args.output_file, 'w') as file: | |
for entry in cleaned_entries: | |
file.write(entry + '\n\n') | |
if __name__ == "__main__": | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment