Skip to content

Instantly share code, notes, and snippets.

@fgirbal
Last active October 9, 2024 08:47
Show Gist options
  • Save fgirbal/e78edf67f43667bdd68e6a5433e09eb8 to your computer and use it in GitHub Desktop.
Save fgirbal/e78edf67f43667bdd68e6a5433e09eb8 to your computer and use it in GitHub Desktop.
Python script that calls an OpenAI model to clean up a references file (saving the old reference if it fails). Using `gpt-4o-mini` costs about $0.05 for cleaning a .bib file with ~400 references.
import re
import argparse
from tqdm import tqdm
import openai
from openai import OpenAI
openai.api_key = "YOUR API KEY"
client = OpenAI(api_key=openai.api_key)
MODEL_ID = "gpt-4o-mini"
CLEANING_PROMPT = """
Here is a bibtex entry:
```
{reference}
```
Clean it by following these guidelines:
- Every field should be on a new line and idented by two spaces
- Always follow the order of fields: author, title, year, journal, followed by any other needed
- If it's a book, keep the publisher's information
- If it's an article or in proceedings but not a book, removing unnecessary information such as issue numbers, volumes, months, publisher, ISSN, ISBN, or article URLs, and any empty fields
- If it's a website, keep the URL
- If it's on arxiv, make sure the type is article and use the journal name as "arXiv pre-print arxiv:{{reference}}"
- Make sure the full name of the journal is mentioned and remove any abbreviation, edition (e.g., "Forty-fourth"), or "Proceedings of the" from the name. For example, NeurIPS should be replaced with Advances in Neural Information Processing Systems, ICML with International Conference in Machine and ICLR with International Conference in Learning Representations
- Make sure that nouns (e.g. "ChatGPT" or "Go") are put in {{}} to remain capitalized
Do not provide any other text in the response, only the cleaned bibtex entry.
"""
def load_bib_file(file_path: str) -> str:
"""Load the .bib file and return its contents as a string."""
with open(file_path, 'r') as file:
return file.read()
def split_bib_entries(bib_content: str) -> list:
"""Split the .bib content into individual entries, ignoring comments."""
# Remove comments that start with % and any leading/trailing whitespace
cleaned_content = re.sub(r'%.*', '', bib_content).strip()
# Use regex to split based on @article, @inproceedings, etc.
entries = re.split(r'(?=@\w+)', cleaned_content)
return [entry.strip() for entry in entries if entry.strip()]
def prepare_for_api_call(entries: list) -> list:
"""Prepare the bibliography entries for OpenAI API call."""
prepared_entries = []
for entry in entries:
# Format each entry for the API request
prepared_entry = {
"model": MODEL_ID, # Change to the desired model
"messages": [{
"role": "user",
"content": CLEANING_PROMPT.format(reference=entry)
}],
"temperature": 0.9, # Adjust as necessary
"max_tokens": 150, # Adjust based on expected output
"entry": entry
}
prepared_entries.append(prepared_entry)
return prepared_entries
def call_openai_api(prepared_entries: list, verbose: bool = True):
"""Send requests to OpenAI API and retrieve completions."""
cleaned_entries = []
for entry in tqdm(prepared_entries):
try:
# Make the API call
response = client.chat.completions.create(
model=entry['model'],
messages=entry['messages'],
temperature=entry['temperature'],
max_tokens=entry['max_tokens']
)
# Extract the cleaned response from the API result
cleaned_text = response.choices[0].message.content
# Remove ``` from the start and end of the response
cleaned_text = cleaned_text.strip('`').strip('bibtex').strip()
extracted_entry = split_bib_entries(cleaned_text)
if len(extracted_entry) > 1 or len(extracted_entry) == 0:
cleaned_entries.append(entry["entry"])
else:
extracted_entry = extracted_entry[0]
cleaned_entries.append(extracted_entry)
except Exception as e:
print(f"Error calling OpenAI API: {e}")
cleaned_entries.append(entry["entry"])
if verbose:
print("Original entry:", entry["entry"])
print("Cleaned entry:", cleaned_entries[-1])
return cleaned_entries
def main():
parser = argparse.ArgumentParser(description='Clean bibliography entries using gpt-4o-mini.')
parser.add_argument(
'--input-file',
type=str,
help='Path to the original .bib file',
required=True
)
parser.add_argument(
'--output-file',
type=str,
default='cleaned_refs.bib',
help='Path to save the cleaned .bib file'
)
args = parser.parse_args()
# Path to your .bib file
bib_content = load_bib_file(args.input_file)
entries = split_bib_entries(bib_content)
prepared_entries = prepare_for_api_call(entries)
cleaned_entries = call_openai_api(prepared_entries)
# Save the cleaned entries to a new file
with open(args.output_file, 'w') as file:
for entry in cleaned_entries:
file.write(entry + '\n\n')
if __name__ == "__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment