fgirbal · October 9, 2024 08:47
diff --git a/bibtex_cleaner.py b/bibtex_cleaner.py
 import re
 import argparse

 from tqdm import tqdm
 import openai
 from openai import OpenAI


 openai.api_key = "YOUR API KEY"
 client = OpenAI(api_key=openai.api_key)

 MODEL_ID = "gpt-4o-mini"
 CLEANING_PROMPT = """
 Here is a bibtex entry:
 ```
 {reference}
 ```

 Clean it by following these guidelines:
 - Every field should be on a new line and idented by two spaces
 - Always follow the order of fields: author, title, year, journal, followed by any other needed
 - If it's a book, keep the publisher's information
 - If it's an article or in proceedings but not a book, removing unnecessary information such as issue numbers, volumes, months, publisher, ISSN, ISBN, or article URLs, and any empty fields
 - If it's a website, keep the URL
 - If it's on arxiv, make sure the type is article and use the journal name as "arXiv pre-print arxiv:{{reference}}"
 - Make sure the full name of the journal is mentioned and remove any abbreviation, edition (e.g., "Forty-fourth"), or "Proceedings of the" from the name. For example, NeurIPS should be replaced with Advances in Neural Information Processing Systems, ICML with International Conference in Machine and ICLR with International Conference in Learning Representations
 - Make sure that nouns (e.g. "ChatGPT" or "Go") are put in {{}} to remain capitalized

 Do not provide any other text in the response, only the cleaned bibtex entry.
 """


 def load_bib_file(file_path: str) -> str:
    """Load the .bib file and return its contents as a string."""
    with open(file_path, 'r') as file:
        return file.read()


 def split_bib_entries(bib_content: str) -> list:
    """Split the .bib content into individual entries, ignoring comments."""
    # Remove comments that start with % and any leading/trailing whitespace
    cleaned_content = re.sub(r'%.*', '', bib_content).strip()
    # Use regex to split based on @article, @inproceedings, etc.
    entries = re.split(r'(?=@\w+)', cleaned_content)
    return [entry.strip() for entry in entries if entry.strip()]


 def prepare_for_api_call(entries: list) -> list:
    """Prepare the bibliography entries for OpenAI API call."""
    prepared_entries = []
    for entry in entries:
        # Format each entry for the API request
        prepared_entry = {
            "model": MODEL_ID,  # Change to the desired model
            "messages": [{
                "role": "user",
                "content": CLEANING_PROMPT.format(reference=entry)
            }],
            "temperature": 0.9,  # Adjust as necessary
            "max_tokens": 150,  # Adjust based on expected output
            "entry": entry
        }
        prepared_entries.append(prepared_entry)
    return prepared_entries


 def call_openai_api(prepared_entries: list, verbose: bool = True):
    """Send requests to OpenAI API and retrieve completions."""
    cleaned_entries = []
    for entry in tqdm(prepared_entries):
        try:
            # Make the API call
            response = client.chat.completions.create(
                model=entry['model'],
                messages=entry['messages'],
                temperature=entry['temperature'],
                max_tokens=entry['max_tokens']
            )

            # Extract the cleaned response from the API result
            cleaned_text = response.choices[0].message.content

            # Remove ``` from the start and end of the response
            cleaned_text = cleaned_text.strip('`').strip('bibtex').strip()

            extracted_entry = split_bib_entries(cleaned_text)
            if len(extracted_entry) > 1 or len(extracted_entry) == 0:
                cleaned_entries.append(entry["entry"])
            else:
                extracted_entry = extracted_entry[0]
                cleaned_entries.append(extracted_entry)
        except Exception as e:
            print(f"Error calling OpenAI API: {e}")
            cleaned_entries.append(entry["entry"])
    
        if verbose:
            print("Original entry:", entry["entry"])
            print("Cleaned entry:", cleaned_entries[-1])

    return cleaned_entries


 def main():
    parser = argparse.ArgumentParser(description='Clean bibliography entries using gpt-4o-mini.')
    parser.add_argument(
        '--input-file',
        type=str,
        help='Path to the original .bib file',
        required=True
    )
    parser.add_argument(
        '--output-file',
        type=str,
        default='cleaned_refs.bib',
        help='Path to save the cleaned .bib file'
    )
    args = parser.parse_args()

    # Path to your .bib file
    bib_content = load_bib_file(args.input_file)
    entries = split_bib_entries(bib_content)
    prepared_entries = prepare_for_api_call(entries)
    cleaned_entries = call_openai_api(prepared_entries)

    # Save the cleaned entries to a new file
    with open(args.output_file, 'w') as file:
        for entry in cleaned_entries:
            file.write(entry + '\n\n')


 if __name__ == "__main__":
    main()
	import re
	import argparse

	from tqdm import tqdm
	import openai
	from openai import OpenAI


	openai.api_key = "YOUR API KEY"
	client = OpenAI(api_key=openai.api_key)

	MODEL_ID = "gpt-4o-mini"
	CLEANING_PROMPT = """
	Here is a bibtex entry:
	```
	{reference}
	```

	Clean it by following these guidelines:
	- Every field should be on a new line and idented by two spaces
	- Always follow the order of fields: author, title, year, journal, followed by any other needed
	- If it's a book, keep the publisher's information
	- If it's an article or in proceedings but not a book, removing unnecessary information such as issue numbers, volumes, months, publisher, ISSN, ISBN, or article URLs, and any empty fields
	- If it's a website, keep the URL
	- If it's on arxiv, make sure the type is article and use the journal name as "arXiv pre-print arxiv:{{reference}}"
	- Make sure the full name of the journal is mentioned and remove any abbreviation, edition (e.g., "Forty-fourth"), or "Proceedings of the" from the name. For example, NeurIPS should be replaced with Advances in Neural Information Processing Systems, ICML with International Conference in Machine and ICLR with International Conference in Learning Representations
	- Make sure that nouns (e.g. "ChatGPT" or "Go") are put in {{}} to remain capitalized

	Do not provide any other text in the response, only the cleaned bibtex entry.
	"""


	def load_bib_file(file_path: str) -> str:
	"""Load the .bib file and return its contents as a string."""
	with open(file_path, 'r') as file:
	return file.read()


	def split_bib_entries(bib_content: str) -> list:
	"""Split the .bib content into individual entries, ignoring comments."""
	# Remove comments that start with % and any leading/trailing whitespace
	cleaned_content = re.sub(r'%.*', '', bib_content).strip()
	# Use regex to split based on @article, @inproceedings, etc.
	entries = re.split(r'(?=@\w+)', cleaned_content)
	return [entry.strip() for entry in entries if entry.strip()]


	def prepare_for_api_call(entries: list) -> list:
	"""Prepare the bibliography entries for OpenAI API call."""
	prepared_entries = []
	for entry in entries:
	# Format each entry for the API request
	prepared_entry = {
	"model": MODEL_ID, # Change to the desired model
	"messages": [{
	"role": "user",
	"content": CLEANING_PROMPT.format(reference=entry)
	}],
	"temperature": 0.9, # Adjust as necessary
	"max_tokens": 150, # Adjust based on expected output
	"entry": entry
	}
	prepared_entries.append(prepared_entry)
	return prepared_entries


	def call_openai_api(prepared_entries: list, verbose: bool = True):
	"""Send requests to OpenAI API and retrieve completions."""
	cleaned_entries = []
	for entry in tqdm(prepared_entries):
	try:
	# Make the API call
	response = client.chat.completions.create(
	model=entry['model'],
	messages=entry['messages'],
	temperature=entry['temperature'],
	max_tokens=entry['max_tokens']
	)

	# Extract the cleaned response from the API result
	cleaned_text = response.choices[0].message.content

	# Remove ``` from the start and end of the response
	cleaned_text = cleaned_text.strip('`').strip('bibtex').strip()

	extracted_entry = split_bib_entries(cleaned_text)
	if len(extracted_entry) > 1 or len(extracted_entry) == 0:
	cleaned_entries.append(entry["entry"])
	else:
	extracted_entry = extracted_entry[0]
	cleaned_entries.append(extracted_entry)
	except Exception as e:
	print(f"Error calling OpenAI API: {e}")
	cleaned_entries.append(entry["entry"])

	if verbose:
	print("Original entry:", entry["entry"])
	print("Cleaned entry:", cleaned_entries[-1])

	return cleaned_entries


	def main():
	parser = argparse.ArgumentParser(description='Clean bibliography entries using gpt-4o-mini.')
	parser.add_argument(
	'--input-file',
	type=str,
	help='Path to the original .bib file',
	required=True
	)
	parser.add_argument(
	'--output-file',
	type=str,
	default='cleaned_refs.bib',
	help='Path to save the cleaned .bib file'
	)
	args = parser.parse_args()

	# Path to your .bib file
	bib_content = load_bib_file(args.input_file)
	entries = split_bib_entries(bib_content)
	prepared_entries = prepare_for_api_call(entries)
	cleaned_entries = call_openai_api(prepared_entries)

	# Save the cleaned entries to a new file
	with open(args.output_file, 'w') as file:
	for entry in cleaned_entries:
	file.write(entry + '\n\n')


	if __name__ == "__main__":
	main()