Skip to content

Instantly share code, notes, and snippets.

@Sentient07
Created February 3, 2025 19:01
Show Gist options
  • Save Sentient07/4e2b16e3852aa3f6c2a28cc0ec512040 to your computer and use it in GitHub Desktop.
Save Sentient07/4e2b16e3852aa3f6c2a28cc0ec512040 to your computer and use it in GitHub Desktop.
Removes duplicate citations in bib files
import re
with open("bibliography.bib", "r", encoding="utf-8") as f:
lines = f.readlines()
entries = []
current = []
for line in lines:
if line.lstrip().startswith("@") and current:
entries.append("".join(current))
current = []
current.append(line)
if current:
entries.append("".join(current))
seen_titles = set()
unique_entries = []
for entry in entries:
match = re.search(r'\btitle\b\s*=\s*\{(.*?)\}', entry, re.IGNORECASE | re.DOTALL)
if match:
title = match.group(1)
norm_title = " ".join(title.split())
if norm_title in seen_titles:
continue
seen_titles.add(norm_title)
unique_entries.append(entry)
with open("merged.bib", "w", encoding="utf-8") as f:
f.write("\n".join(unique_entries))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment