Created
December 12, 2024 17:13
-
-
Save brandones/7b16459be935a9b9f838d54117f64977 to your computer and use it in GitHub Desktop.
A gist to rename Markdown files from their Zotero PDF filenames to their citekey (and embed some metadata)
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import bibtexparser | |
import pandas as pd | |
from pathlib import Path | |
import os | |
import re | |
from datetime import datetime | |
import yaml | |
def load_bibtex(bib_path): | |
"""Load bibtex file and convert to DataFrame.""" | |
with open(bib_path, 'r', encoding='utf-8') as bibtex_file: | |
bib_database = bibtexparser.load(bibtex_file) | |
# Convert to DataFrame | |
df_bib = pd.DataFrame(bib_database.entries) | |
# Set ID as index | |
df_bib.set_index('ID', inplace=True) | |
return df_bib | |
def clean_author_names(author_name_string): | |
"""Convert author names from 'Last, First' format to 'First Last' format. | |
Args: | |
author_name_string (str): String of author names in format 'Last, First; Last, First' | |
Returns: | |
str: Author names in format 'First Last, First Last' | |
""" | |
if not isinstance(author_name_string, str): | |
print("bad author name string", author_name_string) | |
return "" | |
# Split multiple authors | |
authors = author_name_string.split(';') | |
# Process each author | |
cleaned_names = [] | |
for author in authors: | |
# Split into last and first name | |
name_parts = author.strip().split(',') | |
if len(name_parts) == 2: | |
last_name = name_parts[0].strip() | |
first_name = name_parts[1].strip() | |
cleaned_names.append(f"{first_name} {last_name}") | |
# Join all authors with commas | |
return ', '.join(cleaned_names) | |
def create_merged_dataset(bib_path='library.bib', csv_path='library.csv'): | |
"""Create a merged dataset from bibtex and csv files.""" | |
# Load bibtex | |
df_bib = load_bibtex(bib_path) | |
df_bib['citekey'] = df_bib.index | |
# Load CSV | |
df_csv = pd.read_csv(csv_path) | |
# Extract just the filename from the full paths | |
df_bib['filename'] = df_bib['file'].str.extract(r'/([^/]+)$') | |
df_csv['filename'] = df_csv['File Attachments'].str.extract(r'/([^/]+)$') | |
# Drop rows with no filename | |
df_bib = df_bib[df_bib['filename'].notna()] | |
df_csv = df_csv[df_csv['filename'].notna()] | |
# Merge on the extracted filenames | |
merged_data = df_bib.merge(df_csv, on='filename', how='inner') | |
clean_data = [] | |
for _, entry in merged_data.iterrows(): | |
entry_data = { | |
'citekey': entry['citekey'], | |
'title': entry['title'].replace('{', '').replace('}', ''), | |
'year': entry['year'], | |
'authors_raw': entry['Author'], | |
'authors': clean_author_names(entry['Author']), | |
'type': entry['Item Type'], | |
'abstract': entry['abstract'], | |
'file': entry['file'], | |
'downloadURI': f"zotero://select/library/items/{entry['Key']}", | |
'tags': ', '.join(entry['Manual Tags'].split(';')) if pd.notna(entry['Manual Tags']) else '' | |
} | |
clean_data.append(entry_data) | |
return pd.DataFrame(clean_data) | |
def create_markdown_content(entry_data): | |
"""Create markdown content from template.""" | |
frontmatter = { | |
'year': entry_data['year'], | |
'authors': entry_data['authors_raw'], | |
'type': entry_data['type'] | |
} | |
# Create the content | |
content = f"""--- | |
{yaml.dump(frontmatter, sort_keys=False)}--- | |
## {entry_data['title']} | |
{entry_data['authors']} | |
{entry_data['year']} | |
[Open in Zotero]({entry_data['downloadURI']}) | |
> [!info] | |
> - **Abstract:** {entry_data['abstract']} | |
""" | |
if entry_data['tags']: | |
content += f"\n> - **Tags:** {entry_data['tags']}" | |
content += """ | |
>[!Personal Notes] | |
> | |
## Annotations | |
""" | |
return content | |
def process_files(merged_data, limit=None): | |
"""Main function to process files. | |
Args: | |
merged_data: DataFrame of entries to process | |
limit: Optional int, maximum number of entries to process | |
""" | |
processed = 0 | |
# Process each entry | |
for _, entry_data in merged_data.iterrows(): | |
if limit and processed >= limit: | |
print("hit the limit") | |
break | |
# Extract original filename from file field | |
original_filename = entry_data['file'].split('/')[-1].replace('.pdf', '') | |
original_md_path = f"{original_filename}.md" | |
new_md_path = f"@{entry_data['citekey']}.md" | |
# Check if original markdown file exists | |
if not os.path.exists(original_md_path): | |
print(f"Warning: No markdown file found for {original_filename}") | |
continue | |
# Create new content | |
new_content = create_markdown_content(entry_data) | |
# Read existing content (might want to preserve some of it) | |
with open(original_md_path, 'r', encoding='utf-8') as f: | |
existing_content = f.read() | |
# Write new file | |
with open(new_md_path, 'w', encoding='utf-8') as f: | |
f.write(new_content) | |
if existing_content: | |
f.write(existing_content) | |
# Remove old file if it's different from the new one | |
if original_md_path != new_md_path and os.path.exists(original_md_path): | |
os.remove(original_md_path) | |
print(f"Processed: {original_md_path} → {new_md_path}") | |
processed += 1 | |
# if __name__ == "__main__": | |
merged_data = create_merged_dataset() | |
merged_data.to_csv('library-merged.csv', index=False) | |
# Process only first entry for testing | |
process_files(merged_data) | |
# Or process all entries | |
# process_files(merged_data) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
I used this to update my Obsidian collection of literature notes. Files were named
Author Year - Title.md
. This renames them to@citekey.md
.In order to use this, you must export your Zotero library both in BibTeX format using the better bibtex plugin, and in CSV using File --> "Export Library...". Rename those files to
library.csv
andlibrary.bib
and put them in the directory with this script and with all the markdown files. Create a backup of the folder before you do anything else. Add thelimit=1
parameter to theprocess_files
call at the end to make sure you like the result before you do the mass move.