Skip to content

Instantly share code, notes, and snippets.

@brandones
Created December 12, 2024 17:13
Show Gist options
  • Save brandones/7b16459be935a9b9f838d54117f64977 to your computer and use it in GitHub Desktop.
Save brandones/7b16459be935a9b9f838d54117f64977 to your computer and use it in GitHub Desktop.
A gist to rename Markdown files from their Zotero PDF filenames to their citekey (and embed some metadata)
import bibtexparser
import pandas as pd
from pathlib import Path
import os
import re
from datetime import datetime
import yaml
def load_bibtex(bib_path):
"""Load bibtex file and convert to DataFrame."""
with open(bib_path, 'r', encoding='utf-8') as bibtex_file:
bib_database = bibtexparser.load(bibtex_file)
# Convert to DataFrame
df_bib = pd.DataFrame(bib_database.entries)
# Set ID as index
df_bib.set_index('ID', inplace=True)
return df_bib
def clean_author_names(author_name_string):
"""Convert author names from 'Last, First' format to 'First Last' format.
Args:
author_name_string (str): String of author names in format 'Last, First; Last, First'
Returns:
str: Author names in format 'First Last, First Last'
"""
if not isinstance(author_name_string, str):
print("bad author name string", author_name_string)
return ""
# Split multiple authors
authors = author_name_string.split(';')
# Process each author
cleaned_names = []
for author in authors:
# Split into last and first name
name_parts = author.strip().split(',')
if len(name_parts) == 2:
last_name = name_parts[0].strip()
first_name = name_parts[1].strip()
cleaned_names.append(f"{first_name} {last_name}")
# Join all authors with commas
return ', '.join(cleaned_names)
def create_merged_dataset(bib_path='library.bib', csv_path='library.csv'):
"""Create a merged dataset from bibtex and csv files."""
# Load bibtex
df_bib = load_bibtex(bib_path)
df_bib['citekey'] = df_bib.index
# Load CSV
df_csv = pd.read_csv(csv_path)
# Extract just the filename from the full paths
df_bib['filename'] = df_bib['file'].str.extract(r'/([^/]+)$')
df_csv['filename'] = df_csv['File Attachments'].str.extract(r'/([^/]+)$')
# Drop rows with no filename
df_bib = df_bib[df_bib['filename'].notna()]
df_csv = df_csv[df_csv['filename'].notna()]
# Merge on the extracted filenames
merged_data = df_bib.merge(df_csv, on='filename', how='inner')
clean_data = []
for _, entry in merged_data.iterrows():
entry_data = {
'citekey': entry['citekey'],
'title': entry['title'].replace('{', '').replace('}', ''),
'year': entry['year'],
'authors_raw': entry['Author'],
'authors': clean_author_names(entry['Author']),
'type': entry['Item Type'],
'abstract': entry['abstract'],
'file': entry['file'],
'downloadURI': f"zotero://select/library/items/{entry['Key']}",
'tags': ', '.join(entry['Manual Tags'].split(';')) if pd.notna(entry['Manual Tags']) else ''
}
clean_data.append(entry_data)
return pd.DataFrame(clean_data)
def create_markdown_content(entry_data):
"""Create markdown content from template."""
frontmatter = {
'year': entry_data['year'],
'authors': entry_data['authors_raw'],
'type': entry_data['type']
}
# Create the content
content = f"""---
{yaml.dump(frontmatter, sort_keys=False)}---
## {entry_data['title']}
{entry_data['authors']}
{entry_data['year']}
[Open in Zotero]({entry_data['downloadURI']})
> [!info]
> - **Abstract:** {entry_data['abstract']}
"""
if entry_data['tags']:
content += f"\n> - **Tags:** {entry_data['tags']}"
content += """
>[!Personal Notes]
>
## Annotations
"""
return content
def process_files(merged_data, limit=None):
"""Main function to process files.
Args:
merged_data: DataFrame of entries to process
limit: Optional int, maximum number of entries to process
"""
processed = 0
# Process each entry
for _, entry_data in merged_data.iterrows():
if limit and processed >= limit:
print("hit the limit")
break
# Extract original filename from file field
original_filename = entry_data['file'].split('/')[-1].replace('.pdf', '')
original_md_path = f"{original_filename}.md"
new_md_path = f"@{entry_data['citekey']}.md"
# Check if original markdown file exists
if not os.path.exists(original_md_path):
print(f"Warning: No markdown file found for {original_filename}")
continue
# Create new content
new_content = create_markdown_content(entry_data)
# Read existing content (might want to preserve some of it)
with open(original_md_path, 'r', encoding='utf-8') as f:
existing_content = f.read()
# Write new file
with open(new_md_path, 'w', encoding='utf-8') as f:
f.write(new_content)
if existing_content:
f.write(existing_content)
# Remove old file if it's different from the new one
if original_md_path != new_md_path and os.path.exists(original_md_path):
os.remove(original_md_path)
print(f"Processed: {original_md_path} → {new_md_path}")
processed += 1
# if __name__ == "__main__":
merged_data = create_merged_dataset()
merged_data.to_csv('library-merged.csv', index=False)
# Process only first entry for testing
process_files(merged_data)
# Or process all entries
# process_files(merged_data)
@brandones
Copy link
Author

I used this to update my Obsidian collection of literature notes. Files were named Author Year - Title.md. This renames them to @citekey.md.

In order to use this, you must export your Zotero library both in BibTeX format using the better bibtex plugin, and in CSV using File --> "Export Library...". Rename those files to library.csv and library.bib and put them in the directory with this script and with all the markdown files. Create a backup of the folder before you do anything else. Add the limit=1 parameter to the process_files call at the end to make sure you like the result before you do the mass move.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment