brandones · December 12, 2024 17:13 · brandones · Dec 12, 2024
diff --git a/bib_rename.py b/bib_rename.py
 import bibtexparser
 import pandas as pd
 from pathlib import Path
 import os
 import re
 from datetime import datetime
 import yaml

 def load_bibtex(bib_path):
    """Load bibtex file and convert to DataFrame."""
    with open(bib_path, 'r', encoding='utf-8') as bibtex_file:
        bib_database = bibtexparser.load(bibtex_file)
    
    # Convert to DataFrame
    df_bib = pd.DataFrame(bib_database.entries)
    # Set ID as index
    df_bib.set_index('ID', inplace=True)
    return df_bib

 def clean_author_names(author_name_string):
    """Convert author names from 'Last, First' format to 'First Last' format.
    
    Args:
        author_name_string (str): String of author names in format 'Last, First; Last, First'
        
    Returns:
        str: Author names in format 'First Last, First Last'
    """
    if not isinstance(author_name_string, str):
        print("bad author name string", author_name_string)
        return ""
    # Split multiple authors
    authors = author_name_string.split(';')
    
    # Process each author
    cleaned_names = []
    for author in authors:
        # Split into last and first name
        name_parts = author.strip().split(',')
        if len(name_parts) == 2:
            last_name = name_parts[0].strip()
            first_name = name_parts[1].strip()
            cleaned_names.append(f"{first_name} {last_name}")
    
    # Join all authors with commas
    return ', '.join(cleaned_names)
    

 def create_merged_dataset(bib_path='library.bib', csv_path='library.csv'):
    """Create a merged dataset from bibtex and csv files."""
    # Load bibtex
    df_bib = load_bibtex(bib_path)
    df_bib['citekey'] = df_bib.index
    
    # Load CSV
    df_csv = pd.read_csv(csv_path)
    
    # Extract just the filename from the full paths
    df_bib['filename'] = df_bib['file'].str.extract(r'/([^/]+)$')
    df_csv['filename'] = df_csv['File Attachments'].str.extract(r'/([^/]+)$')
    
    # Drop rows with no filename
    df_bib = df_bib[df_bib['filename'].notna()]
    df_csv = df_csv[df_csv['filename'].notna()]

    # Merge on the extracted filenames
    merged_data = df_bib.merge(df_csv, on='filename', how='inner')
    
    clean_data = []
    for _, entry in merged_data.iterrows():
        entry_data = {
            'citekey': entry['citekey'],
            'title': entry['title'].replace('{', '').replace('}', ''),
            'year': entry['year'],
            'authors_raw': entry['Author'],
            'authors': clean_author_names(entry['Author']),
            'type': entry['Item Type'],
            'abstract': entry['abstract'],
            'file': entry['file'],
            'downloadURI': f"zotero://select/library/items/{entry['Key']}",
            'tags': ', '.join(entry['Manual Tags'].split(';')) if pd.notna(entry['Manual Tags']) else ''
        }
        clean_data.append(entry_data)
    
    return pd.DataFrame(clean_data)


 def create_markdown_content(entry_data):
    """Create markdown content from template."""
    frontmatter = {
        'year': entry_data['year'],
        'authors': entry_data['authors_raw'],
        'type': entry_data['type']
    }
    
    # Create the content
    content = f"""---
 {yaml.dump(frontmatter, sort_keys=False)}---
 ## {entry_data['title']}
 {entry_data['authors']}
 {entry_data['year']}
 [Open in Zotero]({entry_data['downloadURI']})

 > [!info] 
 > - **Abstract:** {entry_data['abstract']}
 """

    if entry_data['tags']:
        content += f"\n> - **Tags:** {entry_data['tags']}"

    content += """

 >[!Personal Notes]
 > 

 ## Annotations 
 """
    
    return content

 def process_files(merged_data, limit=None):
    """Main function to process files.
    
    Args:
        merged_data: DataFrame of entries to process
        limit: Optional int, maximum number of entries to process
    """
    
    processed = 0
    # Process each entry
    for _, entry_data in merged_data.iterrows():
        if limit and processed >= limit:
            print("hit the limit")
            break
            
        # Extract original filename from file field
        original_filename = entry_data['file'].split('/')[-1].replace('.pdf', '')
        original_md_path = f"{original_filename}.md"
        new_md_path = f"@{entry_data['citekey']}.md"
        
        # Check if original markdown file exists
        if not os.path.exists(original_md_path):
            print(f"Warning: No markdown file found for {original_filename}")
            continue
            
        # Create new content
        new_content = create_markdown_content(entry_data)
        
        # Read existing content (might want to preserve some of it)
        with open(original_md_path, 'r', encoding='utf-8') as f:
            existing_content = f.read()
            
        # Write new file
        with open(new_md_path, 'w', encoding='utf-8') as f:
            f.write(new_content)
            if existing_content:
                f.write(existing_content)
        
        # Remove old file if it's different from the new one
        if original_md_path != new_md_path and os.path.exists(original_md_path):
            os.remove(original_md_path)
            
        print(f"Processed: {original_md_path} → {new_md_path}")
        processed += 1

 # if __name__ == "__main__":
 merged_data = create_merged_dataset()
 merged_data.to_csv('library-merged.csv', index=False)

 # Process only first entry for testing
 process_files(merged_data)
 # Or process all entries
 # process_files(merged_data)
	import bibtexparser
	import pandas as pd
	from pathlib import Path
	import os
	import re
	from datetime import datetime
	import yaml

	def load_bibtex(bib_path):
	"""Load bibtex file and convert to DataFrame."""
	with open(bib_path, 'r', encoding='utf-8') as bibtex_file:
	bib_database = bibtexparser.load(bibtex_file)

	# Convert to DataFrame
	df_bib = pd.DataFrame(bib_database.entries)
	# Set ID as index
	df_bib.set_index('ID', inplace=True)
	return df_bib

	def clean_author_names(author_name_string):
	"""Convert author names from 'Last, First' format to 'First Last' format.

	Args:
	author_name_string (str): String of author names in format 'Last, First; Last, First'

	Returns:
	str: Author names in format 'First Last, First Last'
	"""
	if not isinstance(author_name_string, str):
	print("bad author name string", author_name_string)
	return ""
	# Split multiple authors
	authors = author_name_string.split(';')

	# Process each author
	cleaned_names = []
	for author in authors:
	# Split into last and first name
	name_parts = author.strip().split(',')
	if len(name_parts) == 2:
	last_name = name_parts[0].strip()
	first_name = name_parts[1].strip()
	cleaned_names.append(f"{first_name} {last_name}")

	# Join all authors with commas
	return ', '.join(cleaned_names)


	def create_merged_dataset(bib_path='library.bib', csv_path='library.csv'):
	"""Create a merged dataset from bibtex and csv files."""
	# Load bibtex
	df_bib = load_bibtex(bib_path)
	df_bib['citekey'] = df_bib.index

	# Load CSV
	df_csv = pd.read_csv(csv_path)

	# Extract just the filename from the full paths
	df_bib['filename'] = df_bib['file'].str.extract(r'/([^/]+)$')
	df_csv['filename'] = df_csv['File Attachments'].str.extract(r'/([^/]+)$')

	# Drop rows with no filename
	df_bib = df_bib[df_bib['filename'].notna()]
	df_csv = df_csv[df_csv['filename'].notna()]

	# Merge on the extracted filenames
	merged_data = df_bib.merge(df_csv, on='filename', how='inner')

	clean_data = []
	for _, entry in merged_data.iterrows():
	entry_data = {
	'citekey': entry['citekey'],
	'title': entry['title'].replace('{', '').replace('}', ''),
	'year': entry['year'],
	'authors_raw': entry['Author'],
	'authors': clean_author_names(entry['Author']),
	'type': entry['Item Type'],
	'abstract': entry['abstract'],
	'file': entry['file'],
	'downloadURI': f"zotero://select/library/items/{entry['Key']}",
	'tags': ', '.join(entry['Manual Tags'].split(';')) if pd.notna(entry['Manual Tags']) else ''
	}
	clean_data.append(entry_data)

	return pd.DataFrame(clean_data)


	def create_markdown_content(entry_data):
	"""Create markdown content from template."""
	frontmatter = {
	'year': entry_data['year'],
	'authors': entry_data['authors_raw'],
	'type': entry_data['type']
	}

	# Create the content
	content = f"""---
	{yaml.dump(frontmatter, sort_keys=False)}---
	## {entry_data['title']}
	{entry_data['authors']}
	{entry_data['year']}
	[Open in Zotero]({entry_data['downloadURI']})

	> [!info]
	> - Abstract: {entry_data['abstract']}
	"""

	if entry_data['tags']:
	content += f"\n> - Tags: {entry_data['tags']}"

	content += """

	>[!Personal Notes]
	>

	## Annotations
	"""

	return content

	def process_files(merged_data, limit=None):
	"""Main function to process files.

	Args:
	merged_data: DataFrame of entries to process
	limit: Optional int, maximum number of entries to process
	"""

	processed = 0
	# Process each entry
	for _, entry_data in merged_data.iterrows():
	if limit and processed >= limit:
	print("hit the limit")
	break

	# Extract original filename from file field
	original_filename = entry_data['file'].split('/')[-1].replace('.pdf', '')
	original_md_path = f"{original_filename}.md"
	new_md_path = f"@{entry_data['citekey']}.md"

	# Check if original markdown file exists
	if not os.path.exists(original_md_path):
	print(f"Warning: No markdown file found for {original_filename}")
	continue

	# Create new content
	new_content = create_markdown_content(entry_data)

	# Read existing content (might want to preserve some of it)
	with open(original_md_path, 'r', encoding='utf-8') as f:
	existing_content = f.read()

	# Write new file
	with open(new_md_path, 'w', encoding='utf-8') as f:
	f.write(new_content)
	if existing_content:
	f.write(existing_content)

	# Remove old file if it's different from the new one
	if original_md_path != new_md_path and os.path.exists(original_md_path):
	os.remove(original_md_path)

	print(f"Processed: {original_md_path} → {new_md_path}")
	processed += 1

	# if __name__ == "__main__":
	merged_data = create_merged_dataset()
	merged_data.to_csv('library-merged.csv', index=False)

	# Process only first entry for testing
	process_files(merged_data)
	# Or process all entries
	# process_files(merged_data)