mthomason · January 21, 2025 17:59
diff --git a/create-notes-index-full.py b/create-notes-index-full.py
 #!/usr/bin/env python3
 # -*- coding: utf-8 -*-

 # Author: Michael Thomason <[email protected]>
 # Copyright (C) 2025 Michael Thomason, All rights reserved.
 # License: MIT

 # Obsidian Vault Index Generator
 #
 # This script generates an index for an Obsidian vault by scanning all markdown files
 # in a specified directory. It organizes H1 and H2 headings in a nested structure.
 #
 # Usage:
 # 1. Set the `VAULT_PATH` variable to the root directory of your Obsidian vault.
 # 2. Run this script. It will create or update an `Index.md` file in the vault.
 # 3. Open `Index.md` in Obsidian to view the generated index.
 # 4. Set the `PRINT_WARNING` varable to false, unless you have a warning file.
 #
 # License:
 # This code is free to use under the MIT License. See the LICENSE file for details.

 import os
 import re
 import logging
 from datetime import datetime
 from calendar import month_name

 # Configuration Constants
 VAULT_PATH: str = '/Users/michael/Projects/scratch/notes/Notes/'
 INDEX_FILE_NAME: str = 'Index.md'
 FILE_EXTENSION: str = '.md'  # File extension to process
 DAILY_NOTES_FOLDER: str = 'DailyNotes'  # Name of daily notes folder
 FOLDER_THRESHOLD: int = 50  # Threshold for applying custom groupings
 LOG_LEVEL: str = 'INFO'  # Logging level: DEBUG, INFO, WARNING, ERROR
 PRINT_WARNING: bool = True  # Outputs details from a warning file.
 EXCLUDE_FOLDERS: list[str] = ['Templates', 'Archive', 'Images']  # Folders to exclude from indexing
 INCLUDE_STATISTICS: bool = False  # Flag to enable/disable document statistics
 SKIP_ROOT: bool = True
 EMOJI_MAPPING: dict[str, str] = {
    'DailyNotes': '📅',
    'Projects': '📂',
    'Work': '💼',
    'Personal': '👤',
    'Ideas': '💡',
    'Gpt': '🤖',
    'Images': '❓',
    'Random': '💡',
    '3rdParty': '🏢'
 }  # Emoji mapping for folder icons

 logging.basicConfig(
    level=getattr(logging, LOG_LEVEL),
    format='%(asctime)s - %(levelname)s - %(message)s',
    datefmt='%Y-%m-%d %H:%M:%S',
 )

 def get_emoji_for_folder(folder_name: str) -> str:
    """Get the emoji for a folder by checking each part of the path against the EMOJI_MAPPING."""
    for part in folder_name.split('/'):
        if part in EMOJI_MAPPING:
            return EMOJI_MAPPING[part]
    return '📄'  # Default emoji if no match is found

 def find_markdown_files(vault_path: str, extension: str = FILE_EXTENSION) -> list[str]:
    """Recursively find all markdown files in the vault, excluding files in the root folder."""
    logging.info("Scanning for markdown files in the vault...")
    markdown_files: list[str] = []
    for root, dirs, files in os.walk(vault_path):
        # Skip files in the root folder
        if SKIP_ROOT and root == vault_path:
            logging.debug(f"Skipping files in the root folder: {root}")
            continue  # Skip processing files in the root folder
        
        # Exclude specified folders
        dirs[:] = [d for d in dirs if d not in EXCLUDE_FOLDERS]
        
        for file in files:
            if file.endswith(extension):
                markdown_files.append(os.path.join(root, file))
    logging.info(f"Found {len(markdown_files)} markdown files.")
    return sorted(markdown_files, key=lambda x: os.path.basename(x), reverse=True)

 def extract_headings(file_path: str) -> dict[str, list[str | tuple[str, str]]]:
    """Extract H1 and H2 headings from a markdown file."""
    logging.debug(f"Extracting headings from {file_path}...")
    headings: dict[str, list[str | tuple[str, str]]] = {"h1": [], "h2": []}
    h1_pattern = re.compile(r'^#\s+(.*)')
    h2_pattern = re.compile(r'^##\s+(.*)')
    current_h1 = None
    
    try:
        with open(file_path, 'r', encoding='utf-8') as file:
            for line in file:
                h1_match = h1_pattern.match(line)
                h2_match = h2_pattern.match(line)
                if h1_match:
                    current_h1 = h1_match.group(1).strip()
                    headings["h1"].append(current_h1)
                elif h2_match and current_h1 is not None:
                    headings["h2"].append((current_h1, h2_match.group(1).strip()))
    except Exception as e:
        logging.error(f"Error reading file {file_path}: {e}")

    return headings

 def group_files_by_folder(files: list[str]) -> dict[str, list[str]]:
    """Group files by their containing folder, preserving hierarchy."""
    grouped_files: dict[str, list[str]] = {}
    for file_path in files:
        # Get relative path from vault root
        relative_path = os.path.relpath(file_path, VAULT_PATH)
        folder_name = os.path.dirname(relative_path)
        # Skip files in the root directory
        if folder_name == '.':
            continue
        grouped_files.setdefault(folder_name, []).append(file_path)
    
    # Sort files in the DailyNotes folder by filename in descending order
    if DAILY_NOTES_FOLDER in grouped_files:
        grouped_files[DAILY_NOTES_FOLDER].sort(key=lambda x: os.path.basename(x), reverse=True)
    
    return grouped_files

 def get_file_metadata(file_path: str) -> tuple[int, str]:
    """Get file size and last modified date."""
    file_size = os.path.getsize(file_path)
    last_modified = datetime.fromtimestamp(os.path.getmtime(file_path)).strftime('%Y-%m-%d %H:%M:%S')
    return file_size, last_modified

 def pretty_print_month(year_month: str) -> str:
    """Convert a year-month string (e.g., '2023-07') to a pretty format (e.g., 'July 2023')."""
    year, month = year_month.split('-')
    return f"{month_name[int(month)]} {year}"

 def write_grouped_index(index_file, folder_name: str, files: list[str], headings_extractor, grouping_func, pretty_print: bool = False) -> None:
    """Write a grouped index for a specific folder."""
    anchor_name = folder_name.lower().replace(" ", "-").replace("/", "-")
    emoji = get_emoji_for_folder(folder_name)
    grouped_files = grouping_func(files)
    
    # Handle sorted list of tuples
    for group_key, group_files in grouped_files:
        pretty_key = pretty_print_month(group_key) if pretty_print else group_key
        index_file.write(f'### {pretty_key}\n\n')
        for file_path in group_files:
            file_name = os.path.basename(file_path)
            index_file.write(f'* [[{file_name}|{file_name.rstrip(".md")}]]\n')
            if INCLUDE_STATISTICS:
                file_size, last_modified = get_file_metadata(file_path)
                index_file.write(f'  *Size: {file_size / 1024:.2f} KB, Last Modified: {last_modified}*\n')
            headings = headings_extractor(file_path)
            for h1 in headings["h1"]:
                index_file.write(f'    * [[{file_name}#{h1}|{h1}]]\n')
                for h2_tuple in filter(lambda x: x[0] == h1, headings["h2"]):
                    index_file.write(f'        * [[{file_name}#{h2_tuple[1]}|{h2_tuple[1]}]]\n')
        index_file.write('\n')

 def group_by_month(file_paths: list[str]) -> list[tuple[str, list[str]]]:
    """Group files by year-month and return a sorted list of tuples in descending order."""
    grouped: dict[str, list[str]] = {}
    for file_path in file_paths:
        match = re.search(r'(\d{4})-(\d{2})', os.path.basename(file_path))
        if match:
            year_month = f"{match.group(1)}-{match.group(2)}"
            grouped.setdefault(year_month, []).append(file_path)
    
    # Sort the groups (year-month keys) in descending order
    sorted_groups = sorted(
        grouped.items(),
        key=lambda x: datetime.strptime(x[0], "%Y-%m"),
        reverse=True
    )
    
    return sorted_groups

 def group_by_alphabet(file_paths: list[str]) -> list[tuple[str, list[str]]]:
    """Group files by their starting letter and return a sorted list of tuples."""
    grouped: dict[str, list[str]] = {}
    for file_path in file_paths:
        first_letter = os.path.basename(file_path)[0].upper()
        grouped.setdefault(first_letter, []).append(file_path)
    
    # Sort the groups (alphabet keys) in ascending order
    sorted_groups = sorted(grouped.items(), key=lambda x: x[0])
    
    return sorted_groups

 def create_index(vault_path: str, index_file_path: str) -> None:
    """Create an index file for the vault."""
    markdown_files = find_markdown_files(vault_path)
    grouped_files = group_files_by_folder(markdown_files)

    with open(index_file_path, 'w', encoding='utf-8') as index_file:
        # Vault Index
        if PRINT_WARNING:
            index_file.write("![[Random/Warning|Warning]]\n\n---\n")
        index_file.write('# Vault Index\n\n')
        index_file.write('_This file is autogenerated. Last updated: '
                         f'{datetime.now().strftime("%Y-%m-%d %H:%M:%S")}_\n\n')

        # Folder Summary
        index_file.write('## Folders\n\n')
        for folder_name, files in sorted(grouped_files.items()):
            emoji = get_emoji_for_folder(folder_name)
            # Use the exact folder name (including slashes) for the anchor
            anchor_name = f"{emoji} {folder_name}"
            index_file.write(f'* [[#{anchor_name}|{emoji} {folder_name}]] ({len(files)} files)\n')
        index_file.write('\n---\n\n')

        # File Details
        total_files = 0
        total_size = 0
        for folder_name, file_paths in sorted(grouped_files.items()):
            total_files += len(file_paths)
            total_size += sum(os.path.getsize(file_path) for file_path in file_paths)

            # Use the exact folder name (including slashes) for the heading
            emoji = get_emoji_for_folder(folder_name)
            anchor_name = f"{emoji} {folder_name}"

            # Specialized groupings for threshold or DailyNotes folder
            if folder_name == DAILY_NOTES_FOLDER:
                index_file.write(f'## {emoji} {folder_name}\n\n')
                #index_file.write(f'## {emoji} {folder_name}-Index\n\n')
                write_grouped_index(index_file, folder_name, file_paths, extract_headings, group_by_month, pretty_print=True)
            elif len(file_paths) > FOLDER_THRESHOLD:
                index_file.write(f'## {emoji} {folder_name}\n\n')
                #index_file.write(f'## {emoji} {folder_name}-Index\n\n')
                write_grouped_index(index_file, folder_name, file_paths, extract_headings, group_by_alphabet)
            else:
                # Standard output for other folders
                index_file.write(f'## {emoji} {folder_name}\n\n')
                for file_path in file_paths:
                    file_name = os.path.basename(file_path)
                    index_file.write(f'### [[{file_name}|{file_name.rstrip(".md")}]]\n')
                    if INCLUDE_STATISTICS:
                        file_size, last_modified = get_file_metadata(file_path)
                        index_file.write(f'*Size: {file_size / 1024:.2f} KB, Last Modified: {last_modified}*\n')
                    headings = extract_headings(file_path)
                    for h1 in headings["h1"]:
                        index_file.write(f'* [[{file_name}#{h1}|{h1}]]\n')
                        for h2_tuple in filter(lambda x: x[0] == h1, headings["h2"]):
                            index_file.write(f'    * [[{file_name}#{h2_tuple[1]}|{h2_tuple[1]}]]\n')
                index_file.write('\n')

        # Summary
        if INCLUDE_STATISTICS:
            index_file.write('---\n\n')
            index_file.write('## Summary\n\n')
            index_file.write(f'- Total folders: {len(grouped_files)}\n')
            index_file.write(f'- Total files: {total_files}\n')
            index_file.write(f'- Total size: {total_size / (1024 * 1024):.2f} MB\n')
    
    logging.info(f"Index file created successfully at {index_file_path}")

 if __name__ == '__main__':
    index_file_path = os.path.join(VAULT_PATH, INDEX_FILE_NAME)
    create_index(VAULT_PATH, index_file_path)
	#!/usr/bin/env python3
	# -- coding: utf-8 --

	# Author: Michael Thomason <[email protected]>
	# Copyright (C) 2025 Michael Thomason, All rights reserved.
	# License: MIT

	# Obsidian Vault Index Generator
	#
	# This script generates an index for an Obsidian vault by scanning all markdown files
	# in a specified directory. It organizes H1 and H2 headings in a nested structure.
	#
	# Usage:
	# 1. Set the `VAULT_PATH` variable to the root directory of your Obsidian vault.
	# 2. Run this script. It will create or update an `Index.md` file in the vault.
	# 3. Open `Index.md` in Obsidian to view the generated index.
	# 4. Set the `PRINT_WARNING` varable to false, unless you have a warning file.
	#
	# License:
	# This code is free to use under the MIT License. See the LICENSE file for details.

	import os
	import re
	import logging
	from datetime import datetime
	from calendar import month_name

	# Configuration Constants
	VAULT_PATH: str = '/Users/michael/Projects/scratch/notes/Notes/'
	INDEX_FILE_NAME: str = 'Index.md'
	FILE_EXTENSION: str = '.md' # File extension to process
	DAILY_NOTES_FOLDER: str = 'DailyNotes' # Name of daily notes folder
	FOLDER_THRESHOLD: int = 50 # Threshold for applying custom groupings
	LOG_LEVEL: str = 'INFO' # Logging level: DEBUG, INFO, WARNING, ERROR
	PRINT_WARNING: bool = True # Outputs details from a warning file.
	EXCLUDE_FOLDERS: list[str] = ['Templates', 'Archive', 'Images'] # Folders to exclude from indexing
	INCLUDE_STATISTICS: bool = False # Flag to enable/disable document statistics
	SKIP_ROOT: bool = True
	EMOJI_MAPPING: dict[str, str] = {
	'DailyNotes': '📅',
	'Projects': '📂',
	'Work': '💼',
	'Personal': '👤',
	'Ideas': '💡',
	'Gpt': '🤖',
	'Images': '❓',
	'Random': '💡',
	'3rdParty': '🏢'
	} # Emoji mapping for folder icons

	logging.basicConfig(
	level=getattr(logging, LOG_LEVEL),
	format='%(asctime)s - %(levelname)s - %(message)s',
	datefmt='%Y-%m-%d %H:%M:%S',
	)

	def get_emoji_for_folder(folder_name: str) -> str:
	"""Get the emoji for a folder by checking each part of the path against the EMOJI_MAPPING."""
	for part in folder_name.split('/'):
	if part in EMOJI_MAPPING:
	return EMOJI_MAPPING[part]
	return '📄' # Default emoji if no match is found

	def find_markdown_files(vault_path: str, extension: str = FILE_EXTENSION) -> list[str]:
	"""Recursively find all markdown files in the vault, excluding files in the root folder."""
	logging.info("Scanning for markdown files in the vault...")
	markdown_files: list[str] = []
	for root, dirs, files in os.walk(vault_path):
	# Skip files in the root folder
	if SKIP_ROOT and root == vault_path:
	logging.debug(f"Skipping files in the root folder: {root}")
	continue # Skip processing files in the root folder

	# Exclude specified folders
	dirs[:] = [d for d in dirs if d not in EXCLUDE_FOLDERS]

	for file in files:
	if file.endswith(extension):
	markdown_files.append(os.path.join(root, file))
	logging.info(f"Found {len(markdown_files)} markdown files.")
	return sorted(markdown_files, key=lambda x: os.path.basename(x), reverse=True)

	def extract_headings(file_path: str) -> dict[str, list[str \| tuple[str, str]]]:
	"""Extract H1 and H2 headings from a markdown file."""
	logging.debug(f"Extracting headings from {file_path}...")
	headings: dict[str, list[str \| tuple[str, str]]] = {"h1": [], "h2": []}
	h1_pattern = re.compile(r'^#\s+(.*)')
	h2_pattern = re.compile(r'^##\s+(.*)')
	current_h1 = None

	try:
	with open(file_path, 'r', encoding='utf-8') as file:
	for line in file:
	h1_match = h1_pattern.match(line)
	h2_match = h2_pattern.match(line)
	if h1_match:
	current_h1 = h1_match.group(1).strip()
	headings["h1"].append(current_h1)
	elif h2_match and current_h1 is not None:
	headings["h2"].append((current_h1, h2_match.group(1).strip()))
	except Exception as e:
	logging.error(f"Error reading file {file_path}: {e}")

	return headings

	def group_files_by_folder(files: list[str]) -> dict[str, list[str]]:
	"""Group files by their containing folder, preserving hierarchy."""
	grouped_files: dict[str, list[str]] = {}
	for file_path in files:
	# Get relative path from vault root
	relative_path = os.path.relpath(file_path, VAULT_PATH)
	folder_name = os.path.dirname(relative_path)
	# Skip files in the root directory
	if folder_name == '.':
	continue
	grouped_files.setdefault(folder_name, []).append(file_path)

	# Sort files in the DailyNotes folder by filename in descending order
	if DAILY_NOTES_FOLDER in grouped_files:
	grouped_files[DAILY_NOTES_FOLDER].sort(key=lambda x: os.path.basename(x), reverse=True)

	return grouped_files

	def get_file_metadata(file_path: str) -> tuple[int, str]:
	"""Get file size and last modified date."""
	file_size = os.path.getsize(file_path)
	last_modified = datetime.fromtimestamp(os.path.getmtime(file_path)).strftime('%Y-%m-%d %H:%M:%S')
	return file_size, last_modified

	def pretty_print_month(year_month: str) -> str:
	"""Convert a year-month string (e.g., '2023-07') to a pretty format (e.g., 'July 2023')."""
	year, month = year_month.split('-')
	return f"{month_name[int(month)]} {year}"

	def write_grouped_index(index_file, folder_name: str, files: list[str], headings_extractor, grouping_func, pretty_print: bool = False) -> None:
	"""Write a grouped index for a specific folder."""
	anchor_name = folder_name.lower().replace(" ", "-").replace("/", "-")
	emoji = get_emoji_for_folder(folder_name)
	grouped_files = grouping_func(files)

	# Handle sorted list of tuples
	for group_key, group_files in grouped_files:
	pretty_key = pretty_print_month(group_key) if pretty_print else group_key
	index_file.write(f'### {pretty_key}\n\n')
	for file_path in group_files:
	file_name = os.path.basename(file_path)
	index_file.write(f'* [[{file_name}\|{file_name.rstrip(".md")}]]\n')
	if INCLUDE_STATISTICS:
	file_size, last_modified = get_file_metadata(file_path)
	index_file.write(f' Size: {file_size / 1024:.2f} KB, Last Modified: {last_modified}\n')
	headings = headings_extractor(file_path)
	for h1 in headings["h1"]:
	index_file.write(f' * [[{file_name}#{h1}\|{h1}]]\n')
	for h2_tuple in filter(lambda x: x[0] == h1, headings["h2"]):
	index_file.write(f' * [[{file_name}#{h2_tuple[1]}\|{h2_tuple[1]}]]\n')
	index_file.write('\n')

	def group_by_month(file_paths: list[str]) -> list[tuple[str, list[str]]]:
	"""Group files by year-month and return a sorted list of tuples in descending order."""
	grouped: dict[str, list[str]] = {}
	for file_path in file_paths:
	match = re.search(r'(\d{4})-(\d{2})', os.path.basename(file_path))
	if match:
	year_month = f"{match.group(1)}-{match.group(2)}"
	grouped.setdefault(year_month, []).append(file_path)

	# Sort the groups (year-month keys) in descending order
	sorted_groups = sorted(
	grouped.items(),
	key=lambda x: datetime.strptime(x[0], "%Y-%m"),
	reverse=True
	)

	return sorted_groups

	def group_by_alphabet(file_paths: list[str]) -> list[tuple[str, list[str]]]:
	"""Group files by their starting letter and return a sorted list of tuples."""
	grouped: dict[str, list[str]] = {}
	for file_path in file_paths:
	first_letter = os.path.basename(file_path)[0].upper()
	grouped.setdefault(first_letter, []).append(file_path)

	# Sort the groups (alphabet keys) in ascending order
	sorted_groups = sorted(grouped.items(), key=lambda x: x[0])

	return sorted_groups

	def create_index(vault_path: str, index_file_path: str) -> None:
	"""Create an index file for the vault."""
	markdown_files = find_markdown_files(vault_path)
	grouped_files = group_files_by_folder(markdown_files)

	with open(index_file_path, 'w', encoding='utf-8') as index_file:
	# Vault Index
	if PRINT_WARNING:
	index_file.write("![[Random/Warning\|Warning]]\n\n---\n")
	index_file.write('# Vault Index\n\n')
	index_file.write('_This file is autogenerated. Last updated: '
	f'{datetime.now().strftime("%Y-%m-%d %H:%M:%S")}_\n\n')

	# Folder Summary
	index_file.write('## Folders\n\n')
	for folder_name, files in sorted(grouped_files.items()):
	emoji = get_emoji_for_folder(folder_name)
	# Use the exact folder name (including slashes) for the anchor
	anchor_name = f"{emoji} {folder_name}"
	index_file.write(f'* [[#{anchor_name}\|{emoji} {folder_name}]] ({len(files)} files)\n')
	index_file.write('\n---\n\n')

	# File Details
	total_files = 0
	total_size = 0
	for folder_name, file_paths in sorted(grouped_files.items()):
	total_files += len(file_paths)
	total_size += sum(os.path.getsize(file_path) for file_path in file_paths)

	# Use the exact folder name (including slashes) for the heading
	emoji = get_emoji_for_folder(folder_name)
	anchor_name = f"{emoji} {folder_name}"

	# Specialized groupings for threshold or DailyNotes folder
	if folder_name == DAILY_NOTES_FOLDER:
	index_file.write(f'## {emoji} {folder_name}\n\n')
	#index_file.write(f'## {emoji} {folder_name}-Index\n\n')
	write_grouped_index(index_file, folder_name, file_paths, extract_headings, group_by_month, pretty_print=True)
	elif len(file_paths) > FOLDER_THRESHOLD:
	index_file.write(f'## {emoji} {folder_name}\n\n')
	#index_file.write(f'## {emoji} {folder_name}-Index\n\n')
	write_grouped_index(index_file, folder_name, file_paths, extract_headings, group_by_alphabet)
	else:
	# Standard output for other folders
	index_file.write(f'## {emoji} {folder_name}\n\n')
	for file_path in file_paths:
	file_name = os.path.basename(file_path)
	index_file.write(f'### [[{file_name}\|{file_name.rstrip(".md")}]]\n')
	if INCLUDE_STATISTICS:
	file_size, last_modified = get_file_metadata(file_path)
	index_file.write(f'Size: {file_size / 1024:.2f} KB, Last Modified: {last_modified}\n')
	headings = extract_headings(file_path)
	for h1 in headings["h1"]:
	index_file.write(f'* [[{file_name}#{h1}\|{h1}]]\n')
	for h2_tuple in filter(lambda x: x[0] == h1, headings["h2"]):
	index_file.write(f' * [[{file_name}#{h2_tuple[1]}\|{h2_tuple[1]}]]\n')
	index_file.write('\n')

	# Summary
	if INCLUDE_STATISTICS:
	index_file.write('---\n\n')
	index_file.write('## Summary\n\n')
	index_file.write(f'- Total folders: {len(grouped_files)}\n')
	index_file.write(f'- Total files: {total_files}\n')
	index_file.write(f'- Total size: {total_size / (1024 * 1024):.2f} MB\n')

	logging.info(f"Index file created successfully at {index_file_path}")

	if __name__ == '__main__':
	index_file_path = os.path.join(VAULT_PATH, INDEX_FILE_NAME)
	create_index(VAULT_PATH, index_file_path)