Last active
January 21, 2025 17:59
-
-
Save mthomason/ee5d9753e61cca287422c80e62b521a3 to your computer and use it in GitHub Desktop.
Create an `Index.md` file for an Obsidian vault. Edit script to add path to your vault.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
# -*- coding: utf-8 -*- | |
# Author: Michael Thomason <[email protected]> | |
# Copyright (C) 2025 Michael Thomason, All rights reserved. | |
# License: MIT | |
# Obsidian Vault Index Generator | |
# | |
# This script generates an index for an Obsidian vault by scanning all markdown files | |
# in a specified directory. It organizes H1 and H2 headings in a nested structure. | |
# | |
# Usage: | |
# 1. Set the `VAULT_PATH` variable to the root directory of your Obsidian vault. | |
# 2. Run this script. It will create or update an `Index.md` file in the vault. | |
# 3. Open `Index.md` in Obsidian to view the generated index. | |
# 4. Set the `PRINT_WARNING` varable to false, unless you have a warning file. | |
# | |
# License: | |
# This code is free to use under the MIT License. See the LICENSE file for details. | |
import os | |
import re | |
import logging | |
from datetime import datetime | |
from calendar import month_name | |
# Configuration Constants | |
VAULT_PATH: str = '/Users/michael/Projects/scratch/notes/Notes/' | |
INDEX_FILE_NAME: str = 'Index.md' | |
FILE_EXTENSION: str = '.md' # File extension to process | |
DAILY_NOTES_FOLDER: str = 'DailyNotes' # Name of daily notes folder | |
FOLDER_THRESHOLD: int = 50 # Threshold for applying custom groupings | |
LOG_LEVEL: str = 'INFO' # Logging level: DEBUG, INFO, WARNING, ERROR | |
PRINT_WARNING: bool = True # Outputs details from a warning file. | |
EXCLUDE_FOLDERS: list[str] = ['Templates', 'Archive', 'Images'] # Folders to exclude from indexing | |
INCLUDE_STATISTICS: bool = False # Flag to enable/disable document statistics | |
SKIP_ROOT: bool = True | |
EMOJI_MAPPING: dict[str, str] = { | |
'DailyNotes': '📅', | |
'Projects': '📂', | |
'Work': '💼', | |
'Personal': '👤', | |
'Ideas': '💡', | |
'Gpt': '🤖', | |
'Images': '❓', | |
'Random': '💡', | |
'3rdParty': '🏢' | |
} # Emoji mapping for folder icons | |
logging.basicConfig( | |
level=getattr(logging, LOG_LEVEL), | |
format='%(asctime)s - %(levelname)s - %(message)s', | |
datefmt='%Y-%m-%d %H:%M:%S', | |
) | |
def get_emoji_for_folder(folder_name: str) -> str: | |
"""Get the emoji for a folder by checking each part of the path against the EMOJI_MAPPING.""" | |
for part in folder_name.split('/'): | |
if part in EMOJI_MAPPING: | |
return EMOJI_MAPPING[part] | |
return '📄' # Default emoji if no match is found | |
def find_markdown_files(vault_path: str, extension: str = FILE_EXTENSION) -> list[str]: | |
"""Recursively find all markdown files in the vault, excluding files in the root folder.""" | |
logging.info("Scanning for markdown files in the vault...") | |
markdown_files: list[str] = [] | |
for root, dirs, files in os.walk(vault_path): | |
# Skip files in the root folder | |
if SKIP_ROOT and root == vault_path: | |
logging.debug(f"Skipping files in the root folder: {root}") | |
continue # Skip processing files in the root folder | |
# Exclude specified folders | |
dirs[:] = [d for d in dirs if d not in EXCLUDE_FOLDERS] | |
for file in files: | |
if file.endswith(extension): | |
markdown_files.append(os.path.join(root, file)) | |
logging.info(f"Found {len(markdown_files)} markdown files.") | |
return sorted(markdown_files, key=lambda x: os.path.basename(x), reverse=True) | |
def extract_headings(file_path: str) -> dict[str, list[str | tuple[str, str]]]: | |
"""Extract H1 and H2 headings from a markdown file.""" | |
logging.debug(f"Extracting headings from {file_path}...") | |
headings: dict[str, list[str | tuple[str, str]]] = {"h1": [], "h2": []} | |
h1_pattern = re.compile(r'^#\s+(.*)') | |
h2_pattern = re.compile(r'^##\s+(.*)') | |
current_h1 = None | |
try: | |
with open(file_path, 'r', encoding='utf-8') as file: | |
for line in file: | |
h1_match = h1_pattern.match(line) | |
h2_match = h2_pattern.match(line) | |
if h1_match: | |
current_h1 = h1_match.group(1).strip() | |
headings["h1"].append(current_h1) | |
elif h2_match and current_h1 is not None: | |
headings["h2"].append((current_h1, h2_match.group(1).strip())) | |
except Exception as e: | |
logging.error(f"Error reading file {file_path}: {e}") | |
return headings | |
def group_files_by_folder(files: list[str]) -> dict[str, list[str]]: | |
"""Group files by their containing folder, preserving hierarchy.""" | |
grouped_files: dict[str, list[str]] = {} | |
for file_path in files: | |
# Get relative path from vault root | |
relative_path = os.path.relpath(file_path, VAULT_PATH) | |
folder_name = os.path.dirname(relative_path) | |
# Skip files in the root directory | |
if folder_name == '.': | |
continue | |
grouped_files.setdefault(folder_name, []).append(file_path) | |
# Sort files in the DailyNotes folder by filename in descending order | |
if DAILY_NOTES_FOLDER in grouped_files: | |
grouped_files[DAILY_NOTES_FOLDER].sort(key=lambda x: os.path.basename(x), reverse=True) | |
return grouped_files | |
def get_file_metadata(file_path: str) -> tuple[int, str]: | |
"""Get file size and last modified date.""" | |
file_size = os.path.getsize(file_path) | |
last_modified = datetime.fromtimestamp(os.path.getmtime(file_path)).strftime('%Y-%m-%d %H:%M:%S') | |
return file_size, last_modified | |
def pretty_print_month(year_month: str) -> str: | |
"""Convert a year-month string (e.g., '2023-07') to a pretty format (e.g., 'July 2023').""" | |
year, month = year_month.split('-') | |
return f"{month_name[int(month)]} {year}" | |
def write_grouped_index(index_file, folder_name: str, files: list[str], headings_extractor, grouping_func, pretty_print: bool = False) -> None: | |
"""Write a grouped index for a specific folder.""" | |
anchor_name = folder_name.lower().replace(" ", "-").replace("/", "-") | |
emoji = get_emoji_for_folder(folder_name) | |
grouped_files = grouping_func(files) | |
# Handle sorted list of tuples | |
for group_key, group_files in grouped_files: | |
pretty_key = pretty_print_month(group_key) if pretty_print else group_key | |
index_file.write(f'### {pretty_key}\n\n') | |
for file_path in group_files: | |
file_name = os.path.basename(file_path) | |
index_file.write(f'* [[{file_name}|{file_name.rstrip(".md")}]]\n') | |
if INCLUDE_STATISTICS: | |
file_size, last_modified = get_file_metadata(file_path) | |
index_file.write(f' *Size: {file_size / 1024:.2f} KB, Last Modified: {last_modified}*\n') | |
headings = headings_extractor(file_path) | |
for h1 in headings["h1"]: | |
index_file.write(f' * [[{file_name}#{h1}|{h1}]]\n') | |
for h2_tuple in filter(lambda x: x[0] == h1, headings["h2"]): | |
index_file.write(f' * [[{file_name}#{h2_tuple[1]}|{h2_tuple[1]}]]\n') | |
index_file.write('\n') | |
def group_by_month(file_paths: list[str]) -> list[tuple[str, list[str]]]: | |
"""Group files by year-month and return a sorted list of tuples in descending order.""" | |
grouped: dict[str, list[str]] = {} | |
for file_path in file_paths: | |
match = re.search(r'(\d{4})-(\d{2})', os.path.basename(file_path)) | |
if match: | |
year_month = f"{match.group(1)}-{match.group(2)}" | |
grouped.setdefault(year_month, []).append(file_path) | |
# Sort the groups (year-month keys) in descending order | |
sorted_groups = sorted( | |
grouped.items(), | |
key=lambda x: datetime.strptime(x[0], "%Y-%m"), | |
reverse=True | |
) | |
return sorted_groups | |
def group_by_alphabet(file_paths: list[str]) -> list[tuple[str, list[str]]]: | |
"""Group files by their starting letter and return a sorted list of tuples.""" | |
grouped: dict[str, list[str]] = {} | |
for file_path in file_paths: | |
first_letter = os.path.basename(file_path)[0].upper() | |
grouped.setdefault(first_letter, []).append(file_path) | |
# Sort the groups (alphabet keys) in ascending order | |
sorted_groups = sorted(grouped.items(), key=lambda x: x[0]) | |
return sorted_groups | |
def create_index(vault_path: str, index_file_path: str) -> None: | |
"""Create an index file for the vault.""" | |
markdown_files = find_markdown_files(vault_path) | |
grouped_files = group_files_by_folder(markdown_files) | |
with open(index_file_path, 'w', encoding='utf-8') as index_file: | |
# Vault Index | |
if PRINT_WARNING: | |
index_file.write("![[Random/Warning|Warning]]\n\n---\n") | |
index_file.write('# Vault Index\n\n') | |
index_file.write('_This file is autogenerated. Last updated: ' | |
f'{datetime.now().strftime("%Y-%m-%d %H:%M:%S")}_\n\n') | |
# Folder Summary | |
index_file.write('## Folders\n\n') | |
for folder_name, files in sorted(grouped_files.items()): | |
emoji = get_emoji_for_folder(folder_name) | |
# Use the exact folder name (including slashes) for the anchor | |
anchor_name = f"{emoji} {folder_name}" | |
index_file.write(f'* [[#{anchor_name}|{emoji} {folder_name}]] ({len(files)} files)\n') | |
index_file.write('\n---\n\n') | |
# File Details | |
total_files = 0 | |
total_size = 0 | |
for folder_name, file_paths in sorted(grouped_files.items()): | |
total_files += len(file_paths) | |
total_size += sum(os.path.getsize(file_path) for file_path in file_paths) | |
# Use the exact folder name (including slashes) for the heading | |
emoji = get_emoji_for_folder(folder_name) | |
anchor_name = f"{emoji} {folder_name}" | |
# Specialized groupings for threshold or DailyNotes folder | |
if folder_name == DAILY_NOTES_FOLDER: | |
index_file.write(f'## {emoji} {folder_name}\n\n') | |
#index_file.write(f'## {emoji} {folder_name}-Index\n\n') | |
write_grouped_index(index_file, folder_name, file_paths, extract_headings, group_by_month, pretty_print=True) | |
elif len(file_paths) > FOLDER_THRESHOLD: | |
index_file.write(f'## {emoji} {folder_name}\n\n') | |
#index_file.write(f'## {emoji} {folder_name}-Index\n\n') | |
write_grouped_index(index_file, folder_name, file_paths, extract_headings, group_by_alphabet) | |
else: | |
# Standard output for other folders | |
index_file.write(f'## {emoji} {folder_name}\n\n') | |
for file_path in file_paths: | |
file_name = os.path.basename(file_path) | |
index_file.write(f'### [[{file_name}|{file_name.rstrip(".md")}]]\n') | |
if INCLUDE_STATISTICS: | |
file_size, last_modified = get_file_metadata(file_path) | |
index_file.write(f'*Size: {file_size / 1024:.2f} KB, Last Modified: {last_modified}*\n') | |
headings = extract_headings(file_path) | |
for h1 in headings["h1"]: | |
index_file.write(f'* [[{file_name}#{h1}|{h1}]]\n') | |
for h2_tuple in filter(lambda x: x[0] == h1, headings["h2"]): | |
index_file.write(f' * [[{file_name}#{h2_tuple[1]}|{h2_tuple[1]}]]\n') | |
index_file.write('\n') | |
# Summary | |
if INCLUDE_STATISTICS: | |
index_file.write('---\n\n') | |
index_file.write('## Summary\n\n') | |
index_file.write(f'- Total folders: {len(grouped_files)}\n') | |
index_file.write(f'- Total files: {total_files}\n') | |
index_file.write(f'- Total size: {total_size / (1024 * 1024):.2f} MB\n') | |
logging.info(f"Index file created successfully at {index_file_path}") | |
if __name__ == '__main__': | |
index_file_path = os.path.join(VAULT_PATH, INDEX_FILE_NAME) | |
create_index(VAULT_PATH, index_file_path) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment