Created
May 4, 2025 02:20
-
-
Save shawnyeager/9c1f4054e95d71b1498de87165251f0c to your computer and use it in GitHub Desktop.
Convert Jekyll-formatted markdown files to prose.sh format.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
""" | |
Convert Jekyll-formatted markdown files to prose.sh format. | |
This script transforms Jekyll blog posts to prose.sh format with the following enhancements: | |
1. Processes all .md files in the current directory | |
2. Creates a 'prose.sh' subdirectory for output files | |
3. Transforms front matter: | |
- Removes Jekyll-specific fields like layout | |
- Preserves title, date, description (with markdown removed) | |
- Flattens image paths to just filenames | |
- Combines categories and tags into a single tags array | |
- Converts redirect_from to aliases | |
4. Content transformations: | |
- Embeds hero images at the top of posts when available | |
- Converts typography: -- to en-dashes, --- to em-dashes | |
- Preserves HTML comments while converting surrounding content | |
5. Strips date prefix from filenames (YYYY-MM-DD-name.md -> name.md) | |
Usage: | |
python convert_jekyll_to_prose.py | |
Author: Shawn Yeager | |
""" | |
import os | |
import re | |
import yaml | |
import sys | |
from pathlib import Path | |
def strip_markdown(text): | |
""" | |
Strip markdown formatting from text while preserving content. | |
Args: | |
text (str): The markdown-formatted text | |
Returns: | |
str: Plain text with markdown formatting removed | |
""" | |
if not text: | |
return text | |
# Remove inline code backticks | |
text = re.sub(r'`([^`]+)`', r'\1', text) | |
# Remove bold/italic markers | |
text = re.sub(r'\*\*([^*]+)\*\*', r'\1', text) | |
text = re.sub(r'\*([^*]+)\*', r'\1', text) | |
text = re.sub(r'__([^_]+)__', r'\1', text) | |
text = re.sub(r'_([^_]+)_', r'\1', text) | |
# Remove links but keep the text | |
text = re.sub(r'\[([^\]]+)\]\([^\)]+\)', r'\1', text) | |
# Remove images | |
text = re.sub(r'!\[([^\]]*)\]\([^\)]+\)', '', text) | |
# Remove headers | |
text = re.sub(r'#{1,6}\s+(.+)', r'\1', text) | |
# Remove HTML tags | |
text = re.sub(r'<[^>]+>', '', text) | |
return text.strip() | |
def mark_html_comments(text): | |
""" | |
Find HTML comments and replace them with unique markers to protect them from conversion. | |
Args: | |
text (str): Text that may contain HTML comments | |
Returns: | |
tuple: (text with comments replaced by markers, dictionary of markers to original comments) | |
""" | |
markers = {} | |
comment_pattern = re.compile(r'<!--.*?-->', re.DOTALL) | |
for i, match in enumerate(comment_pattern.finditer(text)): | |
marker = f"__HTML_COMMENT_{i}__" | |
markers[marker] = match.group(0) | |
text = text.replace(match.group(0), marker) | |
return text, markers | |
def restore_html_comments(text, markers): | |
""" | |
Restore HTML comments from their markers. | |
Args: | |
text (str): Text with comment markers | |
markers (dict): Dictionary mapping markers to original comments | |
Returns: | |
str: Text with original comments restored | |
""" | |
for marker, comment in markers.items(): | |
text = text.replace(marker, comment) | |
return text | |
def format_tags_flow_style(dumper, data): | |
""" | |
Format tags as a flow-style YAML sequence [tag1, tag2, tag3]. | |
Args: | |
dumper: YAML dumper instance | |
data: The list of tags | |
Returns: | |
YAML node with flow-style formatting | |
""" | |
return dumper.represent_sequence('tag:yaml.org,2002:seq', data, flow_style=True) | |
def convert_jekyll_to_prose(jekyll_file_path, output_dir): | |
""" | |
Convert a Jekyll markdown file to prose.sh format. | |
Args: | |
jekyll_file_path (str): Path to the Jekyll markdown file | |
output_dir (str): Directory to write the converted file | |
""" | |
with open(jekyll_file_path, 'r', encoding='utf-8') as file: | |
content = file.read() | |
# Extract front matter | |
front_matter_match = re.match(r'^---\n(.*?)\n---\n', content, re.DOTALL) | |
if not front_matter_match: | |
print(f"No front matter found in {jekyll_file_path}") | |
return | |
front_matter_text = front_matter_match.group(1) | |
body_text = content[front_matter_match.end():] | |
try: | |
front_matter = yaml.safe_load(front_matter_text) | |
except yaml.YAMLError as e: | |
print(f"Error parsing front matter in {jekyll_file_path}: {e}") | |
return | |
# Create new front matter for prose.sh | |
prose_front_matter = {} | |
# --- Process frontmatter fields --- | |
# Copy title | |
if 'title' in front_matter: | |
prose_front_matter['title'] = front_matter['title'] | |
# Convert date format: strip time and remove quotes | |
if 'date' in front_matter: | |
date_value = front_matter['date'] | |
# Extract just the date part (YYYY-MM-DD) and remove quotes | |
if isinstance(date_value, str): | |
date_match = re.match(r"'?(\d{4}-\d{2}-\d{2}).*'?", date_value) | |
if date_match: | |
prose_front_matter['date'] = date_match.group(1) | |
else: | |
prose_front_matter['date'] = date_value | |
else: | |
# Handle datetime objects | |
prose_front_matter['date'] = date_value.strftime("%Y-%m-%d") if hasattr(date_value, 'strftime') else date_value | |
# Copy description and strip any markdown formatting | |
if 'description' in front_matter: | |
description = front_matter['description'] | |
prose_front_matter['description'] = strip_markdown(description) | |
# Flatten image path | |
if 'image' in front_matter and front_matter['image']: | |
image_path = front_matter['image'] | |
# Extract just the filename from the path | |
prose_front_matter['image'] = os.path.basename(image_path) | |
# Combine categories and tags into a single array format | |
tags = [] | |
if 'tags' in front_matter and front_matter['tags']: | |
if isinstance(front_matter['tags'], list): | |
tags.extend(front_matter['tags']) | |
else: | |
tags.append(front_matter['tags']) | |
if 'categories' in front_matter and front_matter['categories']: | |
if isinstance(front_matter['categories'], list): | |
tags.extend(front_matter['categories']) | |
else: | |
tags.append(front_matter['categories']) | |
if tags: | |
# Use list format for tags: [tag1, tag2, tag3] | |
prose_front_matter['tags'] = tags | |
# Convert redirects to aliases | |
if 'redirect_from' in front_matter and front_matter['redirect_from']: | |
prose_front_matter['aliases'] = front_matter['redirect_from'] | |
# --- Format and generate frontmatter --- | |
# Setup custom YAML dumper with flow formatting for tags | |
class CustomDumper(yaml.SafeDumper): | |
pass | |
# Store the original representer | |
original_representer = yaml.representer.SafeRepresenter.represent_list | |
# Override the list representer for tags only | |
yaml.add_representer(list, | |
lambda dumper, data: format_tags_flow_style(dumper, data) if 'tags' in prose_front_matter and data == prose_front_matter['tags'] | |
else original_representer(dumper, data), | |
Dumper=CustomDumper) | |
new_front_matter = yaml.dump(prose_front_matter, Dumper=CustomDumper, default_flow_style=False) | |
# Directly fix the date format by removing quotes around date value | |
if 'date' in prose_front_matter: | |
date_pattern = re.compile(r"(date: )'(.*)'") | |
new_front_matter = date_pattern.sub(r"\1\2", new_front_matter) | |
# Format tags properly according to prose.sh: tags: [tag1, tag2, tag3] | |
if 'tags' in prose_front_matter and prose_front_matter['tags']: | |
# First get the current tags section from the YAML | |
tags_section = re.search(r'tags:(\s*-.*$\n)+', new_front_matter, re.MULTILINE) | |
if tags_section: | |
# Extract all tags from the list format | |
tag_matches = re.findall(r'- (.+)$', tags_section.group(0), re.MULTILINE) | |
# Join them into a single line array format | |
formatted_tags = f"tags: [{', '.join(tag_matches)}]\n" | |
# Replace the entire section with the new format | |
new_front_matter = new_front_matter.replace(tags_section.group(0), formatted_tags) | |
# --- Process content --- | |
# Make sure there's exactly one blank line between front matter and content | |
# First, ensure body_text doesn't already start with blank lines | |
body_text_trimmed = body_text.lstrip('\n') | |
# Process typography | |
# 1. First protect HTML comments from conversion | |
body_text_trimmed, html_comment_markers = mark_html_comments(body_text_trimmed) | |
# 2. Convert triple hyphens to em-dashes (excluding frontmatter delimiter) | |
body_text_trimmed = re.sub(r'(?<!^)---(?!$)', '—', body_text_trimmed) | |
# 3. Convert double hyphens to en-dashes | |
body_text_trimmed = re.sub(r'--', '–', body_text_trimmed) | |
# 4. Restore HTML comments | |
body_text_trimmed = restore_html_comments(body_text_trimmed, html_comment_markers) | |
# Add hero image if available | |
has_hero_image = 'image' in prose_front_matter and prose_front_matter['image'] | |
if has_hero_image: | |
# Add the image as a hero image at the top of the content | |
image_filename = prose_front_matter['image'] | |
hero_image_markdown = f"\n\n" | |
body_text_trimmed = hero_image_markdown + body_text_trimmed | |
# --- Create final content --- | |
new_content = f'---\n{new_front_matter}---\n\n{body_text_trimmed}' | |
# Strip date prefix from filename (YYYY-MM-DD-name.md -> name.md) | |
original_filename = os.path.basename(jekyll_file_path) | |
new_filename = re.sub(r'^\d{4}-\d{2}-\d{2}-', '', original_filename) | |
# Write the converted file | |
output_path = os.path.join(output_dir, new_filename) | |
with open(output_path, 'w', encoding='utf-8') as file: | |
file.write(new_content) | |
print(f"Converted {jekyll_file_path} -> {output_path}") | |
def main(): | |
""" | |
Main function that processes all markdown files in the current directory. | |
Creates the output directory and handles the conversion process. | |
""" | |
# Create output directory | |
output_dir = os.path.join(os.getcwd(), 'prose.sh') | |
os.makedirs(output_dir, exist_ok=True) | |
# Process all markdown files in current directory | |
count = 0 | |
for md_file in Path(os.getcwd()).glob('*.md'): | |
if md_file.name != 'README.md' and not md_file.name.startswith('_'): | |
convert_jekyll_to_prose(md_file, output_dir) | |
count += 1 | |
print(f"Conversion complete. Processed {count} files.") | |
if __name__ == "__main__": | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment