Skip to content

Instantly share code, notes, and snippets.

@shawnyeager
Created May 4, 2025 02:20
Show Gist options
  • Save shawnyeager/9c1f4054e95d71b1498de87165251f0c to your computer and use it in GitHub Desktop.
Save shawnyeager/9c1f4054e95d71b1498de87165251f0c to your computer and use it in GitHub Desktop.
Convert Jekyll-formatted markdown files to prose.sh format.
#!/usr/bin/env python3
"""
Convert Jekyll-formatted markdown files to prose.sh format.
This script transforms Jekyll blog posts to prose.sh format with the following enhancements:
1. Processes all .md files in the current directory
2. Creates a 'prose.sh' subdirectory for output files
3. Transforms front matter:
- Removes Jekyll-specific fields like layout
- Preserves title, date, description (with markdown removed)
- Flattens image paths to just filenames
- Combines categories and tags into a single tags array
- Converts redirect_from to aliases
4. Content transformations:
- Embeds hero images at the top of posts when available
- Converts typography: -- to en-dashes, --- to em-dashes
- Preserves HTML comments while converting surrounding content
5. Strips date prefix from filenames (YYYY-MM-DD-name.md -> name.md)
Usage:
python convert_jekyll_to_prose.py
Author: Shawn Yeager
"""
import os
import re
import yaml
import sys
from pathlib import Path
def strip_markdown(text):
"""
Strip markdown formatting from text while preserving content.
Args:
text (str): The markdown-formatted text
Returns:
str: Plain text with markdown formatting removed
"""
if not text:
return text
# Remove inline code backticks
text = re.sub(r'`([^`]+)`', r'\1', text)
# Remove bold/italic markers
text = re.sub(r'\*\*([^*]+)\*\*', r'\1', text)
text = re.sub(r'\*([^*]+)\*', r'\1', text)
text = re.sub(r'__([^_]+)__', r'\1', text)
text = re.sub(r'_([^_]+)_', r'\1', text)
# Remove links but keep the text
text = re.sub(r'\[([^\]]+)\]\([^\)]+\)', r'\1', text)
# Remove images
text = re.sub(r'!\[([^\]]*)\]\([^\)]+\)', '', text)
# Remove headers
text = re.sub(r'#{1,6}\s+(.+)', r'\1', text)
# Remove HTML tags
text = re.sub(r'<[^>]+>', '', text)
return text.strip()
def mark_html_comments(text):
"""
Find HTML comments and replace them with unique markers to protect them from conversion.
Args:
text (str): Text that may contain HTML comments
Returns:
tuple: (text with comments replaced by markers, dictionary of markers to original comments)
"""
markers = {}
comment_pattern = re.compile(r'<!--.*?-->', re.DOTALL)
for i, match in enumerate(comment_pattern.finditer(text)):
marker = f"__HTML_COMMENT_{i}__"
markers[marker] = match.group(0)
text = text.replace(match.group(0), marker)
return text, markers
def restore_html_comments(text, markers):
"""
Restore HTML comments from their markers.
Args:
text (str): Text with comment markers
markers (dict): Dictionary mapping markers to original comments
Returns:
str: Text with original comments restored
"""
for marker, comment in markers.items():
text = text.replace(marker, comment)
return text
def format_tags_flow_style(dumper, data):
"""
Format tags as a flow-style YAML sequence [tag1, tag2, tag3].
Args:
dumper: YAML dumper instance
data: The list of tags
Returns:
YAML node with flow-style formatting
"""
return dumper.represent_sequence('tag:yaml.org,2002:seq', data, flow_style=True)
def convert_jekyll_to_prose(jekyll_file_path, output_dir):
"""
Convert a Jekyll markdown file to prose.sh format.
Args:
jekyll_file_path (str): Path to the Jekyll markdown file
output_dir (str): Directory to write the converted file
"""
with open(jekyll_file_path, 'r', encoding='utf-8') as file:
content = file.read()
# Extract front matter
front_matter_match = re.match(r'^---\n(.*?)\n---\n', content, re.DOTALL)
if not front_matter_match:
print(f"No front matter found in {jekyll_file_path}")
return
front_matter_text = front_matter_match.group(1)
body_text = content[front_matter_match.end():]
try:
front_matter = yaml.safe_load(front_matter_text)
except yaml.YAMLError as e:
print(f"Error parsing front matter in {jekyll_file_path}: {e}")
return
# Create new front matter for prose.sh
prose_front_matter = {}
# --- Process frontmatter fields ---
# Copy title
if 'title' in front_matter:
prose_front_matter['title'] = front_matter['title']
# Convert date format: strip time and remove quotes
if 'date' in front_matter:
date_value = front_matter['date']
# Extract just the date part (YYYY-MM-DD) and remove quotes
if isinstance(date_value, str):
date_match = re.match(r"'?(\d{4}-\d{2}-\d{2}).*'?", date_value)
if date_match:
prose_front_matter['date'] = date_match.group(1)
else:
prose_front_matter['date'] = date_value
else:
# Handle datetime objects
prose_front_matter['date'] = date_value.strftime("%Y-%m-%d") if hasattr(date_value, 'strftime') else date_value
# Copy description and strip any markdown formatting
if 'description' in front_matter:
description = front_matter['description']
prose_front_matter['description'] = strip_markdown(description)
# Flatten image path
if 'image' in front_matter and front_matter['image']:
image_path = front_matter['image']
# Extract just the filename from the path
prose_front_matter['image'] = os.path.basename(image_path)
# Combine categories and tags into a single array format
tags = []
if 'tags' in front_matter and front_matter['tags']:
if isinstance(front_matter['tags'], list):
tags.extend(front_matter['tags'])
else:
tags.append(front_matter['tags'])
if 'categories' in front_matter and front_matter['categories']:
if isinstance(front_matter['categories'], list):
tags.extend(front_matter['categories'])
else:
tags.append(front_matter['categories'])
if tags:
# Use list format for tags: [tag1, tag2, tag3]
prose_front_matter['tags'] = tags
# Convert redirects to aliases
if 'redirect_from' in front_matter and front_matter['redirect_from']:
prose_front_matter['aliases'] = front_matter['redirect_from']
# --- Format and generate frontmatter ---
# Setup custom YAML dumper with flow formatting for tags
class CustomDumper(yaml.SafeDumper):
pass
# Store the original representer
original_representer = yaml.representer.SafeRepresenter.represent_list
# Override the list representer for tags only
yaml.add_representer(list,
lambda dumper, data: format_tags_flow_style(dumper, data) if 'tags' in prose_front_matter and data == prose_front_matter['tags']
else original_representer(dumper, data),
Dumper=CustomDumper)
new_front_matter = yaml.dump(prose_front_matter, Dumper=CustomDumper, default_flow_style=False)
# Directly fix the date format by removing quotes around date value
if 'date' in prose_front_matter:
date_pattern = re.compile(r"(date: )'(.*)'")
new_front_matter = date_pattern.sub(r"\1\2", new_front_matter)
# Format tags properly according to prose.sh: tags: [tag1, tag2, tag3]
if 'tags' in prose_front_matter and prose_front_matter['tags']:
# First get the current tags section from the YAML
tags_section = re.search(r'tags:(\s*-.*$\n)+', new_front_matter, re.MULTILINE)
if tags_section:
# Extract all tags from the list format
tag_matches = re.findall(r'- (.+)$', tags_section.group(0), re.MULTILINE)
# Join them into a single line array format
formatted_tags = f"tags: [{', '.join(tag_matches)}]\n"
# Replace the entire section with the new format
new_front_matter = new_front_matter.replace(tags_section.group(0), formatted_tags)
# --- Process content ---
# Make sure there's exactly one blank line between front matter and content
# First, ensure body_text doesn't already start with blank lines
body_text_trimmed = body_text.lstrip('\n')
# Process typography
# 1. First protect HTML comments from conversion
body_text_trimmed, html_comment_markers = mark_html_comments(body_text_trimmed)
# 2. Convert triple hyphens to em-dashes (excluding frontmatter delimiter)
body_text_trimmed = re.sub(r'(?<!^)---(?!$)', '—', body_text_trimmed)
# 3. Convert double hyphens to en-dashes
body_text_trimmed = re.sub(r'--', '–', body_text_trimmed)
# 4. Restore HTML comments
body_text_trimmed = restore_html_comments(body_text_trimmed, html_comment_markers)
# Add hero image if available
has_hero_image = 'image' in prose_front_matter and prose_front_matter['image']
if has_hero_image:
# Add the image as a hero image at the top of the content
image_filename = prose_front_matter['image']
hero_image_markdown = f"![{prose_front_matter.get('title', 'Hero image')}]({image_filename})\n\n"
body_text_trimmed = hero_image_markdown + body_text_trimmed
# --- Create final content ---
new_content = f'---\n{new_front_matter}---\n\n{body_text_trimmed}'
# Strip date prefix from filename (YYYY-MM-DD-name.md -> name.md)
original_filename = os.path.basename(jekyll_file_path)
new_filename = re.sub(r'^\d{4}-\d{2}-\d{2}-', '', original_filename)
# Write the converted file
output_path = os.path.join(output_dir, new_filename)
with open(output_path, 'w', encoding='utf-8') as file:
file.write(new_content)
print(f"Converted {jekyll_file_path} -> {output_path}")
def main():
"""
Main function that processes all markdown files in the current directory.
Creates the output directory and handles the conversion process.
"""
# Create output directory
output_dir = os.path.join(os.getcwd(), 'prose.sh')
os.makedirs(output_dir, exist_ok=True)
# Process all markdown files in current directory
count = 0
for md_file in Path(os.getcwd()).glob('*.md'):
if md_file.name != 'README.md' and not md_file.name.startswith('_'):
convert_jekyll_to_prose(md_file, output_dir)
count += 1
print(f"Conversion complete. Processed {count} files.")
if __name__ == "__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment