shawnyeager · May 4, 2025 02:20
diff --git a/convert_jekyll_to_prose.py b/convert_jekyll_to_prose.py
 #!/usr/bin/env python3
 """
 Convert Jekyll-formatted markdown files to prose.sh format.

 This script transforms Jekyll blog posts to prose.sh format with the following enhancements:
 1. Processes all .md files in the current directory
 2. Creates a 'prose.sh' subdirectory for output files
 3. Transforms front matter:
   - Removes Jekyll-specific fields like layout
   - Preserves title, date, description (with markdown removed)
   - Flattens image paths to just filenames
   - Combines categories and tags into a single tags array
   - Converts redirect_from to aliases
 4. Content transformations:
   - Embeds hero images at the top of posts when available
   - Converts typography: -- to en-dashes, --- to em-dashes
   - Preserves HTML comments while converting surrounding content
 5. Strips date prefix from filenames (YYYY-MM-DD-name.md -> name.md)

 Usage:
  python convert_jekyll_to_prose.py

 Author: Shawn Yeager
 """

 import os
 import re
 import yaml
 import sys
 from pathlib import Path


 def strip_markdown(text):
    """
    Strip markdown formatting from text while preserving content.
    
    Args:
        text (str): The markdown-formatted text
        
    Returns:
        str: Plain text with markdown formatting removed
    """
    if not text:
        return text

    # Remove inline code backticks
    text = re.sub(r'`([^`]+)`', r'\1', text)
    
    # Remove bold/italic markers
    text = re.sub(r'\*\*([^*]+)\*\*', r'\1', text)
    text = re.sub(r'\*([^*]+)\*', r'\1', text)
    text = re.sub(r'__([^_]+)__', r'\1', text)
    text = re.sub(r'_([^_]+)_', r'\1', text)
    
    # Remove links but keep the text
    text = re.sub(r'\[([^\]]+)\]\([^\)]+\)', r'\1', text)
    
    # Remove images
    text = re.sub(r'!\[([^\]]*)\]\([^\)]+\)', '', text)
    
    # Remove headers
    text = re.sub(r'#{1,6}\s+(.+)', r'\1', text)
    
    # Remove HTML tags
    text = re.sub(r'<[^>]+>', '', text)
    
    return text.strip()


 def mark_html_comments(text):
    """
    Find HTML comments and replace them with unique markers to protect them from conversion.
    
    Args:
        text (str): Text that may contain HTML comments
    
    Returns:
        tuple: (text with comments replaced by markers, dictionary of markers to original comments)
    """
    markers = {}
    comment_pattern = re.compile(r'<!--.*?-->', re.DOTALL)
    
    for i, match in enumerate(comment_pattern.finditer(text)):
        marker = f"__HTML_COMMENT_{i}__"
        markers[marker] = match.group(0)
        text = text.replace(match.group(0), marker)
    
    return text, markers


 def restore_html_comments(text, markers):
    """
    Restore HTML comments from their markers.
    
    Args:
        text (str): Text with comment markers
        markers (dict): Dictionary mapping markers to original comments
    
    Returns:
        str: Text with original comments restored
    """
    for marker, comment in markers.items():
        text = text.replace(marker, comment)
    return text


 def format_tags_flow_style(dumper, data):
    """
    Format tags as a flow-style YAML sequence [tag1, tag2, tag3].
    
    Args:
        dumper: YAML dumper instance
        data: The list of tags
    
    Returns:
        YAML node with flow-style formatting
    """
    return dumper.represent_sequence('tag:yaml.org,2002:seq', data, flow_style=True)


 def convert_jekyll_to_prose(jekyll_file_path, output_dir):
    """
    Convert a Jekyll markdown file to prose.sh format.
    
    Args:
        jekyll_file_path (str): Path to the Jekyll markdown file
        output_dir (str): Directory to write the converted file
    """
    with open(jekyll_file_path, 'r', encoding='utf-8') as file:
        content = file.read()
    
    # Extract front matter
    front_matter_match = re.match(r'^---\n(.*?)\n---\n', content, re.DOTALL)
    if not front_matter_match:
        print(f"No front matter found in {jekyll_file_path}")
        return
    
    front_matter_text = front_matter_match.group(1)
    body_text = content[front_matter_match.end():]
    
    try:
        front_matter = yaml.safe_load(front_matter_text)
    except yaml.YAMLError as e:
        print(f"Error parsing front matter in {jekyll_file_path}: {e}")
        return
    
    # Create new front matter for prose.sh
    prose_front_matter = {}
    
    # --- Process frontmatter fields ---
    
    # Copy title
    if 'title' in front_matter:
        prose_front_matter['title'] = front_matter['title']
    
    # Convert date format: strip time and remove quotes
    if 'date' in front_matter:
        date_value = front_matter['date']
        # Extract just the date part (YYYY-MM-DD) and remove quotes
        if isinstance(date_value, str):
            date_match = re.match(r"'?(\d{4}-\d{2}-\d{2}).*'?", date_value)
            if date_match:
                prose_front_matter['date'] = date_match.group(1)
            else:
                prose_front_matter['date'] = date_value
        else:
            # Handle datetime objects
            prose_front_matter['date'] = date_value.strftime("%Y-%m-%d") if hasattr(date_value, 'strftime') else date_value
    
    # Copy description and strip any markdown formatting
    if 'description' in front_matter:
        description = front_matter['description']
        prose_front_matter['description'] = strip_markdown(description)
    
    # Flatten image path
    if 'image' in front_matter and front_matter['image']:
        image_path = front_matter['image']
        # Extract just the filename from the path
        prose_front_matter['image'] = os.path.basename(image_path)
    
    # Combine categories and tags into a single array format
    tags = []
    if 'tags' in front_matter and front_matter['tags']:
        if isinstance(front_matter['tags'], list):
            tags.extend(front_matter['tags'])
        else:
            tags.append(front_matter['tags'])
    
    if 'categories' in front_matter and front_matter['categories']:
        if isinstance(front_matter['categories'], list):
            tags.extend(front_matter['categories'])
        else:
            tags.append(front_matter['categories'])
    
    if tags:
        # Use list format for tags: [tag1, tag2, tag3]
        prose_front_matter['tags'] = tags
    
    # Convert redirects to aliases
    if 'redirect_from' in front_matter and front_matter['redirect_from']:
        prose_front_matter['aliases'] = front_matter['redirect_from']
    
    # --- Format and generate frontmatter ---
    
    # Setup custom YAML dumper with flow formatting for tags
    class CustomDumper(yaml.SafeDumper):
        pass
    
    # Store the original representer
    original_representer = yaml.representer.SafeRepresenter.represent_list
    
    # Override the list representer for tags only
    yaml.add_representer(list, 
        lambda dumper, data: format_tags_flow_style(dumper, data) if 'tags' in prose_front_matter and data == prose_front_matter['tags'] 
        else original_representer(dumper, data),
        Dumper=CustomDumper)
    
    new_front_matter = yaml.dump(prose_front_matter, Dumper=CustomDumper, default_flow_style=False)
    
    # Directly fix the date format by removing quotes around date value
    if 'date' in prose_front_matter:
        date_pattern = re.compile(r"(date: )'(.*)'")
        new_front_matter = date_pattern.sub(r"\1\2", new_front_matter)
    
    # Format tags properly according to prose.sh: tags: [tag1, tag2, tag3]
    if 'tags' in prose_front_matter and prose_front_matter['tags']:
        # First get the current tags section from the YAML
        tags_section = re.search(r'tags:(\s*-.*$\n)+', new_front_matter, re.MULTILINE)
        if tags_section:
            # Extract all tags from the list format
            tag_matches = re.findall(r'- (.+)$', tags_section.group(0), re.MULTILINE)
            # Join them into a single line array format
            formatted_tags = f"tags: [{', '.join(tag_matches)}]\n"
            # Replace the entire section with the new format
            new_front_matter = new_front_matter.replace(tags_section.group(0), formatted_tags)
    
    # --- Process content ---
    
    # Make sure there's exactly one blank line between front matter and content
    # First, ensure body_text doesn't already start with blank lines
    body_text_trimmed = body_text.lstrip('\n')
    
    # Process typography
    # 1. First protect HTML comments from conversion
    body_text_trimmed, html_comment_markers = mark_html_comments(body_text_trimmed)
    
    # 2. Convert triple hyphens to em-dashes (excluding frontmatter delimiter)
    body_text_trimmed = re.sub(r'(?<!^)---(?!$)', '—', body_text_trimmed)
    
    # 3. Convert double hyphens to en-dashes
    body_text_trimmed = re.sub(r'--', '–', body_text_trimmed)
    
    # 4. Restore HTML comments
    body_text_trimmed = restore_html_comments(body_text_trimmed, html_comment_markers)
    
    # Add hero image if available
    has_hero_image = 'image' in prose_front_matter and prose_front_matter['image']
    
    if has_hero_image:
        # Add the image as a hero image at the top of the content
        image_filename = prose_front_matter['image']
        hero_image_markdown = f"![{prose_front_matter.get('title', 'Hero image')}]({image_filename})\n\n"
        body_text_trimmed = hero_image_markdown + body_text_trimmed
    
    # --- Create final content ---
    new_content = f'---\n{new_front_matter}---\n\n{body_text_trimmed}'
    
    # Strip date prefix from filename (YYYY-MM-DD-name.md -> name.md)
    original_filename = os.path.basename(jekyll_file_path)
    new_filename = re.sub(r'^\d{4}-\d{2}-\d{2}-', '', original_filename)
    
    # Write the converted file
    output_path = os.path.join(output_dir, new_filename)
    with open(output_path, 'w', encoding='utf-8') as file:
        file.write(new_content)
    
    print(f"Converted {jekyll_file_path} -> {output_path}")


 def main():
    """
    Main function that processes all markdown files in the current directory.
    Creates the output directory and handles the conversion process.
    """
    # Create output directory
    output_dir = os.path.join(os.getcwd(), 'prose.sh')
    os.makedirs(output_dir, exist_ok=True)
    
    # Process all markdown files in current directory
    count = 0
    for md_file in Path(os.getcwd()).glob('*.md'):
        if md_file.name != 'README.md' and not md_file.name.startswith('_'):
            convert_jekyll_to_prose(md_file, output_dir)
            count += 1
    
    print(f"Conversion complete. Processed {count} files.")


 if __name__ == "__main__":
    main()
	#!/usr/bin/env python3
	"""
	Convert Jekyll-formatted markdown files to prose.sh format.

	This script transforms Jekyll blog posts to prose.sh format with the following enhancements:
	1. Processes all .md files in the current directory
	2. Creates a 'prose.sh' subdirectory for output files
	3. Transforms front matter:
	- Removes Jekyll-specific fields like layout
	- Preserves title, date, description (with markdown removed)
	- Flattens image paths to just filenames
	- Combines categories and tags into a single tags array
	- Converts redirect_from to aliases
	4. Content transformations:
	- Embeds hero images at the top of posts when available
	- Converts typography: -- to en-dashes, --- to em-dashes
	- Preserves HTML comments while converting surrounding content
	5. Strips date prefix from filenames (YYYY-MM-DD-name.md -> name.md)

	Usage:
	python convert_jekyll_to_prose.py

	Author: Shawn Yeager
	"""

	import os
	import re
	import yaml
	import sys
	from pathlib import Path


	def strip_markdown(text):
	"""
	Strip markdown formatting from text while preserving content.

	Args:
	text (str): The markdown-formatted text

	Returns:
	str: Plain text with markdown formatting removed
	"""
	if not text:
	return text

	# Remove inline code backticks
	text = re.sub(r'`([^`]+)`', r'\1', text)

	# Remove bold/italic markers
	text = re.sub(r'\\([^]+)\\*', r'\1', text)
	text = re.sub(r'\([^]+)\*', r'\1', text)
	text = re.sub(r'__([^_]+)__', r'\1', text)
	text = re.sub(r'_([^_]+)_', r'\1', text)

	# Remove links but keep the text
	text = re.sub(r'\[([^\]]+)\]\([^\)]+\)', r'\1', text)

	# Remove images
	text = re.sub(r'!\[([^\]]*)\]\([^\)]+\)', '', text)

	# Remove headers
	text = re.sub(r'#{1,6}\s+(.+)', r'\1', text)

	# Remove HTML tags
	text = re.sub(r'<[^>]+>', '', text)

	return text.strip()


	def mark_html_comments(text):
	"""
	Find HTML comments and replace them with unique markers to protect them from conversion.

	Args:
	text (str): Text that may contain HTML comments

	Returns:
	tuple: (text with comments replaced by markers, dictionary of markers to original comments)
	"""
	markers = {}
	comment_pattern = re.compile(r'<!--.*?-->', re.DOTALL)

	for i, match in enumerate(comment_pattern.finditer(text)):
	marker = f"__HTML_COMMENT_{i}__"
	markers[marker] = match.group(0)
	text = text.replace(match.group(0), marker)

	return text, markers


	def restore_html_comments(text, markers):
	"""
	Restore HTML comments from their markers.

	Args:
	text (str): Text with comment markers
	markers (dict): Dictionary mapping markers to original comments

	Returns:
	str: Text with original comments restored
	"""
	for marker, comment in markers.items():
	text = text.replace(marker, comment)
	return text


	def format_tags_flow_style(dumper, data):
	"""
	Format tags as a flow-style YAML sequence [tag1, tag2, tag3].

	Args:
	dumper: YAML dumper instance
	data: The list of tags

	Returns:
	YAML node with flow-style formatting
	"""
	return dumper.represent_sequence('tag:yaml.org,2002:seq', data, flow_style=True)


	def convert_jekyll_to_prose(jekyll_file_path, output_dir):
	"""
	Convert a Jekyll markdown file to prose.sh format.

	Args:
	jekyll_file_path (str): Path to the Jekyll markdown file
	output_dir (str): Directory to write the converted file
	"""
	with open(jekyll_file_path, 'r', encoding='utf-8') as file:
	content = file.read()

	# Extract front matter
	front_matter_match = re.match(r'^---\n(.*?)\n---\n', content, re.DOTALL)
	if not front_matter_match:
	print(f"No front matter found in {jekyll_file_path}")
	return

	front_matter_text = front_matter_match.group(1)
	body_text = content[front_matter_match.end():]

	try:
	front_matter = yaml.safe_load(front_matter_text)
	except yaml.YAMLError as e:
	print(f"Error parsing front matter in {jekyll_file_path}: {e}")
	return

	# Create new front matter for prose.sh
	prose_front_matter = {}

	# --- Process frontmatter fields ---

	# Copy title
	if 'title' in front_matter:
	prose_front_matter['title'] = front_matter['title']

	# Convert date format: strip time and remove quotes
	if 'date' in front_matter:
	date_value = front_matter['date']
	# Extract just the date part (YYYY-MM-DD) and remove quotes
	if isinstance(date_value, str):
	date_match = re.match(r"'?(\d{4}-\d{2}-\d{2}).*'?", date_value)
	if date_match:
	prose_front_matter['date'] = date_match.group(1)
	else:
	prose_front_matter['date'] = date_value
	else:
	# Handle datetime objects
	prose_front_matter['date'] = date_value.strftime("%Y-%m-%d") if hasattr(date_value, 'strftime') else date_value

	# Copy description and strip any markdown formatting
	if 'description' in front_matter:
	description = front_matter['description']
	prose_front_matter['description'] = strip_markdown(description)

	# Flatten image path
	if 'image' in front_matter and front_matter['image']:
	image_path = front_matter['image']
	# Extract just the filename from the path
	prose_front_matter['image'] = os.path.basename(image_path)

	# Combine categories and tags into a single array format
	tags = []
	if 'tags' in front_matter and front_matter['tags']:
	if isinstance(front_matter['tags'], list):
	tags.extend(front_matter['tags'])
	else:
	tags.append(front_matter['tags'])

	if 'categories' in front_matter and front_matter['categories']:
	if isinstance(front_matter['categories'], list):
	tags.extend(front_matter['categories'])
	else:
	tags.append(front_matter['categories'])

	if tags:
	# Use list format for tags: [tag1, tag2, tag3]
	prose_front_matter['tags'] = tags

	# Convert redirects to aliases
	if 'redirect_from' in front_matter and front_matter['redirect_from']:
	prose_front_matter['aliases'] = front_matter['redirect_from']

	# --- Format and generate frontmatter ---

	# Setup custom YAML dumper with flow formatting for tags
	class CustomDumper(yaml.SafeDumper):
	pass

	# Store the original representer
	original_representer = yaml.representer.SafeRepresenter.represent_list

	# Override the list representer for tags only
	yaml.add_representer(list,
	lambda dumper, data: format_tags_flow_style(dumper, data) if 'tags' in prose_front_matter and data == prose_front_matter['tags']
	else original_representer(dumper, data),
	Dumper=CustomDumper)

	new_front_matter = yaml.dump(prose_front_matter, Dumper=CustomDumper, default_flow_style=False)

	# Directly fix the date format by removing quotes around date value
	if 'date' in prose_front_matter:
	date_pattern = re.compile(r"(date: )'(.*)'")
	new_front_matter = date_pattern.sub(r"\1\2", new_front_matter)

	# Format tags properly according to prose.sh: tags: [tag1, tag2, tag3]
	if 'tags' in prose_front_matter and prose_front_matter['tags']:
	# First get the current tags section from the YAML
	tags_section = re.search(r'tags:(\s-.$\n)+', new_front_matter, re.MULTILINE)
	if tags_section:
	# Extract all tags from the list format
	tag_matches = re.findall(r'- (.+)$', tags_section.group(0), re.MULTILINE)
	# Join them into a single line array format
	formatted_tags = f"tags: [{', '.join(tag_matches)}]\n"
	# Replace the entire section with the new format
	new_front_matter = new_front_matter.replace(tags_section.group(0), formatted_tags)

	# --- Process content ---

	# Make sure there's exactly one blank line between front matter and content
	# First, ensure body_text doesn't already start with blank lines
	body_text_trimmed = body_text.lstrip('\n')

	# Process typography
	# 1. First protect HTML comments from conversion
	body_text_trimmed, html_comment_markers = mark_html_comments(body_text_trimmed)

	# 2. Convert triple hyphens to em-dashes (excluding frontmatter delimiter)
	body_text_trimmed = re.sub(r'(?<!^)---(?!$)', '—', body_text_trimmed)

	# 3. Convert double hyphens to en-dashes
	body_text_trimmed = re.sub(r'--', '–', body_text_trimmed)

	# 4. Restore HTML comments
	body_text_trimmed = restore_html_comments(body_text_trimmed, html_comment_markers)

	# Add hero image if available
	has_hero_image = 'image' in prose_front_matter and prose_front_matter['image']

	if has_hero_image:
	# Add the image as a hero image at the top of the content
	image_filename = prose_front_matter['image']
	hero_image_markdown = f"![{prose_front_matter.get('title', 'Hero image')}]({image_filename})\n\n"
	body_text_trimmed = hero_image_markdown + body_text_trimmed

	# --- Create final content ---
	new_content = f'---\n{new_front_matter}---\n\n{body_text_trimmed}'

	# Strip date prefix from filename (YYYY-MM-DD-name.md -> name.md)
	original_filename = os.path.basename(jekyll_file_path)
	new_filename = re.sub(r'^\d{4}-\d{2}-\d{2}-', '', original_filename)

	# Write the converted file
	output_path = os.path.join(output_dir, new_filename)
	with open(output_path, 'w', encoding='utf-8') as file:
	file.write(new_content)

	print(f"Converted {jekyll_file_path} -> {output_path}")


	def main():
	"""
	Main function that processes all markdown files in the current directory.
	Creates the output directory and handles the conversion process.
	"""
	# Create output directory
	output_dir = os.path.join(os.getcwd(), 'prose.sh')
	os.makedirs(output_dir, exist_ok=True)

	# Process all markdown files in current directory
	count = 0
	for md_file in Path(os.getcwd()).glob('*.md'):
	if md_file.name != 'README.md' and not md_file.name.startswith('_'):
	convert_jekyll_to_prose(md_file, output_dir)
	count += 1

	print(f"Conversion complete. Processed {count} files.")


	if __name__ == "__main__":
	main()