Skip to content

Instantly share code, notes, and snippets.

@ehzawad
Created January 31, 2025 15:03
Show Gist options
  • Save ehzawad/da57ff92612211a5609642bc93d5939b to your computer and use it in GitHub Desktop.
Save ehzawad/da57ff92612211a5609642bc93d5939b to your computer and use it in GitHub Desktop.
#!/usr/bin/env python
import re
import os
import sys
from typing import Tuple, Dict, Any
def extract_title_from_front_matter(content: str) -> str:
"""
Extract the 'title: ...' line from the front matter if present.
Returns just the title string, or an empty string if none found.
"""
# Match the entire front matter block: --- ... ---
fm_regex = re.compile(r'^---\s*(.*?)\s*---\s*', flags=re.DOTALL | re.MULTILINE)
front_matter_match = fm_regex.search(content)
if not front_matter_match:
return ""
front_matter = front_matter_match.group(1)
# Find a line that begins with 'title:'
title_match = re.search(r'^title:\s*(.+)$', front_matter, flags=re.MULTILINE)
if title_match:
return title_match.group(1).strip()
return ""
def clean_markdown_keep_code_and_title(content: str) -> Tuple[str, int]:
"""
Keeps only:
1) The title line from front matter.
2) The raw contents of code blocks (without backticks).
Removes everything else.
Returns (cleaned_content, chars_removed).
"""
original_length = len(content)
# 1. Extract title
title = extract_title_from_front_matter(content)
# 2. Remove the entire front matter block (including the --- lines)
content = re.sub(r'^---\s*(.*?)\s*---\s*', '', content, flags=re.DOTALL | re.MULTILINE)
# 3. Extract code blocks (their contents only, minus the triple backticks)
code_block_regex = re.compile(
r'```[^\n]*\n(.*?)\n```', # group(1) captures the code content
flags=re.DOTALL
)
code_blocks = code_block_regex.findall(content)
# 4. Now that we extracted code blocks, everything else goes away.
cleaned_content = ""
# 5. Prepend the extracted title if it exists
if title:
cleaned_content += title.strip() + "\n\n"
# 6. Append each code block, separated by a blank line
for i, block in enumerate(code_blocks):
cleaned_content += block
if i < len(code_blocks) - 1:
cleaned_content += "\n\n" # blank line between code blocks
final_length = len(cleaned_content)
chars_removed = original_length - final_length
return cleaned_content, chars_removed
def process_file(file_path: str) -> Tuple[str, Dict[str, Any]]:
"""
Reads a Markdown file, processes it to keep only the front-matter title and code blocks,
and returns (cleaned_content, stats).
"""
try:
with open(file_path, 'r', encoding='utf-8') as f:
content = f.read()
cleaned_content, chars_removed = clean_markdown_keep_code_and_title(content)
stats = {
'original_size': len(content),
'cleaned_size': len(cleaned_content),
'chars_removed': chars_removed,
}
return cleaned_content, stats
except Exception as e:
print(f"Error processing file {file_path}: {str(e)}")
return "", {}
def main():
# Determine the output file name. Defaults to 'react.txt' if not provided
output_file = "react.txt"
if len(sys.argv) > 1:
output_file = sys.argv[1]
# Prepare to accumulate total stats
total_original_size = 0
total_cleaned_size = 0
total_chars_removed = 0
file_count = 0
# Collect all markdown files (recursively)
md_files = []
for root, dirs, files in os.walk("."):
for filename in files:
if filename.lower().endswith(".md"):
md_files.append(os.path.join(root, filename))
# Sort md_files if you want a consistent order (optional)
md_files.sort()
# Overwrite (not append) the output file
with open(output_file, 'w', encoding='utf-8') as out_f:
for md_file in md_files:
cleaned_content, stats = process_file(md_file)
# Update running totals
total_original_size += stats.get('original_size', 0)
total_cleaned_size += stats.get('cleaned_size', 0)
total_chars_removed += stats.get('chars_removed', 0)
file_count += 1
# Write the cleaned content for each file
# Optionally, separate each file's content with a heading
# out_f.write(f"===== {md_file} =====\n")
out_f.write(cleaned_content)
out_f.write("\n\n") # blank line after each file's content
# Compute the overall reduction percentage
reduction_percentage = 0.0
if total_original_size > 0:
reduction_percentage = (total_chars_removed / total_original_size) * 100
# Print summary statistics for *all* files combined
print("===== Combined Statistics =====")
print(f"Files processed: {file_count}")
print(f"Total original size: {total_original_size} characters")
print(f"Total cleaned size: {total_cleaned_size} characters")
print(f"Total characters removed: {total_chars_removed}")
print(f"Overall size reduction: {reduction_percentage:.2f}%")
print(f"Cleaned output written to: {output_file}")
if __name__ == "__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment