Created
January 31, 2025 15:03
-
-
Save ehzawad/da57ff92612211a5609642bc93d5939b to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
import re | |
import os | |
import sys | |
from typing import Tuple, Dict, Any | |
def extract_title_from_front_matter(content: str) -> str: | |
""" | |
Extract the 'title: ...' line from the front matter if present. | |
Returns just the title string, or an empty string if none found. | |
""" | |
# Match the entire front matter block: --- ... --- | |
fm_regex = re.compile(r'^---\s*(.*?)\s*---\s*', flags=re.DOTALL | re.MULTILINE) | |
front_matter_match = fm_regex.search(content) | |
if not front_matter_match: | |
return "" | |
front_matter = front_matter_match.group(1) | |
# Find a line that begins with 'title:' | |
title_match = re.search(r'^title:\s*(.+)$', front_matter, flags=re.MULTILINE) | |
if title_match: | |
return title_match.group(1).strip() | |
return "" | |
def clean_markdown_keep_code_and_title(content: str) -> Tuple[str, int]: | |
""" | |
Keeps only: | |
1) The title line from front matter. | |
2) The raw contents of code blocks (without backticks). | |
Removes everything else. | |
Returns (cleaned_content, chars_removed). | |
""" | |
original_length = len(content) | |
# 1. Extract title | |
title = extract_title_from_front_matter(content) | |
# 2. Remove the entire front matter block (including the --- lines) | |
content = re.sub(r'^---\s*(.*?)\s*---\s*', '', content, flags=re.DOTALL | re.MULTILINE) | |
# 3. Extract code blocks (their contents only, minus the triple backticks) | |
code_block_regex = re.compile( | |
r'```[^\n]*\n(.*?)\n```', # group(1) captures the code content | |
flags=re.DOTALL | |
) | |
code_blocks = code_block_regex.findall(content) | |
# 4. Now that we extracted code blocks, everything else goes away. | |
cleaned_content = "" | |
# 5. Prepend the extracted title if it exists | |
if title: | |
cleaned_content += title.strip() + "\n\n" | |
# 6. Append each code block, separated by a blank line | |
for i, block in enumerate(code_blocks): | |
cleaned_content += block | |
if i < len(code_blocks) - 1: | |
cleaned_content += "\n\n" # blank line between code blocks | |
final_length = len(cleaned_content) | |
chars_removed = original_length - final_length | |
return cleaned_content, chars_removed | |
def process_file(file_path: str) -> Tuple[str, Dict[str, Any]]: | |
""" | |
Reads a Markdown file, processes it to keep only the front-matter title and code blocks, | |
and returns (cleaned_content, stats). | |
""" | |
try: | |
with open(file_path, 'r', encoding='utf-8') as f: | |
content = f.read() | |
cleaned_content, chars_removed = clean_markdown_keep_code_and_title(content) | |
stats = { | |
'original_size': len(content), | |
'cleaned_size': len(cleaned_content), | |
'chars_removed': chars_removed, | |
} | |
return cleaned_content, stats | |
except Exception as e: | |
print(f"Error processing file {file_path}: {str(e)}") | |
return "", {} | |
def main(): | |
# Determine the output file name. Defaults to 'react.txt' if not provided | |
output_file = "react.txt" | |
if len(sys.argv) > 1: | |
output_file = sys.argv[1] | |
# Prepare to accumulate total stats | |
total_original_size = 0 | |
total_cleaned_size = 0 | |
total_chars_removed = 0 | |
file_count = 0 | |
# Collect all markdown files (recursively) | |
md_files = [] | |
for root, dirs, files in os.walk("."): | |
for filename in files: | |
if filename.lower().endswith(".md"): | |
md_files.append(os.path.join(root, filename)) | |
# Sort md_files if you want a consistent order (optional) | |
md_files.sort() | |
# Overwrite (not append) the output file | |
with open(output_file, 'w', encoding='utf-8') as out_f: | |
for md_file in md_files: | |
cleaned_content, stats = process_file(md_file) | |
# Update running totals | |
total_original_size += stats.get('original_size', 0) | |
total_cleaned_size += stats.get('cleaned_size', 0) | |
total_chars_removed += stats.get('chars_removed', 0) | |
file_count += 1 | |
# Write the cleaned content for each file | |
# Optionally, separate each file's content with a heading | |
# out_f.write(f"===== {md_file} =====\n") | |
out_f.write(cleaned_content) | |
out_f.write("\n\n") # blank line after each file's content | |
# Compute the overall reduction percentage | |
reduction_percentage = 0.0 | |
if total_original_size > 0: | |
reduction_percentage = (total_chars_removed / total_original_size) * 100 | |
# Print summary statistics for *all* files combined | |
print("===== Combined Statistics =====") | |
print(f"Files processed: {file_count}") | |
print(f"Total original size: {total_original_size} characters") | |
print(f"Total cleaned size: {total_cleaned_size} characters") | |
print(f"Total characters removed: {total_chars_removed}") | |
print(f"Overall size reduction: {reduction_percentage:.2f}%") | |
print(f"Cleaned output written to: {output_file}") | |
if __name__ == "__main__": | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment