ehzawad · January 31, 2025 15:03
diff --git a/cleanup_md.py b/cleanup_md.py
 #!/usr/bin/env python

 import re
 import os
 import sys
 from typing import Tuple, Dict, Any

 def extract_title_from_front_matter(content: str) -> str:
    """
    Extract the 'title: ...' line from the front matter if present.
    Returns just the title string, or an empty string if none found.
    """
    # Match the entire front matter block: --- ... ---
    fm_regex = re.compile(r'^---\s*(.*?)\s*---\s*', flags=re.DOTALL | re.MULTILINE)
    front_matter_match = fm_regex.search(content)
    if not front_matter_match:
        return ""
    
    front_matter = front_matter_match.group(1)
    # Find a line that begins with 'title:'
    title_match = re.search(r'^title:\s*(.+)$', front_matter, flags=re.MULTILINE)
    if title_match:
        return title_match.group(1).strip()
    return ""

 def clean_markdown_keep_code_and_title(content: str) -> Tuple[str, int]:
    """
    Keeps only:
      1) The title line from front matter.
      2) The raw contents of code blocks (without backticks).
    Removes everything else.
    Returns (cleaned_content, chars_removed).
    """
    original_length = len(content)

    # 1. Extract title
    title = extract_title_from_front_matter(content)

    # 2. Remove the entire front matter block (including the --- lines)
    content = re.sub(r'^---\s*(.*?)\s*---\s*', '', content, flags=re.DOTALL | re.MULTILINE)

    # 3. Extract code blocks (their contents only, minus the triple backticks)
    code_block_regex = re.compile(
        r'```[^\n]*\n(.*?)\n```',  # group(1) captures the code content
        flags=re.DOTALL
    )
    code_blocks = code_block_regex.findall(content)

    # 4. Now that we extracted code blocks, everything else goes away.
    cleaned_content = ""

    # 5. Prepend the extracted title if it exists
    if title:
        cleaned_content += title.strip() + "\n\n"

    # 6. Append each code block, separated by a blank line
    for i, block in enumerate(code_blocks):
        cleaned_content += block
        if i < len(code_blocks) - 1:
            cleaned_content += "\n\n"  # blank line between code blocks

    final_length = len(cleaned_content)
    chars_removed = original_length - final_length

    return cleaned_content, chars_removed

 def process_file(file_path: str) -> Tuple[str, Dict[str, Any]]:
    """
    Reads a Markdown file, processes it to keep only the front-matter title and code blocks,
    and returns (cleaned_content, stats).
    """
    try:
        with open(file_path, 'r', encoding='utf-8') as f:
            content = f.read()
        
        cleaned_content, chars_removed = clean_markdown_keep_code_and_title(content)
        
        stats = {
            'original_size': len(content),
            'cleaned_size': len(cleaned_content),
            'chars_removed': chars_removed,
        }
        
        return cleaned_content, stats
    except Exception as e:
        print(f"Error processing file {file_path}: {str(e)}")
        return "", {}

 def main():
    # Determine the output file name. Defaults to 'react.txt' if not provided
    output_file = "react.txt"
    if len(sys.argv) > 1:
        output_file = sys.argv[1]

    # Prepare to accumulate total stats
    total_original_size = 0
    total_cleaned_size = 0
    total_chars_removed = 0
    file_count = 0

    # Collect all markdown files (recursively)
    md_files = []
    for root, dirs, files in os.walk("."):
        for filename in files:
            if filename.lower().endswith(".md"):
                md_files.append(os.path.join(root, filename))

    # Sort md_files if you want a consistent order (optional)
    md_files.sort()

    # Overwrite (not append) the output file
    with open(output_file, 'w', encoding='utf-8') as out_f:
        for md_file in md_files:
            cleaned_content, stats = process_file(md_file)

            # Update running totals
            total_original_size += stats.get('original_size', 0)
            total_cleaned_size  += stats.get('cleaned_size', 0)
            total_chars_removed += stats.get('chars_removed', 0)
            file_count += 1

            # Write the cleaned content for each file
            # Optionally, separate each file's content with a heading
            # out_f.write(f"===== {md_file} =====\n")
            out_f.write(cleaned_content)
            out_f.write("\n\n")  # blank line after each file's content

    # Compute the overall reduction percentage
    reduction_percentage = 0.0
    if total_original_size > 0:
        reduction_percentage = (total_chars_removed / total_original_size) * 100

    # Print summary statistics for *all* files combined
    print("===== Combined Statistics =====")
    print(f"Files processed:        {file_count}")
    print(f"Total original size:    {total_original_size} characters")
    print(f"Total cleaned size:     {total_cleaned_size} characters")
    print(f"Total characters removed: {total_chars_removed}")
    print(f"Overall size reduction: {reduction_percentage:.2f}%")
    print(f"Cleaned output written to: {output_file}")

 if __name__ == "__main__":
    main()
	#!/usr/bin/env python

	import re
	import os
	import sys
	from typing import Tuple, Dict, Any

	def extract_title_from_front_matter(content: str) -> str:
	"""
	Extract the 'title: ...' line from the front matter if present.
	Returns just the title string, or an empty string if none found.
	"""
	# Match the entire front matter block: --- ... ---
	fm_regex = re.compile(r'^---\s(.?)\s---\s', flags=re.DOTALL \| re.MULTILINE)
	front_matter_match = fm_regex.search(content)
	if not front_matter_match:
	return ""

	front_matter = front_matter_match.group(1)
	# Find a line that begins with 'title:'
	title_match = re.search(r'^title:\s*(.+)$', front_matter, flags=re.MULTILINE)
	if title_match:
	return title_match.group(1).strip()
	return ""

	def clean_markdown_keep_code_and_title(content: str) -> Tuple[str, int]:
	"""
	Keeps only:
	1) The title line from front matter.
	2) The raw contents of code blocks (without backticks).
	Removes everything else.
	Returns (cleaned_content, chars_removed).
	"""
	original_length = len(content)

	# 1. Extract title
	title = extract_title_from_front_matter(content)

	# 2. Remove the entire front matter block (including the --- lines)
	content = re.sub(r'^---\s(.?)\s---\s', '', content, flags=re.DOTALL \| re.MULTILINE)

	# 3. Extract code blocks (their contents only, minus the triple backticks)
	code_block_regex = re.compile(
	r'```[^\n]\n(.?)\n```', # group(1) captures the code content
	flags=re.DOTALL
	)
	code_blocks = code_block_regex.findall(content)

	# 4. Now that we extracted code blocks, everything else goes away.
	cleaned_content = ""

	# 5. Prepend the extracted title if it exists
	if title:
	cleaned_content += title.strip() + "\n\n"

	# 6. Append each code block, separated by a blank line
	for i, block in enumerate(code_blocks):
	cleaned_content += block
	if i < len(code_blocks) - 1:
	cleaned_content += "\n\n" # blank line between code blocks

	final_length = len(cleaned_content)
	chars_removed = original_length - final_length

	return cleaned_content, chars_removed

	def process_file(file_path: str) -> Tuple[str, Dict[str, Any]]:
	"""
	Reads a Markdown file, processes it to keep only the front-matter title and code blocks,
	and returns (cleaned_content, stats).
	"""
	try:
	with open(file_path, 'r', encoding='utf-8') as f:
	content = f.read()

	cleaned_content, chars_removed = clean_markdown_keep_code_and_title(content)

	stats = {
	'original_size': len(content),
	'cleaned_size': len(cleaned_content),
	'chars_removed': chars_removed,
	}

	return cleaned_content, stats
	except Exception as e:
	print(f"Error processing file {file_path}: {str(e)}")
	return "", {}

	def main():
	# Determine the output file name. Defaults to 'react.txt' if not provided
	output_file = "react.txt"
	if len(sys.argv) > 1:
	output_file = sys.argv[1]

	# Prepare to accumulate total stats
	total_original_size = 0
	total_cleaned_size = 0
	total_chars_removed = 0
	file_count = 0

	# Collect all markdown files (recursively)
	md_files = []
	for root, dirs, files in os.walk("."):
	for filename in files:
	if filename.lower().endswith(".md"):
	md_files.append(os.path.join(root, filename))

	# Sort md_files if you want a consistent order (optional)
	md_files.sort()

	# Overwrite (not append) the output file
	with open(output_file, 'w', encoding='utf-8') as out_f:
	for md_file in md_files:
	cleaned_content, stats = process_file(md_file)

	# Update running totals
	total_original_size += stats.get('original_size', 0)
	total_cleaned_size += stats.get('cleaned_size', 0)
	total_chars_removed += stats.get('chars_removed', 0)
	file_count += 1

	# Write the cleaned content for each file
	# Optionally, separate each file's content with a heading
	# out_f.write(f"===== {md_file} =====\n")
	out_f.write(cleaned_content)
	out_f.write("\n\n") # blank line after each file's content

	# Compute the overall reduction percentage
	reduction_percentage = 0.0
	if total_original_size > 0:
	reduction_percentage = (total_chars_removed / total_original_size) * 100

	# Print summary statistics for all files combined
	print("===== Combined Statistics =====")
	print(f"Files processed: {file_count}")
	print(f"Total original size: {total_original_size} characters")
	print(f"Total cleaned size: {total_cleaned_size} characters")
	print(f"Total characters removed: {total_chars_removed}")
	print(f"Overall size reduction: {reduction_percentage:.2f}%")
	print(f"Cleaned output written to: {output_file}")

	if __name__ == "__main__":
	main()