Created
May 4, 2025 04:46
-
-
Save sunderee/4ac2167ec8740dbcee28afee120e6f48 to your computer and use it in GitHub Desktop.
Markdown file word counter
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
""" | |
Markdown Word Counter | |
--------------------- | |
A simple script to count words in a Markdown file, ignoring Markdown syntax. | |
""" | |
import re | |
import sys | |
import argparse | |
from pathlib import Path | |
def clean_markdown(text): | |
""" | |
Remove Markdown syntax elements that shouldn't be counted as words. | |
""" | |
# Remove code blocks | |
text = re.sub(r'```[\s\S]*?```', '', text) | |
# Remove inline code | |
text = re.sub(r'`[^`]*`', '', text) | |
# Remove HTML tags | |
text = re.sub(r'<[^>]*>', '', text) | |
# Remove URLs | |
text = re.sub(r'https?://\S+', '', text) | |
# Remove image references | |
text = re.sub(r'!\[.*?\]\(.*?\)', '', text) | |
# Remove link references but keep the link text | |
text = re.sub(r'\[([^\]]*)\]\(.*?\)', r'\1', text) | |
# Remove headers (# symbols) | |
text = re.sub(r'^#+\s+', '', text, flags=re.MULTILINE) | |
# Remove emphasis markers (* and _) but keep the text | |
text = re.sub(r'(\*\*|__)(.*?)\1', r'\2', text) | |
text = re.sub(r'(\*|_)(.*?)\1', r'\2', text) | |
# Remove horizontal rules | |
text = re.sub(r'^\s*[-*_]{3,}\s*$', '', text, flags=re.MULTILINE) | |
# Remove footnote references | |
text = re.sub(r'\[\^[^\]]*\]', '', text) | |
return text | |
def count_words(text): | |
""" | |
Count words in the cleaned text. | |
""" | |
# Clean the markdown | |
cleaned_text = clean_markdown(text) | |
# Split by whitespace and count non-empty words | |
words = [word for word in re.split(r'\s+', cleaned_text) if word.strip()] | |
return len(words) | |
def main(): | |
""" | |
Main function to handle command line arguments and process the file. | |
""" | |
parser = argparse.ArgumentParser(description='Count words in a Markdown file.') | |
parser.add_argument('file', help='Path to the Markdown file') | |
parser.add_argument('--details', action='store_true', help='Show detailed statistics') | |
args = parser.parse_args() | |
file_path = Path(args.file) | |
try: | |
if not file_path.exists(): | |
print(f"Error: File '{file_path}' not found.") | |
sys.exit(1) | |
with open(file_path, 'r', encoding='utf-8') as file: | |
content = file.read() | |
word_count = count_words(content) | |
print(f"\nFile: {file_path}") | |
print(f"Word count: {word_count}") | |
if args.details: | |
# Count lines | |
line_count = len(content.splitlines()) | |
# Count characters | |
char_count = len(content) | |
# Estimate reading time (average reading speed: 250 words per minute) | |
reading_time = word_count / 250 | |
reading_minutes = int(reading_time) | |
reading_seconds = int((reading_time - reading_minutes) * 60) | |
print(f"Line count: {line_count}") | |
print(f"Character count: {char_count}") | |
print(f"Estimated reading time: {reading_minutes} min {reading_seconds} sec") | |
except Exception as e: | |
print(f"Error: {e}") | |
sys.exit(1) | |
if __name__ == "__main__": | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment