Last active
January 5, 2018 19:51
-
-
Save Kautenja/99644f9a7534d5235cc8a7899a1fe2cb to your computer and use it in GitHub Desktop.
A Python3 script to count the words in a Markdown document with inline LaTeX.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
"""This Python script counts words in a Markdown / LaTeX document(s). | |
Usage: | |
python3 count_words.py <filename or directory> | |
It will ignore ATX headers, LaTeX & Markdown comments, and LaTeX markup tags. | |
TODO: inline HTML, Markdown images & tables | |
""" | |
def get_filename() -> str: | |
""" | |
Get the filename from the command line. | |
Returns: the first positional argument (the filename) | |
""" | |
from sys import argv | |
try: | |
# try to get the filename | |
filename = argv[1] | |
except IndexError: | |
# pass the exception along to the next level | |
raise ValueError('no filename position argument!') | |
return filename | |
try: | |
# get the filename from the command line | |
filename = get_filename() | |
except ValueError: | |
# print the usage information and exit | |
print(__doc__) | |
from sys import exit | |
exit(1) | |
# specific Markdown files to ignore like READMEs and build artifacts | |
IGNORED_MD_FILENAMES = ['README.md', 'build.md'] | |
# specific extensions to recognized as Markdown | |
MARKDOWN_EXTENSIONS = ['.md', 'markdown', '.tex'] | |
def is_md(filename: str) -> bool: | |
""" | |
Return a boolean determining if the filename is a markdown file. | |
Args: | |
filename: the filename to validate | |
Returns: true if the filename is a markdown file, false otherwise | |
""" | |
# iterate over the ignored filenames to ensure the file is valid | |
for ignored_file_name in IGNORED_MD_FILENAMES: | |
if filename == ignored_file_name: | |
return False | |
# iterate over the extensions in the accepted extensions | |
for markdown_extension in MARKDOWN_EXTENSIONS: | |
if markdown_extension == filename[-len(markdown_extension):]: | |
return True | |
# the filename doesn't have a valid extension, return False | |
return False | |
def markdown_filenames(directory) -> list: | |
""" | |
Return a list of the filenames in the input directory. | |
Args: | |
directory: the input directory | |
Returns: a list of the markdown files in the input directory. | |
""" | |
from os import listdir | |
try: | |
# return a sorted list of the files in the given directory if they | |
# are legal markdown files | |
return sorted([file for file in listdir(directory) if is_md(file)]) | |
except FileNotFoundError: | |
# catch a file not found error if the directory doesn't exist | |
print('{} does not exist!'.format(directory)) | |
exit(1) | |
# the sentinel value for LaTeX comment lines | |
LATEX_COMMENT = '%' | |
# the sentinel value for Markdown header lines | |
MARKDOWN_HEADER = '#' | |
def clean_line(line: str) -> str: | |
""" | |
Clean a single line and return it. | |
Args: | |
line: the line of Markdown / LaTeX to clean | |
Returns: a cleaned line of text | |
""" | |
if LATEX_COMMENT in line[:1] or MARKDOWN_HEADER in line[:1]: | |
# ignore LaTeX comment lines and Markdown header lines | |
return '' | |
# strip the line of all whitespace and new lines and append a single space | |
return line.rstrip() + ' ' | |
# A regular expression for removing Markdown comments | |
MARKDOWN_COMMENT_REGEX = r'<!--.+?-->' | |
# a regular expression for removing LaTeX markup the group in () is the | |
# optional [] parameters following a markup call. | |
LATEX_MARKUP_REGEX = r'\\.+?{.+?}(\[.+?\])?' | |
def clean_contents(contents: str) -> str: | |
""" | |
Clean and return the contents of a LaTeX / Markdown file. | |
Args: | |
contents: the contents of the file to clean | |
Returns: a file with all markup nonsense removed (just words) | |
""" | |
clean_contents = contents | |
from re import sub | |
# remove the markdown comments from the text | |
clean_contents = sub(MARKDOWN_COMMENT_REGEX, '', clean_contents) | |
# remove the LaTeX markup | |
clean_contents = sub(LATEX_MARKUP_REGEX, '', clean_contents) | |
# return the clean text | |
return clean_contents | |
def read_file(filename: str) -> str: | |
""" | |
Read the contents of a single file. | |
Args: | |
filename: the name of the file to read | |
Returns: the string contents of the file | |
""" | |
if not is_md(filename): | |
raise ValueError('filename must have a valid markdown extension') | |
# initialize the contents to store from the file | |
contents = '' | |
# open the file into the contents one line at a time | |
with open(filename) as md_file: | |
# iterate over each line in the file and write it to the output | |
for line in md_file: | |
contents += clean_line(line) | |
return clean_contents(contents) | |
def read_dir(directory: str) -> str: | |
""" | |
Read the contents of every Markdown / LaTeX file in a directory. | |
Args: | |
directory: the name of the directory to read files from | |
Returns: the concatenated contents of the files in the directory | |
""" | |
# initialize the contents to store from the files | |
contents = '' | |
# iterate over the files and collect their contents | |
for filename in markdown_filenames(directory): | |
contents += read_file(f'{directory}/{filename}') | |
return contents | |
def read_contents(filename: str) -> str: | |
""" | |
Read the contents of a file or directory. | |
Args: | |
filename: the filename or directory to read | |
Returns: the concatenated text from the file(s) | |
""" | |
from os.path import isdir | |
if isdir(filename): | |
return read_dir(filename) | |
else: | |
return read_file(filename) | |
def words(contents: str) -> int: | |
""" | |
Return the number of words in a file's contents. | |
Args: | |
contents: the text to count the words in | |
Returns: the number of words in the contents | |
""" | |
from re import findall | |
return findall(r'\w+', contents) | |
# read the contents of the file | |
contents = read_contents(filename) | |
# split the contents into words | |
words = words(contents) | |
# count the words in the list | |
word_count = len(words) | |
# print the word count to the console | |
print(f'{word_count} words in {filename}') |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment