Created
July 7, 2022 13:04
-
-
Save joswr1ght/d195184f115dc9e454e80f19e8dabb6c to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
# MLA Style: Capitalization | |
# | |
# ref: https://libguides.pvcc.edu/citationstyles/mla9-capitalization | |
# | |
# In a title or a subtitle, capitalize the first word, the last word, and all | |
# principal words, including those that follow hyphens in compound terms. | |
# Therefore, capitalize the following parts of speech: | |
# | |
# Nouns (e.g., flowers, as in The Flowers of Europe) | |
# Pronouns (e.g., our, as in Save Our Children; that, as in The Mouse That Roared) | |
# Verbs (e.g., watches, as in America Watches Television; is, as in What Is Literature?) | |
# Adjectives (e.g., ugly, as in The Ugly Duckling; that, as in Who Said That Phrase?) | |
# Adverbs (e.g., slightly, as in Only Slightly Corrupt; down, as in Go Down, Moses) | |
# Subordinating conjunctions (e.g., after, although, as if, as soon as, | |
# because, before, if, that, unless, until, when, where, while, as in One If by | |
# Land and Anywhere That Chance Leads) | |
# | |
# Do not capitalize the following parts of speech when they fall in the middle of a title: | |
# | |
# Articles (a, an, the, as in Under the Bamboo Tree) | |
# Prepositions (e.g., against, as, between, in, of, to, as in The Merchant of | |
# Venice and “A Dialogue between the Soul and Body”) | |
# Coordinating conjunctions (and, but, for, nor, or, so, yet, as in Romeo and Juliet) | |
# The to in infinitives (as in How to Play Chess) | |
import sys | |
import os | |
import re | |
import fileinput | |
def title(s): | |
# Capitalize the first letter | |
return s[0].upper() + s[1:] | |
def lower(s): | |
# Lowercase initial letter | |
return s[0].lower() + s[1:] | |
def mlatitle(words): | |
articles = ["a", "an", "the"] | |
prepositions = ["abroad", "about", "above", "across", "after", | |
"against", "ago", "along", "amidst", "among", "amongst", "apart", | |
"around", "as", "aside", "at", "away", "barring", "before", | |
"behind", "below", "beneath", "beside", "besides", "between", | |
"beyond", "but", "by", "circa", "concerning", "despite", "down", | |
"during", "in", "inside", "instead", "into", "except", | |
"excluding", "for", "following", "from", "hence", "like", | |
"minus", "near", "next", "past", "per", "round", "of", "off", "on", | |
"onto", "opposite", "out", "outside", "over", "than", "through", | |
"throughout", "till", "times", "to", "toward", "towards", | |
"under", "underneath", "unlike", "until", "unto", "up", "upon", | |
"via", "with", "within", "without", "worth"] | |
coordinatingconjunctions = ["and", "but", "for", "nor", "or", "so", "yet"] | |
words = words.split() | |
wordcounttotal = len(words) | |
wordcount = 0 | |
mlatitlewords = "" | |
for word in words: | |
# Ignore the "word" that uses # or = as a repeating character (used for | |
# headings in markup languages) | |
if (re.findall(r"^#+$|^=+$", word)): | |
mlatitlewords += word + " " | |
continue | |
# If it's not markup syntax, count this as a word | |
wordcount += 1 | |
# Handle the case where the word has a leading dot (Asciidoc caption) | |
# We need to restore the leading dot so preserve it here | |
if (wordcount == 1 and word[0] == "."): | |
leadingdot="." | |
word = word[1:] | |
else: | |
leadingdot="" | |
# Skip words that contain a URL | |
# regex adapted from Django URL validator, https://github.com/django/django/blob/stable/1.3.x/django/core/validators.py#L45 | |
urlregex = re.compile( | |
r'^(?:http|ftp)s?://' # http:// or https:// | |
r'(?:(?:[A-Z0-9](?:[A-Z0-9-]{0,61}[A-Z0-9])?\.)+(?:[A-Z]{2,6}\.?|[A-Z0-9-]{2,}\.?)|' #domain... | |
r'localhost|' #localhost... | |
r'\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})' # ...or ip | |
r'(?::\d+)?' # optional port | |
r'(?:/?|[/?]\S+)$', re.IGNORECASE) | |
if (re.match(urlregex, word)): | |
mlatitlewords += leadingdot + word + " " | |
continue | |
# Skip Apple products beginning with "i" | |
if word in ["iPhone", "iPad"]: | |
mlatitlewords += leadingdot + word + " " | |
continue | |
# Skip if the word is a hex string (not really MLA but it should be) | |
# This pattern matches lowercase letters in the hex string; if the word uses mixed case, then | |
# this will not match (and it should be fixed). If it's all uppercase, then it will appear in | |
# proper title case format already. | |
hexregex = re.compile(r'^[0-9a-f:-]+$') | |
if (re.match(hexregex, word)): | |
mlatitlewords += leadingdot + word + " " | |
continue | |
# Capitalize the first word | |
if (wordcount == 1): | |
mlatitlewords += leadingdot + title(word) + " " | |
continue | |
# Capitalize the last word | |
if (wordcount == wordcounttotal): | |
mlatitlewords += title(word) | |
continue | |
if (word.lower() in articles or word.lower() in prepositions or | |
word.lower() in coordinatingconjunctions): | |
# Do not capitalize | |
mlatitlewords += lower(word) + " " | |
continue | |
if ("-" in word): | |
# Capitalize the first letter and the first letter after each hyphen | |
compounds = word.split("-") | |
for compound in compounds[0:-1]: | |
if (compound != ""): # Accommodate trailing hyphens | |
mlatitlewords += title(compound) + "-" | |
else: | |
mlatitlewords += "-" | |
mlatitlewords += compounds[-1] + " " | |
continue | |
# Default to uppercase | |
mlatitlewords += leadingdot + title(word) + " " | |
return mlatitlewords | |
if __name__ == "__main__": | |
if (len(sys.argv) != 1): | |
progname = os.path.basename(sys.argv[0]) | |
print(f"{progname}: Convert string to title case using MLA rules.") | |
print(f"Usage: {progname} <words>") | |
sys.exit(-1) | |
for line in fileinput.input(): | |
print(mlatitle(line)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment