Last active
July 22, 2019 19:03
-
-
Save ernstki/79faf30fe8a1600cc05f388c13b4b67d to your computer and use it in GitHub Desktop.
titlecase - a Python AP-style title case capitalizer (works great with doclip.sh!)
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
# vim: fileencoding=utf-8 | |
""" | |
Tests for titlecase.py | |
Invocation: python -m unittest test_titlecase.py | |
""" | |
import unittest | |
from titlecase import titlecase | |
class TestTitleCase(unittest.TestCase): | |
def test_ap_style(self): | |
""" | |
The AP stylebook says | |
- capitalize the principal words, including prepositions and | |
conjunctions of four or more letters. | |
- capitalize an article – the, a, an – or words of fewer than four | |
letters if it is the first or last word in a title. | |
(source: https://writers.stackexchange.com/a/4622) | |
""" | |
inputs = [ | |
'the the', # British post-punk band | |
'the cat in the hat', # simple book title | |
'over the hills and through the woods', # ≥ 4 char preposition | |
'you and the horse you rode in on', # preposition at end | |
'you and the horse you rode in on!', # punctuation at end | |
'to boldly go, but, then again...', # preposition with comma | |
] | |
expected = [ | |
'The The', | |
'The Cat in the Hat', | |
'Over the Hills and Through the Woods', | |
'You and the Horse You Rode in On', | |
'You and the Horse You Rode in On!', | |
'To Boldly Go, but, Then Again...', | |
] | |
for i in range(0, len(inputs)): | |
self.assertEqual(titlecase(inputs[i]), expected[i]) | |
def test_chicago_style(self): | |
""" | |
But the Chicago Manual of Style says: | |
Use lowercase for articles (a, an, the), coordinating conjunctions | |
(and, but, or, for, nor), and prepositions, regardless of length, | |
unless they are the first or last word of the title. | |
""" | |
inputs = [ | |
'over the hills and through the woods', | |
] | |
expected = [ | |
'Over the Hills and through the Woods', | |
] | |
for i in range(0, len(inputs)): | |
self.assertEqual(titlecase(inputs[i], style='chicago'), | |
expected[i]) | |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
# vim: ft=utf-8 | |
""" | |
Title-case the input on stdin according to AP or Chicago styleguide rules | |
Author: Kevin Ernst <ernstki -at- mail.uc.edu> | |
Source: https://gist.github.com/ernstki/79faf30fe8a1600cc05f388c13b4b67d | |
""" | |
from __future__ import print_function | |
import re | |
ARTS = ['a', 'an', 'the'] | |
# source: https://en.wikibooks.org/wiki/English_in_Use/Prepositions,_Conjunctions,_and_Interjections | |
PREPS = ['about', 'above', 'across', 'after', 'against', 'along', 'amid', | |
'amidst', 'among', 'around', 'at', 'before', 'behind', 'below', | |
'beneath', 'beside', 'besides', 'between', 'beyond', 'during', | |
'except', 'for', 'from', 'in', 'into', 'of', 'off', 'on', 'outside', | |
'over', 'past', 'through', 'throughout', 'to', 'toward', 'towards', | |
'under', 'underneath', 'until', 'with', 'within', 'without'] | |
CONJS = ['although', 'and', 'as', 'because', 'both', 'but', 'either', 'even', | |
'except', 'for', 'however', 'if', 'lest', 'neither', 'nevertheless', | |
'nor', 'notwithstanding', 'or', 'provided', 'save', 'seeing', 'since', | |
'so', 'than', 'that', 'then', 'though', 'unless', 'whereas', | |
'whether', 'yet'] | |
# see https://writers.stackexchange.com/a/4622 | |
AP_CAP_IF_THIS_LONG = 4 # AP stylebook says capitalize if >=4 characters | |
def titlecase(string, style=None): | |
""" | |
Return properly title-cased version of input string | |
If style is 'chicago', use Chicago Manual of Style rules; otherwise, AP. | |
""" | |
style = style if style else 'ap' | |
words = [] | |
if style == 'chicago': | |
excludes = ARTS + PREPS + CONJS | |
else: | |
preps = [p for p in PREPS if len(p) < AP_CAP_IF_THIS_LONG] | |
conjs = [c for c in CONJS if len(c) < AP_CAP_IF_THIS_LONG] | |
excludes = ARTS + preps + conjs | |
for word in string.split(): | |
bareword = re.sub('[^\w]', '', word) # remove punctuation | |
words.append(word if bareword in excludes else word.capitalize()) | |
# always capitalize the first and last words, regardless: | |
words[-1] = words[-1].capitalize() | |
words[0] = words[0].capitalize() | |
return ' '.join(words) | |
if __name__ == '__main__': | |
import sys | |
import argparse | |
parser = argparse.ArgumentParser(description=__doc__, | |
epilog='https://gist.github.com/ernstki/79faf30fe8a1600cc05f388c13b4b67d') | |
parser.add_argument('-c', '-chicago', '--chicago', | |
action='store_const', const='chicago', default='ap', | |
help='use Chicago Manual of Style (default: AP)') | |
options = parser.parse_args() | |
print(titlecase(sys.stdin.read(), options.chicago)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment