Last active
August 27, 2020 20:31
-
-
Save drscotthawley/57cb61337798359ce0197371b279c721 to your computer and use it in GitHub Desktop.
Shorten text by applying various shortening rules
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
# Replaces lengthy words/phrases with shorter variants | |
# Author: Scott Hawley | |
import pandas as pd | |
import re | |
import os | |
def parse_garbl(df=None): | |
"""Scraping word-shortening lists by Gary B. Larson | |
Currently it only selects the first of every possible re-mapping option | |
""" | |
print("Scraping word-shortening lists by Gary B. Larson") | |
urls = [r'https://garbl.info/stylemanual/words.htm',\ | |
r'https://garbl.info/stylemanual/phrases.htm',\ | |
r'https://garbl.info/stylemanual/redundant.htm'] | |
for url in urls: | |
tables = pd.read_html(url) | |
df2 = tables[0].iloc[2:-1,0:2] # 1st two columns; skip first couple rows and last rows. | |
for i in [0,1]: | |
# strip parenthetical anything | |
df2.iloc[:,i] = df2.iloc[:,i].str.replace(r"\s*\(.*\)\s*","") | |
# strip anything after a comma (use only the first option, for now) | |
df2.iloc[:,i] = df2.iloc[:,i].str.replace(r",.*","") | |
# grab x from 'either x or y' | |
df2.iloc[:,i] = df2.iloc[:,i].str.replace(r"either (.*) or .*","\\1") | |
df2 = df2[df2.iloc[:,0].str.contains("Back to top") == False] | |
if df is not None: | |
df = df.append(df2) | |
else: | |
df = df2 | |
return df | |
def parse_brockway(): | |
print("Scraping Laura Hale Brockway's list from PR Daily") | |
url = 'http://m.prdaily.com/Main/Articles/20_phrases_you_can_replace_with_one_word__11285.aspx' | |
tables = pd.read_html(url) | |
df = tables[1].iloc[:,0:2] | |
for i in [0,1]: | |
# strip anything after a comma (use only the first option, for now) | |
df.iloc[:,i] = df.iloc[:,i].str.replace(r",.*","") | |
return df | |
def apply_dict(text, df): | |
"""This is where the mapping happens. | |
Currently it makes replacements without asking for the user's consent | |
""" | |
for index, row in df.iterrows(): | |
if (len(row['from']) > len(row['to'])) and (row['from'] in text): | |
print("Replacing",row['from'],'-->',row['to']) | |
text = text.replace(row['from'],row['to']) | |
return text | |
# Build the 'dictionary' of translations (as a Pandas Dataframe) | |
df = parse_garbl() | |
df = df.append(parse_brockway(),ignore_index=True) | |
df = df.applymap(str) # just in case | |
df.columns = ["from", "to"] | |
print(df) | |
# specify some input text, either as a file or grab text online | |
filename = 'my_essay.txt' | |
if os.path.isfile(filename): | |
print("Reading from",filename) | |
with open(filename, 'r') as infile: | |
old_text = infile.read() | |
else: | |
# Grab some text from online, e.g. "Collected works of William Hazlitt" | |
import requests | |
url = 'https://www.gutenberg.org/files/55932/55932-0.txt' | |
print("Grabbing some text from",url) | |
r = requests.get(url) | |
old_text = r.text | |
# Now apply the shortening | |
new_text = apply_dict(old_text, df) | |
print("\n") | |
print("Before processing, text length =",len(old_text),"characters") | |
print("After processing, text length =",len(new_text),"characters") | |
# Save to new text file | |
with open("my_essay_out.txt", "w") as outfile: | |
outfile.write(new_text) | |
# EOF |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment