Skip to content

Instantly share code, notes, and snippets.

@drscotthawley
Last active August 27, 2020 20:31
Show Gist options
  • Save drscotthawley/57cb61337798359ce0197371b279c721 to your computer and use it in GitHub Desktop.
Save drscotthawley/57cb61337798359ce0197371b279c721 to your computer and use it in GitHub Desktop.
Shorten text by applying various shortening rules
#!/usr/bin/env python
# Replaces lengthy words/phrases with shorter variants
# Author: Scott Hawley
import pandas as pd
import re
import os
def parse_garbl(df=None):
"""Scraping word-shortening lists by Gary B. Larson
Currently it only selects the first of every possible re-mapping option
"""
print("Scraping word-shortening lists by Gary B. Larson")
urls = [r'https://garbl.info/stylemanual/words.htm',\
r'https://garbl.info/stylemanual/phrases.htm',\
r'https://garbl.info/stylemanual/redundant.htm']
for url in urls:
tables = pd.read_html(url)
df2 = tables[0].iloc[2:-1,0:2] # 1st two columns; skip first couple rows and last rows.
for i in [0,1]:
# strip parenthetical anything
df2.iloc[:,i] = df2.iloc[:,i].str.replace(r"\s*\(.*\)\s*","")
# strip anything after a comma (use only the first option, for now)
df2.iloc[:,i] = df2.iloc[:,i].str.replace(r",.*","")
# grab x from 'either x or y'
df2.iloc[:,i] = df2.iloc[:,i].str.replace(r"either (.*) or .*","\\1")
df2 = df2[df2.iloc[:,0].str.contains("Back to top") == False]
if df is not None:
df = df.append(df2)
else:
df = df2
return df
def parse_brockway():
print("Scraping Laura Hale Brockway's list from PR Daily")
url = 'http://m.prdaily.com/Main/Articles/20_phrases_you_can_replace_with_one_word__11285.aspx'
tables = pd.read_html(url)
df = tables[1].iloc[:,0:2]
for i in [0,1]:
# strip anything after a comma (use only the first option, for now)
df.iloc[:,i] = df.iloc[:,i].str.replace(r",.*","")
return df
def apply_dict(text, df):
"""This is where the mapping happens.
Currently it makes replacements without asking for the user's consent
"""
for index, row in df.iterrows():
if (len(row['from']) > len(row['to'])) and (row['from'] in text):
print("Replacing",row['from'],'-->',row['to'])
text = text.replace(row['from'],row['to'])
return text
# Build the 'dictionary' of translations (as a Pandas Dataframe)
df = parse_garbl()
df = df.append(parse_brockway(),ignore_index=True)
df = df.applymap(str) # just in case
df.columns = ["from", "to"]
print(df)
# specify some input text, either as a file or grab text online
filename = 'my_essay.txt'
if os.path.isfile(filename):
print("Reading from",filename)
with open(filename, 'r') as infile:
old_text = infile.read()
else:
# Grab some text from online, e.g. "Collected works of William Hazlitt"
import requests
url = 'https://www.gutenberg.org/files/55932/55932-0.txt'
print("Grabbing some text from",url)
r = requests.get(url)
old_text = r.text
# Now apply the shortening
new_text = apply_dict(old_text, df)
print("\n")
print("Before processing, text length =",len(old_text),"characters")
print("After processing, text length =",len(new_text),"characters")
# Save to new text file
with open("my_essay_out.txt", "w") as outfile:
outfile.write(new_text)
# EOF
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment