Skip to content

Instantly share code, notes, and snippets.

@david-wm-sanders
Created February 18, 2018 22:25
Show Gist options
  • Save david-wm-sanders/0a4f001062e02a0675a179350cb75334 to your computer and use it in GitHub Desktop.
Save david-wm-sanders/0a4f001062e02a0675a179350cb75334 to your computer and use it in GitHub Desktop.
import csv
import string
import sys
from collections import Counter
from pathlib import Path
from bs4 import BeautifulSoup
file_a, style_a = sys.argv[1], sys.argv[2]
sub_p = Path(__file__).parent / file_a
with sub_p.open(mode="r", encoding="utf-8") as f:
html = f.read()
soup = BeautifulSoup(html, "html.parser")
body = soup.body
paragraphs_html = body.find_all("p")
paragraphs = []
for p in paragraphs_html:
begin, end = p.get("begin"), p.get("end")
spans_html = p.find_all("span")
styles = [s.get("style") for s in spans_html]
if style_a in styles:
text = " ".join(s.get_text() for s in spans_html
if s.get("style") == style_a)
paragraphs.append((begin, end, text))
field_headers = ["begin", "end", "text"]
csv_path = Path(__file__).parent / Path(f"{file_a}_{style_a}.csv")
with csv_path.open("w", encoding="utf-8", newline="") as csv_file:
writer = csv.writer(csv_file, quoting=csv.QUOTE_NONNUMERIC)
writer.writerow(field_headers)
for p in paragraphs:
begin, end, text = p[0], p[1], p[2]
writer.writerow([begin, end, text])
c = Counter()
for p in paragraphs:
text = p[2]
words_raw = text.split()
words = [w.strip(string.punctuation).lower() for w in words_raw]
for w in words:
c[w] += 1
print(c.most_common())
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment