david-wm-sanders · February 18, 2018 22:25
diff --git a/ttml_sub_extract.py b/ttml_sub_extract.py
 import csv
 import string
 import sys
 from collections import Counter
 from pathlib import Path

 from bs4 import BeautifulSoup


 file_a, style_a = sys.argv[1], sys.argv[2]

 sub_p = Path(__file__).parent / file_a
 with sub_p.open(mode="r", encoding="utf-8") as f:
    html = f.read()

 soup = BeautifulSoup(html, "html.parser")
 body = soup.body
 paragraphs_html = body.find_all("p")
 paragraphs = []
 for p in paragraphs_html:
    begin, end = p.get("begin"), p.get("end")
    spans_html = p.find_all("span")
    styles = [s.get("style") for s in spans_html]
    if style_a in styles:
        text = " ".join(s.get_text() for s in spans_html
                        if s.get("style") == style_a)
        paragraphs.append((begin, end, text))

 field_headers = ["begin", "end", "text"]
 csv_path = Path(__file__).parent / Path(f"{file_a}_{style_a}.csv")
 with csv_path.open("w", encoding="utf-8", newline="") as csv_file:
    writer = csv.writer(csv_file, quoting=csv.QUOTE_NONNUMERIC)
    writer.writerow(field_headers)
    for p in paragraphs:
        begin, end, text = p[0], p[1], p[2]
        writer.writerow([begin, end, text])

 c = Counter()
 for p in paragraphs:
    text = p[2]
    words_raw = text.split()
    words = [w.strip(string.punctuation).lower() for w in words_raw]
    for w in words:
        c[w] += 1

 print(c.most_common())
	import csv
	import string
	import sys
	from collections import Counter
	from pathlib import Path

	from bs4 import BeautifulSoup


	file_a, style_a = sys.argv[1], sys.argv[2]

	sub_p = Path(__file__).parent / file_a
	with sub_p.open(mode="r", encoding="utf-8") as f:
	html = f.read()

	soup = BeautifulSoup(html, "html.parser")
	body = soup.body
	paragraphs_html = body.find_all("p")
	paragraphs = []
	for p in paragraphs_html:
	begin, end = p.get("begin"), p.get("end")
	spans_html = p.find_all("span")
	styles = [s.get("style") for s in spans_html]
	if style_a in styles:
	text = " ".join(s.get_text() for s in spans_html
	if s.get("style") == style_a)
	paragraphs.append((begin, end, text))

	field_headers = ["begin", "end", "text"]
	csv_path = Path(__file__).parent / Path(f"{file_a}_{style_a}.csv")
	with csv_path.open("w", encoding="utf-8", newline="") as csv_file:
	writer = csv.writer(csv_file, quoting=csv.QUOTE_NONNUMERIC)
	writer.writerow(field_headers)
	for p in paragraphs:
	begin, end, text = p[0], p[1], p[2]
	writer.writerow([begin, end, text])

	c = Counter()
	for p in paragraphs:
	text = p[2]
	words_raw = text.split()
	words = [w.strip(string.punctuation).lower() for w in words_raw]
	for w in words:
	c[w] += 1

	print(c.most_common())