Created
May 7, 2017 13:38
-
-
Save stuartlangridge/0e83edb8cc3f0d821e29a4caabb65cdc to your computer and use it in GitHub Desktop.
Calculate longest subsequences from Unsong's chapter initials to get a hint about what, if anything, it's a notarikon for
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
"""Parses the text of Unsong, and the list of chapter initials, and finds | |
longest common subsequences. The idea here is that if the notarikon formed | |
by the first letter of each chapter conceals a secret message, then perhaps | |
parts of that message have already been revealed in the book... so if there | |
were "NIEAC" is somewhere in the list of chapter initials, then that probably | |
stands for "Nothing Is Ever A Coincidence", as per the many times that phrase | |
is mentioned. Idea from 75thTrombone at | |
https://www.reddit.com/r/unsong/comments/69k1im/preemptive_final_exam_suggestion/ | |
This script calculates those substrings. The answers are not all that promising. | |
It requires a file, Unsong.html, which is the text of the book with fairly plain | |
formatting. This file is created by my epub builder for Unsong, which lives at | |
https://github.com/stuartlangridge/unsong-book-fetcher. | |
""" | |
from bs4 import BeautifulSoup | |
import re | |
import difflib | |
def load_soup(): | |
fp = open("Unsong.html") | |
data = fp.read() | |
fp.close() | |
soup = BeautifulSoup(data, "lxml") | |
return soup | |
def get_initials_of_chapters(): | |
ci = [] | |
soup = load_soup() | |
chapter_headings = soup.find_all("h2") | |
for ch in chapter_headings: | |
if "Chapter" not in ch.text: continue | |
content = ch.parent.find_all("div", "pjgm-postcontent") | |
if not content: | |
continue | |
# remove all blockquotes, which begin chapters | |
for bq in content[0].find_all("blockquote"): | |
bq.extract() | |
# remove all font tags (this is a bit of a bodge, but it | |
# removes the dates at the start) | |
for fnt in content[0].find_all("font"): | |
fnt.extract() | |
txt = content[0].text[0:100].strip().upper() | |
txt = re.sub(r"[^A-Z]", "", txt) | |
ci.append(txt[0]) | |
return ci | |
def get_initials_of_book(): | |
bi = [] | |
soup = load_soup() | |
chapter_headings = soup.find_all("h2") | |
for ch in chapter_headings: | |
if "Chapter" not in ch.text: continue | |
content = ch.parent.find_all("div", "pjgm-postcontent") | |
if not content: | |
continue | |
# remove all blockquotes, which begin chapters | |
for bq in content[0].find_all("blockquote"): | |
bq.extract() | |
# remove all font tags (this is a bit of a bodge, but it removes the dates at the start) | |
for fnt in content[0].find_all("font"): | |
fnt.extract() | |
# now, get all text | |
txt = " ".join([c.text.strip() for c in content]) | |
txt = txt.replace("\n", " ").replace("…", " ") | |
txt = re.sub(r"[^A-Za-z'’ ]", "", txt) | |
initials = [(x[0].upper(), x, ch.text) for x in txt.split()] | |
bi += initials | |
return bi | |
def find_lcs(ci, bi): | |
cistr = "".join(ci) | |
bistr = "".join([x[0] for x in bi]) | |
# we have to put the longer one first | |
s = difflib.SequenceMatcher(None, bistr, cistr) | |
for m in sorted(s.get_matching_blocks(), key=lambda a: a[2], reverse=True): | |
bistart, cistart, length = m | |
if length < 3: continue | |
cisubstr = cistr[cistart:cistart+length] | |
cispaces = " " * cistart | |
bisubstr = [] | |
for i in range(bistart, bistart+length): | |
bisubstr.append(bi[i][1]) | |
bisubstr = " ".join(bisubstr) | |
bichapter = bi[bistart][2] | |
print('%s\n%s%s\n"%s" (%s)' % ( | |
cistr, cispaces, cisubstr, bisubstr, bichapter | |
)) | |
print() | |
if __name__ == "__main__": | |
chapter_initials = get_initials_of_chapters() | |
book_initials = get_initials_of_book() | |
find_lcs(chapter_initials, book_initials) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment