Skip to content

Instantly share code, notes, and snippets.

@singerng
Created November 1, 2022 18:32
Show Gist options
  • Save singerng/986b01c9a9dece7934d81ba97b515f65 to your computer and use it in GitHub Desktop.
Save singerng/986b01c9a9dece7934d81ba97b515f65 to your computer and use it in GitHub Desktop.
Python script for parsing and ranking unknown words, using BeautifulSoup and NLTK
from bs4 import BeautifulSoup
from nltk.stem import SnowballStemmer
from nltk.corpus import cess_esp as corpus
from nltk import FreqDist
import csv
words = []
with open("kindle.html") as f:
soup = BeautifulSoup(f.read(), 'html.parser')
hits = soup.find_all('div', {'class': 'noteText'})
for hit in hits:
# pnct = [",",".",";"]
words.append(hit.contents[0].replace(",","").replace(".","").replace(";",""))
print("Total words: {}".format(len(words)))
stemmer = SnowballStemmer('spanish')
stems = set()
stems_to_words = {}
for word in words:
stems.add(stemmer.stem(word))
stems_to_words[stemmer.stem(word)] = word
print("Unique stems: {}".format(len(stems)))
corpus_stems = []
for word in corpus.words():
corpus_stems.append(stemmer.stem(word))
print("Unique stems (reference): {}".format(len(set(corpus_stems))))
fd = FreqDist(corpus_stems)
with open("kindle_words.csv", 'w', newline='') as outfile:
outwriter = csv.writer(outfile, delimiter=',')
for stem in stems:
outwriter.writerow([stems_to_words[stem], stem, fd[stem] if stem in fd else ""])
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment