Created
November 1, 2022 18:32
-
-
Save singerng/986b01c9a9dece7934d81ba97b515f65 to your computer and use it in GitHub Desktop.
Python script for parsing and ranking unknown words, using BeautifulSoup and NLTK
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from bs4 import BeautifulSoup | |
from nltk.stem import SnowballStemmer | |
from nltk.corpus import cess_esp as corpus | |
from nltk import FreqDist | |
import csv | |
words = [] | |
with open("kindle.html") as f: | |
soup = BeautifulSoup(f.read(), 'html.parser') | |
hits = soup.find_all('div', {'class': 'noteText'}) | |
for hit in hits: | |
# pnct = [",",".",";"] | |
words.append(hit.contents[0].replace(",","").replace(".","").replace(";","")) | |
print("Total words: {}".format(len(words))) | |
stemmer = SnowballStemmer('spanish') | |
stems = set() | |
stems_to_words = {} | |
for word in words: | |
stems.add(stemmer.stem(word)) | |
stems_to_words[stemmer.stem(word)] = word | |
print("Unique stems: {}".format(len(stems))) | |
corpus_stems = [] | |
for word in corpus.words(): | |
corpus_stems.append(stemmer.stem(word)) | |
print("Unique stems (reference): {}".format(len(set(corpus_stems)))) | |
fd = FreqDist(corpus_stems) | |
with open("kindle_words.csv", 'w', newline='') as outfile: | |
outwriter = csv.writer(outfile, delimiter=',') | |
for stem in stems: | |
outwriter.writerow([stems_to_words[stem], stem, fd[stem] if stem in fd else ""]) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment