singerng · November 1, 2022 18:32
diff --git a/kindle.py b/kindle.py
 from bs4 import BeautifulSoup
 from nltk.stem import SnowballStemmer
 from nltk.corpus import cess_esp as corpus
 from nltk import FreqDist
 import csv

 words = []

 with open("kindle.html") as f:
 	soup = BeautifulSoup(f.read(), 'html.parser')
 	hits = soup.find_all('div', {'class': 'noteText'})

 	for hit in hits:
 		# pnct = [",",".",";"]
 		words.append(hit.contents[0].replace(",","").replace(".","").replace(";",""))

 print("Total words: {}".format(len(words)))

 stemmer = SnowballStemmer('spanish')

 stems = set()
 stems_to_words = {}

 for word in words:
 	stems.add(stemmer.stem(word))
 	stems_to_words[stemmer.stem(word)] = word

 print("Unique stems: {}".format(len(stems)))

 corpus_stems = []

 for word in corpus.words():
 	corpus_stems.append(stemmer.stem(word))

 print("Unique stems (reference): {}".format(len(set(corpus_stems))))

 fd = FreqDist(corpus_stems)

 with open("kindle_words.csv", 'w', newline='') as outfile:
 	outwriter = csv.writer(outfile, delimiter=',')

 	for stem in stems:
 		outwriter.writerow([stems_to_words[stem], stem, fd[stem] if stem in fd else ""])
	from bs4 import BeautifulSoup
	from nltk.stem import SnowballStemmer
	from nltk.corpus import cess_esp as corpus
	from nltk import FreqDist
	import csv

	words = []

	with open("kindle.html") as f:
	soup = BeautifulSoup(f.read(), 'html.parser')
	hits = soup.find_all('div', {'class': 'noteText'})

	for hit in hits:
	# pnct = [",",".",";"]
	words.append(hit.contents[0].replace(",","").replace(".","").replace(";",""))

	print("Total words: {}".format(len(words)))

	stemmer = SnowballStemmer('spanish')

	stems = set()
	stems_to_words = {}

	for word in words:
	stems.add(stemmer.stem(word))
	stems_to_words[stemmer.stem(word)] = word

	print("Unique stems: {}".format(len(stems)))

	corpus_stems = []

	for word in corpus.words():
	corpus_stems.append(stemmer.stem(word))

	print("Unique stems (reference): {}".format(len(set(corpus_stems))))

	fd = FreqDist(corpus_stems)

	with open("kindle_words.csv", 'w', newline='') as outfile:
	outwriter = csv.writer(outfile, delimiter=',')

	for stem in stems:
	outwriter.writerow([stems_to_words[stem], stem, fd[stem] if stem in fd else ""])
No results found