negedng · May 13, 2021 20:20
diff --git a/subtitles2vocab.py b/subtitles2vocab.py
 # Parameters
 language = 'es' # Tested: 'en' or 'es'
 text_file_path = 'movie_subtitle.xml'
 known_words_path = 'known_word_list.txt' # one word per line
 # More about universal part-of-speech: https://universaldependencies.org/u/pos/
 skip_upos = ['PUNCT', 'PRON', 'DET', 'ADP', 'SYM', 'X']
 most_common = 30

 # Loading dependencies
 import re
 import stanza
 from collections import Counter

 stanza.download(language)
 nlp = nlp = stanza.Pipeline(language)

 # Helper functions
 def cleanhtml(raw_html):
  cleanr = re.compile('<.*?>')
  cleantext = re.sub(cleanr, '', raw_html)
  return cleantext


 def preprocess(data):
    data = cleanhtml(data)
    data = data.replace('[','\n').replace(']',':')
    return data


 def count_words(nlp_processed, skip_upos, skip_words=[]):
    occurs = Counter()

    for token in nlp_processed.iter_tokens():
        if token.ner=='O':
            for word in token.words:
                if word.upos not in skip_upos and word.lemma not in skip_words:
                    occurs[word.lemma] += 1
    return occurs


 def main():
    with open(known_words_path, 'r') as f:
        known_words = f.readlines()
    known_words = [s.strip() for s in known_words]

    with open(text_file_path, 'r') as f:
        d = f.read()


    d = preprocess(d)
    d = nlp(d)
    occurs = count_words(d, skip_upos, known_words)

    print(occurs.most_common(most_common))


 if __name__ == '__main__':
    main()
	# Parameters
	language = 'es' # Tested: 'en' or 'es'
	text_file_path = 'movie_subtitle.xml'
	known_words_path = 'known_word_list.txt' # one word per line
	# More about universal part-of-speech: https://universaldependencies.org/u/pos/
	skip_upos = ['PUNCT', 'PRON', 'DET', 'ADP', 'SYM', 'X']
	most_common = 30

	# Loading dependencies
	import re
	import stanza
	from collections import Counter

	stanza.download(language)
	nlp = nlp = stanza.Pipeline(language)

	# Helper functions
	def cleanhtml(raw_html):
	cleanr = re.compile('<.*?>')
	cleantext = re.sub(cleanr, '', raw_html)
	return cleantext


	def preprocess(data):
	data = cleanhtml(data)
	data = data.replace('[','\n').replace(']',':')
	return data


	def count_words(nlp_processed, skip_upos, skip_words=[]):
	occurs = Counter()

	for token in nlp_processed.iter_tokens():
	if token.ner=='O':
	for word in token.words:
	if word.upos not in skip_upos and word.lemma not in skip_words:
	occurs[word.lemma] += 1
	return occurs


	def main():
	with open(known_words_path, 'r') as f:
	known_words = f.readlines()
	known_words = [s.strip() for s in known_words]

	with open(text_file_path, 'r') as f:
	d = f.read()


	d = preprocess(d)
	d = nlp(d)
	occurs = count_words(d, skip_upos, known_words)

	print(occurs.most_common(most_common))


	if __name__ == '__main__':
	main()