Skip to content

Instantly share code, notes, and snippets.

@negedng
Last active May 13, 2021 20:20
Show Gist options
  • Save negedng/b2a1336a2316fcbd314f1c6c4af01bf9 to your computer and use it in GitHub Desktop.
Save negedng/b2a1336a2316fcbd314f1c6c4af01bf9 to your computer and use it in GitHub Desktop.
Get vocab for a movie
# Parameters
language = 'es' # Tested: 'en' or 'es'
text_file_path = 'movie_subtitle.xml'
known_words_path = 'known_word_list.txt' # one word per line
# More about universal part-of-speech: https://universaldependencies.org/u/pos/
skip_upos = ['PUNCT', 'PRON', 'DET', 'ADP', 'SYM', 'X']
most_common = 30
# Loading dependencies
import re
import stanza
from collections import Counter
stanza.download(language)
nlp = nlp = stanza.Pipeline(language)
# Helper functions
def cleanhtml(raw_html):
cleanr = re.compile('<.*?>')
cleantext = re.sub(cleanr, '', raw_html)
return cleantext
def preprocess(data):
data = cleanhtml(data)
data = data.replace('[','\n').replace(']',':')
return data
def count_words(nlp_processed, skip_upos, skip_words=[]):
occurs = Counter()
for token in nlp_processed.iter_tokens():
if token.ner=='O':
for word in token.words:
if word.upos not in skip_upos and word.lemma not in skip_words:
occurs[word.lemma] += 1
return occurs
def main():
with open(known_words_path, 'r') as f:
known_words = f.readlines()
known_words = [s.strip() for s in known_words]
with open(text_file_path, 'r') as f:
d = f.read()
d = preprocess(d)
d = nlp(d)
occurs = count_words(d, skip_upos, known_words)
print(occurs.most_common(most_common))
if __name__ == '__main__':
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment