Skip to content

Instantly share code, notes, and snippets.

@sevperez
Created August 24, 2020 21:41
Show Gist options
  • Save sevperez/f5031e6b0b3fa37473a05113d9e2328b to your computer and use it in GitHub Desktop.
Save sevperez/f5031e6b0b3fa37473a05113d9e2328b to your computer and use it in GitHub Desktop.
def print_word_info(word):
print(f"Text:\t{word.text}")
print(f"Lemma: \t{word.lemma}")
print(f"UPOS: \t{word.upos}")
print(f"XPOS: \t{word.xpos}")
print_word_info(moby_p1.sentences[3].words[4])
# Text: growing
# Lemma: grow
# UPOS: VERB
# XPOS: VBG
def word_info_df(doc):
"""
- Parameters: doc (a Stanza Document object)
- Returns: A Pandas DataFrame object with one row for each token in
doc, and columns for text, lemma, upos, and xpos.
"""
rows = []
for sentence in doc.sentences:
for word in sentence.words:
row = {
"text": word.text,
"lemma": word.lemma,
"upos": word.upos,
"xpos": word.xpos,
}
rows.append(row)
return pd.DataFrame(rows)
word_info_df(moby_p1)
# text lemma upos xpos
# 0 Call call VERB VB
# 1 me I PRON PRP
# 2 Ishmael Ishmael PROPN NNP
# 3 . . PUNCT .
# 4 Some some DET DT
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment