Created
August 24, 2020 21:41
-
-
Save sevperez/f5031e6b0b3fa37473a05113d9e2328b to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def print_word_info(word): | |
print(f"Text:\t{word.text}") | |
print(f"Lemma: \t{word.lemma}") | |
print(f"UPOS: \t{word.upos}") | |
print(f"XPOS: \t{word.xpos}") | |
print_word_info(moby_p1.sentences[3].words[4]) | |
# Text: growing | |
# Lemma: grow | |
# UPOS: VERB | |
# XPOS: VBG | |
def word_info_df(doc): | |
""" | |
- Parameters: doc (a Stanza Document object) | |
- Returns: A Pandas DataFrame object with one row for each token in | |
doc, and columns for text, lemma, upos, and xpos. | |
""" | |
rows = [] | |
for sentence in doc.sentences: | |
for word in sentence.words: | |
row = { | |
"text": word.text, | |
"lemma": word.lemma, | |
"upos": word.upos, | |
"xpos": word.xpos, | |
} | |
rows.append(row) | |
return pd.DataFrame(rows) | |
word_info_df(moby_p1) | |
# text lemma upos xpos | |
# 0 Call call VERB VB | |
# 1 me I PRON PRP | |
# 2 Ishmael Ishmael PROPN NNP | |
# 3 . . PUNCT . | |
# 4 Some some DET DT |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment