Created
September 23, 2017 07:39
-
-
Save jss367/dac7dc15737e5afe566e333eeaf214c4 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Let's make a single function to determine the parts of speech | |
import re | |
import nltk | |
import os | |
#from collections import Counter # Is this used? | |
# First we break the text into tokens | |
def tokinze_text(raw_text): | |
tokens = nltk.word_tokenize(raw_text) | |
return tokens | |
tokens = tokinze_text(text) | |
def mytagger(tokens): | |
'''This function inputs tokens''' | |
tags = nltk.pos_tag(tokens) | |
return tags | |
tagged = mytagger(tokens) | |
# Note that IN can be either a preposition or a conjunction, for now we're going to list it with the prepositions | |
common_noun_pos = ['NN', 'NNS'] | |
common_nouns = [] | |
verb_pos = ['VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ'] | |
verbs=[] | |
adjective_pos = ['JJ', 'JJR', 'JJS'] | |
adjectives = [] | |
pronoun_pos = ['PRP', 'PRP$', 'WP', 'WP$'] | |
pronouns = [] | |
adverb_pos = ['RB', 'RBR', 'RBS', 'WRB'] | |
adverbs = [] | |
proper_noun_pos = ['NNP', 'NNPS'] | |
proper_nouns = [] | |
conjunction_pos = ['CC'] | |
conjunctions = [] | |
preposition_pos = ['IN', 'TO'] | |
prepositions = [] | |
interjection_pos = ['UH'] | |
interjections = [] | |
modal_pos = ['MD'] # But these are also verbs, so let's make sure they show up as such | |
modals = [] | |
tagged_other_pos = ['CD', 'DT', 'EX', 'FW', 'LS', 'PDT', 'POS', 'RP', 'SYM', 'WDT'] | |
tagged_others = [] | |
other = [] | |
for idx, token in enumerate(tagged): | |
if token[1] in common_noun_pos: | |
common_nouns.append(token) | |
elif token[1] in verb_pos: | |
verbs.append(token) | |
elif token[1] in adjective_pos: | |
adjectives.append(token) | |
elif token[1] in pronoun_pos: | |
pronouns.append(token) | |
elif token[1] in adverb_pos: | |
adverbs.append(token) | |
elif token[1] in proper_noun_pos: | |
proper_nouns.append(token) | |
elif token[1] in conjunction_pos: | |
conjunctions.append(token) | |
elif token[1] in preposition_pos: | |
prepositions.append(token) | |
elif token[1] in interjection_pos: | |
interjections.append(token) | |
elif token[1] in modal_pos: | |
modals.append(token) | |
elif token[1] in tagged_other_pos: | |
tagged_others.append(token) | |
else: | |
other.append(token) | |
parts_of_speech = [common_nouns, verbs, adjectives, pronouns, adverbs, proper_nouns, conjunctions, prepositions, interjections, modals] | |
# Apped modals to verbs | |
# Create nouns that is both proper nouns and common nouns |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment