Last active
October 12, 2015 05:47
-
-
Save language-engineering/3979698 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from sussex_nltk.corpus_readers import TwitterCorpusReader | |
from sussex_nltk.parse import dep_parse_sentences_arceager | |
from nltk.tokenize import word_tokenize | |
from nltk import pos_tag | |
tcr = TwitterCorpusReader() | |
# Get some (here 30) un-tokenised sentences from tweets | |
sents = tcr.sample_raw_sents(30) | |
# Tokenise and PoS tag the sentences | |
# Notice the round brackets instead of square brackets. This is a generator | |
# expression. It acts quite like a list, but instead of computing all list | |
# elements and storing all in memory, it only does one at a time. | |
# Therefore "tagged_sents" is a generator, not a list | |
tagged_sents = (pos_tag(word_tokenize(sentence)) for sentence in sents) | |
# Dependency parse the sentences | |
parsed_sents = dep_parse_sentences_arceager(tagged_sents) | |
# Now you can inspect the results by printing the sentences as in the | |
# previous section''' | |
for sentence in parsed_sents: | |
print "-----" | |
print sentence |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment