Skip to content

Instantly share code, notes, and snippets.

@language-engineering
Last active October 12, 2015 02:37
Show Gist options
  • Save language-engineering/3958445 to your computer and use it in GitHub Desktop.
Save language-engineering/3958445 to your computer and use it in GitHub Desktop.
from sussex_nltk.corpus_readers import ReutersCorpusReader
from sussex_nltk.tag import twitter_tag_batch
from nltk import pos_tag
from nltk.tokenize import word_tokenize
number_of_sentences = 10 #Number of sentences to sample and display
rcr = ReutersCorpusReader() #Create a corpus reader
sentences = rcr.sample_raw_sents(number_of_sentences) #Sample some sentences
#Tag with twitter specific tagger
# - it also tokenises for you in a twitter specific way
twitter_tagged = twitter_tag_batch(sentences)
#Tag with NLTK's maximum entropy tagger
nltk_tagged = [pos_tag(word_tokenize(sentence)) for sentence in sentences]
#Print each sentence
for raw, twitter_sentence, nltk_sentence in zip(sentences,twitter_tagged,nltk_tagged):
print "-----------------Sentence----------------"
print "Raw:\n %s " % raw
print "Twitter tagged:"
for token, tag in twitter_sentence:
print " %s\t%s" % (token,tag)
print "NLTK tagged:"
for token, tag in nltk_sentence:
print " %s\t%s" % (token,tag)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment