Created
January 24, 2014 21:43
-
-
Save gerbal/8607217 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
file:///C:/Users/gerbal/Downloads/NLTK_presentation%20_code.py |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# File name: NLTK_presentation_code.py | |
# Purpose: Intro to text analysis in Python, NLTK | |
# Data Used: NLTK data and book examples, own Manifesto data | |
# Data Output: - | |
""" | |
Examples based on | |
"Natural Language Processing with Python" | |
Bird, Klein and Loper, 2010 | |
""" | |
#1. Getting started | |
#Import data for examples | |
import nltk | |
nltk.download() | |
######################## | |
# Review basic Python: # | |
######################## | |
#Strings | |
monty = "Monty Python's "\ | |
"Flying Circus." | |
monty*2 + "plus just last word:" + monty[-7:] | |
monty.find('Python') #finds position of substring within string | |
monty.upper() +' and '+ monty.lower() | |
monty.replace('y', 'x') | |
# Lists | |
# As opposed to strings, lists are flexible about the elements they contain. | |
sent1 = ['Monty', 'Python'] | |
sent2 = ['and', 'the', 'Holy', 'Grail'] | |
len(sent2) | |
sent1[1] | |
sent2.append("1975") | |
sent1 + sent2 | |
sorted(sent1 + sent2) | |
' '.join(['Monty', 'Python']) | |
'Monty Python'.split() | |
# Import text4 of book examples, Inaugural Addresses | |
from nltk.book import text4 | |
# Operating on every element. List comprehension. | |
len(set([word.lower() for word in text4 if len(word)>5])) | |
[w.upper() for w in text4[0:5]] | |
for word in text4[0:5]: | |
if len(word)<5 and word.endswith('e'): | |
print word, ' is short and ends with e' | |
elif word.istitle(): | |
print word, ' is a titlecase word' | |
else: | |
print word, 'is just another word' | |
#Searching text | |
text4.concordance("vote") | |
#What other words appear in a similar range of contexts? | |
text4.similar("vote") | |
# examine just the contexts that are shared by two or more words | |
text4.common_contexts(["war", "freedom"]) | |
# Counting Vocabulary: the length of a text from start to finish, | |
# in terms of the words and punctuation symbols that appear. All tokens. | |
len(text4) | |
# How many distinct words does the book of Genesis contain? | |
# The vocabulary of a text is just the set of tokens that it uses. | |
len(set(text4)) #types | |
len(text4) / len(set(text4)) # Each word used on average x times. Richness of the text. | |
#Location of a word in the text: how many spaces from the beginning does it appear? | |
#This positional information can be displayed using a dispersion plot. | |
#You need NumPy and Matplotlib. | |
text4.dispersion_plot(["citizens", "democracy", "freedom", "war", "America", "vote"]) | |
# Generating some random text in the various styles we have just seen. | |
text4.generate() | |
# count how often a word occurs in a text, | |
text4.count("democracy") | |
# compute what percentage of the text is taken up by a specific word | |
100 * text4.count('democracy') / len(text4) | |
# Define functions: | |
def lexical_diversity(text): | |
return len(text) / len(set(text)) | |
def percentage(count, total): | |
return 100 * count / total | |
lexical_diversity(text4) | |
percentage(text4.count('a'), len(text4)) | |
# Simple statistics | |
from nltk import FreqDist | |
# Counting Words Appearing in a Text (a frequency distribution) | |
fdist1 = FreqDist(text4) | |
fdist1 | |
vocabulary1 = fdist1.keys() # list of all the distinct types in the text | |
vocabulary1[:3] # look at first 3 | |
#words that occur only once, called hapaxes | |
fdist1.hapaxes()[:20] | |
# Words that meet a condition, are long for example | |
V = set(text4) | |
long_words = [w for w in V if len(w) > 15] | |
sorted(long_words) | |
#finding words that characterize a text, relatively long, and occur frequently | |
fdist = FreqDist(text4) | |
sorted([w for w in set(text4) if len(w) > 7 and fdist[w] > 7]) | |
# Collocations and Bigrams. | |
# A collocation is a sequence of words that occur together unusually often. | |
# Built in collocations function | |
text4.collocations() | |
############# | |
#Corpus data# | |
############# | |
# Inaugural Address Corpus | |
from nltk.corpus import inaugural | |
inaugural.fileids()[:2] | |
[fileid[:4] for fileid in inaugural.fileids()] | |
#How the words America and citizen are used over time. | |
cfd = nltk.ConditionalFreqDist( | |
(target, fileid[:4]) | |
for fileid in inaugural.fileids() | |
for w in inaugural.words(fileid) | |
for target in ['america', 'war'] | |
if w.lower().startswith(target)) | |
cfd.plot() | |
#cfd.tabulate() | |
from nltk.corpus import brown | |
news_words=brown.words(categories="news") | |
print(news_words) | |
freq= nltk.FreqDist(news_words) | |
freq.plot(30) | |
from nltk import FreqDist | |
verbs=["should", "may", "can"] | |
genres=["news", "government", "romance"] | |
for g in genres: | |
words=brown.words(categories=g) | |
freq=FreqDist([w.lower() for w in words if w.lower() in verbs]) | |
print g, freq | |
# Import own corpus | |
from nltk.corpus import PlaintextCorpusReader | |
corpus_root = "C:/Data/Files/" | |
wordlists = PlaintextCorpusReader(corpus_root, '.*\.txt') | |
wordlists.fileids()[:3] | |
wordlists.words('UK_natl_2010_en_Lab.txt') | |
# Stopwords | |
from nltk.corpus import stopwords | |
stopwords.words('english') | |
def content_fraction(text): | |
stopwords = nltk.corpus.stopwords.words('english') | |
content = [w for w in text if w.lower() not in stopwords] | |
return len(content) / len(text) | |
content_fraction(nltk.corpus.reuters.words()) | |
content_fraction(nltk.corpus.inaugural.words()) | |
# Translator | |
from nltk.corpus import swadesh | |
languages = ['en', 'ro', 'es', 'fr', 'pt', 'la'] | |
for i in [100, 141, 143]: | |
print swadesh.entries(languages)[i] | |
# Wordnet | |
#dictionary of English | |
from nltk.corpus import wordnet as wn | |
wn.synsets('motorcar') | |
wn.synset('car.n.01').lemma_names | |
wn.synset('car.n.01').definition | |
for synset in wn.synsets('car')[1:3]: | |
print synset.lemma_names | |
# Depth of a synset | |
wn.synset('whale.n.02').min_depth() | |
wn.synset('vertebrate.n.01').min_depth() | |
wn.synset('walk.v.01').entailments() #Walking involves stepping | |
# Hyponims and hypernims | |
motorcar = wn.synset('car.n.01') | |
types_of_motorcar = motorcar.hyponyms() | |
types_of_motorcar[0:2] | |
################################## | |
### Importing and accessing text# | |
################################# | |
import nltk, re, pprint | |
# Online books | |
from urllib import urlopen | |
url = "http://www.gutenberg.org/files/61/61.txt" | |
raw = urlopen(url).read() | |
type(raw) | |
len(raw) | |
raw[:75] | |
tokens = nltk.word_tokenize(raw) | |
type(tokens) | |
len(tokens) | |
tokens[80:110] | |
text = nltk.Text(tokens) | |
text.collocations() | |
# Online articles | |
url = "http://www.bbc.co.uk/news/science-environment-21471908" | |
#Getting text out of HTML is a sufficiently common task that NLTK provides a helper function nltk.clean_html(), which takes an HTML string and returns raw text. | |
html = urlopen(url).read() | |
html[:60] | |
raw = nltk.clean_html(html) | |
tokens = nltk.word_tokenize(raw) | |
tokens[:15] | |
#Processing RSS Feeds: import feedparser | |
# Reading local files | |
f = open('C:\Data\Files\UK_natl_2010_en_Lab.txt') | |
raw = f.read() | |
print raw[:100] | |
# User input | |
s = raw_input("Enter some text: ") | |
#Regular exppressions applications. Find and count all vowels. | |
import re | |
word = 'supercalifragilisticexpialidocious' | |
len(re.findall(r'[aeiou]', word)) | |
# Normalize - ignore upper case | |
# Tokenize - divide into tokens | |
f = open('C:\Data\Files\UK_natl_2010_en_Lab.txt') | |
raw = f.read() | |
tokens = nltk.word_tokenize(raw) | |
tokens[:10] | |
set(w.lower() for w in tokens) | |
# Stemming - strip off affixes | |
porter = nltk.PorterStemmer() | |
lancaster = nltk.LancasterStemmer() | |
[porter.stem(t) for t in tokens] | |
[lancaster.stem(t) for t in tokens] | |
# Lemmatizing - the word is from a dictionary | |
wnl = nltk.WordNetLemmatizer() | |
[wnl.lemmatize(t) for t in tokens] | |
# Sentence segmentation | |
sent_tokenizer=nltk.data.load('tokenizers/punkt/english.pickle') | |
sents = sent_tokenizer.tokenize(raw) | |
pprint.pprint(sents[182:185]) | |
# Writing output to file | |
output_file = open('C:\Data\Files\output.txt', 'w') | |
words = set(sents) | |
for word in sorted(words): | |
output_file.write(word + "\n") | |
output_file.close() | |
############## | |
#POS Tagging # | |
############## | |
nltk.corpus.brown.tagged_words() | |
text = nltk.word_tokenize("And now for something completely different") | |
nltk.pos_tag(text) | |
text = nltk.word_tokenize("They refuse to permit us to obtain the refuse permit") | |
nltk.pos_tag(text) | |
from nltk.corpus import brown | |
brown_tagged_sents = brown.tagged_sents(categories='news') | |
brown_sents = brown.sents(categories='news') | |
size = int(len(brown_tagged_sents) * 0.9) | |
train_sents = brown_tagged_sents[:size] | |
test_sents = brown_tagged_sents[size:] | |
unigram_tagger = nltk.UnigramTagger(train_sents) | |
unigram_tagger.evaluate(test_sents) | |
bigram_tagger = nltk.BigramTagger(train_sents) | |
bigram_tagger.tag(brown_sents[2007]) | |
unseen_sent = brown_sents[4203] | |
bigram_tagger.tag(unseen_sent) | |
bigram_tagger.evaluate(test_sents) | |
################### | |
# Classification # | |
################## | |
###################################### | |
# Names-gender identification example# | |
###################################### | |
def gender_features(word): | |
return {'last_letter': word[-1]} | |
gender_features('Shrek') | |
from nltk.corpus import names | |
import random | |
names = ([(name, 'male') for name in names.words('male.txt')] + | |
[(name, 'female') for name in names.words('female.txt')]) | |
random.shuffle(names) | |
featuresets = [(gender_features(n), g) for (n,g) in names] | |
from nltk.classify import apply_features # use apply if you're working with large corpora | |
train_set = apply_features(gender_features, names[500:]) | |
test_set = apply_features(gender_features, names[:500]) | |
classifier = nltk.NaiveBayesClassifier.train(train_set) | |
classifier.classify(gender_features('Neo')) | |
classifier.classify(gender_features('Trinity')) | |
print nltk.classify.accuracy(classifier, test_set) | |
classifier.show_most_informative_features(5) | |
######################### | |
# Movie reviews example # | |
######################### | |
from nltk.corpus import movie_reviews | |
documents = [(list(movie_reviews.words(fileid)), category) | |
for category in movie_reviews.categories() | |
for fileid in movie_reviews.fileids(category)] | |
random.shuffle(documents) | |
all_words = nltk.FreqDist(w.lower() for w in movie_reviews.words()) | |
word_features = all_words.keys()[:2000] # [_document-classify-all-words] | |
def document_features(document): # [_document-classify-extractor] | |
document_words = set(document) # [_document-classify-set] | |
features = {} | |
for word in word_features: | |
features['contains(%s)' % word] = (word in document_words) | |
return features | |
print document_features(movie_reviews.words('pos/cv957_8737.txt')) | |
featuresets = [(document_features(d), c) for (d,c) in documents] | |
train_set, test_set = featuresets[100:], featuresets[:100] | |
classifier = nltk.NaiveBayesClassifier.train(train_set) | |
print nltk.classify.accuracy(classifier, test_set) | |
classifier.show_most_informative_features(5) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment