Created
December 23, 2013 20:57
-
-
Save rpietro/8104434 to your computer and use it in GitHub Desktop.
source code for chapter 1 of http://nltk.org/book
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# code from http://nltk.org/book | |
import nltk | |
# nltk.download() # just go ahead and download everything if you have space | |
from nltk.book import * | |
from __future__ import division | |
text1 | |
text2 | |
text1.concordance("monstrous") | |
text1.similar("monstrous") | |
text2.similar("monstrous") | |
text2.common_contexts(["monstrous", "very"]) | |
text4.dispersion_plot(["citizens", "democracy", "freedom", "duties", "America"]) # close the graphic once you're done | |
text3.generate() | |
len(text3) # number words | |
sorted(set(text3)) | |
len(set(text3)) | |
len(text3) / len(set(text3)) | |
text3.count("smote") | |
100 * text4.count('a') / len(text4) | |
def lexical_diversity(text): | |
return len(text) / len(set(text)) | |
def percentage(count, total): | |
return 100 * count / total | |
lexical_diversity(text3) | |
lexical_diversity(text5) | |
percentage(4, 5) | |
percentage(text4.count('a'), len(text4)) | |
sent1 = ['Call', 'me', 'Ishmael', '.'] | |
sent1 | |
len(sent1) | |
lexical_diversity(sent1) | |
sent2 | |
sent3 | |
['Monty', 'Python'] + ['and', 'the', 'Holy', 'Grail'] | |
sent4 + sent1 | |
sent1.append("Some") | |
sent1 | |
text4[173] | |
text4.index('awaken') | |
text5[16715:16735] | |
text6[1600:1625] | |
sent = ['word1', 'word2', 'word3', 'word4', 'word5', 'word6', 'word7', 'word8', 'word9', 'word10'] | |
sent[0] | |
sent[9] | |
sent[:3] | |
text2[141525:] | |
saying = ['After', 'all', 'is', 'said', 'and', 'done', 'more', 'is', 'said', 'than', 'done'] | |
tokens = set(saying) | |
tokens | |
tokens = sorted(tokens) | |
tokens[-2:] | |
fdist1 = FreqDist(text1) | |
fdist1 | |
vocabulary1 = fdist1.keys() | |
vocabulary1[:50] | |
fdist1['whale'] | |
fdist1.plot(50, cumulative=True) | |
V = set(text1) | |
long_words = [w for w in V if len(w) > 15] | |
sorted(long_words) | |
fdist5 = FreqDist(text5) | |
sorted([w for w in set(text5) if len(w) > 7 and fdist5[w] > 7]) | |
bigrams(['more', 'is', 'said', 'than', 'done']) | |
text4.collocations() | |
text8.collocations() | |
[len(w) for w in text1] | |
fdist = FreqDist([len(w) for w in text1]) | |
fdist | |
fdist.keys() | |
fdist.items() | |
fdist.max() | |
fdist[3] | |
fdist.freq(3) | |
sorted([w for w in set(text1) if w.endswith('ableness')]) | |
sorted([term for term in set(text4) if 'gnt' in term]) | |
sorted([item for item in set(text6) if item.istitle()]) | |
sorted([item for item in set(sent7) if item.isdigit()]) | |
sorted([w for w in set(text7) if '-' in w and 'index' in w]) | |
sorted([wd for wd in set(text3) if wd.istitle() and len(wd) > 10]) | |
sorted([w for w in set(sent7) if not w.islower()]) | |
sorted([t for t in set(text2) if 'cie' in t or 'cei' in t]) | |
[len(w) for w in text1] | |
[w.upper() for w in text1] | |
len(text1) | |
len(set(text1)) | |
len(set([word.lower() for word in text1])) | |
len(set([word.lower() for word in text1 if word.isalpha()])) | |
word = 'cat' | |
if len(word) < 5: | |
print 'word length is less than 5' | |
for word in ['Call', 'me', 'Ishmael', '.']: | |
print word | |
sent1 = ['Call', 'me', 'Ishmael', '.'] | |
for xyzzy in sent1: | |
if xyzzy.endswith('l'): | |
print xyzzy | |
for token in sent1: | |
if token.islower(): | |
print token, 'is a lowercase word' | |
elif token.istitle(): | |
print token, 'is a titlecase word' | |
else: | |
print token, 'is punctuation' | |
tricky = sorted([w for w in set(text2) if 'cie' in w or 'cei' in w]) | |
for word in tricky: | |
print word, | |
# below no longer working | |
babelize_shell() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment