Skip to content

Instantly share code, notes, and snippets.

@rpietro
Created December 23, 2013 20:57
Show Gist options
  • Save rpietro/8104434 to your computer and use it in GitHub Desktop.
Save rpietro/8104434 to your computer and use it in GitHub Desktop.
source code for chapter 1 of http://nltk.org/book
# code from http://nltk.org/book
import nltk
# nltk.download() # just go ahead and download everything if you have space
from nltk.book import *
from __future__ import division
text1
text2
text1.concordance("monstrous")
text1.similar("monstrous")
text2.similar("monstrous")
text2.common_contexts(["monstrous", "very"])
text4.dispersion_plot(["citizens", "democracy", "freedom", "duties", "America"]) # close the graphic once you're done
text3.generate()
len(text3) # number words
sorted(set(text3))
len(set(text3))
len(text3) / len(set(text3))
text3.count("smote")
100 * text4.count('a') / len(text4)
def lexical_diversity(text):
return len(text) / len(set(text))
def percentage(count, total):
return 100 * count / total
lexical_diversity(text3)
lexical_diversity(text5)
percentage(4, 5)
percentage(text4.count('a'), len(text4))
sent1 = ['Call', 'me', 'Ishmael', '.']
sent1
len(sent1)
lexical_diversity(sent1)
sent2
sent3
['Monty', 'Python'] + ['and', 'the', 'Holy', 'Grail']
sent4 + sent1
sent1.append("Some")
sent1
text4[173]
text4.index('awaken')
text5[16715:16735]
text6[1600:1625]
sent = ['word1', 'word2', 'word3', 'word4', 'word5', 'word6', 'word7', 'word8', 'word9', 'word10']
sent[0]
sent[9]
sent[:3]
text2[141525:]
saying = ['After', 'all', 'is', 'said', 'and', 'done', 'more', 'is', 'said', 'than', 'done']
tokens = set(saying)
tokens
tokens = sorted(tokens)
tokens[-2:]
fdist1 = FreqDist(text1)
fdist1
vocabulary1 = fdist1.keys()
vocabulary1[:50]
fdist1['whale']
fdist1.plot(50, cumulative=True)
V = set(text1)
long_words = [w for w in V if len(w) > 15]
sorted(long_words)
fdist5 = FreqDist(text5)
sorted([w for w in set(text5) if len(w) > 7 and fdist5[w] > 7])
bigrams(['more', 'is', 'said', 'than', 'done'])
text4.collocations()
text8.collocations()
[len(w) for w in text1]
fdist = FreqDist([len(w) for w in text1])
fdist
fdist.keys()
fdist.items()
fdist.max()
fdist[3]
fdist.freq(3)
sorted([w for w in set(text1) if w.endswith('ableness')])
sorted([term for term in set(text4) if 'gnt' in term])
sorted([item for item in set(text6) if item.istitle()])
sorted([item for item in set(sent7) if item.isdigit()])
sorted([w for w in set(text7) if '-' in w and 'index' in w])
sorted([wd for wd in set(text3) if wd.istitle() and len(wd) > 10])
sorted([w for w in set(sent7) if not w.islower()])
sorted([t for t in set(text2) if 'cie' in t or 'cei' in t])
[len(w) for w in text1]
[w.upper() for w in text1]
len(text1)
len(set(text1))
len(set([word.lower() for word in text1]))
len(set([word.lower() for word in text1 if word.isalpha()]))
word = 'cat'
if len(word) < 5:
print 'word length is less than 5'
for word in ['Call', 'me', 'Ishmael', '.']:
print word
sent1 = ['Call', 'me', 'Ishmael', '.']
for xyzzy in sent1:
if xyzzy.endswith('l'):
print xyzzy
for token in sent1:
if token.islower():
print token, 'is a lowercase word'
elif token.istitle():
print token, 'is a titlecase word'
else:
print token, 'is punctuation'
tricky = sorted([w for w in set(text2) if 'cie' in w or 'cei' in w])
for word in tricky:
print word,
# below no longer working
babelize_shell()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment