rpietro · December 23, 2013 20:57
diff --git a/nltk_hack_ch1.py b/nltk_hack_ch1.py
 # code from http://nltk.org/book

 import nltk
 # nltk.download() # just go ahead and download everything if you have space
 from nltk.book import *
 from __future__ import division

 text1
 text2
 text1.concordance("monstrous")
 text1.similar("monstrous")
 text2.similar("monstrous")
 text2.common_contexts(["monstrous", "very"])
 text4.dispersion_plot(["citizens", "democracy", "freedom", "duties", "America"]) # close the graphic once you're done
 text3.generate()
 len(text3) # number words 
 sorted(set(text3)) 
 len(set(text3)) 

 len(text3) / len(set(text3))
 text3.count("smote")
 100 * text4.count('a') / len(text4)

 def lexical_diversity(text):
    return len(text) / len(set(text))
 
 def percentage(count, total):
    return 100 * count / total
 
 lexical_diversity(text3)
 lexical_diversity(text5)
 percentage(4, 5)
 percentage(text4.count('a'), len(text4))


 sent1 = ['Call', 'me', 'Ishmael', '.']
 sent1
 len(sent1)
 lexical_diversity(sent1) 

 sent2
 sent3

 ['Monty', 'Python'] + ['and', 'the', 'Holy', 'Grail'] 
 sent4 + sent1

 sent1.append("Some")
 sent1

 text4[173]
 text4.index('awaken')
 text5[16715:16735]
 text6[1600:1625]

 sent = ['word1', 'word2', 'word3', 'word4', 'word5', 'word6', 'word7', 'word8', 'word9', 'word10']
 sent[0]
 sent[9]
 sent[:3] 
 text2[141525:] 


 saying = ['After', 'all', 'is', 'said', 'and', 'done', 'more', 'is', 'said', 'than', 'done']
 tokens = set(saying)
 tokens
 tokens = sorted(tokens)
 tokens[-2:]

 fdist1 = FreqDist(text1)
 fdist1 

 vocabulary1 = fdist1.keys()
 vocabulary1[:50] 
 fdist1['whale']

 fdist1.plot(50, cumulative=True)

 V = set(text1)
 long_words = [w for w in V if len(w) > 15]
 sorted(long_words)

 fdist5 = FreqDist(text5)
 sorted([w for w in set(text5) if len(w) > 7 and fdist5[w] > 7])




 bigrams(['more', 'is', 'said', 'than', 'done'])
 text4.collocations()
 text8.collocations()

 [len(w) for w in text1]
 fdist = FreqDist([len(w) for w in text1])
 fdist
 fdist.keys()


 fdist.items()
 fdist.max()
 fdist[3]
 fdist.freq(3)


 sorted([w for w in set(text1) if w.endswith('ableness')])
 sorted([term for term in set(text4) if 'gnt' in term])
 sorted([item for item in set(text6) if item.istitle()])
 sorted([item for item in set(sent7) if item.isdigit()])


 sorted([w for w in set(text7) if '-' in w and 'index' in w])
 sorted([wd for wd in set(text3) if wd.istitle() and len(wd) > 10])
 sorted([w for w in set(sent7) if not w.islower()])
 sorted([t for t in set(text2) if 'cie' in t or 'cei' in t])


 [len(w) for w in text1]
 [w.upper() for w in text1]


 len(text1)
 len(set(text1))
 len(set([word.lower() for word in text1]))

 len(set([word.lower() for word in text1 if word.isalpha()]))

 word = 'cat'
 if len(word) < 5:
    print 'word length is less than 5'

 for word in ['Call', 'me', 'Ishmael', '.']:
    print word
 
 sent1 = ['Call', 'me', 'Ishmael', '.']
 for xyzzy in sent1:
    if xyzzy.endswith('l'):
        print xyzzy
 
 for token in sent1:
    if token.islower():
        print token, 'is a lowercase word'
    elif token.istitle():
        print token, 'is a titlecase word'
    else:
        print token, 'is punctuation'
 
 tricky = sorted([w for w in set(text2) if 'cie' in w or 'cei' in w])
 for word in tricky:
    print word,
 

 # below no longer working
 babelize_shell()
	# code from http://nltk.org/book

	import nltk
	# nltk.download() # just go ahead and download everything if you have space
	from nltk.book import *
	from __future__ import division

	text1
	text2
	text1.concordance("monstrous")
	text1.similar("monstrous")
	text2.similar("monstrous")
	text2.common_contexts(["monstrous", "very"])
	text4.dispersion_plot(["citizens", "democracy", "freedom", "duties", "America"]) # close the graphic once you're done
	text3.generate()
	len(text3) # number words
	sorted(set(text3))
	len(set(text3))

	len(text3) / len(set(text3))
	text3.count("smote")
	100 * text4.count('a') / len(text4)

	def lexical_diversity(text):
	return len(text) / len(set(text))

	def percentage(count, total):
	return 100 * count / total

	lexical_diversity(text3)
	lexical_diversity(text5)
	percentage(4, 5)
	percentage(text4.count('a'), len(text4))


	sent1 = ['Call', 'me', 'Ishmael', '.']
	sent1
	len(sent1)
	lexical_diversity(sent1)

	sent2
	sent3

	['Monty', 'Python'] + ['and', 'the', 'Holy', 'Grail']
	sent4 + sent1

	sent1.append("Some")
	sent1

	text4[173]
	text4.index('awaken')
	text5[16715:16735]
	text6[1600:1625]

	sent = ['word1', 'word2', 'word3', 'word4', 'word5', 'word6', 'word7', 'word8', 'word9', 'word10']
	sent[0]
	sent[9]
	sent[:3]
	text2[141525:]


	saying = ['After', 'all', 'is', 'said', 'and', 'done', 'more', 'is', 'said', 'than', 'done']
	tokens = set(saying)
	tokens
	tokens = sorted(tokens)
	tokens[-2:]

	fdist1 = FreqDist(text1)
	fdist1

	vocabulary1 = fdist1.keys()
	vocabulary1[:50]
	fdist1['whale']

	fdist1.plot(50, cumulative=True)

	V = set(text1)
	long_words = [w for w in V if len(w) > 15]
	sorted(long_words)

	fdist5 = FreqDist(text5)
	sorted([w for w in set(text5) if len(w) > 7 and fdist5[w] > 7])




	bigrams(['more', 'is', 'said', 'than', 'done'])
	text4.collocations()
	text8.collocations()

	[len(w) for w in text1]
	fdist = FreqDist([len(w) for w in text1])
	fdist
	fdist.keys()


	fdist.items()
	fdist.max()
	fdist[3]
	fdist.freq(3)


	sorted([w for w in set(text1) if w.endswith('ableness')])
	sorted([term for term in set(text4) if 'gnt' in term])
	sorted([item for item in set(text6) if item.istitle()])
	sorted([item for item in set(sent7) if item.isdigit()])


	sorted([w for w in set(text7) if '-' in w and 'index' in w])
	sorted([wd for wd in set(text3) if wd.istitle() and len(wd) > 10])
	sorted([w for w in set(sent7) if not w.islower()])
	sorted([t for t in set(text2) if 'cie' in t or 'cei' in t])


	[len(w) for w in text1]
	[w.upper() for w in text1]


	len(text1)
	len(set(text1))
	len(set([word.lower() for word in text1]))

	len(set([word.lower() for word in text1 if word.isalpha()]))

	word = 'cat'
	if len(word) < 5:
	print 'word length is less than 5'

	for word in ['Call', 'me', 'Ishmael', '.']:
	print word

	sent1 = ['Call', 'me', 'Ishmael', '.']
	for xyzzy in sent1:
	if xyzzy.endswith('l'):
	print xyzzy

	for token in sent1:
	if token.islower():
	print token, 'is a lowercase word'
	elif token.istitle():
	print token, 'is a titlecase word'
	else:
	print token, 'is punctuation'

	tricky = sorted([w for w in set(text2) if 'cie' in w or 'cei' in w])
	for word in tricky:
	print word,


	# below no longer working
	babelize_shell()