jkff · December 22, 2017 06:43
diff --git a/nltk_preprocess.py b/nltk_preprocess.py
 import nltk                                                                                           
 import pickle                                                                                         
 import random                                                                                         
                                                                                                      
 # I cleaned up the data manually in Vim.                                                              
 lines = list(open('movie_lines.tsv').readlines())                                                     
 random.shuffle(lines)                                                                                 
 tagged = [                                                                                            
  # Split lines into sentences; split sentences into words; tag words with                            
  # part of speech (POS).                                                                             
  nltk.pos_tag(nltk.word_tokenize(sentence))                                                          
  for line in lines                                                                                   
  for sentence in nltk.sent_tokenize(line.decode('utf8'))]                                            
                                                                                                      
 # nltk is pretty slow, good idea to save the result and maybe load it later                           
 # to play with it without redoing the POS tagging.                                                    
 pickle.dump(tagged, open('movie_lines_tagged.p', 'wb'))                                               
                                                                                                      
 cleaned = [                                                                                           
  [word.lower()                                                                                       
   for word, pos in tagged_phrase                                                                     
   # Exclude singular and plural proper nouns: they make up about 50% of                              
   # the unique words. We could do more cleaning, e.g. normalize word forms.                          
   # That depends on the learning goal and the language.                                              
   if pos not in ('NNP', 'NNPS')]                                                                     
  for tagged_phrase in tagged]                                                                        
                                                                                                      
 pickle.dump(cleaned, open('movie_lines_cleaned.p', 'wb'))
	import nltk
	import pickle
	import random

	# I cleaned up the data manually in Vim.
	lines = list(open('movie_lines.tsv').readlines())
	random.shuffle(lines)
	tagged = [
	# Split lines into sentences; split sentences into words; tag words with
	# part of speech (POS).
	nltk.pos_tag(nltk.word_tokenize(sentence))
	for line in lines
	for sentence in nltk.sent_tokenize(line.decode('utf8'))]

	# nltk is pretty slow, good idea to save the result and maybe load it later
	# to play with it without redoing the POS tagging.
	pickle.dump(tagged, open('movie_lines_tagged.p', 'wb'))

	cleaned = [
	[word.lower()
	for word, pos in tagged_phrase
	# Exclude singular and plural proper nouns: they make up about 50% of
	# the unique words. We could do more cleaning, e.g. normalize word forms.
	# That depends on the learning goal and the language.
	if pos not in ('NNP', 'NNPS')]
	for tagged_phrase in tagged]

	pickle.dump(cleaned, open('movie_lines_cleaned.p', 'wb'))