mallyvai · February 11, 2011 00:08
diff --git a/ians_csv_parser.py b/ians_csv_parser.py
 """
 Basic Python program for doing some stuff with the conversation format
 you wanted.
 --Vaibhav Mallya
 """

 import sys
 import nltk
 sent_tokenize = nltk.tokenize.sent_tokenize
 word_tokenize = nltk.tokenize.word_tokenize
 stem_word = nltk.stem.porter.PorterStemmer().stem_word
 stop_words = set(nltk.corpus.stopwords.words('english')) # This should be a set() - submit CR request to NLTK

 def get_words(content):
   sentences = sent_tokenize(content)
   words = []
   for sentence in sentences:
       unstemmed_words = word_tokenize(sentence)
       stemmed_words = [ stem_word(word) for word in unstemmed_words if word not in stop_words and stem_word(word) not in stop_words and stem_word(word) not in ".?/!"]
       words.extend(stemmed_words)
   return words

 if __name__ == "__main__":
   lines = open(sys.argv[1]).readlines()
   for line in lines:
       if len(line.strip()) == 0:
           continue
       counter = {}
       content = ' '.join([content for content in line.split(",")])
       words = get_words(content)
       for word in words:
           if word not in counter:
               counter[word] = 0
           counter[word] += 1
       sorted_tokens = sorted(counter.keys(), key=lambda k: counter[k], reverse=True)
       final_line = ','.join([sorted_tokens[0], line])
       print final_line
	"""
	Basic Python program for doing some stuff with the conversation format
	you wanted.
	--Vaibhav Mallya
	"""

	import sys
	import nltk
	sent_tokenize = nltk.tokenize.sent_tokenize
	word_tokenize = nltk.tokenize.word_tokenize
	stem_word = nltk.stem.porter.PorterStemmer().stem_word
	stop_words = set(nltk.corpus.stopwords.words('english')) # This should be a set() - submit CR request to NLTK

	def get_words(content):
	sentences = sent_tokenize(content)
	words = []
	for sentence in sentences:
	unstemmed_words = word_tokenize(sentence)
	stemmed_words = [ stem_word(word) for word in unstemmed_words if word not in stop_words and stem_word(word) not in stop_words and stem_word(word) not in ".?/!"]
	words.extend(stemmed_words)
	return words

	if __name__ == "__main__":
	lines = open(sys.argv[1]).readlines()
	for line in lines:
	if len(line.strip()) == 0:
	continue
	counter = {}
	content = ' '.join([content for content in line.split(",")])
	words = get_words(content)
	for word in words:
	if word not in counter:
	counter[word] = 0
	counter[word] += 1
	sorted_tokens = sorted(counter.keys(), key=lambda k: counter[k], reverse=True)
	final_line = ','.join([sorted_tokens[0], line])
	print final_line
No results found