Created
December 27, 2011 21:22
-
-
Save rohitdholakia/1525201 to your computer and use it in GitHub Desktop.
A python script to generate a dictionary to be used with NaiveBayes
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#This script reads all files in all directories of the folder taken from the openClassroom site and generates the dictionary, which we can then store in a file | |
folders = ["spam-train","spam-test","nonspam-train","nonspam-test"] | |
import os,sys | |
#We need a dictionary to store word occurences. What we can do is create a default dict and then update the frequencies. Write it all into a file all at once. | |
from collections import * | |
dictionary = defaultdict(int) | |
fdict = open(sys.argv[2],'w') #File to write all the entries in the dictionary | |
for root,dirnames,filenames in os.walk(sys.argv[1]): | |
for d in dirnames: #For each directory | |
for f in os.listdir(d): | |
data = open ( os.path.join(sys.argv[1],d,f),'r') | |
for line in data: | |
words = line.split(" ")#Split words on space | |
for w in words: | |
dictionary[w] += 1 | |
for k,v in dictionary.iteritems(): | |
fdict.write(k +" "+str(dictionary[k])+"\n") | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment