Created
November 11, 2012 08:31
-
-
Save royguo/4054164 to your computer and use it in GitHub Desktop.
NavieBayes
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
# encoding: utf-8 | |
""" | |
author: [email protected] | |
""" | |
import os | |
import random | |
import re | |
class DataPrepare(object): | |
"""处理原始数据,为机器学习模型的训练作准备""" | |
def __init__(self, input_dir, train_data_file, test_data_file, train_file_percentage): | |
self.input_dir = input_dir | |
self.train_data_file = open(train_data_file,'w') | |
self.test_data_file = open(test_data_file,'w') | |
self.train_file_percentage = train_file_percentage | |
self.unique_words = [] | |
# 每一个单词都使用一个数字类型的id表示,python索引的时候才会快一些 | |
self.word_ids = {} | |
def __del__(self): | |
self.train_data_file.close() | |
self.test_data_file.close() | |
def prepare(self): | |
file_num = 0 | |
output_file = self.test_data_file | |
for file_name in os.listdir(self.input_dir): | |
# arr = (1234,'business') | |
arr = re.findall(r'(\d+)(\w+)',file_name)[0] | |
category = arr[1] | |
# 随即函数按照train_file_percentage指定的百分比来选择训练和测试数据及 | |
if random.random() < self.train_file_percentage: | |
output_file = self.train_data_file | |
else: | |
output_file = self.test_data_file | |
# 读取文件获得词组 | |
words = [] | |
with open(self.input_dir + '/' + file_name,'r') as f: | |
words = f.read().decode('utf-8').split() | |
output_file.write(category + ' ') | |
for word in words: | |
if word not in self.word_ids: | |
self.unique_words.append(word) | |
# 可以取Hash,这里为了简便期间,直接使用当前数组的长度(也是唯一的) | |
self.word_ids[word] = len(self.unique_words) | |
output_file.write(str(self.word_ids[word]) + " ") | |
output_file.write("#"+file_name+"\n") | |
# 原始文件较多,需要交互显示进度 | |
file_num += 1 | |
if file_num % 100 == 0: | |
print file_num,' files processed' | |
print file_num, " files loaded!" | |
print len(self.unique_words), " unique words found!" | |
if __name__ == '__main__': | |
dp = DataPrepare('newsdata','news.train','news.test',0.8) | |
dp.prepare() | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment