Created
May 23, 2016 14:31
-
-
Save joest67/d4962b81176b8fc7269cf39de9552a64 to your computer and use it in GitHub Desktop.
We can make this file beautiful and searchable if this error is corrected: It looks like row 3 should actually have 8 columns, instead of 7 in line 2.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
430972,5037,6,52,2600.0,0,用,c | |
352986,4498,5,12,600.0,0,对,- | |
1399015,26982,569,12,600.0,0,- | |
678560,2954497,11,9,450.0,1,用,扫 | |
331,18779,2,9,450.0,0,用,地址,-,win | |
135200,3568182,2,6,300.0,1,端口,路由,转发 | |
992422,21469,36,6,300.0,0,脚本,存,window,调用 | |
142786,26725,3,6,300.0,0,网,地址,- | |
281,18243,2,5,250.0,0,屏蔽,端口,路由,路由器 | |
701894,22114,12,4,200.0,0,用 | |
175459,2953509,3,3,150.0,1,扫,端口,过 | |
353306,22192,5,2,100.0,0,对 | |
308691,3140498,4,2,100.0,1,地址,ok | |
867063,3573899,20,1,50.0,1,肉鸡 | |
238,17821,2,1,50.0,0,会 | |
41083,2943788,2,1,50.0,1,放 | |
431330,24388,6,1,50.0,0,用户 | |
429021,3578509,5,1,50.0,1,入侵 | |
1409102,3569938,619,1,50.0,1,防护 | |
215532,3185471,3,1,50.0,1,点 | |
761486,3152196,14,0,0.0,1 | |
1183309,9020,110,0,0.0,0 | |
562406,2964500,8,0,0.0,1 | |
1091680,14378,61,0,0.0,0 | |
841050,10781,19,0,0.0,0 | |
1167932,9006,99,0,0.0,0 | |
431300,23607,6,0,0.0,0 | |
967507,25445,32,0,0.0,0 | |
333992,3416846,4,0,0.0,1 | |
431306,23834,6,0,0.0,0 | |
1369255,26982,443,0,0.0,0 | |
91997,3183133,2,0,0.0,1 | |
841013,5099,19,0,0.0,0 | |
1052881,24048,49,0,0.0,0 | |
1396025,26982,555,0,0.0,0 | |
1003482,5099,38,0,0.0,0 | |
1060222,20082,51,0,0.0,0 | |
1298224,9006,254,0,0.0,0 | |
142589,23834,3,0,0.0,0 | |
596679,7937,9,0,0.0,0 | |
1184613,9006,111,0,0.0,0 | |
773607,26932,15,0,0.0,0 | |
1070490,13938,54,0,0.0,0 | |
752259,8017,14,0,0.0,0 | |
142486,22262,3,0,0.0,0 | |
953309,8017,30,0,0.0,0 | |
1252167,26982,180,0,0.0,0 | |
259016,28402,4,0,0.0,0 | |
792768,21243,16,0,0.0,0 | |
792785,23153,16,0,0.0,0 | |
986516,9202,35,0,0.0,0 | |
1293365,26692,245,0,0.0,0 | |
192035,3050687,3,0,0.0,1 | |
1036872,8863,45,0,0.0,0 | |
1097117,7937,63,0,0.0,0 | |
550381,8095,8,0,0.0,0 | |
518078,3086023,7,0,0.0,1 | |
416,20281,2,0,0.0,0 | |
752409,27074,14,0,0.0,0 | |
582093,3306162,8,0,0.0,1 | |
431098,17350,6,0,0.0,0 |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#! /usr/bin/env python | |
# -*- coding: utf-8 -*- | |
""" | |
usage: | |
python sample.py ddos_ch_200.csv hackbase_sample0.csv 123.csv | |
""" | |
import csv | |
import argparse | |
from collections import ( | |
defaultdict, | |
namedtuple | |
) | |
import jieba | |
SourceFrame = namedtuple( | |
'SourceFrame', | |
['index_key', 'postid', 'floor', 'sentence', 'manual_related'] | |
) | |
DataFrame = namedtuple( | |
'DataFrame', | |
['postid', 'floor', 'freq', 'score', 'manual', 'keywords'] | |
) | |
class KeyWordFrequentClient(object): | |
encodings = ['utf-8', 'gbk'] | |
def __init__(self, sample_file, test_filepath, result_filepath, | |
strict_mode=True): | |
self.sample_file = sample_file | |
self.test_filepath = test_filepath | |
self.result_filepath = result_filepath | |
self.strict_mode = strict_mode | |
def guess_encoding(self, word): | |
for i in self.encodings: | |
try: | |
word.decode(i) | |
except: | |
pass | |
else: | |
return i | |
def init_sample_data(self, sample_input_file): | |
keyword_score_map = defaultdict(int) | |
for word, weight, num in csv.reader(sample_input_file): | |
if isinstance(word, basestring): | |
encoding = self.guess_encoding(word) | |
word = word.decode(encoding) | |
keyword_score_map[word] = float(weight) | |
return keyword_score_map | |
def hit_keywords(self, keyword, candidates): | |
if self.strict_mode: | |
return filter(lambda c: c == keyword, candidates) | |
else: | |
return filter(lambda c: c in keyword, candidates) | |
def calculate_sentence_score(self, sentence, keyword_score_map): | |
parts = list(jieba.cut_for_search(sentence)) # only Chinese sentence | |
score = 0 | |
freq = 0 | |
hit_keywords = set() | |
for keyword, weight in keyword_score_map.iteritems(): | |
hit_parts = self.hit_keywords(keyword, parts) | |
freq += len(hit_parts) | |
score += len(hit_parts) * weight | |
hit_keywords |= set(hit_parts) | |
return freq, score, list(hit_keywords) | |
def process_data(self, keyword_score_map, input_file): | |
res = {} | |
for line in csv.reader(input_file): | |
if len(line) < len(SourceFrame._fields): | |
continue | |
source_frame = SourceFrame(*line) | |
freq, score, hit_keywords = self.calculate_sentence_score( | |
source_frame.sentence, | |
keyword_score_map | |
) | |
data = [ | |
source_frame.postid, source_frame.floor, freq, score, | |
source_frame.manual_related, hit_keywords, | |
] | |
res[source_frame.index_key] = DataFrame(*data) | |
return res | |
def write_result(self, res): | |
# sort by score default | |
sorted_line_map = sorted(list(res.iteritems()), | |
key=lambda l: l[1].score, reverse=True) | |
with open(self.result_filepath, 'wb') as f: | |
csv_writer = csv.writer(f) | |
for idx, frame in sorted_line_map: | |
data = [idx] + list(frame[:-1]) + \ | |
[k.encode('utf-8') for k in list(frame[-1])] | |
csv_writer.writerow(data) | |
def run(self): | |
with open(self.sample_file) as sample_input: | |
keyword_score_map = self.init_sample_data(sample_input) | |
with open(self.test_filepath) as input_file: | |
res = self.process_data(keyword_score_map, input_file) | |
self.write_result(res) | |
def main(): | |
parser = argparse.ArgumentParser() | |
parser.add_argument('sample_file', help='sample file path') | |
parser.add_argument('test_file', help='test file path') | |
parser.add_argument('save_filepath', help='test file path') | |
parser.add_argument('-s', '--strict-mode', action='store_false', | |
default=False, help='exact equal with sample keyword') | |
args = parser.parse_args() | |
print args | |
client = KeyWordFrequentClient( | |
args.sample_file, args.test_file, args.save_filepath, | |
args.strict_mode | |
) | |
client.run() | |
print 'done' | |
if __name__ == "__main__": | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment