joest67 · May 23, 2016 14:31
diff --git a/123.csv b/123.csv
 430972,5037,6,52,2600.0,0,用,c
 352986,4498,5,12,600.0,0,对,-
 1399015,26982,569,12,600.0,0,-
 678560,2954497,11,9,450.0,1,用,扫
 331,18779,2,9,450.0,0,用,地址,-,win
 135200,3568182,2,6,300.0,1,端口,路由,转发
 992422,21469,36,6,300.0,0,脚本,存,window,调用
 142786,26725,3,6,300.0,0,网,地址,-
 281,18243,2,5,250.0,0,屏蔽,端口,路由,路由器
 701894,22114,12,4,200.0,0,用
 175459,2953509,3,3,150.0,1,扫,端口,过
 353306,22192,5,2,100.0,0,对
 308691,3140498,4,2,100.0,1,地址,ok
 867063,3573899,20,1,50.0,1,肉鸡
 238,17821,2,1,50.0,0,会
 41083,2943788,2,1,50.0,1,放
 431330,24388,6,1,50.0,0,用户
 429021,3578509,5,1,50.0,1,入侵
 1409102,3569938,619,1,50.0,1,防护
 215532,3185471,3,1,50.0,1,点
 761486,3152196,14,0,0.0,1
 1183309,9020,110,0,0.0,0
 562406,2964500,8,0,0.0,1
 1091680,14378,61,0,0.0,0
 841050,10781,19,0,0.0,0
 1167932,9006,99,0,0.0,0
 431300,23607,6,0,0.0,0
 967507,25445,32,0,0.0,0
 333992,3416846,4,0,0.0,1
 431306,23834,6,0,0.0,0
 1369255,26982,443,0,0.0,0
 91997,3183133,2,0,0.0,1
 841013,5099,19,0,0.0,0
 1052881,24048,49,0,0.0,0
 1396025,26982,555,0,0.0,0
 1003482,5099,38,0,0.0,0
 1060222,20082,51,0,0.0,0
 1298224,9006,254,0,0.0,0
 142589,23834,3,0,0.0,0
 596679,7937,9,0,0.0,0
 1184613,9006,111,0,0.0,0
 773607,26932,15,0,0.0,0
 1070490,13938,54,0,0.0,0
 752259,8017,14,0,0.0,0
 142486,22262,3,0,0.0,0
 953309,8017,30,0,0.0,0
 1252167,26982,180,0,0.0,0
 259016,28402,4,0,0.0,0
 792768,21243,16,0,0.0,0
 792785,23153,16,0,0.0,0
 986516,9202,35,0,0.0,0
 1293365,26692,245,0,0.0,0
 192035,3050687,3,0,0.0,1
 1036872,8863,45,0,0.0,0
 1097117,7937,63,0,0.0,0
 550381,8095,8,0,0.0,0
 518078,3086023,7,0,0.0,1
 416,20281,2,0,0.0,0
 752409,27074,14,0,0.0,0
 582093,3306162,8,0,0.0,1
 431098,17350,6,0,0.0,0
diff --git a/sample.py b/sample.py
 #! /usr/bin/env python
 # -*- coding: utf-8 -*-
 """
 usage:

 python sample.py ddos_ch_200.csv hackbase_sample0.csv 123.csv
 """
 import csv
 import argparse
 from collections import (
    defaultdict,
    namedtuple
 )

 import jieba

 SourceFrame = namedtuple(
    'SourceFrame',
    ['index_key', 'postid', 'floor', 'sentence', 'manual_related']
 )
 DataFrame = namedtuple(
    'DataFrame',
    ['postid', 'floor', 'freq', 'score', 'manual', 'keywords']
 )


 class KeyWordFrequentClient(object):
    encodings = ['utf-8', 'gbk']

    def __init__(self, sample_file, test_filepath, result_filepath,
                 strict_mode=True):
        self.sample_file = sample_file
        self.test_filepath = test_filepath
        self.result_filepath = result_filepath
        self.strict_mode = strict_mode

    def guess_encoding(self, word):
        for i in self.encodings:
            try:
                word.decode(i)
            except:
                pass
            else:
                return i

    def init_sample_data(self, sample_input_file):
        keyword_score_map = defaultdict(int)
        for word, weight, num in csv.reader(sample_input_file):
            if isinstance(word, basestring):
                encoding = self.guess_encoding(word)
                word = word.decode(encoding)
            keyword_score_map[word] = float(weight)
        return keyword_score_map

    def hit_keywords(self, keyword, candidates):
        if self.strict_mode:
            return filter(lambda c: c == keyword, candidates)
        else:
            return filter(lambda c: c in keyword, candidates)

    def calculate_sentence_score(self, sentence, keyword_score_map):
        parts = list(jieba.cut_for_search(sentence))  # only Chinese sentence
        score = 0
        freq = 0
        hit_keywords = set()
        for keyword, weight in keyword_score_map.iteritems():
            hit_parts = self.hit_keywords(keyword, parts)
            freq += len(hit_parts)
            score += len(hit_parts) * weight
            hit_keywords |= set(hit_parts)
        return freq, score, list(hit_keywords)

    def process_data(self, keyword_score_map, input_file):
        res = {}
        for line in csv.reader(input_file):
            if len(line) < len(SourceFrame._fields):
                continue
            source_frame = SourceFrame(*line)
            freq, score, hit_keywords = self.calculate_sentence_score(
                source_frame.sentence,
                keyword_score_map
            )
            data = [
                source_frame.postid, source_frame.floor, freq, score,
                source_frame.manual_related, hit_keywords,
            ]
            res[source_frame.index_key] = DataFrame(*data)
        return res

    def write_result(self, res):
        # sort by score default
        sorted_line_map = sorted(list(res.iteritems()),
                                 key=lambda l: l[1].score, reverse=True)
        with open(self.result_filepath, 'wb') as f:
            csv_writer = csv.writer(f)
            for idx, frame in sorted_line_map:
                data = [idx] + list(frame[:-1]) + \
                    [k.encode('utf-8') for k in list(frame[-1])]
                csv_writer.writerow(data)

    def run(self):
        with open(self.sample_file) as sample_input:
            keyword_score_map = self.init_sample_data(sample_input)
        with open(self.test_filepath) as input_file:
            res = self.process_data(keyword_score_map, input_file)
        self.write_result(res)


 def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('sample_file', help='sample file path')
    parser.add_argument('test_file', help='test file path')
    parser.add_argument('save_filepath', help='test file path')
    parser.add_argument('-s', '--strict-mode', action='store_false',
                        default=False, help='exact equal with sample keyword')
    args = parser.parse_args()
    print args
    client = KeyWordFrequentClient(
        args.sample_file, args.test_file, args.save_filepath,
        args.strict_mode
    )
    client.run()
    print 'done'


 if __name__ == "__main__":
    main()
	430972,5037,6,52,2600.0,0,用,c
	352986,4498,5,12,600.0,0,对,-
	1399015,26982,569,12,600.0,0,-
	678560,2954497,11,9,450.0,1,用,扫
	331,18779,2,9,450.0,0,用,地址,-,win
	135200,3568182,2,6,300.0,1,端口,路由,转发
	992422,21469,36,6,300.0,0,脚本,存,window,调用
	142786,26725,3,6,300.0,0,网,地址,-
	281,18243,2,5,250.0,0,屏蔽,端口,路由,路由器
	701894,22114,12,4,200.0,0,用
	175459,2953509,3,3,150.0,1,扫,端口,过
	353306,22192,5,2,100.0,0,对
	308691,3140498,4,2,100.0,1,地址,ok
	867063,3573899,20,1,50.0,1,肉鸡
	238,17821,2,1,50.0,0,会
	41083,2943788,2,1,50.0,1,放
	431330,24388,6,1,50.0,0,用户
	429021,3578509,5,1,50.0,1,入侵
	1409102,3569938,619,1,50.0,1,防护
	215532,3185471,3,1,50.0,1,点
	761486,3152196,14,0,0.0,1
	1183309,9020,110,0,0.0,0
	562406,2964500,8,0,0.0,1
	1091680,14378,61,0,0.0,0
	841050,10781,19,0,0.0,0
	1167932,9006,99,0,0.0,0
	431300,23607,6,0,0.0,0
	967507,25445,32,0,0.0,0
	333992,3416846,4,0,0.0,1
	431306,23834,6,0,0.0,0
	1369255,26982,443,0,0.0,0
	91997,3183133,2,0,0.0,1
	841013,5099,19,0,0.0,0
	1052881,24048,49,0,0.0,0
	1396025,26982,555,0,0.0,0
	1003482,5099,38,0,0.0,0
	1060222,20082,51,0,0.0,0
	1298224,9006,254,0,0.0,0
	142589,23834,3,0,0.0,0
	596679,7937,9,0,0.0,0
	1184613,9006,111,0,0.0,0
	773607,26932,15,0,0.0,0
	1070490,13938,54,0,0.0,0
	752259,8017,14,0,0.0,0
	142486,22262,3,0,0.0,0
	953309,8017,30,0,0.0,0
	1252167,26982,180,0,0.0,0
	259016,28402,4,0,0.0,0
	792768,21243,16,0,0.0,0
	792785,23153,16,0,0.0,0
	986516,9202,35,0,0.0,0
	1293365,26692,245,0,0.0,0
	192035,3050687,3,0,0.0,1
	1036872,8863,45,0,0.0,0
	1097117,7937,63,0,0.0,0
	550381,8095,8,0,0.0,0
	518078,3086023,7,0,0.0,1
	416,20281,2,0,0.0,0
	752409,27074,14,0,0.0,0
	582093,3306162,8,0,0.0,1
	431098,17350,6,0,0.0,0
	#! /usr/bin/env python
	# -- coding: utf-8 --
	"""
	usage:

	python sample.py ddos_ch_200.csv hackbase_sample0.csv 123.csv
	"""
	import csv
	import argparse
	from collections import (
	defaultdict,
	namedtuple
	)

	import jieba

	SourceFrame = namedtuple(
	'SourceFrame',
	['index_key', 'postid', 'floor', 'sentence', 'manual_related']
	)
	DataFrame = namedtuple(
	'DataFrame',
	['postid', 'floor', 'freq', 'score', 'manual', 'keywords']
	)


	class KeyWordFrequentClient(object):
	encodings = ['utf-8', 'gbk']

	def __init__(self, sample_file, test_filepath, result_filepath,
	strict_mode=True):
	self.sample_file = sample_file
	self.test_filepath = test_filepath
	self.result_filepath = result_filepath
	self.strict_mode = strict_mode

	def guess_encoding(self, word):
	for i in self.encodings:
	try:
	word.decode(i)
	except:
	pass
	else:
	return i

	def init_sample_data(self, sample_input_file):
	keyword_score_map = defaultdict(int)
	for word, weight, num in csv.reader(sample_input_file):
	if isinstance(word, basestring):
	encoding = self.guess_encoding(word)
	word = word.decode(encoding)
	keyword_score_map[word] = float(weight)
	return keyword_score_map

	def hit_keywords(self, keyword, candidates):
	if self.strict_mode:
	return filter(lambda c: c == keyword, candidates)
	else:
	return filter(lambda c: c in keyword, candidates)

	def calculate_sentence_score(self, sentence, keyword_score_map):
	parts = list(jieba.cut_for_search(sentence)) # only Chinese sentence
	score = 0
	freq = 0
	hit_keywords = set()
	for keyword, weight in keyword_score_map.iteritems():
	hit_parts = self.hit_keywords(keyword, parts)
	freq += len(hit_parts)
	score += len(hit_parts) * weight
	hit_keywords \|= set(hit_parts)
	return freq, score, list(hit_keywords)

	def process_data(self, keyword_score_map, input_file):
	res = {}
	for line in csv.reader(input_file):
	if len(line) < len(SourceFrame._fields):
	continue
	source_frame = SourceFrame(*line)
	freq, score, hit_keywords = self.calculate_sentence_score(
	source_frame.sentence,
	keyword_score_map
	)
	data = [
	source_frame.postid, source_frame.floor, freq, score,
	source_frame.manual_related, hit_keywords,
	]
	res[source_frame.index_key] = DataFrame(*data)
	return res

	def write_result(self, res):
	# sort by score default
	sorted_line_map = sorted(list(res.iteritems()),
	key=lambda l: l[1].score, reverse=True)
	with open(self.result_filepath, 'wb') as f:
	csv_writer = csv.writer(f)
	for idx, frame in sorted_line_map:
	data = [idx] + list(frame[:-1]) + \
	[k.encode('utf-8') for k in list(frame[-1])]
	csv_writer.writerow(data)

	def run(self):
	with open(self.sample_file) as sample_input:
	keyword_score_map = self.init_sample_data(sample_input)
	with open(self.test_filepath) as input_file:
	res = self.process_data(keyword_score_map, input_file)
	self.write_result(res)


	def main():
	parser = argparse.ArgumentParser()
	parser.add_argument('sample_file', help='sample file path')
	parser.add_argument('test_file', help='test file path')
	parser.add_argument('save_filepath', help='test file path')
	parser.add_argument('-s', '--strict-mode', action='store_false',
	default=False, help='exact equal with sample keyword')
	args = parser.parse_args()
	print args
	client = KeyWordFrequentClient(
	args.sample_file, args.test_file, args.save_filepath,
	args.strict_mode
	)
	client.run()
	print 'done'


	if __name__ == "__main__":
	main()