yuchen · October 16, 2013 11:35
diff --git a/BeyesianAvg.py b/BeyesianAvg.py
 # -*- coding=utf-8 -*-
 import collections

 # Usage:
 #   我的做法是把WordsDetector.py里的结果输出到文件，
 #   然后把文件名放到下面的names列表中，运行本程序。

 names = ['name0',
         'name1',
         'name2',
         'name3']

 words = dict([(i, collections.Counter()) for i in names])
 total_words = collections.Counter()

 for name in names:
    f = open(name)
    for line in f:
        word, freq = line.split()
        words[name][word] += int(freq)
    total_words += words[name]

 ps = dict([(i, collections.defaultdict(int)) for i in names])

 for name in names:
    print name
    cnt = total = avg = 0.0 
    for word, freq in words[name].iteritems():
        cnt += 1
        total += total_words[word]
        avg += float(freq) / total_words[word]
    total /= cnt 
    avg /= cnt 
    avg_times_total = total * avg 
    for word, freq in words[name].iteritems():
        ps[name][word] = (float(freq) + avg_times_total) / (total_words[word] + total)
    word_list = list(set(words[name]))
    word_list.sort(cmp = lambda x, y: cmp(ps[name][y], ps[name][x]))
    cnt = 0 
    for word in word_list:
        print '* ', word, ps[name][word]
        cnt += 1
        if cnt >= 10: break
diff --git a/WordsDetector.py b/WordsDetector.py
 # -*- coding=utf-8 -*-
 import feedparser
 import re
 import collections
 import math

 def info_entropy(words):
    result = 0 
    total = sum([val for _, val in words.iteritems()])
    for word, cnt in words.iteritems():
        p = float(cnt) / total
        result -= p * math.log(p)
    return result

 max_word_len = 5 
 entropy_threshold = 1 

 content = []
 articles = feedparser.parse('http://www.liyaos.com/blog/feed')
 for article in articles.entries:
    content.append(article.title)
    content.extend(re.split('<.*?>|&nbsp;', article.description, 0, re.UNICODE))
 # replace above line with this if using ATOM:
 #   try:
 #       s = article.content[0]['value']
 #   except AttributeError:
 #       try:
 #           s = article.summary
 #       except AttributeError:
 #           s = ''
 #   content.extend(re.split('<.*?>|&nbsp;', s, 0, re.UNICODE))
 content = u''.join(content)
 sentences = re.split("\W+|[a-zA-Z0-9]+", content, 0, re.UNICODE)
 freq = collections.Counter()
 for sentence in sentences:
    if sentence:
        l = len(sentence)
        wl = min(l, max_word_len)
        for i in range(1, wl + 1): 
            for j in range(0, l - i + 1): 
                freq[sentence[j:j + i]] += 1
 total = sum([val for _, val in freq.iteritems()])
 ps = collections.defaultdict(int)
 for word, val in freq.iteritems():
    ps[word] = float(val) / total

 words = set()
 for word, word_p in ps.items():
    if len(word) > 1:
        p = 0
        for i in range(1, len(word)):
            t = ps[word[0:i]] * ps[word[i:]]
            p = max(p, t)
        if freq[word] >= 3 and word_p / p > 100:
            words.add(word)

 final_words = set()
 for word in words:
    lf = rf = True
    left_words = collections.Counter()
    right_words = collections.Counter()
    pattern = re.compile(word.join(['.?', '.?']))
    for sentence in sentences:
        l = pattern.findall(sentence)
        if l:
            if l[0][0] != word[0]:
                left_words[l[0][0]] += 1
            else:
                lf = False
            if l[0][-1] != word[-1]:
                right_words[l[0][-1]] += 1
            else:
                rf = False
    left_info_entropy = info_entropy(left_words)
    right_info_entropy = info_entropy(right_words)
    if lf and len(left_words) > 0 and left_info_entropy < entropy_threshold:
        continue
    if rf and len(right_words) > 0 and right_info_entropy < entropy_threshold:
        continue
    final_words.add(word)
 words_list = list(final_words)
 words_list.sort(cmp = lambda x, y: cmp(freq[y], freq[x]))
 for word in words_list:
    print word.encode('utf8'), freq[word]
	# -- coding=utf-8 --
	import collections

	# Usage:
	# 我的做法是把WordsDetector.py里的结果输出到文件，
	# 然后把文件名放到下面的names列表中，运行本程序。

	names = ['name0',
	'name1',
	'name2',
	'name3']

	words = dict([(i, collections.Counter()) for i in names])
	total_words = collections.Counter()

	for name in names:
	f = open(name)
	for line in f:
	word, freq = line.split()
	words[name][word] += int(freq)
	total_words += words[name]

	ps = dict([(i, collections.defaultdict(int)) for i in names])

	for name in names:
	print name
	cnt = total = avg = 0.0
	for word, freq in words[name].iteritems():
	cnt += 1
	total += total_words[word]
	avg += float(freq) / total_words[word]
	total /= cnt
	avg /= cnt
	avg_times_total = total * avg
	for word, freq in words[name].iteritems():
	ps[name][word] = (float(freq) + avg_times_total) / (total_words[word] + total)
	word_list = list(set(words[name]))
	word_list.sort(cmp = lambda x, y: cmp(ps[name][y], ps[name][x]))
	cnt = 0
	for word in word_list:
	print '* ', word, ps[name][word]
	cnt += 1
	if cnt >= 10: break
	# -- coding=utf-8 --
	import feedparser
	import re
	import collections
	import math

	def info_entropy(words):
	result = 0
	total = sum([val for _, val in words.iteritems()])
	for word, cnt in words.iteritems():
	p = float(cnt) / total
	result -= p * math.log(p)
	return result

	max_word_len = 5
	entropy_threshold = 1

	content = []
	articles = feedparser.parse('http://www.liyaos.com/blog/feed')
	for article in articles.entries:
	content.append(article.title)
	content.extend(re.split('<.*?>\| ', article.description, 0, re.UNICODE))
	# replace above line with this if using ATOM:
	# try:
	# s = article.content[0]['value']
	# except AttributeError:
	# try:
	# s = article.summary
	# except AttributeError:
	# s = ''
	# content.extend(re.split('<.*?>\| ', s, 0, re.UNICODE))
	content = u''.join(content)
	sentences = re.split("\W+\|[a-zA-Z0-9]+", content, 0, re.UNICODE)
	freq = collections.Counter()
	for sentence in sentences:
	if sentence:
	l = len(sentence)
	wl = min(l, max_word_len)
	for i in range(1, wl + 1):
	for j in range(0, l - i + 1):
	freq[sentence[j:j + i]] += 1
	total = sum([val for _, val in freq.iteritems()])
	ps = collections.defaultdict(int)
	for word, val in freq.iteritems():
	ps[word] = float(val) / total

	words = set()
	for word, word_p in ps.items():
	if len(word) > 1:
	p = 0
	for i in range(1, len(word)):
	t = ps[word[0:i]] * ps[word[i:]]
	p = max(p, t)
	if freq[word] >= 3 and word_p / p > 100:
	words.add(word)

	final_words = set()
	for word in words:
	lf = rf = True
	left_words = collections.Counter()
	right_words = collections.Counter()
	pattern = re.compile(word.join(['.?', '.?']))
	for sentence in sentences:
	l = pattern.findall(sentence)
	if l:
	if l[0][0] != word[0]:
	left_words[l[0][0]] += 1
	else:
	lf = False
	if l[0][-1] != word[-1]:
	right_words[l[0][-1]] += 1
	else:
	rf = False
	left_info_entropy = info_entropy(left_words)
	right_info_entropy = info_entropy(right_words)
	if lf and len(left_words) > 0 and left_info_entropy < entropy_threshold:
	continue
	if rf and len(right_words) > 0 and right_info_entropy < entropy_threshold:
	continue
	final_words.add(word)
	words_list = list(final_words)
	words_list.sort(cmp = lambda x, y: cmp(freq[y], freq[x]))
	for word in words_list:
	print word.encode('utf8'), freq[word]