Created
December 19, 2012 21:35
-
-
Save alexeyproskuryakov/4340741 to your computer and use it in GitHub Desktop.
Нужно будет еще переделать этот гребанный рандомайзер. Он многое гадит.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#coding:utf-8 | |
import re | |
from random import uniform | |
from collections import defaultdict | |
from pymongo import MongoClient, ASCENDING | |
sent_end = ('.!?,;:$') | |
comma = ',;:' | |
r_alphabet = re.compile(u'[a-zA-Zа-яёА-ЯЁ0-9-]+|[.,:;?!]+') | |
__doc__ = """ | |
""" | |
class model(object): | |
def __init__(self, host='localhost', port=27017, db_name='sentences'): | |
db = MongoClient(host, port)[db_name] | |
self.gramms = db['gramms'] | |
if len(self.gramms.index_information()) <= 1: | |
self.gramms.ensure_index([('1', ASCENDING), ('2', ASCENDING)], unique=True) | |
def init_sub_model(self, words): | |
sub_model = {} | |
for word in words: | |
for el in self.gramms.find({'$or': [{'1': word}, {'2': word}, {'sib.3': word}]}): | |
sub_model[(el['1'], el['2'])] = el['sib'] | |
#sanitize | |
res_sub_model = {} | |
is_good_data = False | |
for key, value in sub_model.iteritems(): | |
for sib_el in value: | |
next = key[1], sib_el['3'] | |
if next in sub_model: | |
if key in res_sub_model: | |
res_sub_model[key].append(sib_el) | |
else: | |
res_sub_model[key] = [sib_el] | |
if sib_el['3'] in sent_end and not is_good_data: | |
is_good_data = True | |
res_sub_model[sib_el['3'], '$'] = [{'3': '$', 'w': 1}] | |
if ('$', '$') not in sub_model or not is_good_data: | |
raise Exception('bad train data for this words') | |
return res_sub_model | |
def get_seq(self, t0, t1): | |
res = self.gramms.find_one({'1': t0, '2': t1}) | |
if res: | |
return res['sib'] | |
def add_to_seq(self, t0, t1, token, weight): | |
container = self.gramms.find_one({'1': t0, '2': t1}) | |
if container: | |
self.gramms.update(container, {'$push': {'sib': {'3': token, 'w': weight}}}, upsert=True) | |
else: | |
self.gramms.save({'1': t0, '2': t1, 'sib': [{'3': token, 'w': weight}]}) | |
def gen_lines(corpus): | |
data = open(corpus) | |
for line in data: | |
yield line.decode('utf-8').lower() | |
def gen_tokens(lines): | |
for line in lines: | |
for token in r_alphabet.findall(line): | |
yield token | |
def gen_trigrams(tokens): | |
t0, t1 = '$', '$' | |
for t2 in tokens: | |
yield t0, t1, t2 | |
if t2 in '.!?': | |
yield t1, t2, '$' | |
yield t2, '$', '$' | |
t0, t1 = '$', '$' | |
else: | |
t0, t1 = t1, t2 | |
def train(corpus): | |
lines = gen_lines(corpus) | |
tokens = gen_tokens(lines) | |
trigrams = gen_trigrams(tokens) | |
bi, tri = defaultdict(lambda: 0.0), defaultdict(lambda: 0.0) | |
for t0, t1, t2 in trigrams: | |
bi[t0, t1] += 1 | |
tri[t0, t1, t2] += 1 | |
m = model() | |
for (t0, t1, t2), freq in tri.iteritems(): | |
m.add_to_seq(t0, t1, t2, freq / bi[t0, t1]) | |
return m | |
def generate_sentence(model): | |
phrase = '' | |
t0, t1 = '$', '$' | |
while 1: # for every iteration in t1 new word random getted from model | |
if (t0, t1) not in model: | |
break | |
t0, t1 = t1, unirand(model[t0, t1]) | |
if t1 == '$': break | |
if t1 in sent_end or t0 == '$': | |
phrase += t1 | |
else: | |
phrase += ' ' + t1 | |
print phrase | |
return phrase.capitalize() | |
def unirand(seq): | |
""" | |
here generating next word on sequence. | |
""" | |
sum_, freq_ = 0, 0 | |
for el in seq: | |
sum_ += el['w'] | |
rnd = uniform(0, sum_) | |
for el in seq: | |
freq_ += el['w'] | |
if rnd < freq_: | |
return el['3'] | |
if __name__ == '__main__': | |
## model = train('/home/alesha/develop/seo-venv/dicts/text.txt') | |
# model = model(words=[u'начать', u'имении', u'граф', u'пьер']) | |
# for i in range(10): | |
# print generate_sentence(model) | |
#model = train('../data/data.txt') | |
model = model() | |
words = [u'любовь', u'деньги', u'я', u'шеф'] | |
sub_model = model.init_sub_model(words) | |
print generate_sentence(sub_model) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
При изменении рандомайзера следует учесть также связи слов во входной выборке. Дабы предложение сочленялось. А не как сейчас - генерируется предложение из одного слова. А также, прикрутить pymorphy, но с более высокими n, один хер вариантов не так много нежели слов.