alexeyproskuryakov · December 19, 2012 21:35 · alexeyproskuryakov · Dec 19, 2012
diff --git a/Sentence_generator b/Sentence_generator
 #coding:utf-8
 import re
 from random import uniform
 from collections import defaultdict
 from pymongo import MongoClient, ASCENDING

 sent_end = ('.!?,;:$')
 comma = ',;:'
 r_alphabet = re.compile(u'[a-zA-Zа-яёА-ЯЁ0-9-]+|[.,:;?!]+')

 __doc__ = """

 """

 class model(object):
    def __init__(self, host='localhost', port=27017, db_name='sentences'):
        db = MongoClient(host, port)[db_name]
        self.gramms = db['gramms']
        if len(self.gramms.index_information()) <= 1:
            self.gramms.ensure_index([('1', ASCENDING), ('2', ASCENDING)], unique=True)


    def init_sub_model(self, words):
        sub_model = {}
        for word in words:
            for el in self.gramms.find({'$or': [{'1': word}, {'2': word}, {'sib.3': word}]}):
                sub_model[(el['1'], el['2'])] = el['sib']
                #sanitize

        res_sub_model = {}
        is_good_data = False
        for key, value in sub_model.iteritems():
            for sib_el in value:
                next = key[1], sib_el['3']
                if next in sub_model:
                    if key in res_sub_model:
                        res_sub_model[key].append(sib_el)
                    else:
                        res_sub_model[key] = [sib_el]

                    if sib_el['3'] in sent_end and not is_good_data:
                        is_good_data = True
                        res_sub_model[sib_el['3'], '$'] = [{'3': '$', 'w': 1}]

        if ('$', '$') not in sub_model or not is_good_data:
            raise Exception('bad train data for this words')

        return res_sub_model

    def get_seq(self, t0, t1):
        res = self.gramms.find_one({'1': t0, '2': t1})
        if res:
            return res['sib']

    def add_to_seq(self, t0, t1, token, weight):
        container = self.gramms.find_one({'1': t0, '2': t1})
        if container:
            self.gramms.update(container, {'$push': {'sib': {'3': token, 'w': weight}}}, upsert=True)
        else:
            self.gramms.save({'1': t0, '2': t1, 'sib': [{'3': token, 'w': weight}]})


 def gen_lines(corpus):
    data = open(corpus)
    for line in data:
        yield line.decode('utf-8').lower()


 def gen_tokens(lines):
    for line in lines:
        for token in r_alphabet.findall(line):
            yield token


 def gen_trigrams(tokens):
    t0, t1 = '$', '$'
    for t2 in tokens:
        yield t0, t1, t2
        if t2 in '.!?':
            yield t1, t2, '$'
            yield t2, '$', '$'
            t0, t1 = '$', '$'
        else:
            t0, t1 = t1, t2


 def train(corpus):
    lines = gen_lines(corpus)
    tokens = gen_tokens(lines)
    trigrams = gen_trigrams(tokens)

    bi, tri = defaultdict(lambda: 0.0), defaultdict(lambda: 0.0)

    for t0, t1, t2 in trigrams:
        bi[t0, t1] += 1
        tri[t0, t1, t2] += 1

    m = model()
    for (t0, t1, t2), freq in tri.iteritems():
        m.add_to_seq(t0, t1, t2, freq / bi[t0, t1])
    return m


 def generate_sentence(model):
    phrase = ''
    t0, t1 = '$', '$'
    while 1: # for every iteration in t1 new word random getted from model
        if (t0, t1) not in model:
            break
        t0, t1 = t1, unirand(model[t0, t1])
        if t1 == '$': break
        if t1 in sent_end or t0 == '$':
            phrase += t1
        else:
            phrase += ' ' + t1
        print phrase
    return phrase.capitalize()


 def unirand(seq):
    """
    here generating next word on sequence.
    """
    sum_, freq_ = 0, 0
    for el in seq:
        sum_ += el['w']
        rnd = uniform(0, sum_)
    for el in seq:
        freq_ += el['w']
        if rnd < freq_:
            return el['3']

 if __name__ == '__main__':
 ##    model = train('/home/alesha/develop/seo-venv/dicts/text.txt')
 #    model = model(words=[u'начать', u'имении', u'граф', u'пьер'])
 #    for i in range(10):
 #        print generate_sentence(model)

    #model = train('../data/data.txt')
    model = model()
    words = [u'любовь', u'деньги', u'я', u'шеф']
    sub_model = model.init_sub_model(words)
    print generate_sentence(sub_model)
	#coding:utf-8
	import re
	from random import uniform
	from collections import defaultdict
	from pymongo import MongoClient, ASCENDING

	sent_end = ('.!?,;:$')
	comma = ',;:'
	r_alphabet = re.compile(u'[a-zA-Zа-яёА-ЯЁ0-9-]+\|[.,:;?!]+')

	__doc__ = """

	"""

	class model(object):
	def __init__(self, host='localhost', port=27017, db_name='sentences'):
	db = MongoClient(host, port)[db_name]
	self.gramms = db['gramms']
	if len(self.gramms.index_information()) <= 1:
	self.gramms.ensure_index([('1', ASCENDING), ('2', ASCENDING)], unique=True)


	def init_sub_model(self, words):
	sub_model = {}
	for word in words:
	for el in self.gramms.find({'$or': [{'1': word}, {'2': word}, {'sib.3': word}]}):
	sub_model[(el['1'], el['2'])] = el['sib']
	#sanitize

	res_sub_model = {}
	is_good_data = False
	for key, value in sub_model.iteritems():
	for sib_el in value:
	next = key[1], sib_el['3']
	if next in sub_model:
	if key in res_sub_model:
	res_sub_model[key].append(sib_el)
	else:
	res_sub_model[key] = [sib_el]

	if sib_el['3'] in sent_end and not is_good_data:
	is_good_data = True
	res_sub_model[sib_el['3'], '$'] = [{'3': '$', 'w': 1}]

	if ('$', '$') not in sub_model or not is_good_data:
	raise Exception('bad train data for this words')

	return res_sub_model

	def get_seq(self, t0, t1):
	res = self.gramms.find_one({'1': t0, '2': t1})
	if res:
	return res['sib']

	def add_to_seq(self, t0, t1, token, weight):
	container = self.gramms.find_one({'1': t0, '2': t1})
	if container:
	self.gramms.update(container, {'$push': {'sib': {'3': token, 'w': weight}}}, upsert=True)
	else:
	self.gramms.save({'1': t0, '2': t1, 'sib': [{'3': token, 'w': weight}]})


	def gen_lines(corpus):
	data = open(corpus)
	for line in data:
	yield line.decode('utf-8').lower()


	def gen_tokens(lines):
	for line in lines:
	for token in r_alphabet.findall(line):
	yield token


	def gen_trigrams(tokens):
	t0, t1 = '$', '$'
	for t2 in tokens:
	yield t0, t1, t2
	if t2 in '.!?':
	yield t1, t2, '$'
	yield t2, '$', '$'
	t0, t1 = '$', '$'
	else:
	t0, t1 = t1, t2


	def train(corpus):
	lines = gen_lines(corpus)
	tokens = gen_tokens(lines)
	trigrams = gen_trigrams(tokens)

	bi, tri = defaultdict(lambda: 0.0), defaultdict(lambda: 0.0)

	for t0, t1, t2 in trigrams:
	bi[t0, t1] += 1
	tri[t0, t1, t2] += 1

	m = model()
	for (t0, t1, t2), freq in tri.iteritems():
	m.add_to_seq(t0, t1, t2, freq / bi[t0, t1])
	return m


	def generate_sentence(model):
	phrase = ''
	t0, t1 = '$', '$'
	while 1: # for every iteration in t1 new word random getted from model
	if (t0, t1) not in model:
	break
	t0, t1 = t1, unirand(model[t0, t1])
	if t1 == '$': break
	if t1 in sent_end or t0 == '$':
	phrase += t1
	else:
	phrase += ' ' + t1
	print phrase
	return phrase.capitalize()


	def unirand(seq):
	"""
	here generating next word on sequence.
	"""
	sum_, freq_ = 0, 0
	for el in seq:
	sum_ += el['w']
	rnd = uniform(0, sum_)
	for el in seq:
	freq_ += el['w']
	if rnd < freq_:
	return el['3']

	if __name__ == '__main__':
	## model = train('/home/alesha/develop/seo-venv/dicts/text.txt')
	# model = model(words=[u'начать', u'имении', u'граф', u'пьер'])
	# for i in range(10):
	# print generate_sentence(model)

	#model = train('../data/data.txt')
	model = model()
	words = [u'любовь', u'деньги', u'я', u'шеф']
	sub_model = model.init_sub_model(words)
	print generate_sentence(sub_model)