Created
October 31, 2008 07:09
-
-
Save hitode909/21249 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#! /usr/bin/ruby | |
# -*- coding: utf-8 -*- | |
# マルコフ連鎖をしてみるやつ | |
$KCODE = 'UTF8' | |
# Array::choiseを追加 | |
class Array | |
def choise | |
self[rand(self.size)] | |
end | |
end | |
# Hash::choiseを追加 | |
class Hash | |
def choise | |
keys.choise | |
end | |
end | |
# N-gramするモジュール | |
module Ngram | |
# str: 解析するString | |
# n: 分割する文字数 | |
# 出力: {単語=>[その単語の次に来ることがある単語の配列]} | |
def create_table(str, n) | |
list = str.gsub(/\n|\r/, '').split(//) | |
result = Hash.new | |
prev = nil | |
0.upto(list.size - n) do |counter| | |
token = list[counter..counter+n-1].join('') | |
if prev | |
result[prev] = [] unless result[prev] | |
result[prev] << token | |
end | |
prev = token | |
end | |
result | |
end | |
end | |
# マルコフ連鎖するモジュール | |
module Marcov | |
include Ngram | |
# 入力されたStringから文を生成 | |
# str: 入力のString | |
# n: NgramのN | |
# ssize: 生成する文章の数(sentence size) | |
# texp: 終端記号(terminal expression) | |
def generate_from_random(str, n, ssize, texp) | |
index = Ngram.create_table(str, n) | |
from = lambda{ index.choise } | |
struct(index, from, ssize, texp) | |
end | |
private | |
# 解析したデータから文を生成 | |
# index: 解析した表 | |
# from: 開始点の選び方 | |
# ssize: 生成する文章の数(sentence size) | |
# texp: 終端記号(terminal expression) | |
def struct(index, from, ssize, texp) | |
result = "" | |
cur = from.call | |
sentence_counter = 0 | |
while sentence_counter < ssize | |
char = cur.split(//).last | |
result << char | |
sentence_counter += 1 if char =~ texp | |
unless index[cur] | |
cur = from.call # 文章の最後まで行ってしまったとき、こっそり開始点を選び直す | |
end | |
cur = index[cur].choise | |
end | |
result | |
end | |
end | |
if $0 == __FILE__ | |
# 3-Gramで10文出す | |
include Marcov | |
str = ARGF.read | |
ngram_size = 3 | |
num_sentence = 10 | |
texp = /。| / | |
puts Marcov.generate_from_random(str, ngram_size, num_sentence, texp).gsub(texp, "。\n") | |
end |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment