Skip to content

Instantly share code, notes, and snippets.

@mattn
Created October 18, 2017 03:50
Show Gist options
  • Save mattn/bc0f8009350c3db5ea1c3ccd0a6e0dbf to your computer and use it in GitHub Desktop.
Save mattn/bc0f8009350c3db5ea1c3ccd0a6e0dbf to your computer and use it in GitHub Desktop.
package main
import (
"fmt"
"log"
"math/rand"
"regexp"
"strings"
"time"
"github.com/PuerkitoBio/goquery"
"github.com/ikawaha/kagome/tokenizer"
)
var (
reIgnoreText = regexp.MustCompile(`[\[\]「」『』()]`)
)
type Markov struct {
tbl map[string]map[string][]string
}
func NewMarkov() *Markov {
return &Markov{
tbl: make(map[string]map[string][]string),
}
}
func (m *Markov) Update(text string) {
t := tokenizer.New()
text = reIgnoreText.ReplaceAllString(text, "")
tokens := t.Tokenize(text)
words := []string{}
for _, token := range tokens {
if token.Surface == "EOS" {
continue
}
words = append(words, token.Surface)
}
size := len(words)
for i := 0; i < size-2; i++ {
second, ok := m.tbl[words[i]]
if !ok {
second = make(map[string][]string)
m.tbl[words[i]] = second
}
second[words[i+1]] = append(second[words[i+1]], words[i+2])
}
}
func (m *Markov) First() string {
keys := []string{}
for k := range m.tbl {
keys = append(keys, k)
}
return keys[rand.Int()%len(keys)]
}
func (m *Markov) Chain(first string) string {
text := first
keys := []string{}
for k := range m.tbl[first] {
keys = append(keys, k)
}
kv := rand.Int() % len(keys)
second := keys[kv]
text += second
for second != "" {
size := len(m.tbl[first][second])
if size == 0 {
break
}
idx := rand.Int() % size
next := m.tbl[first][second][idx]
text += next
first = second
second = next
}
return text
}
func main() {
rand.Seed(time.Now().UnixNano())
doc, err := goquery.NewDocument("https://fushinsha-joho.co.jp/serif.cgi")
if err != nil {
log.Fatal(err)
}
m := NewMarkov()
doc.Find(".headline").Each(func(i int, s *goquery.Selection) {
m.Update(strings.TrimSpace(s.Text()))
})
fmt.Println(m.Chain(m.First()))
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment