Created
November 4, 2015 12:38
-
-
Save quarnster/3d9e8d590b27cf15c494 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// Download http://downloads.tatoeba.org/exports/sentences.tar.bz2 and http://downloads.tatoeba.org/exports/links.tar.bz2 | |
// to this directory. | |
// | |
// Then run "go run tatoeba.go". | |
// | |
// Tweak "havestring" to change the 汉字 learned. | |
package main | |
import ( | |
"archive/tar" | |
"compress/bzip2" | |
"encoding/gob" | |
"fmt" | |
"io/ioutil" | |
"log" | |
"os" | |
"sort" | |
"strings" | |
) | |
type ( | |
// Just a type to keep track of a set of runes | |
runes []rune | |
) | |
func (r runes) Len() int { | |
return len(r) | |
} | |
func (r runes) Less(i, j int) bool { | |
return r[i] < r[j] | |
} | |
func (r runes) Swap(i, j int) { | |
r[i], r[j] = r[j], r[i] | |
} | |
func (r runes) Index(r2 rune) int { | |
return sort.Search(len(r), func(i int) bool { | |
return r2 <= r[i] | |
}) | |
} | |
func (r runes) Contains(r2 rune) bool { | |
i := r.Index(r2) | |
if i >= len(r) || r[i] != r2 { | |
return false | |
} | |
return true | |
} | |
var skiprunes = runes("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789,.!?()!,·。?「」 “”﹣\":;、-/\\…≦≧▽《》﹐();⋯"Q:ßí⋯ 『』%%+[]_«»àáèìòóùāęłǎαβа—‘’‧→-013568& ₣") | |
func init() { | |
sort.Sort(skiprunes) | |
} | |
func (r *runes) Add(r2 rune) { | |
if skiprunes.Contains(r2) { | |
return | |
} | |
i := r.Index(r2) | |
if i >= len(*r) || (*r)[i] != r2 { | |
end := append([]rune{r2}, (*r)[i:]...) | |
*r = append((*r)[:i], end...) | |
} | |
} | |
func loadBzip2(fn string) (data []byte, err error) { | |
f, err := os.Open(fn) | |
if err != nil { | |
return nil, fmt.Errorf("%s\nplease download http://downloads.tatoeba.org/exports/%s to this directory", err, fn) | |
} | |
defer f.Close() | |
br := tar.NewReader(bzip2.NewReader(f)) | |
_, err = br.Next() | |
if err != nil { | |
return nil, err | |
} | |
data, err = ioutil.ReadAll(br) | |
return | |
} | |
func load(fn string) (data []byte, err error) { | |
data, err = ioutil.ReadFile(fn + ".csv") | |
if err != nil { | |
data, err = loadBzip2(fn + ".tar.bz2") | |
if err == nil { | |
ioutil.WriteFile(fn+".csv", data, 0644) | |
} | |
return | |
} | |
return | |
} | |
func load_cached_pairs() (map[string]string, error) { | |
f, err := os.Open("pairs.gob") | |
if err != nil { | |
return nil, err | |
} | |
defer f.Close() | |
dec := gob.NewDecoder(f) | |
ret := make(map[string]string) | |
err = dec.Decode(&ret) | |
if err != nil { | |
return nil, err | |
} | |
return ret, nil | |
} | |
func load_pairs() (map[string]string, error) { | |
ret, err := load_cached_pairs() | |
if err == nil { | |
return ret, nil | |
} | |
var ( | |
english = make(map[string]string) | |
chinese = make(map[string]string) | |
cn_en_pairs = make(map[string]string) | |
) | |
log.Println("load") | |
data, err := load("sentences") | |
if err != nil { | |
return nil, err | |
} | |
log.Println("split") | |
lines := strings.Split(string(data), "\n") | |
log.Println("proc") | |
for _, line := range lines { | |
fields := strings.Split(line, "\t") | |
if len(fields) != 3 { | |
continue | |
} | |
switch fields[1] { | |
case "eng": | |
english[fields[0]] = fields[2] | |
case "cmn": | |
chinese[fields[0]] = fields[2] | |
} | |
} | |
log.Println("loading links") | |
data, err = load("links") | |
if err != nil { | |
return nil, err | |
} | |
log.Println("processing links") | |
for _, line := range strings.Split(string(data), "\n") { | |
fields := strings.Split(line, "\t") | |
if len(fields) != 2 { | |
continue | |
} | |
c, ok := chinese[fields[0]] | |
if !ok { | |
continue | |
} | |
e, ok := english[fields[1]] | |
if !ok { | |
continue | |
} | |
cn_en_pairs[c] = e | |
} | |
log.Printf("have %d sentence pairs", len(cn_en_pairs)) | |
f, err := os.Create("pairs.gob") | |
if err != nil { | |
return nil, err | |
} | |
defer f.Close() | |
enc := gob.NewEncoder(f) | |
enc.Encode(cn_en_pairs) | |
return cn_en_pairs, nil | |
} | |
func main() { | |
pairs, err := load_pairs() | |
if err != nil { | |
log.Fatalln(err) | |
} | |
havestring := "你好你好我叫呢名字什么很高兴认识龙猫鱼不一是吗的条只二三四五六七八九十二十十二百两千万在家人有哪儿和也家人北京火车火车站后面住今年今年多大多大了几岁口爸爸她哪个我们医生去学学习学生学校同学大学北大太医院妈妈他个做工作汉语老师小下面电视机前面买东西睡觉里这儿找它前后电视电脑下对不起上午没看看见见椅子上卧室桌子银行网上用对卡绿皮书打电话给喂叔叔请问谢谢客气不客气等一下等一下会说想下午今天电影星期日明天星期日没关系再见现在写能书中文本汉字谁这样啊哦什么时候要可以吧怎么吃几个儿子天气冷下雨怎么样坐出租车苹果中午时候回来回来回家饭馆点几点那商店些请吃饭衣服钱喜欢爱漂亮这白件色贵那个哪您好零这个块多少都朋友位美国人八月欢迎热月上个月下个月昨天女儿中国觉得嗨美国飞机半刻但但是喝菜还听说好吃还是送水果听米饭杯子为什么茶水雪碧可口可乐那里" | |
var haverunes runes | |
for _, r := range []rune(havestring) { | |
haverunes.Add(r) | |
} | |
var sentencecount = 0 | |
outer: | |
for cn, en := range pairs { | |
for _, r := range []rune(cn) { | |
if skiprunes.Contains(r) { | |
continue | |
} | |
if !haverunes.Contains(r) { | |
continue outer | |
} | |
} | |
log.Printf("You can say: %s: %s", cn, en) | |
sentencecount++ | |
} | |
log.Printf("A total of %d sentences", sentencecount) | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment