Skip to content

Instantly share code, notes, and snippets.

@kariyayo
Created February 4, 2019 11:10
Show Gist options
  • Select an option

  • Save kariyayo/428f60519b1c2d0d76eda6a007fe92be to your computer and use it in GitHub Desktop.

Select an option

Save kariyayo/428f60519b1c2d0d76eda6a007fe92be to your computer and use it in GitHub Desktop.
package main
import (
"bufio"
"fmt"
"os"
"path/filepath"
"strings"
mecab "github.com/shogo82148/go-mecab"
)
func main() {
if len(os.Args) < 2 {
fmt.Println("Usage: go run word_frequency.go INPUT_DIR")
return
}
inputDir := os.Args[1]
frequency := make(map[string]int)
var countProcessed int
tagger, err := mecab.New(map[string]string{"output-format-type": "wakati"})
if err != nil {
panic(err)
}
defer tagger.Destroy()
// avoid GC problem with MeCab 0.996 (see https://github.com/taku910/mecab/pull/24)
tagger.Parse("")
err = filepath.Walk(inputDir, func(path string, info os.FileInfo, err error) error {
if err != nil {
return err
}
if strings.HasPrefix(info.Name(), "wiki_") {
fmt.Fprintf(os.Stderr, "Processing %v...\n", inputDir+info.Name())
file, err := os.Open(inputDir + info.Name())
if err != nil {
return err
}
defer file.Close()
scanner := bufio.NewScanner(file)
for {
content := extractContent(scanner)
if content == "" {
break
}
tokens, err := getTokens(&tagger, content)
if err != nil {
return err
}
for _, token := range tokens {
frequency[token] = frequency[token] + 1
}
countProcessed += 1
if countProcessed % 10000 == 0 {
fmt.Fprintf(os.Stderr, "%v documents were processed.\n", countProcessed)
}
}
}
return nil
})
if err != nil {
panic(err)
}
for token, count := range frequency {
fmt.Println(token, count)
}
}
func extractContent(scanner *bufio.Scanner) string {
var contents []string
for scanner.Scan() {
line := scanner.Text()
if strings.HasPrefix(line, "<doc ") {
contents = []string{}
} else if strings.HasPrefix(line, "</doc>") {
return strings.Join(contents, "")
} else {
contents = append(contents, line)
}
}
return ""
}
var subCates = []string{"固有名詞", "一般"}
func getTokens(tagger *mecab.MeCab, content string) ([]string, error) {
var results []string
node, err := tagger.ParseToNode(content)
if err != nil {
return results, err
}
for ; !node.IsZero(); node = node.Next() {
feature := node.Feature()
categories := strings.Split(feature, ",")
category := categories[0]
subCategory := categories[1]
if category == "名詞" && contains(subCates, subCategory) {
results = append(results, node.Surface())
}
}
return results, nil
}
func contains(ss []string, target string) bool {
for _, s := range ss {
if s == target {
return true
}
}
return false
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment