Created
February 4, 2019 11:10
-
-
Save kariyayo/428f60519b1c2d0d76eda6a007fe92be to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| package main | |
| import ( | |
| "bufio" | |
| "fmt" | |
| "os" | |
| "path/filepath" | |
| "strings" | |
| mecab "github.com/shogo82148/go-mecab" | |
| ) | |
| func main() { | |
| if len(os.Args) < 2 { | |
| fmt.Println("Usage: go run word_frequency.go INPUT_DIR") | |
| return | |
| } | |
| inputDir := os.Args[1] | |
| frequency := make(map[string]int) | |
| var countProcessed int | |
| tagger, err := mecab.New(map[string]string{"output-format-type": "wakati"}) | |
| if err != nil { | |
| panic(err) | |
| } | |
| defer tagger.Destroy() | |
| // avoid GC problem with MeCab 0.996 (see https://github.com/taku910/mecab/pull/24) | |
| tagger.Parse("") | |
| err = filepath.Walk(inputDir, func(path string, info os.FileInfo, err error) error { | |
| if err != nil { | |
| return err | |
| } | |
| if strings.HasPrefix(info.Name(), "wiki_") { | |
| fmt.Fprintf(os.Stderr, "Processing %v...\n", inputDir+info.Name()) | |
| file, err := os.Open(inputDir + info.Name()) | |
| if err != nil { | |
| return err | |
| } | |
| defer file.Close() | |
| scanner := bufio.NewScanner(file) | |
| for { | |
| content := extractContent(scanner) | |
| if content == "" { | |
| break | |
| } | |
| tokens, err := getTokens(&tagger, content) | |
| if err != nil { | |
| return err | |
| } | |
| for _, token := range tokens { | |
| frequency[token] = frequency[token] + 1 | |
| } | |
| countProcessed += 1 | |
| if countProcessed % 10000 == 0 { | |
| fmt.Fprintf(os.Stderr, "%v documents were processed.\n", countProcessed) | |
| } | |
| } | |
| } | |
| return nil | |
| }) | |
| if err != nil { | |
| panic(err) | |
| } | |
| for token, count := range frequency { | |
| fmt.Println(token, count) | |
| } | |
| } | |
| func extractContent(scanner *bufio.Scanner) string { | |
| var contents []string | |
| for scanner.Scan() { | |
| line := scanner.Text() | |
| if strings.HasPrefix(line, "<doc ") { | |
| contents = []string{} | |
| } else if strings.HasPrefix(line, "</doc>") { | |
| return strings.Join(contents, "") | |
| } else { | |
| contents = append(contents, line) | |
| } | |
| } | |
| return "" | |
| } | |
| var subCates = []string{"固有名詞", "一般"} | |
| func getTokens(tagger *mecab.MeCab, content string) ([]string, error) { | |
| var results []string | |
| node, err := tagger.ParseToNode(content) | |
| if err != nil { | |
| return results, err | |
| } | |
| for ; !node.IsZero(); node = node.Next() { | |
| feature := node.Feature() | |
| categories := strings.Split(feature, ",") | |
| category := categories[0] | |
| subCategory := categories[1] | |
| if category == "名詞" && contains(subCates, subCategory) { | |
| results = append(results, node.Surface()) | |
| } | |
| } | |
| return results, nil | |
| } | |
| func contains(ss []string, target string) bool { | |
| for _, s := range ss { | |
| if s == target { | |
| return true | |
| } | |
| } | |
| return false | |
| } |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment