Created
October 4, 2015 14:32
-
-
Save llibra/1973edabad62b6886760 to your computer and use it in GitHub Desktop.
Extract nouns from Japanese text.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
(ns kuro.core | |
(:import (com.atilika.kuromoji.ipadic Token Tokenizer)) | |
(:gen-class)) | |
(defn noun? [token] | |
(let [pos (first (:part-of-speech token))] | |
(or (= pos "名詞")))) | |
(defn ->part-of-speech [token] | |
(letfn [(na? [x] (= x "*"))] | |
(remove na? | |
`(~(.getPartOfSpeechLevel1 token) | |
~(.getPartOfSpeechLevel2 token) | |
~(.getPartOfSpeechLevel3 token) | |
~(.getPartOfSpeechLevel4 token))))) | |
(defn ->token [obj] | |
{ | |
:surface (.getSurface obj) | |
:part-of-speech (->part-of-speech obj) | |
}) | |
(defn morphological-analysis [text] | |
(let [tokenizer (new Tokenizer) | |
tokens (.tokenize tokenizer text)] | |
(map ->token tokens))) | |
(defn extract-nouns [text] | |
(let [tokens (morphological-analysis text)] | |
(map (fn [token] (:surface token)) | |
(filter noun? tokens)))) | |
(defn -main | |
[& args] | |
(let [text "お寿司が食べたい。"] | |
(println (extract-nouns text)))) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment