Last active
May 24, 2020 14:13
-
-
Save k0f1sh/7e1507b50e261c215caaccdc7d98e2ff to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
(ns gengo05 | |
(:require [clojure.java.io :as io] | |
[incanter.core :as i-core] | |
[incanter.charts :as i-charts] | |
[dorothy.core :as dot] | |
[dorothy.jvm :refer (render save! show!)])) | |
;; https://nlp100.github.io/ja/ch05.html | |
;; $ cat neko.txt | cabocha -f1 > neko.txt.cabocha | |
;; 40 | |
(defrecord Morph [surface base pos pos1]) | |
(defn to-morph [line] | |
(let [[surface details-str] (clojure.string/split line #"\t") | |
[pos pos1 _ _ _ _ base _ _] (clojure.string/split details-str #",")] | |
(->Morph surface base pos pos1))) | |
(def a (with-open [r (io/reader (io/resource "neko.txt.cabocha"))] | |
(doall | |
(->> | |
(line-seq r) | |
(filter #(not (clojure.string/starts-with? % "*"))) | |
(partition-by #(clojure.string/starts-with? % "EOS")) | |
(filter #(not (= "EOS" (first %)))) | |
(map #(map to-morph %)))))) | |
;; 41 | |
(defrecord Chunk [morphs dst srcs]) | |
(defrecord NoSrcsChunk [morphs idx dst]) | |
(defn to-no-srcs-chunk [dependency-line morph-lines] | |
(let [[_ idx dst-d _ _] (clojure.string/split dependency-line #" ") | |
dst (clojure.string/replace dst-d #"D" "")] | |
(->NoSrcsChunk (map to-morph morph-lines) | |
(Integer. idx) | |
(Integer. dst) | |
))) | |
;; idx-dst-pairsはその文全体での係り受け元と係り受け先のペアの配列 | |
(defn to-chunk [no-srcs-chunk idx-dst-pairs] | |
(->Chunk (:morphs no-srcs-chunk) | |
(:dst no-srcs-chunk) | |
(->> idx-dst-pairs | |
(filter #(= (second %) (:idx no-srcs-chunk))) | |
(map first)))) | |
(def sentences (doall | |
(->> | |
(line-seq (io/reader (io/resource "neko.txt.cabocha"))) | |
(partition-by #(clojure.string/starts-with? % "EOS")) | |
(filter #(not (= "EOS" (first %)))) | |
(map (fn [lines] | |
(loop [coll [] | |
lines lines] | |
(if (not (seq lines)) | |
coll | |
(let [dependency-line (first lines) | |
[morph-lines rest-lines] (split-with #(not (clojure.string/starts-with? % "* ")) (rest lines))] | |
(recur (conj coll {:dependency-line dependency-line | |
:morph-lines morph-lines}) | |
rest-lines)))))) | |
(map (fn [raw-chunks] | |
(map (fn [{:keys [dependency-line morph-lines]}] | |
(to-no-srcs-chunk dependency-line morph-lines)) | |
raw-chunks))) | |
(map (fn [no-srcs-chunks] | |
(let [idx-dst-pairs (->> no-srcs-chunks | |
(map (fn [no-srcs-chunk] | |
[(:idx no-srcs-chunk) (:dst no-srcs-chunk)])) | |
)] | |
(map (fn [no-srcs-chunk] | |
(to-chunk no-srcs-chunk idx-dst-pairs)) | |
no-srcs-chunks) | |
)))))) | |
(clojure.pprint/pprint (nth sentences 7)) | |
;; 42 | |
(->> sentences | |
(map (fn [chunks] | |
(->> chunks | |
(map (fn [chunk] | |
(->> | |
(:morphs chunk) | |
(filter #(not= (:pos %) "記号")) | |
(map :surface) | |
(apply str)))) | |
(clojure.string/join "\t")))) | |
(clojure.string/join "\n") | |
(spit "42.txt")) | |
;; 43 | |
(defn contains-noun? [chunk] | |
(->> (:morphs chunk) | |
(some #(= (:pos %) "名詞")) | |
(boolean))) | |
(defn contains-verb? [chunk] | |
(->> (:morphs chunk) | |
(some #(= (:pos %) "動詞")) | |
(boolean))) | |
(contains-noun? (first (first (take 1 sentences)))) | |
(contains-verb? (first (first (take 1 sentences)))) | |
(defn check43 [chunks] | |
(->> chunks | |
(filter contains-noun?) | |
(filter #(not= (:dst %) -1)) | |
(filter (fn [chunk] | |
(let [dst (:dst chunk)] | |
(contains-verb? (nth chunks dst))))) | |
(count) | |
(< 0))) | |
(->> sentences | |
(filter check43) | |
(map (fn [chunks] | |
(->> chunks | |
(map (fn [chunk] | |
(->> | |
(:morphs chunk) | |
(filter #(not= (:pos %) "記号")) | |
(map :surface) | |
(apply str)))) | |
(clojure.string/join "\t")))) | |
(clojure.string/join "\n") | |
(spit "43.txt")) | |
;; 44 | |
;; 与えられた文の係り受け木を有向グラフとして可視化せよ.可視化には,係り受け木をDOT言語に変換し,Graphvizを用いるとよい.また,Pythonから有向グラフを直接的に可視化するには,pydotを使うとよい. | |
(defn sentence-digraph [chunks] | |
(->> chunks | |
(filter (fn [chunk] | |
(not= -1 (:dst chunk)))) | |
(map (fn [chunk] | |
(let [from (->> (:morphs chunk) | |
(filter #(not= (:pos %) "記号")) | |
(map :surface) | |
(apply str)) | |
to (let [chunk (nth chunks (:dst chunk))] | |
(->> | |
(:morphs chunk) | |
(filter #(not= (:pos %) "記号")) | |
(map :surface) | |
(apply str)))] | |
[from to]))))) | |
(->> (sentence-digraph (nth sentences 6)) | |
(dot/digraph) | |
(dot/dot) | |
(show!)) | |
;; 45 | |
;; for debug | |
(defn to-string [chunks] | |
(->> chunks | |
(map (fn [chunk] | |
(let [morphs (:morphs chunk)] | |
(clojure.string/join "" (map :surface morphs))))) | |
(clojure.string/join "/"))) | |
(defn include-verb? [chunk] | |
(let [morphs (:morphs chunk)] | |
(boolean (some #(= (:pos %) "動詞") morphs)))) | |
(defn include-particles? [chunk] | |
(let [morphs (:morphs chunk)] | |
(boolean (some #(= (:pos %) "助詞") morphs)))) | |
(defn get-particles [chunk] | |
(let [morphs (:morphs chunk) | |
particles (filter #(= "助詞" (:pos %)) morphs)] | |
(map :surface particles))) | |
(defn get-verb-and-partices [chunk chunks] | |
(if (include-verb? chunk) | |
(when (not (empty? (:srcs chunk))) | |
(let [particle-chunks (->> (:srcs chunk) | |
(map #(nth chunks %)) | |
(filter include-particles?))] | |
[chunk particle-chunks])))) | |
(defn get-verb-chunk-base [verb-chunk] | |
(if-let [morph (some (fn [morph] | |
(when (= (:pos morph) "動詞") | |
morph)) | |
(:morphs verb-chunk))] | |
(:base morph))) | |
(defn to-str-45 [[verb-chunk particle-chunks]] | |
(let [verb-str (get-verb-chunk-base verb-chunk) | |
particle-str (->> (map (fn [particle-chunk] | |
(some (fn [morph] | |
(when (= (:pos morph) "助詞") | |
morph)) | |
(:morphs particle-chunk))) particle-chunks) | |
(map :surface))] | |
(str verb-str "\t" (clojure.string/join " " particle-str)))) | |
(->> sentences | |
(map (fn [chunks] | |
(->> chunks | |
(map #(get-verb-and-partices % chunks)) | |
(filter (comp not nil?)) | |
(map to-str-45)))) | |
(flatten) | |
(clojure.string/join "\n") | |
(spit "45.txt")) | |
;; 以下の事項をUNIXコマンドを用いて確認せよ. | |
;; コーパス中で頻出する述語と格パターンの組み合わせ | |
;; 「する」「見る」「与える」という動詞の格パターン(コーパス中で出現頻度の高い順に並べよ) | |
;; せっかくなのでclojureでやる | |
(def predicate-case-combinations | |
(with-open [r (io/reader "45.txt")] | |
(doall | |
(->> (line-seq r) | |
(map (fn [line] | |
(let [[verb cases-str] (clojure.string/split line #"\t")] | |
(when cases-str | |
(let [cases (clojure.string/split cases-str #" ")] | |
(->> cases | |
(map (fn [case] | |
[verb case])))))))) | |
(filter boolean) | |
(apply concat))))) | |
;; コーパス中で頻出する述語と格パターンの組み合わせ | |
(clojure.pprint/pprint | |
(take 10 (->> predicate-case-combinations | |
(frequencies) | |
(sort-by val) | |
(reverse)))) | |
;; 「する」「見る」「与える」という動詞の格パターン(コーパス中で出現頻度の高い順に並べよ) | |
(clojure.pprint/pprint | |
(->> predicate-case-combinations | |
(filter (fn [[verb _]] | |
(or (= verb "する") | |
(= verb "見る") | |
(= verb "与える")))) | |
(map second) | |
(frequencies) | |
(sort-by val) | |
(reverse))) | |
;; 46 | |
(defn to-str-46 [[verb-chunk particle-chunks]] | |
(let [verb-str (get-verb-chunk-base verb-chunk) | |
particle-str (->> (map (fn [particle-chunk] | |
(some (fn [morph] | |
(when (= (:pos morph) "助詞") | |
morph)) | |
(:morphs particle-chunk))) particle-chunks) | |
(map :surface)) | |
particle-all-str (->> particle-chunks | |
(filter (fn [particle-chunk] | |
(boolean (some (fn [morph] | |
(= (:pos morph) "助詞")) (:morphs particle-chunk))))) | |
(map (fn [particle-chunk] | |
(let [morphs (:morphs particle-chunk)] | |
(->> morphs | |
(map :surface) | |
(clojure.string/join ""))))))] | |
(str verb-str "\t" (clojure.string/join " " particle-str) "\t" (clojure.string/join " " particle-all-str)))) | |
(->> sentences | |
(map (fn [chunks] | |
(->> chunks | |
(map #(get-verb-and-partices % chunks)) | |
(filter (comp not nil?)) | |
(map to-str-46)))) | |
(flatten) | |
(clojure.string/join "\n") | |
(spit "46.txt")) | |
;; 47 | |
(defn sa? [morph] | |
(and | |
(= (:pos morph) "名詞") | |
(= (:pos1 morph) "サ変接続"))) | |
(defn wo? [morph] | |
(and (= (:pos morph) "助詞") | |
(= (:surface morph) "を"))) | |
(defn has-sa? [chunk] | |
(let [morphs (:morphs chunk)] | |
(boolean (some sa? morphs)))) | |
(defn has-wo? [chunk] | |
(let [morphs (:morphs chunk)] | |
(boolean (some wo? morphs)))) | |
(defn has-sa-and-wo? [chunk] | |
(and (has-sa? chunk) | |
(has-wo? chunk))) | |
(defn chunk-to-string [chunk] | |
(->> (:morphs chunk) | |
(filter (fn [morph] | |
(not= (:pos morph) "記号"))) | |
(map :surface) | |
(clojure.string/join))) | |
(defn get-first-verb-chunk [chunks] | |
"文中で動詞が含まれている文節の最初の一つを返す" | |
(->> chunks | |
(some (fn [chunk] | |
(when (include-verb? chunk) | |
chunk))))) | |
(defn get-all-verb-chunks [chunks] | |
"文中で動詞が含まれている文節をすべて返す" | |
(->> chunks | |
(filter include-verb?))) | |
(defn verb-chunk-to-base-str [verb-chunk] | |
(let [verb-morph (some #(when (= (:pos %) "動詞") %) (:morphs verb-chunk))] | |
(:base verb-morph))) | |
(defn sa-chunk-to-str [sa-chunk] | |
(let [sa-morph (some #(when (sa? %) %) (:morphs sa-chunk))] | |
(:surface sa-morph))) | |
(defn particle-chunks-to-only-particle-str [particle-chunks] | |
(->> particle-chunks | |
(map :morphs) | |
(map (fn [morphs] | |
;;(some #(when (= (:pos %) "助詞") %) morphs) | |
(->> morphs | |
(filter (fn [morph] | |
(= (:pos morph) "助詞"))) | |
last))) | |
(map (fn [particle-morph] | |
(:surface particle-morph))) | |
(clojure.string/join " "))) | |
(defn particle-chunks-to-str [particle-chunks] | |
(->> particle-chunks | |
(map chunk-to-string) | |
(clojure.string/join " "))) | |
(defn to-str-47 [{:keys [verb-chunk sa-chunk particle-chunks]}] | |
(let [verb-str (verb-chunk-to-base-str verb-chunk) | |
jutugo (str (sa-chunk-to-str sa-chunk) "を" verb-str) | |
only-particle-str (particle-chunks-to-only-particle-str particle-chunks) | |
particle-chunk-str (particle-chunks-to-str particle-chunks)] | |
(str jutugo "\t" only-particle-str "\t" particle-chunk-str))) | |
(->> sentences | |
(map (fn [chunks] | |
(let [verb-chunks (get-all-verb-chunks chunks)] | |
(->> verb-chunks | |
;; 動詞に係るサ変接続名詞の文節があるもののみfilter | |
(filter (fn [verb-chunk] | |
(let [srcs (:srcs verb-chunk) | |
src-chunks (map #(nth chunks %) srcs)] | |
(some has-sa-and-wo? src-chunks)))) | |
(map (fn [verb-chunk] | |
(let [srcs (:srcs verb-chunk) | |
src-chunks (map #(nth chunks %) srcs)] | |
(let [particle-chunks (->> src-chunks | |
(filter #(not (has-sa-and-wo? %))) | |
(filter include-particles?) | |
)] | |
{:verb-chunk verb-chunk | |
:sa-chunk (some #(when (has-sa-and-wo? %) %) src-chunks) | |
:particle-chunks particle-chunks})))) | |
)))) | |
(apply concat) | |
(filter (comp not empty?)) | |
(map to-str-47) | |
(clojure.string/join "\n") | |
(spit "47.txt")) | |
;; ;; このプログラムの出力をファイルに保存し,以下の事項をUNIXコマンドを用いて確認せよ. | |
;; ;; コーパス中で頻出する述語(サ変接続名詞+を+動詞) | |
(def sa-verbs | |
(with-open [r (io/reader "47.txt")] | |
(doall | |
(->> (line-seq r) | |
(map (fn [line] | |
(let [[verb] (clojure.string/split line #"\t")] | |
verb))))))) | |
(clojure.pprint/pprint | |
(take 10 (->> sa-verbs | |
(frequencies) | |
(sort-by val) | |
(reverse)))) | |
;; ;; コーパス中で頻出する述語と助詞パターン | |
(def sa-combinations | |
(with-open [r (io/reader "47.txt")] | |
(doall | |
(->> (line-seq r) | |
(map (fn [line] | |
(let [[verb particles-str] (clojure.string/split line #"\t")] | |
(when (not (nil? particles-str)) | |
(let [particles (clojure.string/split particles-str #" ")] | |
(map (fn [particle] | |
[verb particle]) | |
particles))) | |
))) | |
(filter (comp not nil?)) | |
(apply concat) | |
)))) | |
(clojure.pprint/pprint (take 10 (->> sa-combinations | |
(frequencies) | |
(sort-by val) | |
(reverse)))) | |
;; 48 | |
(defn make-chain [chunk chunks] | |
(lazy-seq | |
(if (= (:dst chunk) -1) | |
[chunk] | |
(lazy-seq (cons chunk (make-chain (nth chunks (:dst chunk)) chunks)))))) | |
(defn chain-to-string [chain] | |
(when (not (empty? chain)) | |
(->> chain | |
(map chunk-to-string) | |
(clojure.string/join " -> ")))) | |
(->> sentences | |
(filter (fn [chunks] | |
;; 名詞をふくむ | |
(some contains-noun? chunks))) | |
(map (fn [chunks] | |
(->> (map (fn [chunk] | |
(when (contains-noun? chunk) | |
(make-chain chunk chunks))) chunks) | |
(map chain-to-string) | |
(filter (comp not nil?))))) | |
(apply concat) | |
(clojure.string/join "\n") | |
(spit "48.txt")) | |
;; 49 | |
;; 吾輩はここで始めて人間というものを見た。 | |
(def debug-chunks (nth sentences 5)) | |
(defn to-indexed-chunk [idx chunk] | |
(assoc chunk :idx idx)) | |
(defn to-indexed-chunks [chunks] | |
(map-indexed | |
(fn [idx chunk] | |
(to-indexed-chunk idx chunk)) | |
chunks)) | |
;; すべての名詞のペアを抽出 [i j] idx付き | |
(defn all-noun-combinations [chunks] | |
(let [indexed-noun-chunks (->> chunks | |
(to-indexed-chunks) | |
(filter contains-noun?))] | |
(->> (for [i indexed-noun-chunks | |
j indexed-noun-chunks] | |
(vector i j)) | |
(filter (fn [[i j]] (not= i j))) | |
(map (fn [v] | |
(sort-by :idx v))) | |
(distinct) | |
(map (fn [[i j]] | |
[(dissoc i :idx) (dissoc j :idx)]))))) | |
;; iから構文木の根に至る経路にjがあるかどうか | |
(defn exists-route? [i j chunks] | |
(let [route (make-chain i chunks)] | |
(->> route | |
(some #(= j %)) | |
(boolean)))) | |
(defn to-x-str [chunk] | |
(let [morphs (:morphs chunk)] | |
(->> morphs | |
(map (fn [morph] | |
(if (= (:pos morph) "名詞") | |
"X" | |
(:surface morph)))) | |
(clojure.string/join)))) | |
(defn to-49-string-exists-route [i j chunks] | |
(let [route (make-chain i chunks) | |
middle-of-route (rest (take-while #(not= j %) route)) | |
i-str (to-x-str i)] | |
(str i-str " -> " (chain-to-string middle-of-route) " -> " "Y"))) | |
;; kを探す | |
;; 文節iと文節jから構文木の根に至る経路上で共通の文節kで交わる | |
(defn find-k [i j chunks] | |
(let [i-route-without-head (rest (make-chain i chunks)) | |
j-route-without-head (rest (make-chain j chunks))] | |
(loop [coll i-route-without-head] | |
(when (seq coll) | |
(let [k (first coll)] | |
(if (->> j-route-without-head | |
(some (fn [chunk] | |
(= k chunk)))) | |
k | |
(recur (rest coll)))))))) | |
;; 文節iから文節kに至る直前のパスを文字列化 | |
(defn left-str [i k chunks] | |
(let [route (->> (make-chain i chunks) | |
(take-while #(not= k %)))] | |
(->> route | |
(map (fn [chunk] | |
(if (= chunk i) | |
(to-x-str chunk) | |
(chunk-to-string chunk)))) | |
(clojure.string/join)))) | |
(defn to-y-str [chunk] | |
(let [morphs (:morphs chunk)] | |
(->> morphs | |
(map (fn [morph] | |
(if (= (:pos morph) "名詞") | |
"Y" | |
(:surface morph)))) | |
(clojure.string/join)))) | |
;; 文節jから文節kに至る直前のパスを文字列化 | |
(defn middle-str [j k chunks] | |
(let [route (->> (make-chain j chunks) | |
(take-while #(not= k %)))] | |
(->> route | |
(map (fn [chunk] | |
(if (= chunk j) | |
(to-y-str chunk) | |
(chunk-to-string chunk)))) | |
(clojure.string/join " -> ")))) | |
(defn to-49-string-not-exists-route [i j chunks] | |
(let [k (find-k i j chunks)] | |
(str (left-str i k chunks) " | " (middle-str j k chunks) " | " (chunk-to-string k)))) | |
(defmulti to-49-string exists-route?) | |
(defmethod to-49-string true [i j chunks] | |
(to-49-string-exists-route i j chunks)) | |
(defmethod to-49-string false [i j chunks] | |
(to-49-string-not-exists-route i j chunks)) | |
(->> sentences | |
(map (fn [chunks] | |
(->> (all-noun-combinations chunks) | |
(map (fn [[i j]] | |
(to-49-string i j chunks)))))) | |
(flatten) | |
(clojure.string/join "\n") | |
(spit "49.txt")) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment