Skip to content

Instantly share code, notes, and snippets.

@k0f1sh
Last active May 24, 2020 14:13
Show Gist options
  • Save k0f1sh/7e1507b50e261c215caaccdc7d98e2ff to your computer and use it in GitHub Desktop.
Save k0f1sh/7e1507b50e261c215caaccdc7d98e2ff to your computer and use it in GitHub Desktop.
(ns gengo05
(:require [clojure.java.io :as io]
[incanter.core :as i-core]
[incanter.charts :as i-charts]
[dorothy.core :as dot]
[dorothy.jvm :refer (render save! show!)]))
;; https://nlp100.github.io/ja/ch05.html
;; $ cat neko.txt | cabocha -f1 > neko.txt.cabocha
;; 40
(defrecord Morph [surface base pos pos1])
(defn to-morph [line]
(let [[surface details-str] (clojure.string/split line #"\t")
[pos pos1 _ _ _ _ base _ _] (clojure.string/split details-str #",")]
(->Morph surface base pos pos1)))
(def a (with-open [r (io/reader (io/resource "neko.txt.cabocha"))]
(doall
(->>
(line-seq r)
(filter #(not (clojure.string/starts-with? % "*")))
(partition-by #(clojure.string/starts-with? % "EOS"))
(filter #(not (= "EOS" (first %))))
(map #(map to-morph %))))))
;; 41
(defrecord Chunk [morphs dst srcs])
(defrecord NoSrcsChunk [morphs idx dst])
(defn to-no-srcs-chunk [dependency-line morph-lines]
(let [[_ idx dst-d _ _] (clojure.string/split dependency-line #" ")
dst (clojure.string/replace dst-d #"D" "")]
(->NoSrcsChunk (map to-morph morph-lines)
(Integer. idx)
(Integer. dst)
)))
;; idx-dst-pairsはその文全体での係り受け元と係り受け先のペアの配列
(defn to-chunk [no-srcs-chunk idx-dst-pairs]
(->Chunk (:morphs no-srcs-chunk)
(:dst no-srcs-chunk)
(->> idx-dst-pairs
(filter #(= (second %) (:idx no-srcs-chunk)))
(map first))))
(def sentences (doall
(->>
(line-seq (io/reader (io/resource "neko.txt.cabocha")))
(partition-by #(clojure.string/starts-with? % "EOS"))
(filter #(not (= "EOS" (first %))))
(map (fn [lines]
(loop [coll []
lines lines]
(if (not (seq lines))
coll
(let [dependency-line (first lines)
[morph-lines rest-lines] (split-with #(not (clojure.string/starts-with? % "* ")) (rest lines))]
(recur (conj coll {:dependency-line dependency-line
:morph-lines morph-lines})
rest-lines))))))
(map (fn [raw-chunks]
(map (fn [{:keys [dependency-line morph-lines]}]
(to-no-srcs-chunk dependency-line morph-lines))
raw-chunks)))
(map (fn [no-srcs-chunks]
(let [idx-dst-pairs (->> no-srcs-chunks
(map (fn [no-srcs-chunk]
[(:idx no-srcs-chunk) (:dst no-srcs-chunk)]))
)]
(map (fn [no-srcs-chunk]
(to-chunk no-srcs-chunk idx-dst-pairs))
no-srcs-chunks)
))))))
(clojure.pprint/pprint (nth sentences 7))
;; 42
(->> sentences
(map (fn [chunks]
(->> chunks
(map (fn [chunk]
(->>
(:morphs chunk)
(filter #(not= (:pos %) "記号"))
(map :surface)
(apply str))))
(clojure.string/join "\t"))))
(clojure.string/join "\n")
(spit "42.txt"))
;; 43
(defn contains-noun? [chunk]
(->> (:morphs chunk)
(some #(= (:pos %) "名詞"))
(boolean)))
(defn contains-verb? [chunk]
(->> (:morphs chunk)
(some #(= (:pos %) "動詞"))
(boolean)))
(contains-noun? (first (first (take 1 sentences))))
(contains-verb? (first (first (take 1 sentences))))
(defn check43 [chunks]
(->> chunks
(filter contains-noun?)
(filter #(not= (:dst %) -1))
(filter (fn [chunk]
(let [dst (:dst chunk)]
(contains-verb? (nth chunks dst)))))
(count)
(< 0)))
(->> sentences
(filter check43)
(map (fn [chunks]
(->> chunks
(map (fn [chunk]
(->>
(:morphs chunk)
(filter #(not= (:pos %) "記号"))
(map :surface)
(apply str))))
(clojure.string/join "\t"))))
(clojure.string/join "\n")
(spit "43.txt"))
;; 44
;; 与えられた文の係り受け木を有向グラフとして可視化せよ.可視化には,係り受け木をDOT言語に変換し,Graphvizを用いるとよい.また,Pythonから有向グラフを直接的に可視化するには,pydotを使うとよい.
(defn sentence-digraph [chunks]
(->> chunks
(filter (fn [chunk]
(not= -1 (:dst chunk))))
(map (fn [chunk]
(let [from (->> (:morphs chunk)
(filter #(not= (:pos %) "記号"))
(map :surface)
(apply str))
to (let [chunk (nth chunks (:dst chunk))]
(->>
(:morphs chunk)
(filter #(not= (:pos %) "記号"))
(map :surface)
(apply str)))]
[from to])))))
(->> (sentence-digraph (nth sentences 6))
(dot/digraph)
(dot/dot)
(show!))
;; 45
;; for debug
(defn to-string [chunks]
(->> chunks
(map (fn [chunk]
(let [morphs (:morphs chunk)]
(clojure.string/join "" (map :surface morphs)))))
(clojure.string/join "/")))
(defn include-verb? [chunk]
(let [morphs (:morphs chunk)]
(boolean (some #(= (:pos %) "動詞") morphs))))
(defn include-particles? [chunk]
(let [morphs (:morphs chunk)]
(boolean (some #(= (:pos %) "助詞") morphs))))
(defn get-particles [chunk]
(let [morphs (:morphs chunk)
particles (filter #(= "助詞" (:pos %)) morphs)]
(map :surface particles)))
(defn get-verb-and-partices [chunk chunks]
(if (include-verb? chunk)
(when (not (empty? (:srcs chunk)))
(let [particle-chunks (->> (:srcs chunk)
(map #(nth chunks %))
(filter include-particles?))]
[chunk particle-chunks]))))
(defn get-verb-chunk-base [verb-chunk]
(if-let [morph (some (fn [morph]
(when (= (:pos morph) "動詞")
morph))
(:morphs verb-chunk))]
(:base morph)))
(defn to-str-45 [[verb-chunk particle-chunks]]
(let [verb-str (get-verb-chunk-base verb-chunk)
particle-str (->> (map (fn [particle-chunk]
(some (fn [morph]
(when (= (:pos morph) "助詞")
morph))
(:morphs particle-chunk))) particle-chunks)
(map :surface))]
(str verb-str "\t" (clojure.string/join " " particle-str))))
(->> sentences
(map (fn [chunks]
(->> chunks
(map #(get-verb-and-partices % chunks))
(filter (comp not nil?))
(map to-str-45))))
(flatten)
(clojure.string/join "\n")
(spit "45.txt"))
;; 以下の事項をUNIXコマンドを用いて確認せよ.
;; コーパス中で頻出する述語と格パターンの組み合わせ
;; 「する」「見る」「与える」という動詞の格パターン(コーパス中で出現頻度の高い順に並べよ)
;; せっかくなのでclojureでやる
(def predicate-case-combinations
(with-open [r (io/reader "45.txt")]
(doall
(->> (line-seq r)
(map (fn [line]
(let [[verb cases-str] (clojure.string/split line #"\t")]
(when cases-str
(let [cases (clojure.string/split cases-str #" ")]
(->> cases
(map (fn [case]
[verb case]))))))))
(filter boolean)
(apply concat)))))
;; コーパス中で頻出する述語と格パターンの組み合わせ
(clojure.pprint/pprint
(take 10 (->> predicate-case-combinations
(frequencies)
(sort-by val)
(reverse))))
;; 「する」「見る」「与える」という動詞の格パターン(コーパス中で出現頻度の高い順に並べよ)
(clojure.pprint/pprint
(->> predicate-case-combinations
(filter (fn [[verb _]]
(or (= verb "する")
(= verb "見る")
(= verb "与える"))))
(map second)
(frequencies)
(sort-by val)
(reverse)))
;; 46
(defn to-str-46 [[verb-chunk particle-chunks]]
(let [verb-str (get-verb-chunk-base verb-chunk)
particle-str (->> (map (fn [particle-chunk]
(some (fn [morph]
(when (= (:pos morph) "助詞")
morph))
(:morphs particle-chunk))) particle-chunks)
(map :surface))
particle-all-str (->> particle-chunks
(filter (fn [particle-chunk]
(boolean (some (fn [morph]
(= (:pos morph) "助詞")) (:morphs particle-chunk)))))
(map (fn [particle-chunk]
(let [morphs (:morphs particle-chunk)]
(->> morphs
(map :surface)
(clojure.string/join ""))))))]
(str verb-str "\t" (clojure.string/join " " particle-str) "\t" (clojure.string/join " " particle-all-str))))
(->> sentences
(map (fn [chunks]
(->> chunks
(map #(get-verb-and-partices % chunks))
(filter (comp not nil?))
(map to-str-46))))
(flatten)
(clojure.string/join "\n")
(spit "46.txt"))
;; 47
(defn sa? [morph]
(and
(= (:pos morph) "名詞")
(= (:pos1 morph) "サ変接続")))
(defn wo? [morph]
(and (= (:pos morph) "助詞")
(= (:surface morph) "を")))
(defn has-sa? [chunk]
(let [morphs (:morphs chunk)]
(boolean (some sa? morphs))))
(defn has-wo? [chunk]
(let [morphs (:morphs chunk)]
(boolean (some wo? morphs))))
(defn has-sa-and-wo? [chunk]
(and (has-sa? chunk)
(has-wo? chunk)))
(defn chunk-to-string [chunk]
(->> (:morphs chunk)
(filter (fn [morph]
(not= (:pos morph) "記号")))
(map :surface)
(clojure.string/join)))
(defn get-first-verb-chunk [chunks]
"文中で動詞が含まれている文節の最初の一つを返す"
(->> chunks
(some (fn [chunk]
(when (include-verb? chunk)
chunk)))))
(defn get-all-verb-chunks [chunks]
"文中で動詞が含まれている文節をすべて返す"
(->> chunks
(filter include-verb?)))
(defn verb-chunk-to-base-str [verb-chunk]
(let [verb-morph (some #(when (= (:pos %) "動詞") %) (:morphs verb-chunk))]
(:base verb-morph)))
(defn sa-chunk-to-str [sa-chunk]
(let [sa-morph (some #(when (sa? %) %) (:morphs sa-chunk))]
(:surface sa-morph)))
(defn particle-chunks-to-only-particle-str [particle-chunks]
(->> particle-chunks
(map :morphs)
(map (fn [morphs]
;;(some #(when (= (:pos %) "助詞") %) morphs)
(->> morphs
(filter (fn [morph]
(= (:pos morph) "助詞")))
last)))
(map (fn [particle-morph]
(:surface particle-morph)))
(clojure.string/join " ")))
(defn particle-chunks-to-str [particle-chunks]
(->> particle-chunks
(map chunk-to-string)
(clojure.string/join " ")))
(defn to-str-47 [{:keys [verb-chunk sa-chunk particle-chunks]}]
(let [verb-str (verb-chunk-to-base-str verb-chunk)
jutugo (str (sa-chunk-to-str sa-chunk) "を" verb-str)
only-particle-str (particle-chunks-to-only-particle-str particle-chunks)
particle-chunk-str (particle-chunks-to-str particle-chunks)]
(str jutugo "\t" only-particle-str "\t" particle-chunk-str)))
(->> sentences
(map (fn [chunks]
(let [verb-chunks (get-all-verb-chunks chunks)]
(->> verb-chunks
;; 動詞に係るサ変接続名詞の文節があるもののみfilter
(filter (fn [verb-chunk]
(let [srcs (:srcs verb-chunk)
src-chunks (map #(nth chunks %) srcs)]
(some has-sa-and-wo? src-chunks))))
(map (fn [verb-chunk]
(let [srcs (:srcs verb-chunk)
src-chunks (map #(nth chunks %) srcs)]
(let [particle-chunks (->> src-chunks
(filter #(not (has-sa-and-wo? %)))
(filter include-particles?)
)]
{:verb-chunk verb-chunk
:sa-chunk (some #(when (has-sa-and-wo? %) %) src-chunks)
:particle-chunks particle-chunks}))))
))))
(apply concat)
(filter (comp not empty?))
(map to-str-47)
(clojure.string/join "\n")
(spit "47.txt"))
;; ;; このプログラムの出力をファイルに保存し,以下の事項をUNIXコマンドを用いて確認せよ.
;; ;; コーパス中で頻出する述語(サ変接続名詞+を+動詞)
(def sa-verbs
(with-open [r (io/reader "47.txt")]
(doall
(->> (line-seq r)
(map (fn [line]
(let [[verb] (clojure.string/split line #"\t")]
verb)))))))
(clojure.pprint/pprint
(take 10 (->> sa-verbs
(frequencies)
(sort-by val)
(reverse))))
;; ;; コーパス中で頻出する述語と助詞パターン
(def sa-combinations
(with-open [r (io/reader "47.txt")]
(doall
(->> (line-seq r)
(map (fn [line]
(let [[verb particles-str] (clojure.string/split line #"\t")]
(when (not (nil? particles-str))
(let [particles (clojure.string/split particles-str #" ")]
(map (fn [particle]
[verb particle])
particles)))
)))
(filter (comp not nil?))
(apply concat)
))))
(clojure.pprint/pprint (take 10 (->> sa-combinations
(frequencies)
(sort-by val)
(reverse))))
;; 48
(defn make-chain [chunk chunks]
(lazy-seq
(if (= (:dst chunk) -1)
[chunk]
(lazy-seq (cons chunk (make-chain (nth chunks (:dst chunk)) chunks))))))
(defn chain-to-string [chain]
(when (not (empty? chain))
(->> chain
(map chunk-to-string)
(clojure.string/join " -> "))))
(->> sentences
(filter (fn [chunks]
;; 名詞をふくむ
(some contains-noun? chunks)))
(map (fn [chunks]
(->> (map (fn [chunk]
(when (contains-noun? chunk)
(make-chain chunk chunks))) chunks)
(map chain-to-string)
(filter (comp not nil?)))))
(apply concat)
(clojure.string/join "\n")
(spit "48.txt"))
;; 49
;; 吾輩はここで始めて人間というものを見た。
(def debug-chunks (nth sentences 5))
(defn to-indexed-chunk [idx chunk]
(assoc chunk :idx idx))
(defn to-indexed-chunks [chunks]
(map-indexed
(fn [idx chunk]
(to-indexed-chunk idx chunk))
chunks))
;; すべての名詞のペアを抽出 [i j] idx付き
(defn all-noun-combinations [chunks]
(let [indexed-noun-chunks (->> chunks
(to-indexed-chunks)
(filter contains-noun?))]
(->> (for [i indexed-noun-chunks
j indexed-noun-chunks]
(vector i j))
(filter (fn [[i j]] (not= i j)))
(map (fn [v]
(sort-by :idx v)))
(distinct)
(map (fn [[i j]]
[(dissoc i :idx) (dissoc j :idx)])))))
;; iから構文木の根に至る経路にjがあるかどうか
(defn exists-route? [i j chunks]
(let [route (make-chain i chunks)]
(->> route
(some #(= j %))
(boolean))))
(defn to-x-str [chunk]
(let [morphs (:morphs chunk)]
(->> morphs
(map (fn [morph]
(if (= (:pos morph) "名詞")
"X"
(:surface morph))))
(clojure.string/join))))
(defn to-49-string-exists-route [i j chunks]
(let [route (make-chain i chunks)
middle-of-route (rest (take-while #(not= j %) route))
i-str (to-x-str i)]
(str i-str " -> " (chain-to-string middle-of-route) " -> " "Y")))
;; kを探す
;; 文節iと文節jから構文木の根に至る経路上で共通の文節kで交わる
(defn find-k [i j chunks]
(let [i-route-without-head (rest (make-chain i chunks))
j-route-without-head (rest (make-chain j chunks))]
(loop [coll i-route-without-head]
(when (seq coll)
(let [k (first coll)]
(if (->> j-route-without-head
(some (fn [chunk]
(= k chunk))))
k
(recur (rest coll))))))))
;; 文節iから文節kに至る直前のパスを文字列化
(defn left-str [i k chunks]
(let [route (->> (make-chain i chunks)
(take-while #(not= k %)))]
(->> route
(map (fn [chunk]
(if (= chunk i)
(to-x-str chunk)
(chunk-to-string chunk))))
(clojure.string/join))))
(defn to-y-str [chunk]
(let [morphs (:morphs chunk)]
(->> morphs
(map (fn [morph]
(if (= (:pos morph) "名詞")
"Y"
(:surface morph))))
(clojure.string/join))))
;; 文節jから文節kに至る直前のパスを文字列化
(defn middle-str [j k chunks]
(let [route (->> (make-chain j chunks)
(take-while #(not= k %)))]
(->> route
(map (fn [chunk]
(if (= chunk j)
(to-y-str chunk)
(chunk-to-string chunk))))
(clojure.string/join " -> "))))
(defn to-49-string-not-exists-route [i j chunks]
(let [k (find-k i j chunks)]
(str (left-str i k chunks) " | " (middle-str j k chunks) " | " (chunk-to-string k))))
(defmulti to-49-string exists-route?)
(defmethod to-49-string true [i j chunks]
(to-49-string-exists-route i j chunks))
(defmethod to-49-string false [i j chunks]
(to-49-string-not-exists-route i j chunks))
(->> sentences
(map (fn [chunks]
(->> (all-noun-combinations chunks)
(map (fn [[i j]]
(to-49-string i j chunks))))))
(flatten)
(clojure.string/join "\n")
(spit "49.txt"))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment