behrica · December 17, 2020 22:29
diff --git a/bow.clj b/bow.clj
 (ns sciloj.smile-nlp-example
  (:require [clojure.string :as str]
            [pppmap.core :as ppp]
            [tablecloth.api :as tc]
            [tablecloth.api.split :as split])
  (:import smile.classification.Maxent
           smile.nlp.normalizer.SimpleNormalizer
           smile.nlp.stemmer.PorterStemmer
           [smile.nlp.tokenizer SimpleSentenceSplitter SimpleTokenizer]
           [smile.validation Accuracy ConfusionMatrix]))



 ;; converts the token-counts to th e format the Maxtent funcions wants
 ;; a native array, where every int is a index in the vocabulary
 ;; If indeix is present, the token is prsent in text
 (defn bow->sparse-indices [bow vocab->index-map]
  (->>
   (merge-with
    (fn [index count]
      [index count])
    vocab->index-map
    bow)
   vals
   (filter vector?)
   (map first)
   (into-array Integer/TYPE)))


 ;; converts text to token counts (a map token -> count)
 (defn default-text->bow [text]
  (let [normalizer (SimpleNormalizer/getInstance)
        tokenizer (SimpleTokenizer. )
        sentence-splitter (SimpleSentenceSplitter/getInstance)
        stemmer (PorterStemmer.)]
    (->> text
         (.normalize normalizer)
         (.split sentence-splitter)
         (map #(.split tokenizer %))
         (map seq)
         flatten
         (remove nil?)
         (map #(.stem stemmer %))
         (map str/lower-case)
         frequencies)))

 ;;;  take to n occuring terms from all token-frquency ables
 ;;;  Makes a global vocabulary first
 (defn ->vocabulary-top-n [ds bow-col n]
  (let [vocabulary
        (->>
         (apply merge-with + (get ds bow-col))
         (sort-by second)
         reverse
         (take n)
         keys)
        vocab->index-map (zipmap vocabulary (range))
        ]

    {:vocab vocabulary
     :vocab->index-map vocab->index-map
     :index->vocab-map (clojure.set/map-invert vocab->index-map)
     })
  )


 ;; vectorizes the text via text->bow-fn
 (defn count-vectorize [ds text-col bow-col text->bow-fn]
  (tc/add-or-replace-column ds bow-col
                            (fn [ds]
                              (ppp/ppmap-with-progress
                               "text->bow"
                               1000
                               text->bow-fn
                               (get ds text-col)))))

 ;; converts bow column to sparse indeices column
 (defn bow->sparse-array [ds bow-col indices-col vocab->index-map]
  (tc/add-or-replace-column ds
                            indices-col
                            (fn [ds]
                              (ppp/ppmap-with-progress
                               "bow->sparse"
                               1000
                               #(bow->sparse-indices % vocab->index-map)
                               (get ds bow-col)))))


 ;; create final native arrays for Maxent
 (defn train-test-arrays [train-test-split bow-sparse target]
  (let [train-array
        (into-array ^"[[Ljava.lang.Integer"
                    (get (train-test-split :train) bow-sparse))

        test-array
        (into-array ^"[[Ljava.lang.Integer"
                    (get (train-test-split :test) bow-sparse))


        train-score-array
        (into-array
         Integer/TYPE
         (get (train-test-split :train) target))

        test-score-array
        (into-array
         Integer/TYPE
         (get (train-test-split :test) target))]
    {:x-train train-array
     :x-test test-array
     :y-train train-score-array
     :y-test test-score-array
     }))


 ;;;  a library
 ;; -----------------------------------------
 ;; user code


 (def reviews
  (->
   (tc/dataset "./Reviews.csv" {:key-fn keyword})
   (tc/select-columns [:Text :Score])
   (tc/drop-missing)
    (tc/head 10000)
   (count-vectorize :Text :bow default-text->bow)))

 (def vocabulary
  (->vocabulary-top-n reviews :bow 10000))

 (def reviews
  (bow->sparse-array reviews :bow :bow-sparse (:vocab->index-map vocabulary) ))

 (def train-test-split (first (split/split reviews  :holdout {:ratio 0.3})))


 (def p (count (vocabulary :vocab->index-map)))

 (def t-t-arrays (train-test-arrays train-test-split :bow-sparse :Score))

 (def maxent (Maxent/multinomial p (:x-train t-t-arrays) (:y-train t-t-arrays)))

 (def predictions (.predict maxent (:x-test t-t-arrays)))


 (println
 (ConfusionMatrix/of predictions (:y-test t-t-arrays)))

 (println
 (Accuracy/of predictions (:y-test t-t-arrays)))
	(ns sciloj.smile-nlp-example
	(:require [clojure.string :as str]
	[pppmap.core :as ppp]
	[tablecloth.api :as tc]
	[tablecloth.api.split :as split])
	(:import smile.classification.Maxent
	smile.nlp.normalizer.SimpleNormalizer
	smile.nlp.stemmer.PorterStemmer
	[smile.nlp.tokenizer SimpleSentenceSplitter SimpleTokenizer]
	[smile.validation Accuracy ConfusionMatrix]))



	;; converts the token-counts to th e format the Maxtent funcions wants
	;; a native array, where every int is a index in the vocabulary
	;; If indeix is present, the token is prsent in text
	(defn bow->sparse-indices [bow vocab->index-map]
	(->>
	(merge-with
	(fn [index count]
	[index count])
	vocab->index-map
	bow)
	vals
	(filter vector?)
	(map first)
	(into-array Integer/TYPE)))


	;; converts text to token counts (a map token -> count)
	(defn default-text->bow [text]
	(let [normalizer (SimpleNormalizer/getInstance)
	tokenizer (SimpleTokenizer. )
	sentence-splitter (SimpleSentenceSplitter/getInstance)
	stemmer (PorterStemmer.)]
	(->> text
	(.normalize normalizer)
	(.split sentence-splitter)
	(map #(.split tokenizer %))
	(map seq)
	flatten
	(remove nil?)
	(map #(.stem stemmer %))
	(map str/lower-case)
	frequencies)))

	;;; take to n occuring terms from all token-frquency ables
	;;; Makes a global vocabulary first
	(defn ->vocabulary-top-n [ds bow-col n]
	(let [vocabulary
	(->>
	(apply merge-with + (get ds bow-col))
	(sort-by second)
	reverse
	(take n)
	keys)
	vocab->index-map (zipmap vocabulary (range))
	]

	{:vocab vocabulary
	:vocab->index-map vocab->index-map
	:index->vocab-map (clojure.set/map-invert vocab->index-map)
	})
	)


	;; vectorizes the text via text->bow-fn
	(defn count-vectorize [ds text-col bow-col text->bow-fn]
	(tc/add-or-replace-column ds bow-col
	(fn [ds]
	(ppp/ppmap-with-progress
	"text->bow"
	1000
	text->bow-fn
	(get ds text-col)))))

	;; converts bow column to sparse indeices column
	(defn bow->sparse-array [ds bow-col indices-col vocab->index-map]
	(tc/add-or-replace-column ds
	indices-col
	(fn [ds]
	(ppp/ppmap-with-progress
	"bow->sparse"
	1000
	#(bow->sparse-indices % vocab->index-map)
	(get ds bow-col)))))


	;; create final native arrays for Maxent
	(defn train-test-arrays [train-test-split bow-sparse target]
	(let [train-array
	(into-array ^"[[Ljava.lang.Integer"
	(get (train-test-split :train) bow-sparse))

	test-array
	(into-array ^"[[Ljava.lang.Integer"
	(get (train-test-split :test) bow-sparse))


	train-score-array
	(into-array
	Integer/TYPE
	(get (train-test-split :train) target))

	test-score-array
	(into-array
	Integer/TYPE
	(get (train-test-split :test) target))]
	{:x-train train-array
	:x-test test-array
	:y-train train-score-array
	:y-test test-score-array
	}))


	;;; a library
	;; -----------------------------------------
	;; user code


	(def reviews
	(->
	(tc/dataset "./Reviews.csv" {:key-fn keyword})
	(tc/select-columns [:Text :Score])
	(tc/drop-missing)
	(tc/head 10000)
	(count-vectorize :Text :bow default-text->bow)))

	(def vocabulary
	(->vocabulary-top-n reviews :bow 10000))

	(def reviews
	(bow->sparse-array reviews :bow :bow-sparse (:vocab->index-map vocabulary) ))

	(def train-test-split (first (split/split reviews :holdout {:ratio 0.3})))


	(def p (count (vocabulary :vocab->index-map)))

	(def t-t-arrays (train-test-arrays train-test-split :bow-sparse :Score))

	(def maxent (Maxent/multinomial p (:x-train t-t-arrays) (:y-train t-t-arrays)))

	(def predictions (.predict maxent (:x-test t-t-arrays)))


	(println
	(ConfusionMatrix/of predictions (:y-test t-t-arrays)))

	(println
	(Accuracy/of predictions (:y-test t-t-arrays)))