Last active
December 17, 2020 22:29
-
-
Save behrica/1c532dc122c68b5d831a6bacd7c944d6 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
(ns sciloj.smile-nlp-example | |
(:require [clojure.string :as str] | |
[pppmap.core :as ppp] | |
[tablecloth.api :as tc] | |
[tablecloth.api.split :as split]) | |
(:import smile.classification.Maxent | |
smile.nlp.normalizer.SimpleNormalizer | |
smile.nlp.stemmer.PorterStemmer | |
[smile.nlp.tokenizer SimpleSentenceSplitter SimpleTokenizer] | |
[smile.validation Accuracy ConfusionMatrix])) | |
;; converts the token-counts to th e format the Maxtent funcions wants | |
;; a native array, where every int is a index in the vocabulary | |
;; If indeix is present, the token is prsent in text | |
(defn bow->sparse-indices [bow vocab->index-map] | |
(->> | |
(merge-with | |
(fn [index count] | |
[index count]) | |
vocab->index-map | |
bow) | |
vals | |
(filter vector?) | |
(map first) | |
(into-array Integer/TYPE))) | |
;; converts text to token counts (a map token -> count) | |
(defn default-text->bow [text] | |
(let [normalizer (SimpleNormalizer/getInstance) | |
tokenizer (SimpleTokenizer. ) | |
sentence-splitter (SimpleSentenceSplitter/getInstance) | |
stemmer (PorterStemmer.)] | |
(->> text | |
(.normalize normalizer) | |
(.split sentence-splitter) | |
(map #(.split tokenizer %)) | |
(map seq) | |
flatten | |
(remove nil?) | |
(map #(.stem stemmer %)) | |
(map str/lower-case) | |
frequencies))) | |
;;; take to n occuring terms from all token-frquency ables | |
;;; Makes a global vocabulary first | |
(defn ->vocabulary-top-n [ds bow-col n] | |
(let [vocabulary | |
(->> | |
(apply merge-with + (get ds bow-col)) | |
(sort-by second) | |
reverse | |
(take n) | |
keys) | |
vocab->index-map (zipmap vocabulary (range)) | |
] | |
{:vocab vocabulary | |
:vocab->index-map vocab->index-map | |
:index->vocab-map (clojure.set/map-invert vocab->index-map) | |
}) | |
) | |
;; vectorizes the text via text->bow-fn | |
(defn count-vectorize [ds text-col bow-col text->bow-fn] | |
(tc/add-or-replace-column ds bow-col | |
(fn [ds] | |
(ppp/ppmap-with-progress | |
"text->bow" | |
1000 | |
text->bow-fn | |
(get ds text-col))))) | |
;; converts bow column to sparse indeices column | |
(defn bow->sparse-array [ds bow-col indices-col vocab->index-map] | |
(tc/add-or-replace-column ds | |
indices-col | |
(fn [ds] | |
(ppp/ppmap-with-progress | |
"bow->sparse" | |
1000 | |
#(bow->sparse-indices % vocab->index-map) | |
(get ds bow-col))))) | |
;; create final native arrays for Maxent | |
(defn train-test-arrays [train-test-split bow-sparse target] | |
(let [train-array | |
(into-array ^"[[Ljava.lang.Integer" | |
(get (train-test-split :train) bow-sparse)) | |
test-array | |
(into-array ^"[[Ljava.lang.Integer" | |
(get (train-test-split :test) bow-sparse)) | |
train-score-array | |
(into-array | |
Integer/TYPE | |
(get (train-test-split :train) target)) | |
test-score-array | |
(into-array | |
Integer/TYPE | |
(get (train-test-split :test) target))] | |
{:x-train train-array | |
:x-test test-array | |
:y-train train-score-array | |
:y-test test-score-array | |
})) | |
;;; a library | |
;; ----------------------------------------- | |
;; user code | |
(def reviews | |
(-> | |
(tc/dataset "./Reviews.csv" {:key-fn keyword}) | |
(tc/select-columns [:Text :Score]) | |
(tc/drop-missing) | |
(tc/head 10000) | |
(count-vectorize :Text :bow default-text->bow))) | |
(def vocabulary | |
(->vocabulary-top-n reviews :bow 10000)) | |
(def reviews | |
(bow->sparse-array reviews :bow :bow-sparse (:vocab->index-map vocabulary) )) | |
(def train-test-split (first (split/split reviews :holdout {:ratio 0.3}))) | |
(def p (count (vocabulary :vocab->index-map))) | |
(def t-t-arrays (train-test-arrays train-test-split :bow-sparse :Score)) | |
(def maxent (Maxent/multinomial p (:x-train t-t-arrays) (:y-train t-t-arrays))) | |
(def predictions (.predict maxent (:x-test t-t-arrays))) | |
(println | |
(ConfusionMatrix/of predictions (:y-test t-t-arrays))) | |
(println | |
(Accuracy/of predictions (:y-test t-t-arrays))) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment