Created
March 6, 2020 11:43
-
-
Save genmeblog/9da71e2750f95cbc724baefbbfc813dd to your computer and use it in GitHub Desktop.
Tablesaw join test
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
(ns tablesaw-test | |
(:import [tech.tablesaw.io.csv CsvReadOptions CsvReader])) | |
(def ^CsvReader csv-reader (CsvReader.)) | |
(defn load-csv-data | |
([file] (load-csv-data file nil)) | |
([^String file {:keys [separator line-ending header?] | |
:or {separator \, line-ending "\n" header? true}}] | |
(let [builder (doto (CsvReadOptions/builder file) | |
(.separator separator) | |
(.lineEnding line-ending) | |
(.header header?))] | |
(->> builder | |
(.build) | |
(.read csv-reader))))) | |
(def lhs-fields | |
[:size :day :operatorid :notes :more-notes :even-more-notes :how-can-there-be-more]) | |
(defn customers [] | |
(for [i (range 100000)] | |
(let [city (str (rand-int 10))] | |
{:address (str "Address" i) | |
:gender (rand-nth ["m" "f" "n"]) | |
:address-id i | |
:country-code "99" | |
:first-name (str "customer_" i "first") | |
:last-name (str "customer_" i "last") | |
:city city | |
:zip-code (clojure.string/join (repeat 5 city)) | |
:email (str "customer_" i "@the-net") | |
:huge-field (str "this is a huge field containing a lot of dumb info for | |
bloat which will make the file so much larger for our poor machine how | |
unkind of us to do so in this day and age" i)}))) | |
(def rhs-fields | |
[:operatorid | |
:address | |
:gender | |
:address-id | |
:country-code | |
:first-name | |
:last-name | |
:city | |
:zip-code | |
:email]) | |
(defn random-lhs [] | |
(for [i (range 200000)] | |
{:size (rand-nth ["s" "m" "l"]) | |
:day (str (rand-int 100000)) | |
:operatorid (str "op" (rand-int 10000) "op") | |
:notes "THis is some bloated information we'll add in" | |
:more-notes "to make the table larger" | |
:even-more-notes "Also this will make things big as well" | |
:how-can-there-be-more "Yet another text field will add overhead jabroni"})) | |
(defn random-rhs [] | |
(let [cs (vec (customers))] | |
(for [i (range 500000)] | |
(let [c (rand-nth cs)] | |
(assoc c :operatorid (str "op" (rand-int 10000) "op")))))) | |
(with-open [w (clojure.java.io/writer "lhs.csv")] | |
(.write w (str (clojure.string/join "," (map name lhs-fields)) "\n")) | |
(run! (comp #(.write w (str % "\n")) | |
(partial clojure.string/join ",") | |
(apply juxt lhs-fields)) (random-lhs))) | |
(with-open [w (clojure.java.io/writer "rhs.csv")] | |
(.write w (str (clojure.string/join "," (map name rhs-fields)) "\n")) | |
(run! (comp #(.write w (str % "\n")) | |
(partial clojure.string/join ",") | |
(apply juxt rhs-fields)) (random-rhs))) | |
;; 435 ms | |
(time (def lhs (load-csv-data "lhs.csv"))) | |
;; 469 ms | |
(time (def rhs (load-csv-data "rhs.csv"))) | |
;; need to convert operatorid id to String (originally textcolumn is created) | |
(def lhs (.replaceColumn lhs "operatorid" (.asStringColumn (.column lhs "operatorid")))) | |
(def rhs (.replaceColumn rhs "operatorid" (.asStringColumn (.column rhs "operatorid")))) | |
;; 40 s | |
(time (def result (-> lhs | |
(.joinOn (into-array String ["operatorid"])) | |
(.inner rhs "operatorid")))) | |
(.rowCount result) | |
;; => 9999083 | |
(def lhs nil) | |
(def rhs nil) | |
(def result nil) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment