;; using lumo for ClojureScript: https://github.com/anmonteiro/lumo
;;
;; https://www.wordclouds.com/
;;    - size: 1920x1080
;;    - size is typically around -45
;;    - shape: rectangle
;;    - theme: black on white
;;    - font: Ariel
;;
;; PS: Over the last 30 years, I've probably written this program over
;;     20 times in awk, perl, python, ruby... this was the first time writing
;;     it in Clojure, my new favorite language.  This time around, I used Lumo,
;;     which feels much more like a scripting language than any other ClojureScript
;;     implementation.  Highly recommended!
;;
;;     There's another ClojureScript scripting alternative called Planck, which
;;     also looks fantastic: http://planck-repl.org/


(ns split.core
  (:require [clojure.string :as str]
            [cljs.nodejs :as nodejs]
            [lumo.util :as util :refer [file-seq line-seq distinct-by]]
            [goog.string :as gstr])
  (:import [goog.string format StringBuffer]))

(nodejs/enable-util-print!)

; helper function to check for presence of value in vector
(defn in? 
  "true if coll contains elem"
  [coll elem]  
  (some #(= elem %) coll))

; words to filter out of list
(def common-words (util/line-seq "commonwords.txt"))

; source text
(def lines (util/line-seq "module6.txt"))

(println "# lines: " (count lines))

(def wlines (map #(clojure.string/split % #"[\s.,]+") lines))
(def words (map clojure.string/lower-case (flatten wlines)))

(def sorted-words (sort-by second > (frequencies words)))

(def top-words (filter #(>= (second %) 1) sorted-words))

(def top-new-words (filter #(not (in? common-words (first %))) top-words))

(def top-words (take 250 top-new-words))

(def out (map #(gstr/format "%d %s" (second %) (first %)) top-words))

(doseq [o out]
  (println o))