Last active
June 9, 2020 15:06
-
-
Save realgenekim/2c7bd1cb33f6ffc2b13b1ad413d2fb06 to your computer and use it in GitHub Desktop.
Create sorted word frequencies for word clouds, using Lumo and ClojureScript
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
;; using lumo for ClojureScript: https://github.com/anmonteiro/lumo | |
;; | |
;; https://www.wordclouds.com/ | |
;; - size: 1920x1080 | |
;; - size is typically around -45 | |
;; - shape: rectangle | |
;; - theme: black on white | |
;; - font: Ariel | |
;; | |
;; PS: Over the last 30 years, I've probably written this program over | |
;; 20 times in awk, perl, python, ruby... this was the first time writing | |
;; it in Clojure, my new favorite language. This time around, I used Lumo, | |
;; which feels much more like a scripting language than any other ClojureScript | |
;; implementation. Highly recommended! | |
;; | |
;; There's another ClojureScript scripting alternative called Planck, which | |
;; also looks fantastic: http://planck-repl.org/ | |
(ns split.core | |
(:require [clojure.string :as str] | |
[cljs.nodejs :as nodejs] | |
[lumo.util :as util :refer [file-seq line-seq distinct-by]] | |
[goog.string :as gstr]) | |
(:import [goog.string format StringBuffer])) | |
(nodejs/enable-util-print!) | |
; helper function to check for presence of value in vector | |
(defn in? | |
"true if coll contains elem" | |
[coll elem] | |
(some #(= elem %) coll)) | |
; words to filter out of list | |
(def common-words (util/line-seq "commonwords.txt")) | |
; source text | |
(def lines (util/line-seq "module6.txt")) | |
(println "# lines: " (count lines)) | |
(def wlines (map #(clojure.string/split % #"[\s.,]+") lines)) | |
(def words (map clojure.string/lower-case (flatten wlines))) | |
(def sorted-words (sort-by second > (frequencies words))) | |
(def top-words (filter #(>= (second %) 1) sorted-words)) | |
(def top-new-words (filter #(not (in? common-words (first %))) top-words)) | |
(def top-words (take 250 top-new-words)) | |
(def out (map #(gstr/format "%d %s" (second %) (first %)) top-words)) | |
(doseq [o out] | |
(println o)) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment