Last active
December 12, 2015 08:19
-
-
Save ghoseb/4743635 to your computer and use it in GitHub Desktop.
Sample code from Pune Clojure Dojo Meetup session held at 8 feb 2013.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
(ns ^{:doc "Word frequencies in a text file." | |
:author "Baishampayan Ghose <[email protected]>"} | |
meetup.freq | |
(:require [clojure.java.io :as io] | |
[clojure.string :as s])) | |
(def stop-word? #{"is" "the" "am" "i" "that" "if"}) ;; fill it up! | |
;;; all these functions are written in a "point free" style | |
(def get-lines (comp line-seq io/reader)) | |
(def get-words (partial mapcat (partial re-seq #"\w+"))) | |
(def lowercase-words (partial map s/lower-case)) | |
(def remove-stop-words (partial remove stop-word?)) | |
(defn count-freqs | |
[coll] | |
(reduce (fn [res word] | |
(update-in res [word] (fnil inc 0))) | |
{} coll)) | |
(def sort-map (partial sort-by (comp - val))) | |
(defn word-freqs | |
([file] | |
(word-freqs file 10)) | |
([file n] | |
(->> file | |
get-lines | |
get-words | |
lowercase-words | |
remove-stop-words | |
count-freqs | |
sort-map | |
(take n)))) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment