ithayer · June 28, 2011 06:32 · ithayer · Jun 28, 2011 · chernyshev-alex · Dec 5, 2014
diff --git a/tf-idf.clj b/tf-idf.clj
 (ns ignacio.tfidf (:require [clojure.contrib.string  :as string])) ;; Simple tfidf in clojure, for fun.

 (def stopwords (set (string/split #"\n" (slurp "./stopwords.txt"))))

 (defn tokenize [raw-text] ;; Lowercases and splits on non-letters, non-numbers.
  (remove stopwords (string/split #"[^a-z0-9äöüáéíóúãâêîôûàèìòùçñ]+" (string/lower-case raw-text))))

 (defn idf2 [n-docs match] (Math/pow (Math/log (/ n-docs (count (keys match)))) 2))

 (defn index-one [fname] ;; Index for one file. Given an fname, returns a map of token -> map of (fname, count)
  (let [word-counts (frequencies (tokenize (slurp fname)))]
    (zipmap (keys word-counts) (map (fn [c] {fname c}) (vals word-counts)))))

 (defn accum-tfidf [total-doc-count match] ;; Given the total term count and a map of doc -> count, accumulate tfidf for docs.
  (map (fn [doc w-count] {doc (* w-count (idf2 total-doc-count match))}) (keys match) (vals match)))
  
 (defn search [db total-doc-count raw-text] ;; Returns accumulated tfidf for each doc.
  (let [results (keep db (tokenize raw-text))] ;; Each result is one term lookup.
    (apply merge-with + (mapcat (partial accum-tfidf total-doc-count) results))))
        
 (defn read-and-search [db total-doc-count doc-norms raw-text]
  (let [results (search db total-doc-count raw-text)
 	scores  (take 3 (reverse (sort-by second (map (fn [k] [k (/ (results k) (doc-norms k))]) (keys results)))))]
    (println "FOR: " raw-text " matched: " results)))
  
 (defn -main [& args]
  (let [db            (apply merge-with merge (map index-one args))
        doc-norms-raw (apply merge-with + (mapcat (partial accum-tfidf (count args)) (vals db)))
        doc-norms     (zipmap (keys doc-norms-raw) (map #(Math/sqrt %) (vals doc-norms-raw)))]
   (map (partial read-and-search db (count args) doc-norms) (line-seq (java.io.BufferedReader. *in*)))))
	(ns ignacio.tfidf (:require [clojure.contrib.string :as string])) ;; Simple tfidf in clojure, for fun.

	(def stopwords (set (string/split #"\n" (slurp "./stopwords.txt"))))

	(defn tokenize [raw-text] ;; Lowercases and splits on non-letters, non-numbers.
	(remove stopwords (string/split #"[^a-z0-9äöüáéíóúãâêîôûàèìòùçñ]+" (string/lower-case raw-text))))

	(defn idf2 [n-docs match] (Math/pow (Math/log (/ n-docs (count (keys match)))) 2))

	(defn index-one [fname] ;; Index for one file. Given an fname, returns a map of token -> map of (fname, count)
	(let [word-counts (frequencies (tokenize (slurp fname)))]
	(zipmap (keys word-counts) (map (fn [c] {fname c}) (vals word-counts)))))

	(defn accum-tfidf [total-doc-count match] ;; Given the total term count and a map of doc -> count, accumulate tfidf for docs.
	(map (fn [doc w-count] {doc (* w-count (idf2 total-doc-count match))}) (keys match) (vals match)))

	(defn search [db total-doc-count raw-text] ;; Returns accumulated tfidf for each doc.
	(let [results (keep db (tokenize raw-text))] ;; Each result is one term lookup.
	(apply merge-with + (mapcat (partial accum-tfidf total-doc-count) results))))

	(defn read-and-search [db total-doc-count doc-norms raw-text]
	(let [results (search db total-doc-count raw-text)
	scores (take 3 (reverse (sort-by second (map (fn [k] [k (/ (results k) (doc-norms k))]) (keys results)))))]
	(println "FOR: " raw-text " matched: " results)))

	(defn -main [& args]
	(let [db (apply merge-with merge (map index-one args))
	doc-norms-raw (apply merge-with + (mapcat (partial accum-tfidf (count args)) (vals db)))
	doc-norms (zipmap (keys doc-norms-raw) (map #(Math/sqrt %) (vals doc-norms-raw)))]
	(map (partial read-and-search db (count args) doc-norms) (line-seq (java.io.BufferedReader. in)))))