Created
June 28, 2011 06:32
-
-
Save ithayer/1050607 to your computer and use it in GitHub Desktop.
Simple tf-idf in 30 lines of clojure. Inspired by a nice simple scala implementation: https://github.com/felipehummel/TinySearchEngine/blob/master/scala/tinySearch.scala and matches as closely as possible the computation.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
(ns ignacio.tfidf (:require [clojure.contrib.string :as string])) ;; Simple tfidf in clojure, for fun. | |
(def stopwords (set (string/split #"\n" (slurp "./stopwords.txt")))) | |
(defn tokenize [raw-text] ;; Lowercases and splits on non-letters, non-numbers. | |
(remove stopwords (string/split #"[^a-z0-9äöüáéíóúãâêîôûàèìòùçñ]+" (string/lower-case raw-text)))) | |
(defn idf2 [n-docs match] (Math/pow (Math/log (/ n-docs (count (keys match)))) 2)) | |
(defn index-one [fname] ;; Index for one file. Given an fname, returns a map of token -> map of (fname, count) | |
(let [word-counts (frequencies (tokenize (slurp fname)))] | |
(zipmap (keys word-counts) (map (fn [c] {fname c}) (vals word-counts))))) | |
(defn accum-tfidf [total-doc-count match] ;; Given the total term count and a map of doc -> count, accumulate tfidf for docs. | |
(map (fn [doc w-count] {doc (* w-count (idf2 total-doc-count match))}) (keys match) (vals match))) | |
(defn search [db total-doc-count raw-text] ;; Returns accumulated tfidf for each doc. | |
(let [results (keep db (tokenize raw-text))] ;; Each result is one term lookup. | |
(apply merge-with + (mapcat (partial accum-tfidf total-doc-count) results)))) | |
(defn read-and-search [db total-doc-count doc-norms raw-text] | |
(let [results (search db total-doc-count raw-text) | |
scores (take 3 (reverse (sort-by second (map (fn [k] [k (/ (results k) (doc-norms k))]) (keys results)))))] | |
(println "FOR: " raw-text " matched: " results))) | |
(defn -main [& args] | |
(let [db (apply merge-with merge (map index-one args)) | |
doc-norms-raw (apply merge-with + (mapcat (partial accum-tfidf (count args)) (vals db))) | |
doc-norms (zipmap (keys doc-norms-raw) (map #(Math/sqrt %) (vals doc-norms-raw)))] | |
(map (partial read-and-search db (count args) doc-norms) (line-seq (java.io.BufferedReader. *in*))))) |
Author
ithayer
commented
Jun 28, 2011
via email
Thanks Scott! What are you working on?
Ignacio
…On Tue, Jun 28, 2011 at 7:54 AM, scottjad < ***@***.***>wrote:
;; Here are a few improvements:
(defn fmap-keys [f m](fmap %28fn [[k v]] [k %28f v%29]%29 m)) ;fmap from contrib
(zipmap (keys word-counts) (map (fn [c] {fname c}) (vals word-counts)))
(zipmap (keys word-counts) (map fname (vals word-counts)))
(into {} (map (fn [[k v]] [k (fname v)]) word-counts))
(fmap (fn [[k v]] [k (fname v)]) word-counts)
(fmap-keys fname word-counts)
(map (fn [doc w-count] {doc (\* w-count (idf2 total-doc-count match))})
(keys match) (vals match))
(map (fn [[doc w-count]] {doc (\* w-count (idf2 total-doc-count match))})
match)
(zipmap (keys doc-norms-raw) (map #(Math/sqrt %) (vals doc-norms-raw)))
(into {} (map (fn [[k v]] [k (Math/sqrt v)]) doc-norms-raw))
(fmap (fn [[k v]] [k (Math/sqrt v)]) doc-norms-raw)
(fmap-keys #(Math/sqrt %) doc-norms-raw)
##
Reply to this email directly or view it on GitHub:
https://gist.github.com/1050607
probably --> doall required
(doall (map (partial read-and-search db
(count args) doc-norms) (line-seq (java.io.BufferedReader. in))))))
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment