Skip to content

Instantly share code, notes, and snippets.

@ithayer
ithayer / tf-idf.clj
Created June 28, 2011 06:32
Simple tf-idf in 30 lines of clojure. Inspired by a nice simple scala implementation: https://github.com/felipehummel/TinySearchEngine/blob/master/scala/tinySearch.scala and matches as closely as possible the computation.
(ns ignacio.tfidf (:require [clojure.contrib.string :as string])) ;; Simple tfidf in clojure, for fun.
(def stopwords (set (string/split #"\n" (slurp "./stopwords.txt"))))
(defn tokenize [raw-text] ;; Lowercases and splits on non-letters, non-numbers.
(remove stopwords (string/split #"[^a-z0-9äöüáéíóúãâêîôûàèìòùçñ]+" (string/lower-case raw-text))))
(defn idf2 [n-docs match] (Math/pow (Math/log (/ n-docs (count (keys match)))) 2))
(defn index-one [fname] ;; Index for one file. Given an fname, returns a map of token -> map of (fname, count)
[org.clojars.ithayer/plaid-penguin "1.0.0"]
var sys = require('sys'),
Path = require('path'),
Mu = require('../mu'),
Script = process.binding('evals').Script;
exports.compile = compile;
exports.compilePartial = compilePartial;
function RenderEventEmitter(options) {
this.chunkSize = options.chunkSize || 1024;