-
-
Save spdegabrielle/2427904 to your computer and use it in GitHub Desktop.
TF-IDF in Racket
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#lang racket | |
(require unstable/dict) | |
(provide main) | |
;; Set[String] | |
(define stopwords (list->set (file->lines "./stopwords.txt"))) | |
;; String -> List[String] | |
(define (tokenize raw-text) ;; Lowercases and splits on non-letters, non-numbers. | |
(filter-not (λ (e) (set-member? stopwords e)) | |
(regexp-split #rx"[^a-z0-9äöüáéíóúãâêîôûàèìòùçñ]+" (string-downcase raw-text)))) | |
;; Number Hash[Any,Any] -> Number | |
(define (idf2 n-docs match) (sqr (log (/ n-docs (hash-count match))))) | |
;; Sequence[String] -> Hash[String,Number] | |
(define (frequencies l) (for/fold ([h (hash)]) ([i l]) (hash-update h i add1 0))) | |
;; Path-String -> Hash[String,Hash[Path-String,Number]] | |
;; Index for one file. Given an fname, returns a map of token -> map of (fname -> count) | |
(define (index-one fname) | |
(for/hash ([(k c) (in-hash (frequencies (tokenize (file->string fname))))]) | |
(values k (hash fname c)))) | |
;; Number Hash[String,Number] -> List[Hash[String,Number]] | |
;; Given the total term count and a map of doc -> count, accumulate tfidf for docs. | |
(define (accum-tfidf total-doc-count match) | |
(for/list ([(doc w-count) (in-hash match)]) | |
(hash doc (* w-count (idf2 total-doc-count match))))) | |
;; Hash[String,Hash[String,Number]] Number String -> Hash[String,Number] | |
;; Returns accumulated tfidf for each doc. | |
(define (search db total-doc-count raw-text) | |
(for*/fold ([h (hash)]) ([e (tokenize raw-text)] [r (in-value (hash-ref db e #f))] #:when r) | |
(apply dict-union #:combine + h (accum-tfidf total-doc-count r)))) | |
;; Hash[String,Hash[String,Number]] Number Hash[String,Number] String -> Void | |
(define (read-and-search db total-doc-count doc-norms raw-text) | |
(define results (search db total-doc-count raw-text)) | |
(define scores (take-right (sort (for/list ([(k v) results]) | |
(list k (/ v (hash-ref doc-norms k)))) | |
< #:key second) | |
(min 3 (hash-count results)))) | |
(printf "FOR: ~a matched: ~a\n" raw-text scores)) | |
;; List[String] -> Void | |
(define (main . args) | |
(define db (apply dict-union (hash) #:combine (λ (a b) (dict-union a b #:combine (λ (a b) a))) (map index-one args))) | |
(define doc-norms-raw (apply dict-union (hash) #:combine + (append-map (λ (e) (accum-tfidf (length args) e)) (hash-values db)))) | |
(define doc-norms (for/hash ([(k v) doc-norms-raw]) (values k (sqrt v)))) | |
(for ([e (in-lines)]) (read-and-search db (length args) doc-norms e))) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment