Skip to content

Instantly share code, notes, and snippets.

@MasseR
Created July 16, 2013 20:43
Show Gist options
  • Save MasseR/6012478 to your computer and use it in GitHub Desktop.
Save MasseR/6012478 to your computer and use it in GitHub Desktop.
tf-idf test
import Data.Map (Map)
import qualified Data.Map as M
import Data.Monoid
import Data.List (find, sortBy)
import Data.Maybe (fromMaybe)
import Data.Text.Lazy (Text)
import qualified Data.Text.Lazy as T
import qualified Data.Text.Lazy.IO as TI
import Data.Function
import Data.Char
type Term = Text
type Document = FilePath
data Tfidf = Tfidf (Sum Int) (Map Term [(Document, Int)]) deriving Show
instance Monoid Tfidf where
mempty = Tfidf mempty M.empty
mappend (Tfidf an am) (Tfidf bn bm) = Tfidf (mappend an bn) (M.unionWith (++) am bm)
tfidf :: Term -> Document -> Tfidf -> Double
tfidf term doc (Tfidf (Sum ndoc) t) = let
termdocs = M.lookup term t
tf = log $ fromIntegral $ fromMaybe 1 $ do
docs <- termdocs
fmap ((+1) . snd) $ find (\x -> fst x == doc) docs
idf = (fromIntegral (ndoc)) / (fromIntegral $ fromMaybe 1 $ fmap ((+1) . length) termdocs)
in tf * idf
toTfidf :: FilePath -> Text -> Tfidf
toTfidf doc content = let
hist = foldr (\t acc -> M.insertWith' (+) t 1 acc) M.empty $ map fixup $ T.words content
in Tfidf (Sum 1) $ M.map (\x -> [(doc, x)]) hist
where fixup = T.filter isAlphaNum . T.map toLower
main = do
let files = ["tfidf.hs",
"db.scm",
"43227-0.txt",
"pg43225.txt",
"pg43223.txt",
"pg43224.txt",
"pg18581.txt",
"pg14831.txt",
"pg399.txt",
"pg462.txt",
"pg528.txt",
"pg11111.txt"]
tfidfs@(Tfidf _ t) <- fmap mconcat $ mapM (\x -> TI.readFile x >>= return . toTfidf x) files
let sorted = sortBy (flip (compare `on` snd)) $ map (\x -> (x, tfidf x "pg528.txt" tfidfs)) $ M.keys t
mapM_ (\(doc, rank) -> TI.putStrLn $ T.unwords [doc, T.pack $ show rank]) $ take 15 sorted
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment