Last active October 14, 2015 22:06
module Main where
import Data.Time
import Debug.Trace
import Data.Char hiding (empty)
import Data.List (findIndex)
import Data.Map hiding (findIndex, foldl)
import Text.ParserCombinators.ReadP
dataset = "mxm_dataset_test.txt"
target = "devil"
-- Build the inverted index.
invertedIndex =
foldl (\ acc1 (_, mxmId, stat) ->
foldl (\ acc2 (wordIndex, _) ->
insertWith (++) wordIndex [mxmId] acc2) acc1 stat) empty
-- Grep a word in the inverted index.
grep word words invIndex =
case findIndex (== word) words of
Just i -> Data.Map.lookup (show $ i + 1) invIndex -- starts from 1
Nothing -> error "word not found"
-- MusiXmatch parser
newline = char '\n'
skipComment = do
char '#'
munch (/= '\n')
wordName = do
x <- munch1 $ \x -> x /= ',' && x /= '\n'
many $ char ','
return x
topWords = char '%' >> manyTill wordName newline
wordStat = do
wordIndex <- munch1 isDigit
char ':'
wordCount <- munch1 isDigit
many $ char ','
return (wordIndex, wordCount)
track = do
trackId <- munch1 (/= ',')
char ','
mxmId <- munch1 isDigit
char ','
stat <- manyTill wordStat newline
return (trackId, mxmId, stat)
mxm = do
many skipComment
words <- topWords
tracks <- manyTill track eof
return (words, tracks)
parseText text = do
case readP_to_S mxm text of
[(v, _)] -> v
_ -> error "file not parsed"
main = do
timeStart <- getCurrentTime
content <- readFile dataset
let (words, tracks) = parseText content
putStrLn $ "Words: " ++ show (length words)
putStrLn $ "Tracks: " ++ show (length tracks)
timeFinishParsing <- getCurrentTime
putStrLn $ "Time on parsing: " ++
show (diffUTCTime timeFinishParsing timeStart)
let invIndex = invertedIndex tracks
putStrLn $ "Indexed: " ++ show (size invIndex)
timeFinishIndexing <- getCurrentTime
putStrLn $ "Time on indexing: " ++
show (diffUTCTime timeFinishIndexing timeFinishParsing)
case grep target words invIndex of
Nothing -> putStrLn "word not found"
Just result -> putStrLn $ "Number of tracks with that word: " ++
show (length result)
timeFinishGrepping <- getCurrentTime
putStrLn $ "Time on grepping: " ++
show (diffUTCTime timeFinishGrepping timeFinishIndexing)
-export([inverted_index/1, grep/3, main/1]).
-define(DATASET, "mxm_dataset_test.txt").
-define(TARGET, "devil").
inverted_index(Tracks) ->
(fun(Track, Acc) ->
{_, MxmId, TrackWords} = read_mxm:parse_track(Track),
lists:foldl(fun({WordIndex, _}, Acc2) ->
dict:append(WordIndex, MxmId, Acc2)
Acc, TrackWords)
dict:new(), Tracks).
grep(Word, Words, InvIndex) ->
dict:find(string:str(Words, [Word]), InvIndex).
main(_) ->
{Words, Tracks} = read_mxm:from_file(?DATASET),
{Time, InvIndex} = timer:tc(test_seq, inverted_index, [Tracks]),
io:format("Time on indexing: ~ps~n", [Time * 0.000001]),
case grep(?TARGET, Words, InvIndex) of
{ok, Result} ->
io:format("Number of tracks with that word: ~p~n",
_ -> io:format("word not found~n")
