Skip to content

Instantly share code, notes, and snippets.

@soimort
Last active October 14, 2015 22:06
Show Gist options
  • Save soimort/d7719cceb9943c01a1e1 to your computer and use it in GitHub Desktop.
Save soimort/d7719cceb9943c01a1e1 to your computer and use it in GitHub Desktop.
module Main where
import Data.Time
import Debug.Trace
import Data.Char hiding (empty)
import Data.List (findIndex)
import Data.Map hiding (findIndex, foldl)
import Text.ParserCombinators.ReadP
dataset = "mxm_dataset_test.txt"
target = "devil"
-- Build the inverted index.
invertedIndex =
foldl (\ acc1 (_, mxmId, stat) ->
foldl (\ acc2 (wordIndex, _) ->
insertWith (++) wordIndex [mxmId] acc2) acc1 stat) empty
-- Grep a word in the inverted index.
grep word words invIndex =
case findIndex (== word) words of
Just i -> Data.Map.lookup (show $ i + 1) invIndex -- starts from 1
Nothing -> error "word not found"
-- MusiXmatch parser
newline = char '\n'
skipComment = do
char '#'
munch (/= '\n')
newline
wordName = do
x <- munch1 $ \x -> x /= ',' && x /= '\n'
many $ char ','
return x
topWords = char '%' >> manyTill wordName newline
wordStat = do
wordIndex <- munch1 isDigit
char ':'
wordCount <- munch1 isDigit
many $ char ','
return (wordIndex, wordCount)
track = do
trackId <- munch1 (/= ',')
char ','
mxmId <- munch1 isDigit
char ','
stat <- manyTill wordStat newline
return (trackId, mxmId, stat)
mxm = do
many skipComment
words <- topWords
tracks <- manyTill track eof
return (words, tracks)
parseText text = do
case readP_to_S mxm text of
[(v, _)] -> v
_ -> error "file not parsed"
main = do
timeStart <- getCurrentTime
content <- readFile dataset
let (words, tracks) = parseText content
putStrLn $ "Words: " ++ show (length words)
putStrLn $ "Tracks: " ++ show (length tracks)
timeFinishParsing <- getCurrentTime
putStrLn $ "Time on parsing: " ++
show (diffUTCTime timeFinishParsing timeStart)
let invIndex = invertedIndex tracks
putStrLn $ "Indexed: " ++ show (size invIndex)
timeFinishIndexing <- getCurrentTime
putStrLn $ "Time on indexing: " ++
show (diffUTCTime timeFinishIndexing timeFinishParsing)
case grep target words invIndex of
Nothing -> putStrLn "word not found"
Just result -> putStrLn $ "Number of tracks with that word: " ++
show (length result)
timeFinishGrepping <- getCurrentTime
putStrLn $ "Time on grepping: " ++
show (diffUTCTime timeFinishGrepping timeFinishIndexing)
-module(test_seq).
-mode(compile).
-export([inverted_index/1, grep/3, main/1]).
-define(DATASET, "mxm_dataset_test.txt").
-define(TARGET, "devil").
inverted_index(Tracks) ->
lists:foldl
(fun(Track, Acc) ->
{_, MxmId, TrackWords} = read_mxm:parse_track(Track),
lists:foldl(fun({WordIndex, _}, Acc2) ->
dict:append(WordIndex, MxmId, Acc2)
end,
Acc, TrackWords)
end,
dict:new(), Tracks).
grep(Word, Words, InvIndex) ->
dict:find(string:str(Words, [Word]), InvIndex).
main(_) ->
compile:file(read_mxm),
{Words, Tracks} = read_mxm:from_file(?DATASET),
{Time, InvIndex} = timer:tc(test_seq, inverted_index, [Tracks]),
io:format("Time on indexing: ~ps~n", [Time * 0.000001]),
case grep(?TARGET, Words, InvIndex) of
{ok, Result} ->
io:format("Number of tracks with that word: ~p~n",
[length(Result)]);
_ -> io:format("word not found~n")
end.
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment