Created
May 24, 2009 21:21
-
-
Save wilkes/117261 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
module Main where | |
import Network.Curl | |
--import Network.URI | |
import Data.Char(isAlphaNum, toLower) | |
import Data.List(transpose,sort,foldl') | |
import Data.Maybe(fromMaybe) | |
import Control.Monad --(foldM) | |
import System.Environment(getArgs) | |
import qualified System.Random as Random | |
import qualified Data.Map as Map | |
import qualified Text.HTML.TagSoup as TagSoup | |
import qualified Text.Feed.Import as Feed | |
import qualified Text.Feed.Types as Feed | |
import qualified Text.RSS.Syntax as RSS | |
import qualified Text.Atom.Feed as Atom | |
import qualified Text.Atom.Feed as Atom | |
feedlist :: [String] | |
feedlist = [ "http://news.google.com/?output=rss" | |
, "http://feeds.nytimes.com/nyt/rss/HomePage" | |
, "http://feeds.salon.com/salon/news" | |
, "http://www.foxnews.com/xmlfeed/rss/0,4313,0,00.rss" | |
, "http://www.foxnews.com/xmlfeed/rss/0,4313,80,00.rss" | |
, "http://www.foxnews.com/xmlfeed/rss/0,4313,81,00.rss" | |
, "http://rss.cnn.com/rss/edition.rss" | |
, "http://rss.cnn.com/rss/edition_world.rss" | |
, "http://rss.cnn.com/rss/edition_us.rss" | |
, "http://rss.msnbc.msn.com/id/3032091/device/rss/rss.xml" | |
, "http://rss.msnbc.msn.com/id/3032524/device/rss/rss.xml" | |
, "http://rss.msnbc.msn.com/id/3032506/device/rss/rss.xml" | |
] | |
type Title = String | |
type CountMap = Map.Map String Int | |
type CountMatrix = [[Int]] | |
data WordCounts = WordCounts CountMap [CountMap] [Title] | |
emptyWordCounts :: WordCounts | |
emptyWordCounts = WordCounts Map.empty [] [] | |
feedItems :: Feed.Feed -> [Feed.Item] | |
feedItems (Feed.RSSFeed f) = map Feed.RSSItem $ RSS.rssItems $ RSS.rssChannel f | |
feedItems (Feed.AtomFeed f) = map Feed.AtomItem $ Atom.feedEntries f | |
feedItems (Feed.RSS1Feed _) = error "Feed.RSS1Feed" | |
feedItems (Feed.XMLFeed _) = error "Feed.XMLFeed" | |
itemTitle :: Feed.Item -> Title | |
itemTitle (Feed.RSSItem i) = fromMaybe "" $ RSS.rssItemTitle i | |
itemTitle (Feed.AtomItem i) = show $ Atom.entryTitle i | |
itemTitle _ = undefined | |
itemBody :: Feed.Item -> String | |
itemBody (Feed.RSSItem i) = fromMaybe "" $ RSS.rssItemDescription i | |
itemBody (Feed.AtomItem i) = show $ Atom.entryContent i | |
itemBody _ = undefined | |
feed2WordCounts :: WordCounts -> Feed.Feed -> WordCounts | |
feed2WordCounts wc f = foldl' updateWordCounts wc $ map processItem $ feedItems f | |
makeMatrix :: CountMap -> [CountMap] -> (CountMatrix, [String]) | |
makeMatrix allw articlew = (l1, wordvec) | |
where l1 = [[lookupCount w f | w <- wordvec] | f <- articlew] | |
wordvec = [w | (w, c) <- Map.toList allw, filterTest c] | |
filterTest c = c > 3 && (toD c)< (toD $ length articlew) * 0.4 | |
toD :: (Real a) => a -> Double | |
toD = realToFrac | |
processItem :: Feed.Item -> (Title, CountMap) | |
processItem e = (title, counts) | |
where title = itemTitle e | |
desc = TagSoup.innerText $ TagSoup.parseTags $ itemBody e | |
counts = countWords $ title ++ " " ++ desc | |
countWords :: String -> CountMap | |
countWords = count . normalize . words | |
where normalize = map (map toLower . filter isAlphaNum) | |
count = foldr incCount Map.empty | |
incCount w m = Map.insert w (lookupCount w m + 1) m | |
updateWordCounts :: WordCounts -> (Title, CountMap) -> WordCounts | |
updateWordCounts wd@(WordCounts countMap articles titles) (title, article) | |
| (title `elem` titles) = wd | |
| otherwise = WordCounts ws (articles ++ [article]) (titles ++ [title]) | |
where ws = foldl step countMap (Map.keys article) | |
step r k = Map.insert k (lookupCount k r + lookupCount k article) r | |
lookupCount :: String -> CountMap -> Int | |
lookupCount k = Map.findWithDefault 0 k | |
difcost :: [[Float]] -> [[Float]] -> Float | |
difcost a b = sum $ map sum $ zipMatricesWith squareDif a b | |
where squareDif x y = (x-y)^2 | |
(^^*) :: (Num a) => [[a]] -> [[a]] -> [[a]] | |
a ^^* b = [[sum $ zipWith (*) x y | y <- transpose b] | x <- a] | |
zipMatricesWith :: (a -> b -> c) -> [[a]] -> [[b]] -> [[c]] | |
zipMatricesWith = zipWith . zipWith | |
(@*) :: (Num a) => [[a]] -> [[a]] -> [[a]] | |
(@*) = zipMatricesWith (*) | |
(@/) :: (Fractional a) => [[a]] -> [[a]] -> [[a]] | |
(@/) = zipMatricesWith (/) | |
groupsOf :: Int -> [t] -> [[t]] | |
groupsOf _ [] = [] | |
groupsOf n l = (take n l) : groupsOf n (drop n l) | |
factorize :: (Real a) => [[a]] -> Int -> Int -> ([[Float]], [[Float]]) | |
factorize vs pc iter = factorize' w h v iter | |
where | |
v = map (map realToFrac) vs | |
w = groupsOf pc (take (ic*pc) $ rnd 2) | |
h = groupsOf fc (drop (fc*pc) $ rnd 3) | |
(ic,fc) = shape v | |
rnd = rands pc ic fc | |
rands :: Int -> Int -> Int -> Int -> [Float] | |
rands pc ic fc seed = take (ic*pc + fc*pc) $ Random.randoms $ Random.mkStdGen seed | |
shape :: [[a]] -> (Int, Int) | |
shape m = (length m, length $ head m) | |
factorize' :: [[Float]] -> [[Float]] -> [[Float]] -> Int -> ([[Float]], [[Float]]) | |
factorize' w h _ 0 = (w,h) | |
factorize' w h v iter | |
| difcost v wh == 0 = (w, h) | |
| otherwise = factorize' w' h' v (iter-1) | |
where | |
wh = w ^^* h | |
wt = transpose w | |
hn = wt ^^* v | |
hd = (wt ^^* w) ^^* h | |
h' = (h @* hn) @/ hd | |
ht = transpose h | |
wn = v ^^* ht | |
wd = (w ^^* h) ^^* ht | |
w' = (w @* wn) @/ wd | |
readFeed :: String -> IO Feed.Feed | |
readFeed url = do src <- callRemote url | |
case Feed.parseFeedString src of | |
Nothing -> fail $ "Unable to parse feed: " ++ url | |
Just feed -> return feed | |
callRemote :: String -> IO String | |
callRemote url = do response <- getResponse | |
case (respCurlCode response) of | |
CurlOK -> return $ respBody response | |
_ -> fail $ msg response | |
where getResponse :: IO (CurlResponse_ [(String, String)] String) | |
getResponse = curlGetResponse_ url [] | |
msg r = url ++ "\n" ++ (show $ respStatus r) ++ respStatusLine r | |
buildWordCounts :: IO ([Title], [String], [[Int]]) | |
buildWordCounts = do | |
(WordCounts allw articlew titles) <- foldM procFeed emptyWordCounts feedlist | |
return $ result titles allw articlew | |
where | |
result t a w = (t, ws, matrix) | |
where (matrix, ws) = makeMatrix a w | |
procFeed wc f = | |
putStrLn f >> readFeed f >>= return . feed2WordCounts wc | |
groupFeatures :: [[Float]] -> [[Float]] -> [String] -> [String] -> | |
[([(Float, String)], [(Float, String)])] | |
groupFeatures weights features titles wordvec = | |
zipWith (\f ws -> (topWords f, topTitles ws)) features weights | |
where | |
topWords feature = take 6 $ reverse $ sort $ zip feature wordvec | |
topTitles ws = reverse $ sort $ zip ws titles | |
showResults :: ([(Float, String)], [(Float, String)]) -> IO () | |
showResults (wds,titles) = putStrLn "" >> | |
putStrLn (show wds) >> | |
mapM_ (putStrLn . show) titles >> | |
putStrLn "" | |
main :: IO () | |
main = | |
getArgs >>= \args -> | |
case args of | |
[pc, iter] -> | |
buildWordCounts >>= \(titles, wordvec, matrix) -> | |
let (weights,features) = factorize matrix (read pc) (read iter) in | |
mapM_ showResults (groupFeatures weights features titles wordvec) | |
_ -> | |
putStrLn "Usage: NewsFeatures FEATURE_COUNT #OF_ITERATIONS" | |
m1 = [[1,2,3],[4,5,6]] | |
m2 = [[1,2],[3,4],[5,6]] |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment