Skip to content

Instantly share code, notes, and snippets.

@wilkes
Created May 24, 2009 21:21
Show Gist options
  • Save wilkes/117261 to your computer and use it in GitHub Desktop.
Save wilkes/117261 to your computer and use it in GitHub Desktop.
module Main where
import Network.Curl
--import Network.URI
import Data.Char(isAlphaNum, toLower)
import Data.List(transpose,sort,foldl')
import Data.Maybe(fromMaybe)
import Control.Monad --(foldM)
import System.Environment(getArgs)
import qualified System.Random as Random
import qualified Data.Map as Map
import qualified Text.HTML.TagSoup as TagSoup
import qualified Text.Feed.Import as Feed
import qualified Text.Feed.Types as Feed
import qualified Text.RSS.Syntax as RSS
import qualified Text.Atom.Feed as Atom
import qualified Text.Atom.Feed as Atom
feedlist :: [String]
feedlist = [ "http://news.google.com/?output=rss"
, "http://feeds.nytimes.com/nyt/rss/HomePage"
, "http://feeds.salon.com/salon/news"
, "http://www.foxnews.com/xmlfeed/rss/0,4313,0,00.rss"
, "http://www.foxnews.com/xmlfeed/rss/0,4313,80,00.rss"
, "http://www.foxnews.com/xmlfeed/rss/0,4313,81,00.rss"
, "http://rss.cnn.com/rss/edition.rss"
, "http://rss.cnn.com/rss/edition_world.rss"
, "http://rss.cnn.com/rss/edition_us.rss"
, "http://rss.msnbc.msn.com/id/3032091/device/rss/rss.xml"
, "http://rss.msnbc.msn.com/id/3032524/device/rss/rss.xml"
, "http://rss.msnbc.msn.com/id/3032506/device/rss/rss.xml"
]
type Title = String
type CountMap = Map.Map String Int
type CountMatrix = [[Int]]
data WordCounts = WordCounts CountMap [CountMap] [Title]
emptyWordCounts :: WordCounts
emptyWordCounts = WordCounts Map.empty [] []
feedItems :: Feed.Feed -> [Feed.Item]
feedItems (Feed.RSSFeed f) = map Feed.RSSItem $ RSS.rssItems $ RSS.rssChannel f
feedItems (Feed.AtomFeed f) = map Feed.AtomItem $ Atom.feedEntries f
feedItems (Feed.RSS1Feed _) = error "Feed.RSS1Feed"
feedItems (Feed.XMLFeed _) = error "Feed.XMLFeed"
itemTitle :: Feed.Item -> Title
itemTitle (Feed.RSSItem i) = fromMaybe "" $ RSS.rssItemTitle i
itemTitle (Feed.AtomItem i) = show $ Atom.entryTitle i
itemTitle _ = undefined
itemBody :: Feed.Item -> String
itemBody (Feed.RSSItem i) = fromMaybe "" $ RSS.rssItemDescription i
itemBody (Feed.AtomItem i) = show $ Atom.entryContent i
itemBody _ = undefined
feed2WordCounts :: WordCounts -> Feed.Feed -> WordCounts
feed2WordCounts wc f = foldl' updateWordCounts wc $ map processItem $ feedItems f
makeMatrix :: CountMap -> [CountMap] -> (CountMatrix, [String])
makeMatrix allw articlew = (l1, wordvec)
where l1 = [[lookupCount w f | w <- wordvec] | f <- articlew]
wordvec = [w | (w, c) <- Map.toList allw, filterTest c]
filterTest c = c > 3 && (toD c)< (toD $ length articlew) * 0.4
toD :: (Real a) => a -> Double
toD = realToFrac
processItem :: Feed.Item -> (Title, CountMap)
processItem e = (title, counts)
where title = itemTitle e
desc = TagSoup.innerText $ TagSoup.parseTags $ itemBody e
counts = countWords $ title ++ " " ++ desc
countWords :: String -> CountMap
countWords = count . normalize . words
where normalize = map (map toLower . filter isAlphaNum)
count = foldr incCount Map.empty
incCount w m = Map.insert w (lookupCount w m + 1) m
updateWordCounts :: WordCounts -> (Title, CountMap) -> WordCounts
updateWordCounts wd@(WordCounts countMap articles titles) (title, article)
| (title `elem` titles) = wd
| otherwise = WordCounts ws (articles ++ [article]) (titles ++ [title])
where ws = foldl step countMap (Map.keys article)
step r k = Map.insert k (lookupCount k r + lookupCount k article) r
lookupCount :: String -> CountMap -> Int
lookupCount k = Map.findWithDefault 0 k
difcost :: [[Float]] -> [[Float]] -> Float
difcost a b = sum $ map sum $ zipMatricesWith squareDif a b
where squareDif x y = (x-y)^2
(^^*) :: (Num a) => [[a]] -> [[a]] -> [[a]]
a ^^* b = [[sum $ zipWith (*) x y | y <- transpose b] | x <- a]
zipMatricesWith :: (a -> b -> c) -> [[a]] -> [[b]] -> [[c]]
zipMatricesWith = zipWith . zipWith
(@*) :: (Num a) => [[a]] -> [[a]] -> [[a]]
(@*) = zipMatricesWith (*)
(@/) :: (Fractional a) => [[a]] -> [[a]] -> [[a]]
(@/) = zipMatricesWith (/)
groupsOf :: Int -> [t] -> [[t]]
groupsOf _ [] = []
groupsOf n l = (take n l) : groupsOf n (drop n l)
factorize :: (Real a) => [[a]] -> Int -> Int -> ([[Float]], [[Float]])
factorize vs pc iter = factorize' w h v iter
where
v = map (map realToFrac) vs
w = groupsOf pc (take (ic*pc) $ rnd 2)
h = groupsOf fc (drop (fc*pc) $ rnd 3)
(ic,fc) = shape v
rnd = rands pc ic fc
rands :: Int -> Int -> Int -> Int -> [Float]
rands pc ic fc seed = take (ic*pc + fc*pc) $ Random.randoms $ Random.mkStdGen seed
shape :: [[a]] -> (Int, Int)
shape m = (length m, length $ head m)
factorize' :: [[Float]] -> [[Float]] -> [[Float]] -> Int -> ([[Float]], [[Float]])
factorize' w h _ 0 = (w,h)
factorize' w h v iter
| difcost v wh == 0 = (w, h)
| otherwise = factorize' w' h' v (iter-1)
where
wh = w ^^* h
wt = transpose w
hn = wt ^^* v
hd = (wt ^^* w) ^^* h
h' = (h @* hn) @/ hd
ht = transpose h
wn = v ^^* ht
wd = (w ^^* h) ^^* ht
w' = (w @* wn) @/ wd
readFeed :: String -> IO Feed.Feed
readFeed url = do src <- callRemote url
case Feed.parseFeedString src of
Nothing -> fail $ "Unable to parse feed: " ++ url
Just feed -> return feed
callRemote :: String -> IO String
callRemote url = do response <- getResponse
case (respCurlCode response) of
CurlOK -> return $ respBody response
_ -> fail $ msg response
where getResponse :: IO (CurlResponse_ [(String, String)] String)
getResponse = curlGetResponse_ url []
msg r = url ++ "\n" ++ (show $ respStatus r) ++ respStatusLine r
buildWordCounts :: IO ([Title], [String], [[Int]])
buildWordCounts = do
(WordCounts allw articlew titles) <- foldM procFeed emptyWordCounts feedlist
return $ result titles allw articlew
where
result t a w = (t, ws, matrix)
where (matrix, ws) = makeMatrix a w
procFeed wc f =
putStrLn f >> readFeed f >>= return . feed2WordCounts wc
groupFeatures :: [[Float]] -> [[Float]] -> [String] -> [String] ->
[([(Float, String)], [(Float, String)])]
groupFeatures weights features titles wordvec =
zipWith (\f ws -> (topWords f, topTitles ws)) features weights
where
topWords feature = take 6 $ reverse $ sort $ zip feature wordvec
topTitles ws = reverse $ sort $ zip ws titles
showResults :: ([(Float, String)], [(Float, String)]) -> IO ()
showResults (wds,titles) = putStrLn "" >>
putStrLn (show wds) >>
mapM_ (putStrLn . show) titles >>
putStrLn ""
main :: IO ()
main =
getArgs >>= \args ->
case args of
[pc, iter] ->
buildWordCounts >>= \(titles, wordvec, matrix) ->
let (weights,features) = factorize matrix (read pc) (read iter) in
mapM_ showResults (groupFeatures weights features titles wordvec)
_ ->
putStrLn "Usage: NewsFeatures FEATURE_COUNT #OF_ITERATIONS"
m1 = [[1,2,3],[4,5,6]]
m2 = [[1,2],[3,4],[5,6]]
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment