Created
November 19, 2019 16:00
-
-
Save fakedrake/3c3fa5f8b8f7881ac76836b6523fd41a to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{-# LANGUAGE OverloadedStrings #-} | |
import Control.Applicative | |
import Text.HTML.Scalpel | |
import Data.Char | |
import Data.Hashable | |
import qualified Data.HashMap.Strict as HM | |
import Data.List | |
import Data.Ord | |
wikiUrl = "https://en.wikipedia.org/wiki/Haskell_(programming_language)?action=render" | |
paragraphs :: IO (Maybe [String]) | |
paragraphs = scrapeURL wikiUrl $ chroots anySelector $ text "p" <|> text "h3" | |
-- Remember that haskell is a lazy language and infinite structures | |
-- are common. `take (length x - 1) x` will die for `x = [1..]` as it | |
-- will get stuck calculating `length [1..]`. | |
shortenBy1 :: String -> String | |
shortenBy1 [] = [] | |
shortenBy1 [_] = [] | |
shortenBy1 (x:xs) = x:shortenBy1 xs | |
removePunc :: String -> String | |
removePunc = filter $ not . isPunctuation | |
-- A style that isn't better or worse (called point free) is to | |
-- compose functions rather than appy functions to the arguments. The | |
-- two are equivalent (and often only the latter is possible, eg if | |
-- the arguments are used in more than one place) but point-free | |
-- trains you in a certain way of thinking that haskell is designed | |
-- for. Namely that functions are not special in any way to the | |
-- language. Also at least sometimes you avoid having to things. | |
-- | |
-- Finally you can make your intent clearer by making your types more | |
-- general. | |
countWords :: Ord a => [a] -> [(a,Int)] | |
countWords = map (\w -> (head w, length w)) . group . sort | |
countWordsHM :: (Eq a, Hashable a) => [a] -> [(a,Int)] | |
countWordsHM = | |
HM.toList | |
. foldl (\hist v -> HM.alter (Just . maybe 1 (+1)) v hist) mempty | |
histogram :: (Eq a, Hashable a, Ord a) => [a] -> [(a,Int)] | |
histogram = sortBy (comparing snd) . countWordsHM | |
tokenize :: String -> [String] | |
tokenize = filter (not . null) . words . map toLower . removePunc | |
main = do | |
paragsM <- paragraphs | |
case paragsM of | |
Just parags -> do | |
let wordHisto = histogram $ tokenize =<< parags | |
print $ head wordHisto | |
Nothing -> fail "Couldn't parse site.." |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment