Created
November 19, 2019 15:50
-
-
Save fakedrake/c97008c15cd110281e94388a3dca0b61 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{-# LANGUAGE OverloadedStrings #-} | |
import Control.Applicative | |
import Text.HTML.Scalpel | |
import Data.Char | |
import Data.List | |
import Data.Ord | |
wikiUrl = "https://en.wikipedia.org/wiki/Haskell_(programming_language)?action=render" | |
paragraphs :: IO (Maybe [String]) | |
paragraphs = scrapeURL wikiUrl $ chroots anySelector $ text "p" <|> text "h3" | |
-- Remember that haskell is a lazy language and infinite structures | |
-- are common. `take (length x - 1) x` will die for `x = [1..]` as it | |
-- will get stuck calculating `length [1..]`. | |
shortenBy1 :: String -> String | |
shortenBy1 [] = [] | |
shortenBy1 [_] = [] | |
shortenBy1 (x:xs) = x:shortenBy1 xs | |
removePunc :: String -> String | |
removePunc = filter $ not . isPunctuation | |
-- A style that isn't better or worse (called point free) is to | |
-- compose functions rather than appy functions to the arguments. The | |
-- two are equivalent (and often only the latter is possible, eg if | |
-- the arguments are used in more than one place) but point-free | |
-- trains you in a certain way of thinking that haskell is designed | |
-- for. Namely that functions are not special in any way to the | |
-- language. Also at least sometimes you avoid having to things. | |
-- | |
-- Finally you can make your intent clearer by making your types more | |
-- general. | |
countWords :: Ord a => [a] -> [(a,Int)] | |
countWords = map (\w -> (head w, length w)) . group . sort | |
histogram :: Ord a => [a] -> [(a,Int)] | |
histogram = sortBy (comparing snd) . countWords | |
tokenize :: String -> [String] | |
tokenize = filter (not . null) . words . map toLower . removePunc | |
main = do | |
paragsM <- paragraphs | |
case paragsM of | |
Just parags -> do | |
let wordHisto = histogram $ tokenize =<< parags | |
print $ head wordHisto | |
Nothing -> fail "Couldn't parse site.." |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment