Skip to content

Instantly share code, notes, and snippets.

@mihassan
Last active October 18, 2024 12:57
Show Gist options
  • Save mihassan/1b8b450b79b66c43782c59f5f8ff07fd to your computer and use it in GitHub Desktop.
Save mihassan/1b8b450b79b66c43782c59f5f8ff07fd to your computer and use it in GitHub Desktop.
A scraper to scrape all scrabble words from scrabblewordfinder.org written in Haskell.
#!/usr/bin/env cabal
{- cabal:
build-depends: base, scalpel
-}
{-# LANGUAGE OverloadedStrings #-}
import Control.Monad
import Data.Char
import Data.List
import Data.Maybe
import System.IO
import Text.HTML.Scalpel
loadUrls :: IO [String]
loadUrls = do
let url = "https://scrabblewordfinder.org/word-list"
urls <- fromJust <$> scrapeURL url (attrs "href" "a")
let urls1 = filter ((> 15) . length) urls
let urls2 = fixUrl <$> urls1
return urls2
fixUrl :: String -> String
fixUrl url
| "/" `isPrefixOf` url = "https://scrabblewordfinder.org" ++ url
| otherwise = url
loadWords :: String -> IO [String]
loadWords url = do
words <- fromJust <$> scrapeURL url (texts ("a" @: [hasClass "wordWrapper"]))
return $ fixWord <$> words
fixWord :: String -> String
fixWord = map toUpper . takeWhile isAlpha . dropWhile (not . isAlpha)
mergeWorLists :: [[String]] -> [String]
mergeWorLists = map head . group . sort . concat
main :: IO ()
main = do
urls <- loadUrls
wordLists <- forM urls $ \url -> do
hPutStrLn stderr $ "Scrapping " ++ url ++ " ..."
words <- loadWords url
hPutStrLn stderr $ "Found " ++ show (length words) ++ " words."
hPutStrLn stderr $ unwords $ take 10 words
hPutStrLn stderr "\n\n\n"
return words
hPutStrLn stderr "Saving words ..."
putStr $ unlines $ mergeWorLists wordLists
hPutStrLn stderr "Done."
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment