Last active
October 18, 2024 12:57
-
-
Save mihassan/1b8b450b79b66c43782c59f5f8ff07fd to your computer and use it in GitHub Desktop.
A scraper to scrape all scrabble words from scrabblewordfinder.org written in Haskell.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env cabal | |
{- cabal: | |
build-depends: base, scalpel | |
-} | |
{-# LANGUAGE OverloadedStrings #-} | |
import Control.Monad | |
import Data.Char | |
import Data.List | |
import Data.Maybe | |
import System.IO | |
import Text.HTML.Scalpel | |
loadUrls :: IO [String] | |
loadUrls = do | |
let url = "https://scrabblewordfinder.org/word-list" | |
urls <- fromJust <$> scrapeURL url (attrs "href" "a") | |
let urls1 = filter ((> 15) . length) urls | |
let urls2 = fixUrl <$> urls1 | |
return urls2 | |
fixUrl :: String -> String | |
fixUrl url | |
| "/" `isPrefixOf` url = "https://scrabblewordfinder.org" ++ url | |
| otherwise = url | |
loadWords :: String -> IO [String] | |
loadWords url = do | |
words <- fromJust <$> scrapeURL url (texts ("a" @: [hasClass "wordWrapper"])) | |
return $ fixWord <$> words | |
fixWord :: String -> String | |
fixWord = map toUpper . takeWhile isAlpha . dropWhile (not . isAlpha) | |
mergeWorLists :: [[String]] -> [String] | |
mergeWorLists = map head . group . sort . concat | |
main :: IO () | |
main = do | |
urls <- loadUrls | |
wordLists <- forM urls $ \url -> do | |
hPutStrLn stderr $ "Scrapping " ++ url ++ " ..." | |
words <- loadWords url | |
hPutStrLn stderr $ "Found " ++ show (length words) ++ " words." | |
hPutStrLn stderr $ unwords $ take 10 words | |
hPutStrLn stderr "\n\n\n" | |
return words | |
hPutStrLn stderr "Saving words ..." | |
putStr $ unlines $ mergeWorLists wordLists | |
hPutStrLn stderr "Done." |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment