Last active
April 11, 2018 10:15
-
-
Save silverweed/a1da3e88823a64316083 to your computer and use it in GitHub Desktop.
maudmap-hs
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{-# LANGUAGE OverloadedStrings #-} | |
-- Emit sitemap for maud (requires access to maud db) | |
import Database.MongoDB | |
import System.Environment | |
import Data.Time.Clock.POSIX | |
import qualified Data.Time.Format as DTF | |
import Data.List (intercalate) | |
import Data.List.Split | |
domain :: String | |
domain = "https://crunchy.rocks" | |
dbName :: Database | |
dbName = "maud" | |
parseArgs :: [String] -> IO String | |
parseArgs [] = return "127.0.0.1" | |
parseArgs (x:_) = return x | |
emitUrl :: Document -> [String] | |
emitUrl [] = [] | |
emitUrl (f:fs) | l == "shorturl" = (wrap "loc" $ domain ++ "/thread/" ++ v) : emitUrl fs | |
| l == "lrdate" || l == "lastupdate" = (wrap "lastmod" (todate $ read $ show $ value f)) : emitUrl fs | |
| l == "name" = (wrap "loc" $ domain ++ "/tag/" ++ (escape v)) : emitUrl fs | |
| otherwise = []:emitUrl fs | |
where | |
l = label f | |
v = init $ tail $ show $ value f | |
todate :: Int -> String | |
todate = DTF.formatTime DTF.defaultTimeLocale "%Y-%m-%dT%H:%M:%S" . posixSecondsToUTCTime . realToFrac | |
escape :: String -> String | |
escape [] = [] | |
escape (c:cs) | c == '/' = "/" ++ escape cs | |
| otherwise = c : escape cs | |
emitXML :: Pipe -> Collection -> IO () | |
emitXML pipe coll = do | |
let run = access pipe master dbName | |
docs <- run $ find (select [] coll) { | |
project = [ "shorturl" =: True | |
, "lrdate" =: True | |
, "lastupdate" =: True | |
, "name" =: True | |
, "_id" =: False | |
]} >>= rest | |
putStrLn $ intercalate "\n" $ map (wrapUrl coll) $ map emitUrl docs | |
wrapUrl :: Collection -> [String] -> String | |
wrapUrl _ [] = [] | |
wrapUrl coll lst = i 2 $ wrap "url" $ "\n" ++ intercalate "\n" | |
(map (i 1) (lst | |
++ [wrap "changefreq" (cf coll)] | |
++ [wrap "priority" (prio coll)] | |
)) ++ "\n" | |
where | |
prio "home" = "1.0" | |
prio "stiki" = "0.8" | |
prio "new" = "0.75" | |
prio "threads" = "0.6" | |
prio "tags" = "0.5" | |
prio _ = "0" | |
cf "threads" = "daily" | |
cf "tags" = "weekly" | |
cf "stiki" = "monthly" | |
cf "new" = "yearly" | |
cf "home" = "yearly" | |
cf _ = "never" | |
emitHeader :: IO () | |
emitHeader = do putStrLn "<?xml version=\"1.0\" encoding=\"UTF-8\"?>" | |
putStrLn "<urlset xmlns=\"http://www.sitemaps.org/schemas/sitemap/0.9\">" | |
emitFooter :: IO () | |
emitFooter = putStrLn "</urlset>" | |
emitStatic :: String -> [String] -> IO () | |
emitStatic "home" _ = putStrLn $ wrapUrl "home" [ wrap "loc" (domain ++ "/") | |
, wrap "lastmod" "2015-01-01T00:00:00" ] | |
emitStatic "new" _ = putStrLn $ wrapUrl "new" [ wrap "loc" (domain ++ "/new") | |
, wrap "lastmod" "2015-01-01T00:00:00" ] | |
emitStatic "stiki" [] = return () | |
emitStatic "stiki" (page:pages) = do putStrLn $ wrapUrl "stiki" [ wrap "loc" $ domain ++ "/stiki/" ++ page | |
, wrap "lastmod" "2016-12-05T18:14:00" | |
] | |
emitStatic "stiki" pages | |
emitStatic _ _ = return () | |
i :: Int -> String -> String | |
i n s = intercalate "\n" [[' ' | _ <- [1..n*4]] ++ line | line <- splitOn "\n" s] | |
wrap :: String -> String -> String | |
wrap tag string = "<" ++ tag ++ ">" ++ string ++ "</" ++ tag ++ ">" | |
main :: IO () | |
main = do | |
pipe <- getArgs >>= parseArgs | |
>>= \dbaddr -> connect $ host dbaddr | |
emitHeader | |
emitStatic "home" [] | |
emitStatic "new" [] | |
emitStatic "stiki" [ "cookie-policy", "formatting", "nsfw-policy", "dmca", "video-tag" ] | |
emitXML pipe "threads" | |
emitXML pipe "tags" | |
emitFooter |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment