Last active
November 24, 2019 11:06
-
-
Save thalesmg/1973ab1f2f88898093fe94dea4f3582e to your computer and use it in GitHub Desktop.
khinsider downloader
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env stack | |
{- | |
stack --resolver lts-14.15 script | |
--package bytestring | |
--package tagsoup | |
--package wreq | |
--package lens | |
--package async | |
--package optparse-applicative | |
--package directory | |
--package transformers | |
--package mtl | |
--package split | |
-} | |
{-# LANGUAGE NamedFieldPuns #-} | |
{-# LANGUAGE OverloadedStrings #-} | |
import Control.Applicative (optional) | |
import Control.Concurrent.Async (Concurrently (..)) | |
import Control.Concurrent.QSem (QSem, newQSem, signalQSem, | |
waitQSem) | |
import Control.Exception (bracket_) | |
import Control.Lens ((&), (^.)) | |
import Control.Monad (unless) | |
import Control.Monad.Trans (liftIO) | |
import Control.Monad.Trans.Except (ExceptT (..), runExceptT) | |
import Data.ByteString.Lazy (ByteString) | |
import qualified Data.ByteString.Lazy as BSL | |
import Data.ByteString.Lazy.Char8 (unpack) | |
import qualified Data.ByteString.Lazy.Char8 as C8 | |
import Data.Char (isAlphaNum, isSpace) | |
import Data.List (isPrefixOf) | |
import Data.List.Split (splitOn) | |
import Network.Wreq (Response, defaults, get, | |
responseBody, responseStatus, | |
statusCode) | |
import Options.Applicative (Parser, argument, auto, execParser, | |
fullDesc, help, helper, info, long, | |
metavar, option, progDesc, short, | |
str, strOption, value, (<**>)) | |
import System.Directory (createDirectoryIfMissing, | |
doesDirectoryExist, getPermissions, | |
writable) | |
import System.Exit (exitFailure) | |
import Text.HTML.TagSoup (Tag (..), parseTags, (~/=), (~==)) | |
newtype URL = URL { getURL :: String } deriving (Eq, Show) | |
data Config = Config | |
{ outputDir :: Maybe FilePath | |
, songlistURL :: URL | |
, asyncJobs :: Int | |
} deriving Show | |
baseURL :: String | |
baseURL = "https://downloads.khinsider.com" | |
every :: Int -> [a] -> [a] | |
every _ [] = [] | |
every n (x:xs) = x : every n (drop (n - 1) xs) | |
getOK :: URL -> IO (Either String ByteString) | |
getOK (URL url) = do | |
response <- get url | |
if response ^. responseStatus . statusCode == 200 | |
then pure . Right $ response ^. responseBody | |
else pure . Left $ "failed to fetch " <> url | |
collectLinks :: [Tag ByteString] -> [(ByteString, URL)] | |
collectLinks ts' = go ts' [] | |
where | |
go [] acc = reverse acc | |
go (TagOpen "a" attrs : TagText txt : TagClose "a" : ts) acc | |
| Just href <- lookup "href" attrs = go ts ((txt, URL $ baseURL <> unpack href):acc) | |
| otherwise = go ts acc | |
go (t:ts) acc = go ts acc | |
getSongURLs :: URL -> IO (Either String [(ByteString, URL)]) | |
getSongURLs url = do | |
eresponse <- getOK url | |
pure $ eresponse >>= \body -> do | |
let soup = parseTags body | |
-- name, length in min, size in mb | |
songlist = every 3 | |
. collectLinks | |
. takeWhile (~/= (TagClose "table" :: Tag ByteString)) | |
. dropWhile (~/= (TagOpen "table" [("id", "songlist")] :: Tag ByteString)) | |
$ soup | |
Right songlist | |
getDownloadSongURL :: URL -> IO (Either String URL) | |
getDownloadSongURL url = do | |
eresponse <- getOK url | |
pure $ eresponse >>= \body -> do | |
let soup = parseTags body | |
case filter (~== (TagOpen "audio" [] :: Tag ByteString)) soup of | |
(TagOpen "audio" attrs : _) | |
| Just src <- lookup "src" attrs -> Right (URL . unpack $ src) | |
_ -> Left "could not find audio tag with src" | |
sanitizeFilename :: ByteString -> ByteString | |
sanitizeFilename = C8.map clean | |
where | |
clean c = if isAlphaNum c || isSpace c | |
then c | |
else '_' | |
downloadMP3 :: FilePath -> (ByteString, URL) -> IO (Either String ()) | |
downloadMP3 dest (name, url) = do | |
let filename = dest <> "/" <> unpack (sanitizeFilename name) <> ".mp3" | |
eresponse <- getOK url | |
traverse (BSL.writeFile filename) eresponse | |
withConc :: QSem -> (a -> IO b) -> a -> Concurrently b | |
withConc sem f a = | |
Concurrently $ bracket_ (waitQSem sem) (signalQSem sem) (f a) | |
configParser :: Parser Config | |
configParser | |
= Config | |
<$> optional (strOption | |
( long "output" | |
<> short 'o' | |
<> metavar "DIR" | |
<> help "output directory path" | |
)) | |
<*> ( URL <$> | |
( argument str | |
( metavar "URL" | |
<> help "the url with songlist" | |
) | |
) | |
) | |
<*> option auto | |
( long "jobs" | |
<> short 'j' | |
<> help "number of parallel jobs" | |
<> value 4 | |
) | |
die :: String -> IO a | |
die msg = putStrLn msg >> exitFailure | |
andM :: [IO Bool] -> IO Bool | |
andM [] = pure True | |
andM (t:ts) = do | |
b <- t | |
if b then andM ts else pure False | |
postValidateConfig :: Config -> IO () | |
postValidateConfig Config{songlistURL, outputDir, asyncJobs} = do | |
let URL url = songlistURL | |
unless (baseURL `isPrefixOf` url) $ die "url should be from khinsider" | |
dirOk <- | |
case outputDir of | |
Nothing -> pure True | |
Just dir -> | |
andM [ doesDirectoryExist dir | |
, pure . writable =<< getPermissions dir | |
] | |
unless dirOk $ die "output dir should be writable" | |
unless (asyncJobs > 0) $ die "async jobs must be positive" | |
prepareDownloadDir :: Config -> IO FilePath | |
prepareDownloadDir Config{songlistURL = URL url, outputDir = Nothing} = do | |
let albumSlug = last . splitOn "/" $ url | |
newDir = "./" <> albumSlug | |
createDirectoryIfMissing False newDir | |
pure newDir | |
prepareDownloadDir Config{outputDir = Just dir} = pure dir | |
main :: IO () | |
main = do | |
cfg@Config{songlistURL, asyncJobs} <- execParser opts | |
postValidateConfig cfg | |
outputDir <- prepareDownloadDir cfg | |
putStrLn $ "[*] output dir: " <> outputDir | |
sem <- newQSem asyncJobs | |
efilesToDownload <- runExceptT $ do | |
liftIO $ putStrLn "[*] getting songlist" | |
songURLs <- ExceptT $ getSongURLs songlistURL | |
liftIO $ putStrLn "[*] getting mp3 URLs" | |
mp3URLs <- ExceptT $ fmap sequence (runConcurrently $ traverse (withConc sem getDownloadSongURL) (map snd songURLs)) | |
pure $ zipWith (\(name, _) mp3URL -> (name, mp3URL)) songURLs mp3URLs | |
case efilesToDownload of | |
Left msg -> die msg | |
Right urls -> do | |
putStrLn $ "[*] downloading " <> show (length urls) <> " files..." | |
runConcurrently $ traverse (withConc sem (downloadMP3 outputDir)) urls | |
putStrLn "[*] done!" | |
where | |
opts = info (configParser <**> helper) | |
( fullDesc | |
<> progDesc "downloads an entire album from KHInsider" | |
) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment