Skip to content

Instantly share code, notes, and snippets.

@SaitoAtsushi
Created May 17, 2013 13:28
Show Gist options
  • Save SaitoAtsushi/5599033 to your computer and use it in GitHub Desktop.
Save SaitoAtsushi/5599033 to your computer and use it in GitHub Desktop.
-- -*- mode: haskell; coding: utf-8 -*-
-- Author: SAITO Atsushi
import Network.HTTP
import Data.Maybe
import Text.Regex
import Control.Monad
import Data.Sequence (unfoldr)
import Data.Foldable (toList)
import Data.Word
import Text.Regex.Posix.ByteString
import Codec.Binary.UTF8.String
import System.Locale
import Data.Time.Clock
import Data.Time.Format
import System.Environment
import Data.ByteString.Lazy.Char8 as BL (pack, hPutStr, fromChunks, unpack)
import Data.ByteString.Lazy as BS (pack, unpack)
import Data.ByteString (breakSubstring)
import Codec.Archive.Zip
import System.IO
import Control.Monad.State
data Item = Page { pageNum, subTitle, containts :: String }
| Headline String
data Novel = Novel { ncode,
title,
explanation,
author :: String,
items :: [Item] }
deriving (Show)
formattedCurrentTime =
do t <- getCurrentTime
return $ formatTime defaultTimeLocale "%FT%XZ" t
ncx novel =
"<?xml version='1.0' encoding='utf-8' ?>\n\
\<!DOCTYPE ncx PUBLIC '-//NISO//DTD ncx 2005-1//EN'\
\ 'http://www.daisy.org/z3986/2005/ncx-2005-1.dtd'>\n\
\<ncx xmlns='http://www.daisy.org/z3986/2005/ncx/' xml:lang='en'\
\ version='2005-1'>\n\
\<head>\n\
\<meta name='dtb:uid' content='" ++ ncode novel ++ "' />\n\
\<meta name='dtb:depth' content='1' />\n\
\<meta name='dtb:totalPageCount' content='0' />\n\
\<meta name='dtb:maxPageNumber' content='0' />\n\
\</head>\n\
\<docTitle>\n\
\<text>"++ title novel ++"</text>\n\
\</docTitle>\n\
\<navMap>\n"++novelToNavPoint novel++"</navMap>\n</ncx>"
opf novel =
do fct <- formattedCurrentTime
return $
"<?xml version='1.0' encoding='utf-8' ?>\n\
\<package xmlns='http://www.idpf.org/2007/opf'\
\ unique-identifier='BookId' version='3.0'>\
\<metadata>\
\<dc:title xmlns:dc='http://purl.org/dc/elements/1.1/'>"
++ title novel ++
"</dc:title>\
\<dc:creator xmlns:dc='http://purl.org/dc/elements/1.1/'>"
++ author novel ++
"</dc:creator>\
\<dc:language xmlns:dc='http://purl.org/dc/elements/1.1/'>\
\ja</dc:language>\
\<dc:identifier\
\ xmlns:dc='http://purl.org/dc/elements/1.1/'\
\ id='BookId'>\
\ urn:"
++ ncode novel ++
"</dc:identifier>\
\<dc:subject xmlns:dc='http://purl.org/dc/elements/1.1/'>\
\General Fiction\
\</dc:subject>\
\<dc:description\
\ xmlns:dc='http://purl.org/dc/elements/1.1/'>"
++ explanation novel ++
"</dc:description>\
\<meta property='dcterms:modified'>"
++ fct ++
"</meta></metadata>"
++ manifest novel
++ spine novel ++
"<guide>\
\<reference type='title-page' title='title'\
\ href='title.xhtml' />\
\<reference type='toc' title='Table of Contents'\
\ href='nav.xhtml' />\
\</guide>\
\</package>"
manifest novel =
"<manifest>\
\<item id='toc' href='toc.ncx' media-type='application/x-dtbncx+xml'/>\
\<item id='nav' href='nav.xhtml'\
\ media-type='application/xhtml+xml' properties='nav'/>\
\<item id='title' href='title.xhtml' media-type='application/xhtml+xml'/>\
\<item id='style' href='style.css' media-type='text/css'/>"
++ (manifestItem $ items novel) ++
"</manifest>"
styleSheet =
"html {\
\ -epub-writing-mode: vertical-rl;\
\ direction: ltr;\
\ unicode-bidi:bidi-override;\
\}\
\ol {\
\ list-style-type: none;\
\ padding: 1em 0 0 1em;\
\ margin: 0;\
\}\
\p {\
\ margin: 0;\
\ line-height: 150%\
\}\
\body {\
\ margin: 0;\
\ padding: 0;\
\}\
\dl {\
\ border: solid thick black;\
\}"
spine novel=
"<spine toc='toc' page-progression-direction='rtl'>\
\<itemref idref='title'/>\
\<itemref idref='nav'/>"
++ (manifestItemref $ items novel) ++
"</spine>"
manifestItem :: [Item]->String
manifestItem (x@(Page num _ _): rest) =
"<item id='id_"
++num++
"' href='"
++num++
".xhtml' media-type='application/xhtml+xml'/>"
++manifestItem rest
manifestItem [] = ""
manifestItem ((Headline _): rest) = manifestItem rest
manifestItemref :: [Item]->String
manifestItemref ((Page num _ _): rest) =
"<itemref idref='id_"++num++"' />"++manifestItemref rest
manifestItemref [] = ""
manifestItemref ((Headline _): rest) = manifestItemref rest
novelToNavPoint novel = itemsToNavPoint 1 $ items novel
itemsToNavPoint :: Int -> [Item] ->String
itemsToNavPoint countNum (x@(Page _ _ _): rest) =
pageToNavPoint x++itemsToNavPoint countNum rest
itemsToNavPoint _ [] = ""
itemsToNavPoint countNum ((Headline st): rest@((Page p _ _):_)) =
"<navPoint id='id_chapter"++show countNum++"'>"
++ "<navLabel><text>"++st++"</text></navLabel>"
++ "<content src='"++p++".xhtml' />"
++ case span isPage rest of
(pages, nextChapter)->(concat $ map pageToNavPoint pages)
++ "</navPoint>\n"
++ itemsToNavPoint (countNum+1)nextChapter
novelToToc :: Novel -> [Word8]
novelToToc novel =
encode "<?xml version='1.0' encoding='utf-8' ?>\n\
\<!DOCTYPE html>\n\
\<html xmlns='http://www.w3.org/1999/xhtml'\
\ xmlns:epub='http://www.idpf.org/2007/ops' xml:lang='ja'>\n\
\<head>\n\
\<title>目次</title>\n\
\<link rel='stylesheet' type='text/css' href='style.css' />\n\
\</head>\n\
\<body>\n\
\<section epub:type='frontmatter toc'>\
\<h1>目次</h1>\
\<nav epub:type='toc' id='toc'>\
\<ol>"
++ (BS.unpack $ BL.pack $ itemsToToc $ items novel) ++
encode "</ol>\
\</nav>\
\</section>\
\</body>\
\</html>"
itemsToToc :: [Item]->String
itemsToToc (x@(Page _ _ _): rest) = pageToToc x++itemsToToc rest
itemsToToc [] = ""
itemsToToc ((Headline st): rest@((Page p _ _):_)) =
"<li><span>"++st++"</span><ol>"
++ case span isPage rest of
(pages, nextChapter)->(concat $ map pageToToc pages)
++ "</ol></li>\n"
++ itemsToToc nextChapter
pageToToc (Page p t _) =
"<li><a href='"++ p ++".xhtml'>"++t++"</a></li>\n"
container =
"<?xml version='1.0' ?>\n\
\<container version='1.0'\
\ xmlns='urn:oasis:names:tc:opendocument:xmlns:container'>\
\<rootfiles>\
\<rootfile full-path='OPS/content.opf'\
\ media-type='application/oebps-package+xml'/>\
\</rootfiles>\
\</container>"
pageToNavPoint (Page p t _) =
"<navPoint id='id_"++p++"' playOrder='"++p++"'>"
++ "<navLabel><text>"++t++"</text></navLabel>"
++ "<content src='"++p++".xhtml' /></navPoint>"
toHtml5Style str = subRegex (mkRegex "<rb>([^<]+)</rb>") str "\\1"
splitBr = splitRegex (mkRegexWithOpts "<br />" False True)
instance Show Item where
show (Page p t c) = "<?xml version='1.0' encoding='utf-8' ?>\n"
++ "<!DOCTYPE html>\n"
++ "<html xmlns='http://www.w3.org/1999/xhtml' xml:lang='ja'>\n"
++ "<head>\n<title>"++t++"</title>\n"
++ "<link rel='stylesheet' type='text/css' href='style.css' />\n"
++ "</head>\n<body>\n<h2>"++t++"</h2>\n"
++ (brToParagraph $ toHtml5Style $ c)
++ "</body></html>"
show (Headline t) = t
brToParagraph = concat . map (\x -> "<p>"++x++"</p>") . splitBr
download path = do
url <- return ("http://ncode.syosetu.com" ++ path)
response <- Network.HTTP.simpleHTTP (getRequest url)
getResponseBody response
getNovel ncode = do
indexPage <- download ("/" ++ ncode)
title <- return $ queryTitle indexPage
explanation <- return $ queryExplanation indexPage
author <- return $ queryAuthor indexPage
novelBody <- getNovelBody $ toList $ querySubtitles indexPage
return $ Novel ncode title explanation author novelBody
titlePage novel =
encode "<?xml version='1.0' encoding='utf-8'?>\
\<!DOCTYPE html>\
\<html xmlns='http://www.w3.org/1999/xhtml'\
\ xmlns:epub='http://www.idpf.org/2007/ops' xml:lang='ja'>\
\<head>\
\<link rel='stylesheet' type='text/css' href='style.css'/>\
\<title>" ++ (BS.unpack $ BL.pack $ title novel) ++
encode "</title>\
\</head>\
\<body>\
\<h1>" ++ (BS.unpack $ BL.pack $ title novel) ++ encode "</h1>\
\<div style='text-align:right'>"
++ (BS.unpack $ BL.pack $ author novel)++
encode "</div>\
\<h2>あらすじ</h2>\
\<p>"
++(BS.unpack $ BL.pack $ explanation novel)++
encode "</p></body>\
\</html>"
getNovelBody urls = sequence $ map f urls
where f ["", url, title] =
do body <- (download url)
return $ Page (pathToNum url) title (queryNovelBody body)
f [title, "", ""] = return $ Headline title
matchRegexOne reg str = matchRegex reg str >>= (return . head)
matchRegexAll' reg = unfoldr $ scan
where scan target =
case matchRegexAll reg target of
Just (_, _, a, xs) -> Just (xs, a)
Nothing -> Nothing
queryTitle x = fromJust $ matchRegexOne (mkRegex "<title>([^<]+)</title>") x
queryAuthor x = re ((fromJust $ (matchRegex (mkRegexWithOpts "<div class=\"novel_writername\">([^>]+>)?([^<]+)</" False True) x)) !! 1)
where
re x = fromJust $ matchRegexOne (mkRegexWithOpts "^([^\r\n]+)" False True) x
queryNovelBody str =
fromJust (matchRegexOne (mkRegexWithOpts "<div class=\"novel_view\" id=\"novel_view\">(.+)" False True) str
>>= (matchRegexAll $ mkRegex "</div>")
>>= (\x -> case x of (before,_,_,_) -> Just before))
querySubtitles = matchRegexAll' $ mkRegex "<tr><td class=\"chapter\" colspan=\"4\">([^<]+)</td></tr>|<td class=\"[^_]+_subtitle\"><a href=\"([^\"]+)\">([^<]+)</a></td>"
pathToNum str = fromJust $ matchRegexOne (mkRegex "^/.+/(.+)/$") str
queryExplanation x =
fromJust $
matchRegexOne
(mkRegexWithOpts "<div class=\"novel_ex\">([^>]+)</div>" False True)
x
isPage :: Item -> Bool
isPage (Page _ _ _)= True
isPage _ = False
onlyPage :: [Item] -> [Item]
onlyPage x = filter isPage x
code = decode . BS.unpack . BL.pack
epubize ncode =
do novel <- getNovel ncode
h <- openFile ("["++(code $author novel)++"]"++(code $title novel)++".epub") WriteMode
archive <- return $ foldr (\x arc->(addEntryToArchive (page2entry x) arc)) emptyArchive (onlyPage $ items novel)
archive <- return $ addEntryToArchive (toEntry "OPS/toc.ncx" 0 $ BL.pack $ ncx novel) archive
archive <- return $ addEntryToArchive (toEntry "OPS/style.css" 0 $ BL.pack styleSheet) archive
nopf <- opf novel
archive <- return $ addEntryToArchive (toEntry "OPS/content.opf" 0 $ BL.pack nopf) archive
archive <- return $ addEntryToArchive (toEntry "OPS/nav.xhtml" 0 $ BS.pack $ novelToToc novel) archive
archive <- return $ addEntryToArchive (toEntry "META-INF/container.xml" 0 $ BL.pack container) archive
archive <- return $ addEntryToArchive (toEntry "OPS/title.xhtml" 0 $ BS.pack $ titlePage novel) archive
archive <- return $ addEntryToArchive (toEntry "mimetype" 0 $ BL.pack "application/epub+zip" ) archive
BL.hPutStr h $ fromArchive archive
hClose h
where
page2entry x=(toEntry ("OPS/"++pageNum x++".xhtml") 0 (BL.pack (show x)))
usage = putStr "Usage: ypub ncode ...\n"
main =
do args <- getArgs
if args==[]
then usage
else mapM_ epubize args
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment