Created
May 17, 2013 13:28
-
-
Save SaitoAtsushi/5599033 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
-- -*- mode: haskell; coding: utf-8 -*- | |
-- Author: SAITO Atsushi | |
import Network.HTTP | |
import Data.Maybe | |
import Text.Regex | |
import Control.Monad | |
import Data.Sequence (unfoldr) | |
import Data.Foldable (toList) | |
import Data.Word | |
import Text.Regex.Posix.ByteString | |
import Codec.Binary.UTF8.String | |
import System.Locale | |
import Data.Time.Clock | |
import Data.Time.Format | |
import System.Environment | |
import Data.ByteString.Lazy.Char8 as BL (pack, hPutStr, fromChunks, unpack) | |
import Data.ByteString.Lazy as BS (pack, unpack) | |
import Data.ByteString (breakSubstring) | |
import Codec.Archive.Zip | |
import System.IO | |
import Control.Monad.State | |
data Item = Page { pageNum, subTitle, containts :: String } | |
| Headline String | |
data Novel = Novel { ncode, | |
title, | |
explanation, | |
author :: String, | |
items :: [Item] } | |
deriving (Show) | |
formattedCurrentTime = | |
do t <- getCurrentTime | |
return $ formatTime defaultTimeLocale "%FT%XZ" t | |
ncx novel = | |
"<?xml version='1.0' encoding='utf-8' ?>\n\ | |
\<!DOCTYPE ncx PUBLIC '-//NISO//DTD ncx 2005-1//EN'\ | |
\ 'http://www.daisy.org/z3986/2005/ncx-2005-1.dtd'>\n\ | |
\<ncx xmlns='http://www.daisy.org/z3986/2005/ncx/' xml:lang='en'\ | |
\ version='2005-1'>\n\ | |
\<head>\n\ | |
\<meta name='dtb:uid' content='" ++ ncode novel ++ "' />\n\ | |
\<meta name='dtb:depth' content='1' />\n\ | |
\<meta name='dtb:totalPageCount' content='0' />\n\ | |
\<meta name='dtb:maxPageNumber' content='0' />\n\ | |
\</head>\n\ | |
\<docTitle>\n\ | |
\<text>"++ title novel ++"</text>\n\ | |
\</docTitle>\n\ | |
\<navMap>\n"++novelToNavPoint novel++"</navMap>\n</ncx>" | |
opf novel = | |
do fct <- formattedCurrentTime | |
return $ | |
"<?xml version='1.0' encoding='utf-8' ?>\n\ | |
\<package xmlns='http://www.idpf.org/2007/opf'\ | |
\ unique-identifier='BookId' version='3.0'>\ | |
\<metadata>\ | |
\<dc:title xmlns:dc='http://purl.org/dc/elements/1.1/'>" | |
++ title novel ++ | |
"</dc:title>\ | |
\<dc:creator xmlns:dc='http://purl.org/dc/elements/1.1/'>" | |
++ author novel ++ | |
"</dc:creator>\ | |
\<dc:language xmlns:dc='http://purl.org/dc/elements/1.1/'>\ | |
\ja</dc:language>\ | |
\<dc:identifier\ | |
\ xmlns:dc='http://purl.org/dc/elements/1.1/'\ | |
\ id='BookId'>\ | |
\ urn:" | |
++ ncode novel ++ | |
"</dc:identifier>\ | |
\<dc:subject xmlns:dc='http://purl.org/dc/elements/1.1/'>\ | |
\General Fiction\ | |
\</dc:subject>\ | |
\<dc:description\ | |
\ xmlns:dc='http://purl.org/dc/elements/1.1/'>" | |
++ explanation novel ++ | |
"</dc:description>\ | |
\<meta property='dcterms:modified'>" | |
++ fct ++ | |
"</meta></metadata>" | |
++ manifest novel | |
++ spine novel ++ | |
"<guide>\ | |
\<reference type='title-page' title='title'\ | |
\ href='title.xhtml' />\ | |
\<reference type='toc' title='Table of Contents'\ | |
\ href='nav.xhtml' />\ | |
\</guide>\ | |
\</package>" | |
manifest novel = | |
"<manifest>\ | |
\<item id='toc' href='toc.ncx' media-type='application/x-dtbncx+xml'/>\ | |
\<item id='nav' href='nav.xhtml'\ | |
\ media-type='application/xhtml+xml' properties='nav'/>\ | |
\<item id='title' href='title.xhtml' media-type='application/xhtml+xml'/>\ | |
\<item id='style' href='style.css' media-type='text/css'/>" | |
++ (manifestItem $ items novel) ++ | |
"</manifest>" | |
styleSheet = | |
"html {\ | |
\ -epub-writing-mode: vertical-rl;\ | |
\ direction: ltr;\ | |
\ unicode-bidi:bidi-override;\ | |
\}\ | |
\ol {\ | |
\ list-style-type: none;\ | |
\ padding: 1em 0 0 1em;\ | |
\ margin: 0;\ | |
\}\ | |
\p {\ | |
\ margin: 0;\ | |
\ line-height: 150%\ | |
\}\ | |
\body {\ | |
\ margin: 0;\ | |
\ padding: 0;\ | |
\}\ | |
\dl {\ | |
\ border: solid thick black;\ | |
\}" | |
spine novel= | |
"<spine toc='toc' page-progression-direction='rtl'>\ | |
\<itemref idref='title'/>\ | |
\<itemref idref='nav'/>" | |
++ (manifestItemref $ items novel) ++ | |
"</spine>" | |
manifestItem :: [Item]->String | |
manifestItem (x@(Page num _ _): rest) = | |
"<item id='id_" | |
++num++ | |
"' href='" | |
++num++ | |
".xhtml' media-type='application/xhtml+xml'/>" | |
++manifestItem rest | |
manifestItem [] = "" | |
manifestItem ((Headline _): rest) = manifestItem rest | |
manifestItemref :: [Item]->String | |
manifestItemref ((Page num _ _): rest) = | |
"<itemref idref='id_"++num++"' />"++manifestItemref rest | |
manifestItemref [] = "" | |
manifestItemref ((Headline _): rest) = manifestItemref rest | |
novelToNavPoint novel = itemsToNavPoint 1 $ items novel | |
itemsToNavPoint :: Int -> [Item] ->String | |
itemsToNavPoint countNum (x@(Page _ _ _): rest) = | |
pageToNavPoint x++itemsToNavPoint countNum rest | |
itemsToNavPoint _ [] = "" | |
itemsToNavPoint countNum ((Headline st): rest@((Page p _ _):_)) = | |
"<navPoint id='id_chapter"++show countNum++"'>" | |
++ "<navLabel><text>"++st++"</text></navLabel>" | |
++ "<content src='"++p++".xhtml' />" | |
++ case span isPage rest of | |
(pages, nextChapter)->(concat $ map pageToNavPoint pages) | |
++ "</navPoint>\n" | |
++ itemsToNavPoint (countNum+1)nextChapter | |
novelToToc :: Novel -> [Word8] | |
novelToToc novel = | |
encode "<?xml version='1.0' encoding='utf-8' ?>\n\ | |
\<!DOCTYPE html>\n\ | |
\<html xmlns='http://www.w3.org/1999/xhtml'\ | |
\ xmlns:epub='http://www.idpf.org/2007/ops' xml:lang='ja'>\n\ | |
\<head>\n\ | |
\<title>目次</title>\n\ | |
\<link rel='stylesheet' type='text/css' href='style.css' />\n\ | |
\</head>\n\ | |
\<body>\n\ | |
\<section epub:type='frontmatter toc'>\ | |
\<h1>目次</h1>\ | |
\<nav epub:type='toc' id='toc'>\ | |
\<ol>" | |
++ (BS.unpack $ BL.pack $ itemsToToc $ items novel) ++ | |
encode "</ol>\ | |
\</nav>\ | |
\</section>\ | |
\</body>\ | |
\</html>" | |
itemsToToc :: [Item]->String | |
itemsToToc (x@(Page _ _ _): rest) = pageToToc x++itemsToToc rest | |
itemsToToc [] = "" | |
itemsToToc ((Headline st): rest@((Page p _ _):_)) = | |
"<li><span>"++st++"</span><ol>" | |
++ case span isPage rest of | |
(pages, nextChapter)->(concat $ map pageToToc pages) | |
++ "</ol></li>\n" | |
++ itemsToToc nextChapter | |
pageToToc (Page p t _) = | |
"<li><a href='"++ p ++".xhtml'>"++t++"</a></li>\n" | |
container = | |
"<?xml version='1.0' ?>\n\ | |
\<container version='1.0'\ | |
\ xmlns='urn:oasis:names:tc:opendocument:xmlns:container'>\ | |
\<rootfiles>\ | |
\<rootfile full-path='OPS/content.opf'\ | |
\ media-type='application/oebps-package+xml'/>\ | |
\</rootfiles>\ | |
\</container>" | |
pageToNavPoint (Page p t _) = | |
"<navPoint id='id_"++p++"' playOrder='"++p++"'>" | |
++ "<navLabel><text>"++t++"</text></navLabel>" | |
++ "<content src='"++p++".xhtml' /></navPoint>" | |
toHtml5Style str = subRegex (mkRegex "<rb>([^<]+)</rb>") str "\\1" | |
splitBr = splitRegex (mkRegexWithOpts "<br />" False True) | |
instance Show Item where | |
show (Page p t c) = "<?xml version='1.0' encoding='utf-8' ?>\n" | |
++ "<!DOCTYPE html>\n" | |
++ "<html xmlns='http://www.w3.org/1999/xhtml' xml:lang='ja'>\n" | |
++ "<head>\n<title>"++t++"</title>\n" | |
++ "<link rel='stylesheet' type='text/css' href='style.css' />\n" | |
++ "</head>\n<body>\n<h2>"++t++"</h2>\n" | |
++ (brToParagraph $ toHtml5Style $ c) | |
++ "</body></html>" | |
show (Headline t) = t | |
brToParagraph = concat . map (\x -> "<p>"++x++"</p>") . splitBr | |
download path = do | |
url <- return ("http://ncode.syosetu.com" ++ path) | |
response <- Network.HTTP.simpleHTTP (getRequest url) | |
getResponseBody response | |
getNovel ncode = do | |
indexPage <- download ("/" ++ ncode) | |
title <- return $ queryTitle indexPage | |
explanation <- return $ queryExplanation indexPage | |
author <- return $ queryAuthor indexPage | |
novelBody <- getNovelBody $ toList $ querySubtitles indexPage | |
return $ Novel ncode title explanation author novelBody | |
titlePage novel = | |
encode "<?xml version='1.0' encoding='utf-8'?>\ | |
\<!DOCTYPE html>\ | |
\<html xmlns='http://www.w3.org/1999/xhtml'\ | |
\ xmlns:epub='http://www.idpf.org/2007/ops' xml:lang='ja'>\ | |
\<head>\ | |
\<link rel='stylesheet' type='text/css' href='style.css'/>\ | |
\<title>" ++ (BS.unpack $ BL.pack $ title novel) ++ | |
encode "</title>\ | |
\</head>\ | |
\<body>\ | |
\<h1>" ++ (BS.unpack $ BL.pack $ title novel) ++ encode "</h1>\ | |
\<div style='text-align:right'>" | |
++ (BS.unpack $ BL.pack $ author novel)++ | |
encode "</div>\ | |
\<h2>あらすじ</h2>\ | |
\<p>" | |
++(BS.unpack $ BL.pack $ explanation novel)++ | |
encode "</p></body>\ | |
\</html>" | |
getNovelBody urls = sequence $ map f urls | |
where f ["", url, title] = | |
do body <- (download url) | |
return $ Page (pathToNum url) title (queryNovelBody body) | |
f [title, "", ""] = return $ Headline title | |
matchRegexOne reg str = matchRegex reg str >>= (return . head) | |
matchRegexAll' reg = unfoldr $ scan | |
where scan target = | |
case matchRegexAll reg target of | |
Just (_, _, a, xs) -> Just (xs, a) | |
Nothing -> Nothing | |
queryTitle x = fromJust $ matchRegexOne (mkRegex "<title>([^<]+)</title>") x | |
queryAuthor x = re ((fromJust $ (matchRegex (mkRegexWithOpts "<div class=\"novel_writername\">([^>]+>)?([^<]+)</" False True) x)) !! 1) | |
where | |
re x = fromJust $ matchRegexOne (mkRegexWithOpts "^([^\r\n]+)" False True) x | |
queryNovelBody str = | |
fromJust (matchRegexOne (mkRegexWithOpts "<div class=\"novel_view\" id=\"novel_view\">(.+)" False True) str | |
>>= (matchRegexAll $ mkRegex "</div>") | |
>>= (\x -> case x of (before,_,_,_) -> Just before)) | |
querySubtitles = matchRegexAll' $ mkRegex "<tr><td class=\"chapter\" colspan=\"4\">([^<]+)</td></tr>|<td class=\"[^_]+_subtitle\"><a href=\"([^\"]+)\">([^<]+)</a></td>" | |
pathToNum str = fromJust $ matchRegexOne (mkRegex "^/.+/(.+)/$") str | |
queryExplanation x = | |
fromJust $ | |
matchRegexOne | |
(mkRegexWithOpts "<div class=\"novel_ex\">([^>]+)</div>" False True) | |
x | |
isPage :: Item -> Bool | |
isPage (Page _ _ _)= True | |
isPage _ = False | |
onlyPage :: [Item] -> [Item] | |
onlyPage x = filter isPage x | |
code = decode . BS.unpack . BL.pack | |
epubize ncode = | |
do novel <- getNovel ncode | |
h <- openFile ("["++(code $author novel)++"]"++(code $title novel)++".epub") WriteMode | |
archive <- return $ foldr (\x arc->(addEntryToArchive (page2entry x) arc)) emptyArchive (onlyPage $ items novel) | |
archive <- return $ addEntryToArchive (toEntry "OPS/toc.ncx" 0 $ BL.pack $ ncx novel) archive | |
archive <- return $ addEntryToArchive (toEntry "OPS/style.css" 0 $ BL.pack styleSheet) archive | |
nopf <- opf novel | |
archive <- return $ addEntryToArchive (toEntry "OPS/content.opf" 0 $ BL.pack nopf) archive | |
archive <- return $ addEntryToArchive (toEntry "OPS/nav.xhtml" 0 $ BS.pack $ novelToToc novel) archive | |
archive <- return $ addEntryToArchive (toEntry "META-INF/container.xml" 0 $ BL.pack container) archive | |
archive <- return $ addEntryToArchive (toEntry "OPS/title.xhtml" 0 $ BS.pack $ titlePage novel) archive | |
archive <- return $ addEntryToArchive (toEntry "mimetype" 0 $ BL.pack "application/epub+zip" ) archive | |
BL.hPutStr h $ fromArchive archive | |
hClose h | |
where | |
page2entry x=(toEntry ("OPS/"++pageNum x++".xhtml") 0 (BL.pack (show x))) | |
usage = putStr "Usage: ypub ncode ...\n" | |
main = | |
do args <- getArgs | |
if args==[] | |
then usage | |
else mapM_ epubize args |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment