Skip to content

Instantly share code, notes, and snippets.

@taskie
Last active February 14, 2017 11:56
Show Gist options
  • Save taskie/8414c0d077257340745ca2d7d9a35542 to your computer and use it in GitHub Desktop.
Save taskie/8414c0d077257340745ca2d7d9a35542 to your computer and use it in GitHub Desktop.
PukiWiki Reader for Pandoc(雑)
{-# LANGUAGE RelaxedPolyRec, FlexibleInstances, TypeSynonymInstances #-}
-- RelaxedPolyRec needed for inlinesBetween on GHC < 7
{-
Copyright (C) 2012-2015 John MacFarlane <[email protected]>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
-}
{- |
Module : Text.Pandoc.Readers.PukiWiki
Copyright : Copyright (C) 2012-2015 John MacFarlane
License : GNU GPL, version 2 or above
Maintainer : John MacFarlane <[email protected]>
Stability : alpha
Portability : portable
Conversion of PukiWiki text to 'Pandoc' document.
-}
{-
TODO:
_ correctly handle tables within tables
_ parse templates?
-}
module Text.Pandoc.Readers.PukiWiki ( readPukiWiki ) where
import Text.Pandoc.Definition
import qualified Text.Pandoc.Builder as B
import Text.Pandoc.Builder (Inlines, Blocks, trimInlines)
import Data.Monoid ((<>))
import Text.Pandoc.Options
import Text.Pandoc.Readers.HTML ( htmlTag, isBlockTag, isCommentTag )
import Text.Pandoc.XML ( fromEntities )
import Text.Pandoc.Parsing hiding ( nested )
import Text.Pandoc.Walk ( walk )
import Text.Pandoc.Shared ( stripTrailingNewlines, safeRead, stringify, trim )
import Control.Monad
import Data.List (intersperse, intercalate, isPrefixOf, elemIndex )
import Text.HTML.TagSoup
import Data.Sequence (viewl, ViewL(..), (<|))
import qualified Data.Foldable as F
import qualified Data.Map as M
import qualified Data.Set as Set
import Data.Char (isDigit, isSpace)
import Data.Maybe (fromMaybe)
import Text.Printf (printf)
import Control.Monad.Except (throwError)
import Text.Pandoc.Class (PandocMonad, report)
import Debug.Trace
-- | Read PukiWiki from an input string and return a Pandoc document.
readPukiWiki :: PandocMonad m
=> ReaderOptions -- ^ Reader options
-> String -- ^ String to parse (assuming @'\n'@ line endings)
-> m Pandoc
readPukiWiki opts s = do
parsed <- readWithM parsePukiWiki PWState{ mwOptions = opts
, mwMaxNestingLevel = 4
, mwNextLinkNumber = 1
, mwCategoryLinks = []
, mwHeaderMap = M.empty
, mwIdentifierList = Set.empty
}
(s ++ "\n")
case parsed of
Right result -> return result
Left e -> throwError e
data PWState = PWState { mwOptions :: ReaderOptions
, mwMaxNestingLevel :: Int
, mwNextLinkNumber :: Int
, mwCategoryLinks :: [Inlines]
, mwHeaderMap :: M.Map Inlines String
, mwIdentifierList :: Set.Set String
}
type PWParser m = ParserT [Char] PWState m
instance HasReaderOptions PWState where
extractReaderOptions = mwOptions
instance HasHeaderMap PWState where
extractHeaderMap = mwHeaderMap
updateHeaderMap f st = st{ mwHeaderMap = f $ mwHeaderMap st }
instance HasIdentifierList PWState where
extractIdentifierList = mwIdentifierList
updateIdentifierList f st = st{ mwIdentifierList = f $ mwIdentifierList st }
--
-- auxiliary functions
--
-- This is used to prevent exponential blowups for things like:
-- ''a'''a''a'''a''a'''a''a'''a
nested :: PandocMonad m => PWParser m a -> PWParser m a
nested p = do
nestlevel <- mwMaxNestingLevel `fmap` getState
guard $ nestlevel > 0
updateState $ \st -> st{ mwMaxNestingLevel = mwMaxNestingLevel st - 1 }
res <- p
updateState $ \st -> st{ mwMaxNestingLevel = nestlevel }
return res
specialChars :: [Char]
specialChars = "#'[]<>=&*{}|\":\\%~/"
spaceChars :: [Char]
spaceChars = " \n\t"
sym :: PandocMonad m => String -> PWParser m ()
sym s = () <$ try (string s)
--
-- main parser
--
parsePukiWiki :: PandocMonad m => PWParser m Pandoc
parsePukiWiki = do
bs <- mconcat <$> many block
spaces
eof
categoryLinks <- reverse . mwCategoryLinks <$> getState
let categories = if null categoryLinks
then mempty
else B.para $ mconcat $ intersperse B.space categoryLinks
return $ B.doc $ bs <> categories
--
-- block parsers
--
block :: PandocMonad m => PWParser m Blocks
block = do
pos <- getPosition
res <- mempty <$ skipMany1 blankline
<|> comment
<|> table
<|> header
<|> hrule
<|> orderedList
<|> bulletList
<|> definitionList
<|> preformatted
<|> blockquote
<|> hrule2
<|> refBlock
<|> para2
<|> para
report DEBUG $ printf "line %d: %s" (sourceLine pos)
(take 60 $ show $ B.toList res)
return res
comment :: PandocMonad m => PWParser m Blocks
comment = try $ do
guardColumnOne
sym "//"
inlines <- manyTill inline (char '\n')
return $ B.divWith ("", ["comment"], [("style", "display:none;")]) (B.lineBlock inlines)
hrule2 :: PandocMonad m => PWParser m Blocks
hrule2 = try $ do
guardColumnOne
sym "#hr"
return B.horizontalRule
refBlock :: PandocMonad m => PWParser m Blocks
refBlock = try $ do
guardColumnOne
sym "#ref("
url <- manyTill anyChar (char ')')
return $ B.para $ B.link url "" (B.str url)
blockquote :: PandocMonad m => PWParser m Blocks
blockquote = try $ do
guardColumnOne
char '>'
contents <- trimInlines . mconcat <$> many1 inline
if F.all (==Space) contents
then return mempty
else return $ B.blockQuote $ B.plain contents
para :: PandocMonad m => PWParser m Blocks
para = do
contents <- trimInlines . mconcat <$> many1 inline
if F.all (==Space) contents
then return mempty
else return $ B.para contents
para2 :: PandocMonad m => PWParser m Blocks
para2 = do
guardColumnOne
char '~'
contents <- trimInlines . mconcat <$> many1 inline
if F.all (==Space) contents
then return mempty
else return $ B.para contents
table :: PandocMonad m => PWParser m Blocks
table = try $ do
tbl <- (try $ (table' '|') <|> (table' ','))
return tbl
table' :: PandocMonad m => Char -> PWParser m Blocks
table' c = try $ do
rows <- many1 (tableRow' c)
let cols = maximum (map length rows)
return $ B.table mempty (replicate cols (AlignDefault, 0.0)) (replicate cols mempty) rows
tableRow' :: PandocMonad m => Char -> PWParser m [Blocks]
tableRow' c = try $ do
guardColumnOne
char c
columns <- many1Till (tableCell' c) newline
return columns
tableCell' :: PandocMonad m => Char -> PWParser m Blocks
tableCell' c = try $ do
inls <- mconcat <$> manyTill inline (try $ lookAhead $ oneOf (c:"\n\r"))
optional $ char c
let blk = B.plain $ inls
return $ blk
parseAttrs :: PandocMonad m => PWParser m [(String,String)]
parseAttrs = many1 parseAttr
parseAttr :: PandocMonad m => PWParser m (String, String)
parseAttr = try $ do
skipMany spaceChar
k <- many1 letter
char '='
v <- (char '"' >> many1Till (satisfy (/='\n')) (char '"'))
<|> many1 (satisfy $ \c -> not (isSpace c) && c /= '|')
return (k,v)
tableStart :: PandocMonad m => PWParser m ()
tableStart = try $ guardColumnOne *> skipSpaces *> sym "|"
tableEnd :: PandocMonad m => PWParser m ()
tableEnd = try $ guardColumnOne *> skipSpaces *> sym "|"
rowsep :: PandocMonad m => PWParser m ()
rowsep = try $ guardColumnOne *> skipSpaces *> sym "|" <*
optional parseAttr <* blanklines
cellsep :: PandocMonad m => PWParser m ()
cellsep = try $
(guardColumnOne *> skipSpaces <*
( (char '|' <* notFollowedBy (oneOf "-}+"))
<|> (char '!')
)
)
<|> (() <$ try (string "||"))
<|> (() <$ try (string "!!"))
tableCaption :: PandocMonad m => PWParser m Inlines
tableCaption = try $ do
guardColumnOne
skipSpaces
sym "|+"
optional (try $ parseAttr *> skipSpaces *> char '|' *> skipSpaces)
(trimInlines . mconcat) <$> many (notFollowedBy (cellsep <|> rowsep) *> inline)
tableRow :: PandocMonad m => PWParser m [((Alignment, Double), Blocks)]
tableRow = try $ many tableCell
tableCell :: PandocMonad m => PWParser m ((Alignment, Double), Blocks)
tableCell = try $ do
cellsep
skipMany spaceChar
attrs <- option [] $ try $ parseAttrs <* skipSpaces <* char '|' <*
notFollowedBy (char '|')
skipMany spaceChar
ls <- concat <$> many (notFollowedBy (cellsep <|> rowsep <|> tableEnd) *>
((snd <$> withRaw table) <|> count 1 anyChar))
bs <- parseFromString (mconcat <$> many block) ls
let align = case lookup "align" attrs of
Just "left" -> AlignLeft
Just "right" -> AlignRight
Just "center" -> AlignCenter
_ -> AlignDefault
let width = case lookup "width" attrs of
Just xs -> fromMaybe 0.0 $ parseWidth xs
Nothing -> 0.0
return ((align, width), bs)
parseWidth :: String -> Maybe Double
parseWidth s =
case reverse s of
('%':ds) | all isDigit ds -> safeRead ('0':'.':reverse ds)
_ -> Nothing
template :: PandocMonad m => PWParser m String
template = try $ do
string "{{"
notFollowedBy (char '{')
lookAhead $ letter <|> digit <|> char ':'
let chunk = template <|> variable <|> many1 (noneOf "{}") <|> count 1 anyChar
contents <- manyTill chunk (try $ string "}}")
return $ "{{" ++ concat contents ++ "}}"
trimCode :: String -> String
trimCode ('\n':xs) = stripTrailingNewlines xs
trimCode xs = stripTrailingNewlines xs
hrule :: PandocMonad m => PWParser m Blocks
hrule = B.horizontalRule <$ try (string "----" *> many (char '-') *> newline)
guardColumnOne :: PandocMonad m => PWParser m ()
guardColumnOne = getPosition >>= \pos -> guard (sourceColumn pos == 1)
preformatted :: PandocMonad m => PWParser m Blocks
preformatted = try $ do
guardColumnOne
char ' '
let endline' = B.linebreak <$ (try $ newline <* char ' ')
let whitespace' = B.str <$> many1 ('\160' <$ spaceChar)
let spToNbsp ' ' = '\160'
spToNbsp x = x
let inline' = whitespace' <|> endline'
<|> (try $ notFollowedBy newline *> inline)
contents <- mconcat <$> many1 inline'
let spacesStr (Str xs) = all isSpace xs
spacesStr _ = False
if F.all spacesStr contents
then return mempty
else return $ B.para $ encode contents
encode :: Inlines -> Inlines
encode = B.fromList . normalizeCode . B.toList . walk strToCode
where strToCode (Str s) = Code ("",[],[]) s
strToCode Space = Code ("",[],[]) " "
strToCode x = x
normalizeCode [] = []
normalizeCode (Code a1 x : Code a2 y : zs) | a1 == a2 =
normalizeCode $ (Code a1 (x ++ y)) : zs
normalizeCode (x:xs) = x : normalizeCode xs
header :: PandocMonad m => PWParser m Blocks
header = try $ do
guardColumnOne
eqs <- many1 (char '*')
let lev = length eqs
guard $ lev <= 6
contents <- trimInlines . mconcat <$> manyTill inline (char '\n')
attr <- registerHeader nullAttr contents
return $ B.headerWith attr lev contents
bulletList :: PandocMonad m => PWParser m Blocks
bulletList = B.bulletList <$> many1 (listItem '-')
orderedList :: PandocMonad m => PWParser m Blocks
orderedList = B.orderedList <$> many1 (listItem '+')
definitionList :: PandocMonad m => PWParser m Blocks
definitionList = B.definitionList <$> many1 defListItem
defListItem :: PandocMonad m => PWParser m (Inlines, [Blocks])
defListItem = try $ do
terms <- mconcat . intersperse B.linebreak <$> many defListTerm
-- we allow dd with no dt, or dt with no dd
defs <- if B.isNull terms
then notFollowedBy
(try $ skipMany1 (char ':') >> string "<math>") *>
many1 (listItem ':')
else many (listItem ':')
return (terms, defs)
defListTerm :: PandocMonad m => PWParser m Inlines
defListTerm = char ';' >> skipMany spaceChar >> anyLine >>=
parseFromString (trimInlines . mconcat <$> many inline)
listStart :: PandocMonad m => Char -> PWParser m ()
listStart c = char c *> notFollowedBy listStartChar
listStartChar :: PandocMonad m => PWParser m Char
listStartChar = oneOf "-+;:"
anyListStart :: PandocMonad m => PWParser m Char
anyListStart = char '-'
<|> char '+'
<|> char ':'
<|> char ';'
listItem :: PandocMonad m => Char -> PWParser m Blocks
listItem c = try $ do
extras <- many (try $ char c <* lookAhead listStartChar)
if null extras
then listItem' c
else do
skipMany spaceChar
first <- concat <$> manyTill listChunk newline
rest <- many
(try $ string extras *> lookAhead listStartChar *>
(concat <$> manyTill listChunk newline))
contents <- parseFromString (many1 $ listItem' c)
(unlines (first : rest))
case c of
'-' -> return $ B.bulletList contents
'+' -> return $ B.orderedList contents
':' -> return $ B.definitionList [(mempty, contents)]
_ -> mzero
-- The point of this is to handle stuff like
-- * {{cite book
-- | blah
-- | blah
-- }}
-- * next list item
-- which seems to be valid pukiwiki.
listChunk :: PandocMonad m => PWParser m String
listChunk = template <|> count 1 anyChar
listItem' :: PandocMonad m => Char -> PWParser m Blocks
listItem' c = try $ do
listStart c
skipMany spaceChar
first <- concat <$> manyTill listChunk newline
rest <- many (try $ char c *> lookAhead listStartChar *>
(concat <$> manyTill listChunk newline))
parseFromString (firstParaToPlain . mconcat <$> many1 block)
$ unlines $ first : rest
firstParaToPlain :: Blocks -> Blocks
firstParaToPlain contents =
case viewl (B.unMany contents) of
(Para xs) :< ys -> B.Many $ (Plain xs) <| ys
_ -> contents
--
-- inline parsers
--
inline :: PandocMonad m => PWParser m Inlines
inline = whitespace
<|> url
<|> str
<|> doubleQuotes
<|> strong
<|> emph
<|> strikeout
<|> image
<|> color
<|> size
<|> ref
<|> linebreak2
<|> tab
<|> externalLink
<|> internalLink
-- <|> math
-- <|> inlineTag
<|> B.singleton <$> charRef
-- <|> inlineHtml
<|> autolinkname
<|> linebreak
-- <|> (B.rawInline "pukiwiki" <$> variable)
-- <|> (B.rawInline "pukiwiki" <$> template)
<|> special
str :: PandocMonad m => PWParser m Inlines
str = B.str <$> many1 (noneOf $ specialChars ++ spaceChars)
variable :: PandocMonad m => PWParser m String
variable = try $ do
string "{{{"
contents <- manyTill anyChar (try $ string "}}}")
return $ "{{{" ++ contents ++ "}}}"
special :: PandocMonad m => PWParser m Inlines
special = B.str <$> count 1 (oneOf specialChars)
whitespace :: PandocMonad m => PWParser m Inlines
whitespace = B.space <$ (skipMany1 spaceChar)
<|> B.softbreak <$ endline
endline :: PandocMonad m => PWParser m ()
endline = () <$ try (newline <*
notFollowedBy spaceChar <*
notFollowedBy newline <*
notFollowedBy' hrule <*
notFollowedBy tableStart <*
notFollowedBy' header <*
notFollowedBy anyListStart)
imageIdentifiers :: PandocMonad m => [PWParser m ()]
imageIdentifiers = [sym (identifier ++ ":") | identifier <- identifiers]
where identifiers = ["File", "Image", "Archivo", "Datei", "Fichier",
"Bild"]
image :: PandocMonad m => PWParser m Inlines
image = try $ do
sym "[["
choice imageIdentifiers
fname <- addUnderscores <$> many1 (noneOf "|]")
_ <- many imageOption
dims <- try (char '|' *> (sepBy (many digit) (char 'x')) <* string "px")
<|> return []
_ <- many imageOption
let kvs = case dims of
w:[] -> [("width", w)]
w:(h:[]) -> [("width", w), ("height", h)]
_ -> []
let attr = ("", [], kvs)
caption <- (B.str fname <$ sym "]]")
<|> try (char '|' *> (mconcat <$> manyTill inline (sym "]]")))
return $ B.imageWith attr fname ("fig:" ++ stringify caption) caption
imageOption :: PandocMonad m => PWParser m String
imageOption = try $ char '|' *> opt
where
opt = try (oneOfStrings [ "border", "thumbnail", "frameless"
, "thumb", "upright", "left", "right"
, "center", "none", "baseline", "sub"
, "super", "top", "text-top", "middle"
, "bottom", "text-bottom" ])
<|> try (string "frame")
<|> try (oneOfStrings ["link=","alt=","page=","class="] <* many (noneOf "|]"))
collapseUnderscores :: String -> String
collapseUnderscores [] = []
collapseUnderscores ('_':'_':xs) = collapseUnderscores ('_':xs)
collapseUnderscores (x:xs) = x : collapseUnderscores xs
addUnderscores :: String -> String
addUnderscores = collapseUnderscores . intercalate "_" . words
elemIndexAny :: Eq a => [a] -> [a] -> Maybe Int
elemIndexAny (needle:needles) hay =
case (elemIndex needle hay) of
Nothing -> elemIndexAny needles hay
x -> x
elemIndexAny [] _ = Nothing
externalLink :: PandocMonad m => PWParser m Inlines
externalLink = try $ do
sym "[["
src <- manyTill anyChar (char ']')
char ']'
let link = case (elemIndexAny ">:" src) of
Just idx -> B.link (drop (idx + 1) src) "" (B.str $ take idx src)
Nothing -> B.link src "" (B.str $ src)
return link
internalLink :: PandocMonad m => PWParser m Inlines
internalLink = try $ do
sym "[["
pagename <- unwords . words <$> many (noneOf "|]")
label <- option (B.text pagename) $ char '|' *>
( (mconcat <$> many1 (notFollowedBy (char ']') *> inline))
-- the "pipe trick"
-- [[Help:Contents|] -> "Contents"
<|> (return $ B.text $ drop 1 $ dropWhile (/=':') pagename) )
sym "]]"
linktrail <- B.text <$> many letter
let link = B.link (addUnderscores pagename) "wikilink" (label <> linktrail)
if "Category:" `isPrefixOf` pagename
then do
updateState $ \st -> st{ mwCategoryLinks = link : mwCategoryLinks st }
return mempty
else return link
url :: PandocMonad m => PWParser m Inlines
url = do
(orig, src) <- uri
return $ B.link src "" (B.str orig)
-- | Parses a list of inlines between start and end delimiters.
inlinesBetween :: (PandocMonad m, Show b) => PWParser m a -> PWParser m b -> PWParser m Inlines
inlinesBetween start end =
(trimInlines . mconcat) <$> try (start >> many1Till inner end)
where inner = innerSpace <|> (notFollowedBy' (() <$ whitespace) >> inline)
innerSpace = try $ whitespace <* notFollowedBy' end
emph :: PandocMonad m => PWParser m Inlines
emph = B.emph <$> nested (inlinesBetween start end)
where start = sym "'''" >> lookAhead nonspaceChar
end = try $ notFollowedBy' (() <$ strong) >> sym "'''"
strong :: PandocMonad m => PWParser m Inlines
strong = B.strong <$> nested (inlinesBetween start end)
where start = sym "''" >> lookAhead nonspaceChar
end = try $ sym "''"
strikeout :: PandocMonad m => PWParser m Inlines
strikeout = B.strikeout <$> nested (inlinesBetween start end)
where start = sym "%%" >> lookAhead nonspaceChar
end = try $ sym "%%"
doubleQuotes :: PandocMonad m => PWParser m Inlines
doubleQuotes = B.doubleQuoted <$> nested (inlinesBetween openDoubleQuote closeDoubleQuote)
where openDoubleQuote = sym "\"" >> lookAhead nonspaceChar
closeDoubleQuote = try $ sym "\""
autolinkname :: PandocMonad m => PWParser m Inlines
autolinkname = try $ do
let start = sym "[#" >> lookAhead nonspaceChar
let end = try $ char ']'
inlinesBetween start end
return mempty
-- ... ~
linebreak :: PandocMonad m => PWParser m Inlines
linebreak = try $ do
char '~'
return B.linebreak
-- &br;
linebreak2 :: PandocMonad m => PWParser m Inlines
linebreak2 = try $ do
(sym "&br;")
return B.linebreak
-- &t;
tab :: PandocMonad m => PWParser m Inlines
tab = try $ do
(sym "&t;")
return $ B.str "\t"
-- &ref(...)
ref :: PandocMonad m => PWParser m Inlines
ref = try $ do
href <- sym "&ref(" *> manyTill anyChar (sym ");")
let href2 = case (elemIndex ',' href) of
Just idx -> take idx href
Nothing -> href
return $ B.link href "" (B.str href2)
-- &color(...){...}
color :: PandocMonad m => PWParser m Inlines
color = try $ do
col <- sym "&color(" *> manyTill anyChar (char ')')
let start = sym "{" >> lookAhead nonspaceChar
let end = try $ sym "};"
inl <- nested (inlinesBetween start end)
let style = case (elemIndex ',' col) of
Just idx -> "color:" ++ (take idx col) ++ "; background-color: " ++ (drop (idx + 1) col) ++ ";"
Nothing -> "color:" ++ col ++ ";"
return $ B.spanWith ("", [], [("style", style)]) inl
-- &size(...){...}
size :: PandocMonad m => PWParser m Inlines
size = try $ do
col <- sym "&size(" *> manyTill anyChar (char ')')
let start = sym "{" >> lookAhead nonspaceChar
let end = try $ sym "};"
inl <- nested (inlinesBetween start end)
return $ B.spanWith ("", [], [("style", "font-size:" ++ col ++ "px;")]) inl
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment