Pandoc script to transform NeTEx tarifs docx to a suitable markdown file
#!/usr/bin/env nix-shell
#!nix-shell --pure -i runghc -p "haskellPackages.ghcWithPackages (p: [ p.pandoc p.pretty-show ])"
{-# OPTIONS_GHC -Wall #-}
{-# LANGUAGE LambdaCase #-}
{-# LANGUAGE OverloadedStrings #-}
import Control.Monad ((<=<))
import Control.Monad.IO.Class (liftIO)
import qualified Data.ByteString.Char8 as C8
import Data.Foldable (fold, traverse_)
import Data.List (delete)
import Data.Maybe (fromMaybe, mapMaybe)
import Data.String (IsString, fromString)
import qualified Data.Text as T
import Data.Text.Encoding (decodeUtf8, encodeUtf8)
import System.FilePath (replaceExtension, (</>))
import Text.Pandoc
import Text.Pandoc.App (Opt (..), defaultOpts)
import Text.Pandoc.Builder (Blocks, Inlines, cell, emph, fromList,
header, headerWith, para, plain,
singleton, spanWith, str, strong, text,
import Text.Pandoc.MediaBag (mediaDirectory)
import Text.Pandoc.MIME (MimeType)
import Text.Pandoc.Walk as Walk
-- Requires package pretty-show
import qualified Debug.Trace as DT
import qualified Text.Show.Pretty as P
-- `walk` and `query` can be helpful to debug the AST and what's wrong in the source.
ppDebug :: Show a => a -> b -> b
ppDebug = DT.traceShow . P.ppDoc
main :: IO ()
main =
let source = "originaux/NF_Profil.NeTEx.pour.les.Tarifs.F.-.v1.3-nettoye.patched.docx"
destination = "NeTEx/tarifs"
result = destination </> ""
in runIOorExplode $ do
-- Parse the Word document
document <- extractMedia destination =<< parseSourceDocument source
-- This is the main entry point
patchedDoc <- cleanDocument destination document
-- Final touch and output
markdown <- renderMarkdown patchedDoc
frontMatter <- readFrontMatter
stylesheet <- readStylesheet
write result (frontMatter <> markdown <> stylesheet)
-- Lists the medias needing a conversion (not automated)
readFrontMatter :: PandocIO C8.ByteString
readFrontMatter =
let wrap content = content <> "\n"
in wrap <$> readFileStrict "./tarifs.yml"
readStylesheet :: PandocMonad m => m C8.ByteString
readStylesheet =
let wrap content = "\n<style>\n" <> content <> "</style>\n"
in wrap <$> readFileStrict "./tarifs.css"
-- Document procession
cleanDocument :: PandocMonad m => FilePath -> Pandoc -> m Pandoc
cleanDocument destination =
pure . wrapAnnexes <=<
injectXMLExamples <=<
reformatTablesCaptions .
cleanupStyling .
cleanupDocumentStructure destination
-- Table captions
reformatTablesCaptions :: PandocMonad m => Pandoc -> m Pandoc
reformatTablesCaptions =
let wrap = wrapDiv "table-title" . concatMap dropEmphasis
in walkM $ \case
Para (Str "Table" : Space : Str _nbr : Space : Str "\8211" : Space : rest) -> wrap rest
Para (Str "Table" : Space : Str _nbr : Space : Str "\8211" : rest) -> wrap rest
Para (Str "Table" : Space : Str _nbr : Space : Str "\8212" : rest) -> wrap rest
untouched -> pure untouched
wrapDiv :: PandocMonad m => T.Text -> [Inline] -> m Block
wrapDiv className inner =
let render = writeMarkdown def . Pandoc nullMeta . pure . Plain
in do
-- This is dirty but required to avoid wrapping the content in a <p> after rendering by hugo
md <- render inner
pure $
RawBlock (Format "html") $
"<div class='" <> className <> "'>" <> T.strip md <> "</div>"
-- Styling
cleanupStyling :: Pandoc -> Pandoc
cleanupStyling =
adjustDefinitionsStyling .
enforceTypographyRules .
adjustDefinitionsStyling :: Pandoc -> Pandoc
adjustDefinitionsStyling =
let boldenDefinition b1 [] = [b1]
boldenDefinition b1 (b2: bs) =
case (b1, b2) of
(Para something, Div ( "" , [ "Definition" ] , [] ) definition) -> Header 2 nullAttr something : definition ++ bs
_ -> b1 : b2 : bs
in walk $ topLevel $ adjoin boldenDefinition
-- Typographic rules
enforceTypographyRules :: Pandoc -> Pandoc
enforceTypographyRules = fixSpaces . fixQuotes . cleanChevrons
fixSpaces :: Pandoc -> Pandoc
fixSpaces =
fixNonBreakingSpaces .
walk (concatMap fixSpacingAroundDashes)
fixNonBreakingSpaces :: Pandoc -> Pandoc
fixNonBreakingSpaces =
let glueSpaces i1 [] = [i1]
glueSpaces i1 (i2: is) =
case (i1, i2) of
(Str left, Space) -> if "\160" `T.isSuffixOf` left then Str left : is else Str left : Space : is
(Space, Str right) -> if "\160" `T.isPrefixOf` right then Str right : is else Space : Str right : is
_ -> i1 : i2 : is
in walk $ adjoin glueSpaces
fixSpacingAroundDashes :: Inline -> [ Inline ]
fixSpacingAroundDashes (Str "\8211") = [ Str "\8211" ]
fixSpacingAroundDashes (Str s)
| "\8211" `T.isPrefixOf` s = [ Str "\8211", Space, Str (T.replace "\8211" "" s) ]
| "\8211" `T.isSuffixOf` s = [ Str (T.replace "\8211" "" s), Space, Str "\8211" ]
fixSpacingAroundDashes untouched = [ untouched ]
fixQuotes :: Pandoc -> Pandoc
fixQuotes = walk (fixClosingQuotes . fixOpeningQuotes)
fixOpeningQuotes :: Inline -> Inline
fixOpeningQuotes (Str "\171") = Str "\171\160"
fixOpeningQuotes (Str s)
| "\171\160" `T.isInfixOf` s = Str s
-- Ensure non breaking space:
| "\171 " `T.isInfixOf` s = Str (T.replace "\171 " "\171\160" s)
| "\171" `T.isInfixOf` s = Str (T.replace "\171" "\171\160" s)
fixOpeningQuotes untouched = untouched
fixClosingQuotes :: Inline -> Inline
fixClosingQuotes (Str "\187") = Str "\160\187"
fixClosingQuotes (Str s)
| "\160\187" `T.isInfixOf` s = Str s
-- Ensure non breaking space:
| " \187" `T.isInfixOf` s = Str (T.replace " \187" "\160\187" s)
| "\187" `T.isInfixOf` s = Str (T.replace "\187" "\160\187" s)
fixClosingQuotes untouched = untouched
cleanChevrons :: Pandoc -> Pandoc
cleanChevrons = walk $ \case
Str t -> Str
. T.replace "&lt;" "<"
. T.replace "&gt;" ">"
$ t
inline -> inline
colors :: Pandoc -> Pandoc
colors = adjustMarkingColors . redText
adjustMarkingColors :: Pandoc -> Pandoc
adjustMarkingColors = walk $ \case
original@(Span (id_, classes, extras) inlines) ->
if "mark" `elem` classes && isKnownBlueMarkings inlines
then Span (id_, "mark-blue" : delete "mark" classes, extras) inlines
else original
original -> original
isKnownBlueMarkings :: [Inline] -> Bool
isKnownBlueMarkings inlines =
fromList inlines `elem`
[ text "bleu"
, text "Nombe d'unités désignée."
, str "GeneralGroupOfEntitiesRef"
, str "Units" -- FIXME not precise enough, might hit wrong markings
, str "xsd:decimal" -- FIXME not precise enough, might hit wrong markings
, str "0:1" -- FIXME not precise enough, might hit wrong markings
, text "RRefrence to GENERAL GROUP OF ENTITIES associé au Third PARTY product."
redText :: Pandoc -> Pandoc
redText = walk $ \case
untouched@(Para [ Strong inlines ]) ->
then Para [ Span ("", ["red"], []) [ Strong inlines ] ]
else untouched
untouched -> untouched
-- Document structure
cleanupDocumentStructure :: FilePath -> Pandoc -> Pandoc
cleanupDocumentStructure destination =
fixTermsAndDefinitions .
cleanImages destination .
cleanAnnexBNumbering .
adjoinFigureCaptions .
walk (concatMap (toList . cleanUpSummaryTitles)) .
dropHiddenBlocks .
patchTables .
dropOriginalToC .
patchTables :: Pandoc -> Pandoc
patchTables =
dropEmptyTables .
dropEmptyRows .
fixMissingTableHead .
fixOrphanTable .
dropEmptyTables :: Pandoc -> Pandoc
dropEmptyTables =
let isEmptyTableHead (TableHead _ rows) = allRowsAreEmpty rows
isEmptyTableBody (TableBody _ _ _ rows) = allRowsAreEmpty rows
allRowsAreEmpty = all isEmptyRow
in walk . mapMaybe $ \case
table@(Table _ _ _ tableHead tableBodies _) ->
if all isEmptyTableBody tableBodies && isEmptyTableHead tableHead
then Nothing
else Just table
untouched -> Just untouched
dropEmptyRows :: Pandoc -> Pandoc
dropEmptyRows =
let dropEmptyRows' (TableBody attr rhc rows' rows) = TableBody attr rhc (filter (not . isEmptyRow) rows') (filter (not . isEmptyRow) rows)
in walk $ \case
Table attr caption colSpecs tableHead tableBodies tableFoot ->
Table attr caption colSpecs tableHead (dropEmptyRows' <$> tableBodies) tableFoot
untouched -> untouched
isEmptyRow :: Row -> Bool
isEmptyRow (Row _ cells) =
let isEmptyInline = \case
Plain [] -> True
_ -> False
isEmptyCell = \case
Cell _ _ _ _ inlines -> all isEmptyInline inlines
isHiddenRow cells' = cells' `elem` hardcodedHiddenRows
in all isEmptyCell cells || isHiddenRow cells
hardcodedHiddenRows :: [ [ Cell ] ]
hardcodedHiddenRows =
mkNameOnlyRow <$>
-- Table numbers in this list are according to the source Word document
[ -- Last row of "Table 19 – TimeDemandType – Element"
, -- 3rd row of "Table 32 – TimeStructureFactor – Element"
, -- last row of "Table 66 – SalesOfferPackage – Element"
, -- 5th row of "Table 68 – SalesOfferPackageElement – Element"
, -- last row of "Table 88 – UsageParameter – Element (abstrait)"
mkNameOnlyRow :: T.Text -> [ Cell ]
mkNameOnlyRow name =
let mkCell = cell AlignDefault (RowSpan 1) (ColSpan 1)
in mkCell <$>
[ mempty
, plain $ emph $ strong $ marking (str name)
, mempty
, mempty
, mempty
dropTableColWidths :: Pandoc -> Pandoc
dropTableColWidths = walk $ \case
Table attr caption colSpecs tableHead tableBodies tableFoot ->
Table attr caption ((AlignDefault, ColWidthDefault) <$ colSpecs) tableHead tableBodies tableFoot
untouched -> untouched
fixMissingTableHead :: Pandoc -> Pandoc
fixMissingTableHead = walk $ \case
table@(Table attr caption colSpecs tableHead tableBodies tableFoot) ->
let headIsEmpty (TableHead _ rows) = null rows
tableHead' =
case head tableBodies of
TableBody attr' _ _ rows -> TableHead attr' (take 1 rows)
tableBodies' =
case head tableBodies of
TableBody attr' rhc r' rows -> TableBody attr' rhc r' (drop 1 rows) : tail tableBodies
in if headIsEmpty tableHead
then Table attr caption colSpecs tableHead' tableBodies' tableFoot
else table
untouched -> untouched
fixOrphanTable :: Pandoc -> Pandoc
fixOrphanTable =
let joinSplitTables b1 [] = [b1]
joinSplitTables b1 (b2:bs) =
case (b1, b2) of
(Table attr caption colSpecs tableHead tableBodies1 _, Table _ _ _ _ tableBodies2 tableFoot) ->
Table attr caption colSpecs tableHead (tableBodies1 <> tableBodies2) tableFoot : bs
_ -> b1 : b2 : bs
in walk $ topLevel $ adjoin joinSplitTables
dropHiddenBlocks :: Pandoc -> Pandoc
dropHiddenBlocks =
walk $ topLevel $ mapMaybe $ \b ->
if b `elem` hardcodedHiddenBlocks
then Nothing
else Just b
marking :: Inlines -> Inlines
marking = spanWith ("", ["mark"], [])
hardcodedHiddenBlocks :: Blocks
hardcodedHiddenBlocks =
[ para $ marking $
text "Table 45 \8211 " <> emph (str "AccessRightInProduct") <> text " \8211 Element"
, para $ marking $
text "Table 109 \8211 " <> emph (str "LimitingRule") <> text " \8211 Element"
cleanAnnexBNumbering :: Pandoc -> Pandoc
cleanAnnexBNumbering = walk $ concatMap $ \case
OrderedList ( 2 , UpperAlpha , DefaultDelim )
[ untouched@[ Header 1 ( "" , [] , [] ) [ Str "Exemples" ] ] ] -> untouched
untouched -> [ untouched ]
cleanUpSummaryTitles :: Block -> Blocks
cleanUpSummaryTitles b
| isAnchored "Avant-propos" b = para $ strong $ text "Avant-propos"
| isAnchored "Introduction" b = para $ strong $ text "Introduction"
| isAnchored "Usage Parameters" b = header 1 $ text "Usage Parameters"
| isAnchored "Exemples" b = header 1 $ text "Exemples"
| isAnchored "Bibliographie" b = headerWith ("bibliographie", [], []) 1 $ text "Bibliographie"
| otherwise = singleton b
dropCoverPage :: Pandoc -> Pandoc
dropCoverPage = walk $ topLevel $ dropUntil (isAnchored "Avant-propos")
isAnchored :: T.Text -> Block -> Bool
isAnchored textContent = \case
Para (Span (_, ["anchor"], []) [] : content) ->
content == toList (text textContent)
OrderedList _ [ [ Para ( LineBreak : Span ( _ , [ "anchor" ] , [] ) [] : content) ] ] ->
content == toList (text textContent)
_ -> False
dropOriginalToC :: Pandoc -> Pandoc
dropOriginalToC =
let isSommaireTitle b =
b == Para [ Str "Sommaire" , Space , Str "Page" ]
isIntroductionTitle = isAnchored "Introduction"
in walk $ dropWindow isSommaireTitle isIntroductionTitle
-- Wrapping annexes in a div let us properly number the annexes in the document via css
wrapAnnexes :: Pandoc -> Pandoc
wrapAnnexes =
let startingPoint = \case
Header 1 _ content -> fromList content == text "Usage Parameters"
_ -> False
keepBody = takeUntil startingPoint
cutAnnexes = dropUntil startingPoint
doWrap blocks = [ Div ("", ["annexes"], []) blocks ]
in walk $ topLevel $ keepBody <> doWrap . cutAnnexes
injectXMLExamples :: PandocMonad m => Pandoc -> m Pandoc
injectXMLExamples = walkM (topLevelM replaceSnippets)
replaceSnippets :: PandocMonad m => [Block] -> m [Block]
replaceSnippets =
let replaceExamples examples = foldl (<=<) pure (replaceExample' <$> examples)
replaceExample' (a,b,c) = replaceExample a b c
in replaceExamples
[ ( "", "exemples" , "régle-dapplication-des-caractéristiques-qualitystructurefactor" )
, ( "", "exemple" , "tarif-version-de-loffre-tarifaire-tariff" )
, ( "", "exemple-1" , "les-éléments-de-structure-de-tarification-temporelle" )
, ( "", "exemple-2" , "les-éléments-de-structure-de-tarification-géographique" )
, ( "", "exemple-3" , "élément-de-matrice-de-distances-distancematrixelement" )
, ( "", "exemple-4" , "les-élément-validables-validableelement" )
, ( "" , "exemple-5" , "les-élément-contrôlable-controllableelement" )
, ( "" , "exemple-6" , "les-offre-à-la-vente-salespackageoffer" )
, ( "" , "exemple-7" , "document-de-voyage" )
, ( "", "exemples-1" , "distributionchannel" )
, ( "", "exemples-2" , "les-droits-daccès-et-conditions-de-validité-validity-parameters" )
, ( "" , "exemples-3" , "conditions-dutilisation-usage-parameter" )
, ( "" , "exemples-4" , "les-grilles-tarifaires-faretable" )
, ( "" , "exemples-5" , "les-prix" )
, ( "" , "exemple-minimal" , "entêtes-netex" )
, ( "" , "tarif-simple" , "tarif-minimal-ultra-simplifié" )
, ( "" , "tarif-minimal-ultra-simplifié", "bus-ile-de-france" )
, ( "" , "bus-ile-de-france" , "tgv-paris-lille" )
, ( "" , "tgv-paris-lille" , "leman-express" )
, ( "" , "leman-express" , "tarif-kilométrique-ferré-ter" )
, ( "" , "tarif-kilométrique-ferré-ter" , "bibliographie" )
replaceExample :: PandocMonad m => FilePath -> BlockTarget -> BlockTarget -> [Block] -> m [Block]
replaceExample source startTarget endTarget content =
let isStart = matchTarget startTarget
isEnd = matchTarget endTarget
in do
replacement <- readExample source
pure $ replaceWindow replacement isStart isEnd content
readExample :: PandocMonad m => FilePath -> m [Block]
readExample path =
let getContent (Pandoc _ bs) = bs
opts = def
{ readerExtensions = githubMarkdownExtensions
in getContent <$> (readMarkdown opts . decodeUtf8 =<< readFileStrict ("NeTEx/tarifs/examples" </> path))
cleanImages :: FilePath -> Pandoc -> Pandoc
cleanImages mediaLocation = walk $ \case
Image _attrs inlines target -> Image nullAttr inlines (cleanImageTarget mediaLocation target)
inline -> inline
cleanImageTarget :: FilePath -> Target -> Target
cleanImageTarget prefix (url, title) =
let url' = T.replace ".emf" ".png"
$ fromMaybe url
$ T.stripPrefix (fromString prefix <> "/") url
in (url', title)
fixTermsAndDefinitions :: Pandoc -> Pandoc
fixTermsAndDefinitions =
let joinNumberingWithTitle b1 [] = [b1]
joinNumberingWithTitle b1 (b2:bs) =
case (b1, b2) of
(Para [ Str "3." ], Para i2) -> Header 2 nullAttr (walk (concatMap dropStrong) i2) : bs
_ -> b1 : b2 : bs
in walk $ topLevel $ adjoin joinNumberingWithTitle
adjoinFigureCaptions :: Pandoc -> Pandoc
adjoinFigureCaptions =
let fixEmphasis inlines = [ Emph (dropFigureNumber $ walk (concatMap dropEmphasis) inlines) ]
dropFigureNumber = \case
Str "Figure" : Space : Str _ : Space : Str "\8211" : Space : rest -> rest
untouched -> untouched
joinFigureWithCaption b1 [] = [b1]
joinFigureWithCaption b1 (b2:bs) =
case (b1, b2) of
(Para i1@[Image {}], Para i2) -> Para (i1 <> [SoftBreak] <> fixEmphasis i2) : bs
_ -> b1 : b2 : bs
in walk $ adjoin joinFigureWithCaption
-- Pandoc utilities
parseSourceDocument :: FilePath -> PandocIO Pandoc
parseSourceDocument = readDocx def <=< readFileLazy
renderMarkdown :: Pandoc -> PandocIO C8.ByteString
renderMarkdown doc =
let opts = defaultOpts
writerOptions = def
{ writerTabStop = optTabStop opts
, writerTableOfContents = optTableOfContents opts
, writerHTMLMathMethod = optHTMLMathMethod opts
, writerIncremental = optIncremental opts
, writerCiteMethod = optCiteMethod opts
, writerNumberSections = optNumberSections opts
, writerNumberOffset = optNumberOffset opts
, writerSectionDivs = optSectionDivs opts
, writerExtensions = githubMarkdownExtensions
, writerReferenceLinks = optReferenceLinks opts
, writerReferenceLocation = optReferenceLocation opts
, writerDpi = optDpi opts
, writerWrapText = optWrap opts
, writerColumns = optColumns opts
, writerEmailObfuscation = optEmailObfuscation opts
, writerIdentifierPrefix = optIdentifierPrefix opts
, writerHtmlQTags = optHtmlQTags opts
, writerTopLevelDivision = optTopLevelDivision opts
, writerListings = optListings opts
, writerSlideLevel = optSlideLevel opts
, writerSetextHeaders = optSetextHeaders opts
, writerListTables = optListTables opts
, writerEpubSubdirectory = T.pack $ optEpubSubdirectory opts
, writerEpubFonts = optEpubFonts opts
, writerEpubTitlePage = optEpubTitlePage opts
, writerSplitLevel = optSplitLevel opts
, writerTOCDepth = optTOCDepth opts
, writerReferenceDoc = optReferenceDoc opts
, writerPreferAscii = optAscii opts
in encodeUtf8 <$> writeCommonMark writerOptions doc
write :: FilePath -> C8.ByteString -> PandocIO ()
write path = liftIO . C8.writeFile path
advertiseMediaConversions :: PandocIO ()
advertiseMediaConversions = do
medias <- mediaDirectory <$> getMediaBag
liftIO $ traverse_ advertiseMediaConversion (filter isToBeConverted medias)
isToBeConverted :: (FilePath, MimeType, Int) -> Bool
isToBeConverted (_, mt, _) = mt == "image/x-emf"
advertiseMediaConversion :: (FilePath, MimeType, Int) -> IO ()
advertiseMediaConversion (file, mt, _) =
case mt of
"image/x-emf" -> let convertedFile = replaceExtension file ".png"
in print $ file <> " -> " <> convertedFile
_ -> pure ()
-- Pandoc AST utilities
-- Utility to potentially merge 2 following items in a list
adjoin :: (item -> [item] -> [item]) -> [item] -> [item]
adjoin f = foldr f []
-- Restrict walking to the top level (to be used with `walk`).
topLevel :: ([Block] -> [Block]) -> Pandoc -> Pandoc
topLevel f (Pandoc meta blocks) = Pandoc meta (f blocks)
-- Monadic variant of `topLevel` (to be used with `walkM`).
topLevelM :: Functor m => ([Block] -> m [Block]) -> Pandoc -> m Pandoc
topLevelM f (Pandoc meta blocks) = Pandoc meta <$> f blocks
data BlockTarget = HasId T.Text | IsHeader Int
instance IsString BlockTarget where
fromString = HasId . fromString
matchTarget :: BlockTarget -> Block -> Bool
matchTarget = \case
HasId i -> hasId i
IsHeader lvl -> isHeaderOfLevel lvl
hasId :: T.Text -> Block -> Bool
hasId expected = \case
Header _ (observed, _, _) _ -> observed == expected
_ -> False
isHeaderOfLevel :: Int -> Block -> Bool
isHeaderOfLevel expected = \case
Header observed _ _ -> observed == expected
_ -> False
dropWindow :: (a -> Bool) -> (a -> Bool) -> [a] -> [a]
dropWindow = replaceWindow []
replaceWindow :: [a] -> (a -> Bool) -> (a -> Bool) -> [a] -> [a]
replaceWindow replacement isStart isEndExcluded =
let cutEnd = dropUntil isEndExcluded . dropUntil isStart
cutStart = takeUntil isStart
in cutStart <> const replacement <> cutEnd
dropEmphasis :: Inline -> [Inline]
dropEmphasis (Emph inlines) = inlines
dropEmphasis inline = pure inline
dropStrong :: Inline -> [ Inline ]
dropStrong = \case
Strong inlines -> inlines
inline -> [ inline ]
dropUntil :: (a -> Bool) -> [a] -> [a]
dropUntil p = dropWhile (not . p)
takeUntil :: (a -> Bool) -> [a] -> [a]
takeUntil p = takeWhile (not . p)
