Created
October 25, 2013 16:29
-
-
Save danchoi/7157539 to your computer and use it in GitHub Desktop.
Draft HXT Haskell code to turn Microsoft exported HTML into nested HTML lists
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{-# LANGUAGE Arrows, NoMonomorphismRestriction #-} | |
module Main where | |
import Text.XML.HXT.Core | |
import System.Environment | |
import Data.List | |
import Control.Arrow.ArrowNavigatableTree | |
import Text.XML.HXT.XPath.Arrows | |
import Data.Tree.NTree.TypeDefs | |
import Data.Tree.NavigatableTree.Class | |
isListPara = isElem >>> hasName "p" >>> hasAttrValue "class" (== "MsoListParagraph") | |
isListParaLevel n = isListPara >>> hasAttrValue "style" (("level"++(show n)) `isInfixOf`) | |
-- Nests a flat list of <p> tags into parent <p> tags by "leveln" style infix | |
fixLevel n = | |
withoutNav (isListParaLevel n <+> (neg isListPara)) | |
>>> | |
replaceChildren ( | |
getChildren | |
<+> | |
(listA | |
(followingSiblingAxis >>> filterAxis isListPara) | |
>>> | |
arr (takeFollowingSibsAtLevel (n + 1)) | |
>>> | |
unlistA | |
>>> fixLevel (n+1) | |
) | |
) | |
where takeFollowingSibsAtLevel n ys@(x:_) = takeWhile (isLevel n . toTree) ys | |
takeFollowingSibsAtLevel _ _ = [] | |
isLevel n t@(NTree (XTag _ xs) _ ) | |
| (not.null) | |
[ x | | |
x@(NTree (XAttr q) [NTree (XText s)_]) <- xs, | |
localPart q == "style", | |
("mso-list:l0 level" ++ (show n ++) " lfo1") `isInfixOf` s | |
] | |
= True | |
isLevel _ _ = False | |
-- this converts nested <p> tags into <ul><li> | |
pToUL = isListPara | |
>>> | |
changeElemName (const $ mkName "ul") | |
>>> | |
removeAttr "style" -- remove MSO styling | |
>>> | |
replaceChildren ( | |
(selem "li" [ | |
(getChildren >>> (neg isListPara)) | |
<+> | |
(getChildren >>> isListPara >>> pToUL) | |
]) | |
) | |
main = do | |
args <- getArgs | |
input <- case args of | |
[infile] -> readFile infile | |
otherwise -> getContents | |
res <- runX (readString [ | |
withValidate no,withWarnings no ,withParseHTML yes | |
,withInputEncoding utf8 | |
] input | |
>>> | |
{- | |
processTopDown (deep none `when` (hasText ("![endif]" `isInfixOf`))) | |
>>> | |
processTopDown (deep none `when` (hasText (" `isInfixOf`))) | |
>>> | |
-} | |
processTopDown | |
( | |
( | |
deep isElem | |
>>> addNav >>> getChildren | |
>>> fixLevel 1 >>> remNav | |
>>> pToUL | |
) | |
`when` | |
(hasAttrValue "class" ("WordSection" `isPrefixOf`)) | |
) | |
>>> | |
processTopDown (deep none `when` (isElem >>> hasName "style")) | |
>>> | |
processTopDown (deep none `when` (isAttr >>> hasQName (mkName "style"))) | |
>>> | |
processTopDown (deep none `when` (isElem >>> hasAttrValue "style" (== "mso-list:Ignore"))) | |
>>> | |
processTopDown (deep none `when` (isElem >>> hasAttrValue "style" (== "font-family:Symbol"))) | |
>>> | |
writeDocument [ | |
-- withShowTree yes, | |
withIndent yes | |
,withOutputEncoding usAscii | |
] "-" | |
) | |
return () | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment