Created
March 21, 2013 02:50
-
-
Save heath/5210339 to your computer and use it in GitHub Desktop.
HAML Parsers in Haskell
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
module Main where | |
import Text.ParserCombinators.Parsec | |
import Text.ParserCombinators.Parsec.Language | |
import Text.ParserCombinators.Parsec.Pos | |
import qualified Text.ParserCombinators.Parsec.Token as T | |
import Data.Char | |
import Data.List | |
import Data.Maybe | |
import System.IO.Unsafe | |
main = do s <- getContents | |
case (parse mainParser "stdin" s) of | |
Left err -> putStrLn "Error: " >> print err | |
Right hs -> putStrLn hs | |
-- Try to parse HAML, otherwise re-output raw lines | |
mainParser = do whiteSpace | |
ls <- many1 (hamlCode <|> tilEOL) | |
return $ unlines ls | |
-- | |
-- * HAML lexer | |
-- | |
hamlLexer = T.makeTokenParser emptyDef | |
whiteSpace= T.whiteSpace hamlLexer | |
lexeme = T.lexeme hamlLexer | |
symbol = T.symbol hamlLexer | |
natural = T.natural hamlLexer | |
parens = T.parens hamlLexer | |
semi = T.semi hamlLexer | |
squares = T.squares hamlLexer | |
stringLiteral= T.stringLiteral hamlLexer | |
identifier= T.identifier hamlLexer | |
reserved = T.reserved hamlLexer | |
reservedOp= T.reservedOp hamlLexer | |
commaSep1 = T.commaSep1 hamlLexer | |
-- | |
-- * Main HAML parsers | |
-- | |
-- hamlCode is just many identifiers followed by = followed by a hamlBlock | |
-- f a b c = %somehaml | |
hamlCode = try ( do is <- many1 identifier | |
symbol "=" | |
currentPos <- getPosition | |
x <- manyTill1 | |
(lexeme $ hamlBlock) | |
(notSameIndent currentPos) | |
return $ (concat $ intersperse " " is) ++ | |
" = \n" ++ | |
(concat $ (intersperse (indent currentPos ++ "+++\n") $ filter (not . null) $ x)) | |
) | |
-- A Block may start with some whitespace, then has a valid bit of data | |
hamlBlock = do currentPos <- getPosition | |
bs <- manyTill1 | |
(pTag <|> pText) | |
(notSameIndent currentPos) | |
return $ intercalate (indent currentPos ++ "+++\n") bs | |
pTag = do currentPos <- getPosition | |
try | |
(do t <- lexeme tagParser | |
ts <- (isInline currentPos >> char '/' >> return []) <|> | |
(hamlBlock) | |
return $ intercalate "\n" $ filter (not . null) $ | |
[ (indent currentPos) ++ "((" ++ (if (null ts) then "i" else "") ++ t ++ ")" | |
, if null ts then [] else ts | |
, (indent currentPos) ++ ")\n"] | |
) | |
pText = lexeme stringParser | |
notSameIndent p = (eof >> return []) <|> | |
(do innerPos <- getPosition | |
case (sourceColumn p) == (sourceColumn innerPos) of | |
True -> pzero | |
False -> return [] | |
) | |
-- | |
-- * Various little parsers | |
-- | |
tagParser :: CharParser () String | |
tagParser = do t <- optionMaybe tagParser' | |
i <- optionMaybe idParser | |
c <- optionMaybe (many1 classParser) | |
a <- optionMaybe attributesParser | |
if (isJust t || isJust i || isJust c || isJust a) | |
then | |
do return $ "tag \"" ++ (fromMaybe "div" t) ++ "\"" ++ | |
(if not (isJust i || isJust c || isJust a) then "" else | |
concat $ | |
[ "![" | |
, intercalate ", " $ filter (not . null) | |
[ (maybe "" (\i' -> "strAttr \"id\" \"" ++ i' ++ "\"") i) | |
, (maybe "" (\c' -> "strAttr \"class\" \"" ++ (intercalate " " c') ++ "\"") c) | |
, (maybe "" (\kv -> intercalate ", " $ map (\(k,v) -> "strAttr \"" ++ k ++ "\" (" ++ v ++ ")") kv) a) | |
] | |
, "]"] | |
) | |
else pzero | |
tagParser' :: CharParser () String | |
tagParser' = do char '%' | |
many1 termChar | |
idParser :: CharParser () String | |
idParser = do char '#' | |
many1 termChar | |
classParser :: CharParser () String | |
classParser = do char '.' | |
many1 termChar | |
attributesParser :: CharParser () [(String, String)] | |
attributesParser = squares (commaSep1 attributeParser) | |
attributeParser :: CharParser () (String, String) | |
attributeParser = do k <- identifier | |
symbol "=" | |
cs <- many1 identifier | |
return (k, intercalate " " cs) | |
stringParser :: CharParser () String | |
stringParser = do currentPos <- getPosition | |
modifier <- optionMaybe (char '=' <|> char '-') | |
whiteSpace | |
c <- alphaNum | |
cs<- tilEOL | |
case modifier of | |
Just '-' -> return $ (indent currentPos) ++ "-" ++ c:cs | |
Just '=' -> return $ (indent currentPos) ++ "(stringToHtml " ++ c:cs ++ ")" | |
Nothing -> return $ (indent currentPos) ++ "(stringToHtml \"" ++ c:cs ++ "\")" | |
-- | |
-- * Utility functions | |
-- | |
isInline p = do p2 <- getPosition | |
case (sourceLine p ) == (sourceLine p2) of | |
True -> return [] | |
False -> pzero | |
isSameIndent p1 p2 = (sourceColumn p1) == (sourceColumn p2) | |
tilEOL = manyTill1 (noneOf "\n") eol | |
eol = newline <|> (eof >> return '\n') | |
termChar = satisfy (\c -> (isAlphaNum c) || (c `elem` termPunctuation) ) | |
termPunctuation = "-_" | |
indent p = take (sourceColumn (p) - 1) (repeat ' ') | |
manyTill1 p e = do ms <- manyTill p e | |
case (null ms) of | |
True -> pzero | |
False -> return ms |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
module Main where | |
import Text.ParserCombinators.Parsec | |
import Text.ParserCombinators.Parsec.Expr | |
import Text.ParserCombinators.Parsec.Language | |
import Text.ParserCombinators.Parsec.Perm | |
import qualified Text.ParserCombinators.Parsec.Token as T | |
import Data.Monoid | |
import Control.Applicative hiding (many, (<|>)) | |
type StatefulCharParser a = CharParser Int a | |
type TagContent = String | |
data TagIdentifier = Class String | Id String | TagName String deriving (Show) | |
data Tag = MultiLineTag [TagIdentifier] | SingleLineTag [TagIdentifier] TagContent deriving (Show) | |
data Tree = BranchingNode String [Tree] | |
| Node String | |
deriving (Show) | |
lexer = T.makeTokenParser haskellDef | |
ident = T.identifier lexer | |
identifier = T.identifier lexer | |
dot = T.dot lexer | |
whiteSpace = T.whiteSpace lexer | |
lexeme = T.lexeme lexer | |
symbol = T.symbol lexer | |
natural = T.natural lexer | |
parens = T.parens lexer | |
semi = T.semi lexer | |
reservedOp = T.reservedOp lexer | |
whitespace = T.whiteSpace lexer | |
eol = lexeme (string "\n") | |
word :: CharParser Int String | |
word = do | |
many1 letter | |
manyTill1 p e = do ms <- manyTill p e | |
case (null ms) of | |
True -> pzero | |
False -> return ms | |
parseFile p file = | |
do | |
content <- readFile file | |
run p content | |
run p input | |
= case runParser p 0 "" input of | |
Right n -> putStrLn (concat (map (spitTree "") n)) | |
Left err -> do{ putStr "parse error at " | |
; print err } | |
total :: StatefulCharParser [Tree] | |
total = do many1 consumeTopLevelChunk | |
consumeTopLevelChunk = do { setState 0 | |
; topLevelChunk } | |
topLevelChunk :: StatefulCharParser Tree | |
topLevelChunk = try (do { spaces <- consumeSpaces | |
; line <- (manyTill anyChar (string "\n")) | |
; branches <- many1 doSubChunk | |
; return (BranchingNode line branches) | |
} ) | |
<|> | |
do {consumeSpaces | |
; line <- (manyTill anyChar (string "\n")) | |
; return (Node line)} | |
doSubChunk :: StatefulCharParser Tree | |
doSubChunk = do | |
spaces <- getState | |
-- Check the next indentation level down | |
setState $ spaces + 2 | |
chunk <- topLevelChunk | |
-- Reset back to the old indentation level | |
setState $ spaces | |
return chunk | |
consumeSpaces = try (do { spaces <- getState | |
; count spaces (string " ") | |
; return spaces }) | |
spitTree :: String -> Tree -> String | |
spitTree prefix (BranchingNode content branches) = prefix ++ content ++ "\n" ++ (concat (map (spitTree (prefix ++ " ")) branches)) | |
spitTree prefix (Node content) = prefix ++ content ++ "\n" | |
-- line :: CharParser Int String | |
-- line = do{ indentCount <- (countParser (string " ")) | |
-- ; updateState (+1) | |
-- ; many letter | |
-- } | |
-- myIdentifier = identifier lexer | |
-- simple :: Parser Char | |
-- simple = letter | |
-- openClose :: Parser Char | |
-- openClose = do { char '(' | |
-- ; char ')' | |
-- } | |
-- parens :: Parser () | |
-- parens = do | |
-- char '(' | |
-- parens | |
-- char ')' | |
-- parens | |
-- <|> | |
-- return () | |
-- nesting :: Parser Int | |
-- nesting = do | |
-- char '(' | |
-- n <- nesting | |
-- char ')' | |
-- m <- nesting | |
-- return (max (n+1) m) | |
-- <|> | |
-- return 0 | |
-- word :: Parser String | |
-- word = many1 (letter <?> "") <?> "word" | |
-- separator :: Parser () | |
-- separator = skipMany1 (space <|> char ',' <?> "") | |
-- sentence :: Parser [String] | |
-- sentence = do{ words <- sepBy1 word separator | |
-- ; oneOf ".?!" <?> "end of sentence" | |
-- ; return words | |
-- } | |
-- -- sentences :: Parser [[String]] | |
-- -- sentences = do{ sentences <- sepBy1 sentence separator | |
-- -- ; return sentences | |
-- -- } | |
-- expr :: Parser Integer | |
-- expr = buildExpressionParser table factor <?> "expression" | |
-- table = [[op "*" (*) AssocLeft, op "/" div AssocLeft], | |
-- [op "+" (+) AssocLeft, op "-" (-) AssocLeft]] | |
-- where op s f assoc = Infix (do{ reservedOp s; return f}) assoc | |
-- factor = parens expr | |
-- <|> natural | |
-- <?> "simple expression" | |
-- number :: Parser Integer | |
-- number = do{ ds <- many1 digit | |
-- ; return (read ds) | |
-- } | |
-- <?> "number" | |
-- klasses :: Parser [String] | |
-- klasses = do { many1 klass } <?> "classes" | |
-- klass :: Parser String | |
-- klass = do { dot | |
-- ; identifier | |
-- } | |
-- <?> "class" | |
-- -- blocks consisting of blocks or lines | |
-- -- lines consist of tag names, and/or class names and/or ids |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment