Created
June 6, 2012 18:18
-
-
Save kirelagin/2883710 to your computer and use it in GitHub Desktop.
Parsgen
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
!<! | |
module Parse where | |
!>! | |
s -> e { #val = !<! #1.val !>! } | |
e -> t e' { #val = !<! #2.val !>! | |
#t = !<! #1.val !>! | |
#tt = !<! #1.val !>! } | |
e' -> P t e' { #val = !<! #3.val !>! | |
#tt = !<! ##.t + #2.val !>! } | |
e' -> _ { #val = !<! ##.tt !>! } | |
t -> f t' { #val = !<! #2.val !>! | |
#f = !<! #1.val !>! | |
#ff = !<! #1.val !>! } | |
t' -> X f t' { #val = !<! #3.val !>! | |
#ff = !<! ##.f * #2.val !>! } | |
t' -> _ { #val = !<! ##.ff !>! } | |
f -> L e R { #val = !<! #2.val !>! } | |
f -> N { #val = !<! #1.val !>! } | |
!<! | |
data Token = P () | |
| X () | |
| L () | |
| R () | |
| N Int | |
| Tend () | |
deriving (Eq, Show) | |
type AttrType = Int | |
!>! |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import Data.List | |
import Data.Map (Map) | |
import qualified Data.Map as M | |
import Control.Monad.State | |
type AttrsType = Map String AttrType | |
curToken :: State [Token] Token | |
curToken = state $ f where | |
f s@[] = (Tend (), s) | |
f s@(t:_) = (t, s) | |
parse ts = let attrs = fst (runState (parse_s M.empty) (ts ++ repeat (Tend ()))) in attrs | |
-------- |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
module Main where | |
import Control.Arrow | |
import Control.Monad.State | |
import Data.List | |
import Data.Map (Map) | |
import qualified Data.Map as M | |
import Data.Set (Set) | |
import qualified Data.Set as S | |
import Language.Haskell.Exts (prettyPrint, parseModuleWithMode, ParseResult (..), preludeFixities, ParseMode (..), Extension (..)) | |
import System.IO | |
import Text.Regex | |
import Types | |
import Parsegram (parsegram, tokenise) | |
main = do | |
inp <- getContents | |
let (before, parsed, after) = parsegram . tokenise $ inp | |
rules = process $ parsed | |
text = before ++ "\n" ++ boilerplate ++ (translateGrammar rules) ++ "\n" ++ after | |
hPutStrLn stderr $ showFirsts rules | |
hPutStrLn stderr $ "---" | |
hPutStrLn stderr $ showFollows rules | |
--putStrLn $ text | |
--putStrLn $ "----------!!!-------" | |
putStrLn . codeFormat $ text | |
process :: [(Nonterminal, Rule)] -> NontermRules | |
process l = M.fromListWith (++) $ map (second (\x -> [x])) l | |
calcFirst :: NontermRules -> [RuleItem]-> Set Terminal | |
calcFirst _ [] = S.singleton "_eps" | |
calcFirst _ ((Term t):cs) = S.singleton t | |
calcFirst r ((Nonterm n):cs) = nontermFirst n | |
where nontermFirst n = foldr S.union S.empty (map (calcFirst r . getItems) $ r M.! n) | |
calcFollow :: NontermRules -> Nonterminal -> Set Terminal | |
calcFollow r n = M.foldrWithKey (\k v b -> followNonterm k v `S.union` b) (if n == "s" then S.singleton "Tend" else S.empty) r | |
where followNonterm :: Nonterminal -> [Rule] -> Set Terminal | |
followNonterm a rules = foldr (\v b -> followRule v `S.union` b) S.empty rules | |
where followRule :: Rule -> Set Terminal | |
followRule (rule,_) = followItems rule | |
followItems :: [RuleItem] -> Set Terminal | |
followItems [] = S.empty | |
followItems ((Term _):cs) = followItems cs | |
followItems ((Nonterm t):cs) = (if t == n then if "_eps" `S.member` firstG then (S.delete "_eps" firstG) `S.union` (if t == a then S.empty else calcFollow r a) else firstG else S.empty) `S.union` followItems cs | |
where firstG = calcFirst r cs | |
-- Translation | |
say = modify . flip (++) | |
sayLn s = say s >> say "\n" | |
saySt s = say s >> say ";" >> say "\n" | |
magic m = snd $ flip runState "" m | |
translateGrammar :: NontermRules -> String | |
translateGrammar r = intercalate "\n" $ M.foldrWithKey (\k v b -> translateNontermRules r k v : b) [] r | |
translateNontermRules :: NontermRules -> Nonterminal -> [Rule] -> String | |
translateNontermRules r n rules = magic $ do | |
sayLn $ "parse_" ++ n ++ " :: AttrsType -> State [Token] AttrsType" | |
sayLn $ "parse_" ++ n ++ " pAttrs = do {" | |
saySt $ "t <- curToken" | |
sayLn $ "case t of {" | |
saySt $ translateRulesList rules | |
saySt $ "}" | |
sayLn $ "}" | |
where translateRulesList :: [Rule] -> String | |
translateRulesList [] = "_ -> error $ \"Unexpected token \" ++ show t;" | |
translateRulesList (rule:rs) = translateRule r n rule ++ (translateRulesList rs) | |
translateRule :: NontermRules -> Nonterminal -> Rule -> String | |
translateRule r n rule = | |
let first = calcFirst r items | |
follow = calcFollow r n | |
chklist = if "_eps" `S.member` first then (S.delete "_eps" first) `S.union` follow else first | |
in | |
intercalate ";\n" . S.toList $ S.map translType chklist | |
where translType tp = magic $ do | |
sayLn $ tp ++ " _ -> do {" | |
saySt $ "s1 <- get" | |
sayLn $ "let {" | |
saySt $ "parentAttrs = (" ++ translateAttrs attrs ++ ") `M.union` pAttrs" | |
say $ intercalate "\n" $ map (uncurry translateItem) (zip [1..] items) | |
sayLn $ "};" | |
saySt $ "put s" ++ show ((length items)+1) | |
saySt $ "return parentAttrs" | |
sayLn $ "};" | |
(items, attrs) = rule | |
translateItem number (Nonterm n) = magic $ do | |
let mys = show number | |
nexts = show (number+1) | |
saySt $ "(attrs_" ++ mys ++ ", s" ++ nexts ++ ") = runState (parse_" ++ n ++ " parentAttrs) s" ++ mys | |
translateItem number (Term t) = magic $ do | |
let mys = show number | |
nexts = show (number+1) | |
sayLn $ "(attrs_" ++ mys ++ ", s" ++ nexts ++ ") = case (head s" ++ mys ++ ") of {" | |
saySt $ "(" ++ t ++ " val) -> (M.singleton \"val\" val, tail s" ++ mys ++ ");" | |
saySt $ "_ -> error $ \"Unexpected token \" ++ show t ++ \". " ++ t ++ " expected.\"" | |
sayLn $ "};" | |
translateAttrs :: [(String, Code)] -> String | |
translateAttrs attrs = magic $ do | |
say $ "M.fromList [" | |
say $ intercalate "," $ map trAttr attrs | |
say $ "]" | |
where trAttr :: (String, Code) -> String | |
trAttr (attr, code) = "(\"" ++ attr ++ "\"," ++ preprocessed code ++ ")" | |
preprocessed cd = let | |
cd1 = subRegex (mkRegex "#([0-9]+)\\.([a-zA-Z]+)") cd "(attrs_\\1 M.! \"\\2\")" | |
cd2 = subRegex (mkRegex "##\\.([a-zA-Z]+)") cd1 "(parentAttrs M.! \"\\1\")" | |
in cd2 | |
codeFormat = check . fmap reformat . (parseModuleWithMode mode) where | |
mode = ParseMode "" [FlexibleContexts, MultiParamTypeClasses] False True (Just preludeFixities) | |
reformat = prettyPrint | |
check r = case r of | |
ParseOk a -> a | |
ParseFailed loc err -> error $ show (loc,err) | |
---- JUNK | |
showFirsts:: NontermRules -> String | |
showFirsts r = intercalate "\n" $ M.foldrWithKey (\k v b -> (k ++ " : " ++ show v) : b) [] (firsts $ r) | |
showFollows r = intercalate "\n" $ M.foldrWithKey (\k v b -> (k ++ " : " ++ show v) : b) [] (follows $ r) | |
firsts :: NontermRules -> Map Nonterminal (Set Terminal) | |
firsts r = M.mapWithKey (\k _ -> calcFirst r [Nonterm k]) r | |
follows r = M.mapWithKey (\k _ -> calcFollow r k) r | |
----- BE CAREFUL! The following lines are generated by Makefile. | |
boilerplate = "import Data.List\nimport Data.Map (Map)\nimport qualified Data.Map as M\nimport Control.Monad.State\n\n\ntype AttrsType = Map String AttrType\n\ncurToken :: State [Token] Token\ncurToken = state $ f where\n f s@[] = (Tend (), s)\n f s@(t:_) = (t, s)\n\nparse ts = let attrs = fst (runState (parse_s M.empty) (ts ++ repeat (Tend ()))) in attrs\n\n--------\n" |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import Data.Char | |
import qualified Data.Map as M | |
import Parse | |
tokenise :: String -> [Token] | |
tokenise [] = [] | |
tokenise ('+':cs) = P () : tokenise cs | |
tokenise ('*':cs) = X () : tokenise cs | |
tokenise ('(':cs) = L () : tokenise cs | |
tokenise (')':cs) = R () : tokenise cs | |
tokenise l@(c:cs) | |
| isSpace c = tokenise cs | |
| isDigit c = readInt l | |
where | |
readInt l = N (read i :: Int) : tokenise rest | |
(i, rest) = span isDigit l | |
main = do | |
cont <- getContents | |
--putStrLn . show $ (tokenise cont) | |
print $ (parse $ tokenise cont) M.! "val" |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import Data.Char | |
import qualified Data.Map as M | |
import Parse | |
tokenise :: String -> [Token] | |
tokenise [] = [] | |
tokenise ('+':cs) = OP '+' : tokenise cs | |
tokenise ('-':cs) = OP '-' : tokenise cs | |
tokenise l@(c:cs) | |
| isSpace c = tokenise cs | |
| isDigit c = readInt l | |
where | |
readInt l = NUM (read i :: Int) : tokenise rest | |
(i, rest) = span isDigit l | |
main = do | |
cont <- getContents | |
--putStrLn . show $ (tokenise cont) | |
let res = (parse $ tokenise cont) M.! "val" | |
putStrLn $ case res of | |
Val i -> show i | |
_ -> "fail" |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import Data.Char | |
import qualified Data.Map as M | |
import Parse | |
tokenise :: String -> [Token] | |
tokenise [] = [] | |
tokenise ('(':cs) = LP () : tokenise cs | |
tokenise (')':cs) = RP () : tokenise cs | |
tokenise ('{':cs) = LN () : tokenise cs | |
tokenise ('}':cs) = RN () : tokenise cs | |
tokenise ('[':cs) = LS () : tokenise cs | |
tokenise (']':cs) = RS () : tokenise cs | |
tokenise ('.':cs) = DOT () : tokenise cs | |
tokenise l@(c:cs) | |
| isSpace c = tokenise cs | |
| isAlpha c = readStr l | |
where readStr l = STR s : tokenise rest | |
(s, rest) = span isAlpha l | |
main = do | |
cont <- getContents | |
putStrLn $ (parse $ tokenise cont) M.! "val" |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
parsgen : Main.hs Parsegram.hs Types.hs | |
ghc $^ -o $@ | |
Parsegram.hs : Parsegram.y Types.hs | |
happy -gac $< -o $@ | |
Main.hs : boilerplate.hs | |
if tail -n 1 $@ | grep -q boilerplate; then \ | |
sed -i '$$d' $@;\ | |
fi | |
echo 'main = interact (("boilerplate = " ++) . show)' > tmp.hs | |
runhaskell tmp.hs < $< >> $@ | |
rm -f tmp.hs | |
.PHONY: clean | |
clean : | |
rm -rf *.o *.hi parsgen Parsegram.hs |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
!<! | |
module Parse where | |
import Data.Char | |
import Data.List | |
!>! | |
s -> NUM e' { #val = !<! Val $ case #2.func of { (Func f) -> f #1.val; } !>! } | |
e' -> s a e' { #func = !<! Func $ \x -> case #1.val of { (Val i) -> case #3.func of { (Func f) -> case #2.op of { (Op '+') -> f (x+i); (Op '-') -> f(x-i);};};} !>! } | |
e' -> _ { #func = !<! Func $ id !>! } | |
a -> OP { #op = !<! Op #1.val !>! } | |
!<! | |
data Token = NUM Int | |
| OP Char | |
| Tend () | |
deriving (Show) | |
data AttrType = Val Int | Func (Int -> Int) | Op Char | |
!>! |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
module Parsegram ( | |
parsegram, tokenise | |
) where | |
import Data.Char | |
import Control.Monad | |
import Types | |
} | |
%name parsegram | |
%error { parseError } | |
%tokentype { Token } | |
%token | |
TERM { TTerm $$ } | |
NONTERM { TNonterm $$ } | |
ATTR { TAttr $$ } | |
'->' { TArrow } | |
'!<!' { TCodeOpen } | |
'!>!' { TCodeClose } | |
CODE { TCode $$ } | |
'{' { TLBrace } | |
'}' { TRBrace } | |
'=' { TEq } | |
'_' { TEpsilon } | |
%% | |
GrammarFile :: { (String, [(Nonterminal, Rule)], String) } | |
: '!<!' CODE '!>!' NontermRuleSeq '!<!' CODE '!>!' { ($2, $4, $6) } | |
NontermRuleSeq :: { [(Nonterminal, Rule)] } | |
: NontermRule NontermRuleSeq { $1 : $2 } | |
| NontermRule { [$1] } | |
NontermRule :: { (Nonterminal, Rule) } | |
: NONTERM '->' ItemSeq '{' Attrs '}' { ($1, ($3, $5)) } | |
| NONTERM '->' ItemSeq '{' '}' { ($1, ($3, [])) } | |
Attrs :: { [(String, Code) ]} | |
: Attr Attrs { $1 : $2 } | |
| Attr { [$1] } | |
Attr :: { (String, Code) } | |
: ATTR '=' '!<!' CODE '!>!' { ($1, $4) } | |
ItemSeq :: { [RuleItem] } | |
: Item ItemSeq { $1 : $2 } | |
| Item { [$1] } | |
| '_' { [] } | |
Item :: { RuleItem } | |
Item : TERM { Term $1 } | |
| NONTERM { Nonterm $1 } | |
{ | |
parseError = error . show | |
data Token = TTerm Terminal | |
| TNonterm Nonterminal | |
| TAttr String | |
| TArrow | |
| TCodeOpen | |
| TCodeClose | |
| TCode String | |
| TLBrace | |
| TRBrace | |
| TEq | |
| TEpsilon | |
deriving (Show) | |
tokenise [] = [] | |
tokenise ('-':'>':cs) = TArrow : tokenise cs | |
tokenise ('{':cs) = TLBrace : tokenise cs | |
tokenise ('}':cs) = TRBrace : tokenise cs | |
tokenise ('_':cs) = TEpsilon : tokenise cs | |
tokenise ('#':cs) = readHaskellId | |
where readHaskellId = TAttr i : tokenise rest | |
(i, rest) = span (liftM2 (||) isAlphaNum (=='_')) cs | |
tokenise ('=':cs) = TEq : tokenise cs | |
tokenise ('!':'<':'!':cs) = TCodeOpen : readCode cs "" | |
where readCode ('!':'>':'!':cs) l = (TCode l) : TCodeClose : tokenise cs | |
readCode (c:cs) l = readCode cs (l ++ [c]) | |
readCode [] l = [TCode l] | |
tokenise l@(c:cs) | |
| isSpace c = tokenise cs | |
| isAlpha c = readId l | |
where readId l@(c:cs) = ((if isAsciiUpper c then TTerm else TNonterm) i) : tokenise rest | |
(i, rest) = span (liftM2 (||) isAlpha (=='\'')) l | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
!<! | |
module Parse where | |
import Data.Map (Map) | |
import qualified Data.Map as M | |
!>! | |
s -> LN c DOT s RN { #val = !<! #2.val !>! | |
#sv = !<! #4.val !>! } | |
s -> LP STR RP { #val = !<! #2.val !>! } | |
c -> LS STR RS { #val = !<! #2.val ++ ##.sv !>! } | |
!<! | |
data Token = LN () | |
| RN () | |
| LP () | |
| RP () | |
| LS () | |
| RS () | |
| STR String | |
| DOT () | |
| Tend () | |
deriving (Eq, Show) | |
type AttrType = String | |
!>! |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
2 3 + 1 - 2 2 + - |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{[a].{[b].(c)}} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
module Types where | |
import Data.Map (Map) | |
type NontermRules = Map Nonterminal [Rule] | |
type Rule = ([RuleItem], [(String, Code)]) | |
data RuleItem = Term String | Nonterm String deriving (Show) | |
type Nonterminal = String | |
type Terminal = String | |
type Code = String | |
getItems = fst |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment