AndrasKovacs · March 20, 2021 16:46
diff --git a/FlatParseNotes.hs b/FlatParseNotes.hs

 -- flatparse: hackage 0.1.1.1

 -- example: smaltt (demo for fast elaborator): switching from megaparsec to flatparse would speed up more than 2x overall

 -- 1 year ago:
 --   parsec     : ridiculously slow, CPS implementation
 --   attoparsec : (resumable, CPS), "high-performance" data parsing, binary parsing, not language parsing
 --     fringe, modern:
 --        parsnip: (unpublished on hackage) (by Kmett, on github) : really the high-performance raw data parsing (not language parsing)
 --        bytesmith: similar to parsnip, (a bit slower than flatparse)
 --   megaparsec: faster than parsec, roughly as fast as attoparsec, but richer in features
 --   parsley: staged selective parsing (TH interface + different backends)
 --

 -- General observations:
 --    - I only want cover 95% programming language parsing use cases
 --       - I focus on UTF8
 --       - No resumption
 --       - Only strict ByteString input (we can still turn Text/whatever to ByteString)
 --          - Implementation not generic over streams/monads etc.
 --             (if you don't use features, it still slows you down)
 --
 --
 --    - Error messages / indentation / source positions/spans
 --      - Error messages: megaparsec: built-in automatic hint propagation, can't disable, large overhead, by itself is "toy" implementation
 --      - I want: - minimal overhead of fancy errors, if you don't want fancy errors
 --                - possibility of fancy errors if you want them (you have to implement them)
 --                - minimal overhead of source position tracking
 --                - possibility of pervasive source position tracking/spanning in efficient way
 --                - indentation parsing
 --                -    --||--


 --  FlatParse.Basic    : no indentation parsing
 --  FlatParse.Stateful : possible to do indentation parsing, slightly slower

 -- type Parser e a = FlatParse.Basic.Parser () e a

 {-

 -- distinction of Fail and Err:
 pattern OK# :: a -> Addr# -> Res# e a
 pattern Err# :: e -> Res# e a
 pattern Fail# :: Res# e a

 -- This is how nom in Rust works

 -- *parsec: distinction: consumes input / does not consume input
 --    can't backtrack from consumption
 --    can   backtrack from non-consuming parser

 -- token parsers: don't consume

 -- (keyword "foobar") <|> (keyword "foofoobar")                   -- input: foofoobar (OK)
 -- (keyword "foo" >> keyword "bar") <|> (keyword "foofoobar")     -- (FAIL)
 -- try (keyword "foo" >> keyword "bar") <|> (keyword "foofoobar") -- (OK)     (try: permitting more backtracking)

 -- discourage unlimited lookahead: only cheap parser should be non-consuming, everything else should be consuming
 --    experience: consuming/non-consuming distinction is overidden by a bunch of try-s
 --                (no close correspondence between consumption and possibility of backtracking)

 -- Alternative: (nom/flatparse):
 --    failure: control flow purpose, can be backtracked from (by default)
 --    error  : grammar error, cannot be recovered from (by default)


 -- (keyword1 >> p1) <|> (keyword2 >> p2)        (keywords can fail but not error)
 -- keyword "let" >> ... >> keyword' "in" ...    (keyword' "in" can only throw error, not fail)

 -- Lots of parsers has to be available in "error" version and also the "failure" version

 -- converting failure to error:   cut :: Parser e a -> e -> Parser e a
 --              pa `cut` "error message"
 --              keyword "in" `cut` "expected \"in\""


 -- cutting --> cutWith

 -- $(string "foobar")  -- 6 bytes : can be read by first, check input has 6 bytes, then do 32-bit read, then 16-bit read
 -- $(string "λ")       -- 2 bytes : UTF8-decoding to bytes performed by TH

 $(switch [| case _ of
   "foo"    -> ...
   "bar"    -> ...
   "foobar" -> ...
   _        -> ... |])

 - trie of reads / case expressions, vectorized, aggregated input size checking
 - longest match semantics
 - mini lexer-generator embedded in the combinator library

 -- $(switch [| case _ of
 --    "foo"  -> ...
 --    [a-z]+ -> ... |])          (unicode + lexer generation)

 -- Char in Haskell: 32-bit (code point)
 -- $(string "foo")

 -- Locations:
 -- fusedSatisfy isASCIILetter (\c -> isGreekLetter c || isLetter c) g h

 -- Large benchmark: smaltt parser   (megaparsec)                              (35000 lines/sec)
 --                  setoidtt parser (flatparse, superset of smalltt syntax)   (1,5 million lines/sec)

 -- How to indentation parse:

 --    leading whitespace is relevant: write ws parser which counts code points after newlines
 --    columns are relevant:

 --       foo    bar    baz
 --       foo'   bar'   baz'

 -- case foo of
 --   C1 -> ...
 --   C2 -> ...

 -- Reader of current level of indentation
 -- State of current leading number of whitespaces
 foo = f
 x y

 -}

	-- flatparse: hackage 0.1.1.1

	-- example: smaltt (demo for fast elaborator): switching from megaparsec to flatparse would speed up more than 2x overall

	-- 1 year ago:
	-- parsec : ridiculously slow, CPS implementation
	-- attoparsec : (resumable, CPS), "high-performance" data parsing, binary parsing, not language parsing
	-- fringe, modern:
	-- parsnip: (unpublished on hackage) (by Kmett, on github) : really the high-performance raw data parsing (not language parsing)
	-- bytesmith: similar to parsnip, (a bit slower than flatparse)
	-- megaparsec: faster than parsec, roughly as fast as attoparsec, but richer in features
	-- parsley: staged selective parsing (TH interface + different backends)
	--

	-- General observations:
	-- - I only want cover 95% programming language parsing use cases
	-- - I focus on UTF8
	-- - No resumption
	-- - Only strict ByteString input (we can still turn Text/whatever to ByteString)
	-- - Implementation not generic over streams/monads etc.
	-- (if you don't use features, it still slows you down)
	--
	--
	-- - Error messages / indentation / source positions/spans
	-- - Error messages: megaparsec: built-in automatic hint propagation, can't disable, large overhead, by itself is "toy" implementation
	-- - I want: - minimal overhead of fancy errors, if you don't want fancy errors
	-- - possibility of fancy errors if you want them (you have to implement them)
	-- - minimal overhead of source position tracking
	-- - possibility of pervasive source position tracking/spanning in efficient way
	-- - indentation parsing
	-- - --\|\|--


	-- FlatParse.Basic : no indentation parsing
	-- FlatParse.Stateful : possible to do indentation parsing, slightly slower

	-- type Parser e a = FlatParse.Basic.Parser () e a

	{-

	-- distinction of Fail and Err:
	pattern OK# :: a -> Addr# -> Res# e a
	pattern Err# :: e -> Res# e a
	pattern Fail# :: Res# e a

	-- This is how nom in Rust works

	-- *parsec: distinction: consumes input / does not consume input
	-- can't backtrack from consumption
	-- can backtrack from non-consuming parser

	-- token parsers: don't consume

	-- (keyword "foobar") <\|> (keyword "foofoobar") -- input: foofoobar (OK)
	-- (keyword "foo" >> keyword "bar") <\|> (keyword "foofoobar") -- (FAIL)
	-- try (keyword "foo" >> keyword "bar") <\|> (keyword "foofoobar") -- (OK) (try: permitting more backtracking)

	-- discourage unlimited lookahead: only cheap parser should be non-consuming, everything else should be consuming
	-- experience: consuming/non-consuming distinction is overidden by a bunch of try-s
	-- (no close correspondence between consumption and possibility of backtracking)

	-- Alternative: (nom/flatparse):
	-- failure: control flow purpose, can be backtracked from (by default)
	-- error : grammar error, cannot be recovered from (by default)


	-- (keyword1 >> p1) <\|> (keyword2 >> p2) (keywords can fail but not error)
	-- keyword "let" >> ... >> keyword' "in" ... (keyword' "in" can only throw error, not fail)

	-- Lots of parsers has to be available in "error" version and also the "failure" version

	-- converting failure to error: cut :: Parser e a -> e -> Parser e a
	-- pa `cut` "error message"
	-- keyword "in" `cut` "expected \"in\""


	-- cutting --> cutWith

	-- $(string "foobar") -- 6 bytes : can be read by first, check input has 6 bytes, then do 32-bit read, then 16-bit read
	-- $(string "λ") -- 2 bytes : UTF8-decoding to bytes performed by TH

	$(switch [\| case _ of
	"foo" -> ...
	"bar" -> ...
	"foobar" -> ...
	_ -> ... \|])

	- trie of reads / case expressions, vectorized, aggregated input size checking
	- longest match semantics
	- mini lexer-generator embedded in the combinator library

	-- $(switch [\| case _ of
	-- "foo" -> ...
	-- [a-z]+ -> ... \|]) (unicode + lexer generation)

	-- Char in Haskell: 32-bit (code point)
	-- $(string "foo")

	-- Locations:
	-- fusedSatisfy isASCIILetter (\c -> isGreekLetter c \|\| isLetter c) g h

	-- Large benchmark: smaltt parser (megaparsec) (35000 lines/sec)
	-- setoidtt parser (flatparse, superset of smalltt syntax) (1,5 million lines/sec)

	-- How to indentation parse:

	-- leading whitespace is relevant: write ws parser which counts code points after newlines
	-- columns are relevant:

	-- foo bar baz
	-- foo' bar' baz'

	-- case foo of
	-- C1 -> ...
	-- C2 -> ...

	-- Reader of current level of indentation
	-- State of current leading number of whitespaces
	foo = f
	x y

	-}