Created
May 27, 2013 17:44
-
-
Save barrucadu/5658260 to your computer and use it in GitHub Desktop.
A gentle introduction to parsec (in particular, applicative parsec), by constructing a parser for CSV files
http://blog.barrucadu.co.uk/2013/05/27/a-gentle-introduction-to-parsec/
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
-- Example code for a CSV file parser using Parsec. Some parts are | |
-- repeated, with differing functionality. To avoid name conflicts, | |
-- comment out the versions you don't want to play with. | |
module CSVParser (parseCSV, parseCSV') where | |
import Control.Applicative ((<$), (<*), (*>), liftA) | |
import Data.Char (chr) | |
import Data.Either (either) | |
import Text.Parsec | |
---------- Simple cell parser | |
cell :: Parsec String () String | |
cell = many $ noneOf ",\n" | |
---------- | |
---------- More advanced cell parser | |
cell :: Parsec String () String | |
cell = cell' <|> many (noneOf ",\n") | |
where cell' = between (char '"') (char '"') $ many chr | |
chr = noneOf "\"" <|> try ('"' <$ string "\"\"") | |
---------- | |
---------- Final cell parser | |
cell :: Parsec String () String | |
cell = cell' <|> many (chr ",\n\\") | |
where cell' = between (char '"') (char '"') $ many (chr "\\\"") | |
chr bad = try literal <|> specialchar <|> noneOf bad | |
specialCharacters :: [(Char, Char)] | |
specialCharacters = [('0', '\0'), ('a', '\a'), ('b', '\b'), ('f', '\f'), | |
('n', '\n'), ('r', '\r'), ('t', '\t'), ('v', '\v'), | |
('"', '"'), ('\'', '\''), ('\\', '\\')] | |
specialchar :: Parsec String () Char | |
specialchar = char '\\' *> special' specialCharacters | |
where special' ((esc, c):cs) = char esc *> parserReturn c <|> special' cs | |
special' [] = parserZero | |
literalNumbers :: [(Char, Parsec String () Char)] | |
literalNumbers = [('x', hexDigit), ('o', octDigit)] | |
literal :: Parsec String () Char | |
literal = char '\\' *> literal' literalNumbers | |
where literal' ((c, f):cs) = char c *> tochar c (many1 f) <|> literal' cs | |
literal' [] = parserZero | |
tochar c = liftA $ \s -> chr . read $ '0' : c : s | |
---------- | |
---------- Line parser | |
line :: Parsec String () [String] | |
line = cell `sepBy1` char ',' | |
---------- | |
---------- File parser | |
csvp :: Parsec String () [[String]] | |
csvp = line `endBy` newline <* eof | |
---------- | |
---------- Parser interfaces | |
parseCSV :: String -> Either ParseError [[String]] | |
parseCSV = parse csvp "" | |
parseCSV' :: String -> Maybe [[String]] | |
parseCSV' = either (const Nothing) Just . parseCSV | |
---------- | |
---------- Exercise 1: allow for multi-character special character escapes. | |
specialCharacters :: [(String, Char)] | |
specialCharacters = [("0", '\0'), ("a", '\a'), ("b", '\b'), ("f", '\f'), | |
("n", '\n'), ("r", '\r'), ("t", '\t'), ("v", '\v'), | |
("\"", '"'), ("'", '\''), ("\\", '\\'), | |
("SOH", '\SOH'), ("STX", '\STX'), ("ETX", '\ETX'), | |
("EOT", '\EOT'), ("ENQ", '\ENQ'), ("ACK", '\ACK'), | |
("BEL", '\BEL'), ("BS", '\BS'), ("HT", '\HT'), | |
("LF", '\LF'), ("VT", '\VT'), ("FF", '\FF'), | |
("CR", '\CR'), ("SO", '\SO'), ("SI", '\SI'), | |
("DLE", '\DLE'), ("DC1", '\DC1'), ("DC2", '\DC2'), | |
("DC3", '\DC3'), ("DC4", '\DC4'), ("NAK", '\NAK'), | |
("SYN", '\SYN'), ("ETB", '\ETB'), ("CAN", '\CAN'), | |
("EM", '\EM'), ("SUB", '\SUB'), ("ESC", '\ESC'), | |
("FS", '\FS'), ("GS", '\GS'), ("RS", '\RS'), | |
("US", '\US'), ("SP", '\SP'), ("DEL", '\DEL')] | |
specialchar :: Parsec String () Char | |
specialchar = char '\\' *> foldr ((<|>) . special') parserZero specialCharacters | |
where special' (esc, c) = try (string esc) *> parserReturn c | |
---------- | |
---------- Exercise 2: allow for decimal character literals | |
literalNumbers :: [(Maybe Char, Parsec String () Char)] | |
literalNumbers = [(Just 'x', hexDigit), (Just 'o', octDigit), (Nothing, digit)] | |
literal :: Parsec String () Char | |
literal = char '\\' *> try (foldr ((<|>) . literal') parserZero literalNumbers) | |
where literal' (c'@(Just c), f) = char c *> tochar c' (many1 f) | |
literal' (Nothing, f) = tochar Nothing (many1 f) | |
tochar (Just c) = liftA $ \s -> chr . read $ '0' : c : s | |
tochar Nothing = liftA $ \s -> chr $ read s | |
---------- |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment