Last active
August 29, 2015 13:57
-
-
Save xbeta/9511420 to your computer and use it in GitHub Desktop.
Various simple regex parsers
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
-module(regexp). | |
-export([run/2]). | |
-include_lib("eunit/include/eunit.hrl"). | |
-type regexp() :: emptych | |
| {char, integer()} | |
| {seq, [regexp()]} | |
| {star, regexp()}. | |
run(S, Regexp) -> | |
case match(S, Regexp, 0) of | |
{match, _, Pos} -> {match, [{0, Pos}]}; | |
nomatch -> nomatch | |
end. | |
match(S, empty, Pos) -> | |
{match, S, Pos}; | |
match([C|Cs], {char, C}, Pos) -> | |
{match, Cs, Pos + 1}; | |
match(_, {char, _}, _) -> | |
nomatch; | |
match(S, {seq, []}, Pos) -> | |
{match, S, Pos}; | |
match(S, {seq, [Regexp|Regexps]}, Pos) -> | |
case match(S, Regexp, Pos) of | |
{match, S2, Pos2} -> match(S2, {seq, Regexps}, Pos2); | |
nomatch -> nomatch | |
end; | |
match(_, {seq, _}, _) -> | |
nomatch; | |
match(S, {star, Regexp}, Pos) -> | |
case match(S, Regexp, Pos) of | |
{match, S2, Pos2} -> match_star(S2, Regexp, Pos2); | |
nomatch -> nomatch | |
end. | |
match_star(S, {seq, Regexp}, Pos) -> | |
case match(S, {seq, Regexp}, Pos) of | |
{match, S, Pos} -> {match, S, Pos}; | |
nomatch -> {match, S, Pos}; | |
{match, S2, Pos2} -> match_star(S2, {seq, Regexp}, Pos2) | |
end; | |
match_star(S, {char, C}, Pos) -> | |
case match(S, {char, C}, Pos) of | |
{match, S, Pos} -> {match, S, Pos}; | |
nomatch -> {match, S, Pos}; | |
{match, S2, Pos2} -> match_star(S2, {char, C}, Pos2) | |
end. | |
char_test() -> | |
{match, [{0, 1}]} = run("a", {char, $a}), | |
nomatch = run("b", {char, $a}). | |
seq_test() -> | |
{match, [{0, 2}]} = run("ab", {seq, [{char, $a}, {char, $b}]}). | |
star_test() -> | |
{match, [{0, 2}]} = run("ab", {star, {seq, [{char, $a}, {char, $b}]}}). |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{- | |
http://jcreigh.blogspot.com/2006/12/simple-regex-engine-in-haskell.html | |
-} | |
import Text.ParserCombinators.Parsec | |
data Match = MkMatch String String deriving (Show, Eq) | |
type Matcher = Match -> [Match] | |
matchOne :: (Char -> Bool) -> Matcher | |
matchOne _ (MkMatch _ "") = [] | |
matchOne f (MkMatch xs (y:ys)) | |
| f y = [MkMatch (xs ++ [y]) ys] | |
| otherwise = [] | |
rechar :: Char -> Matcher | |
rechar c = matchOne (==c) | |
dot :: Matcher | |
dot = matchOne (const True) | |
badstar :: Matcher -> Matcher | |
badstar f x = (f x >>= badstar f) ++ [x] | |
star :: Matcher -> Matcher | |
star f x = (filter (/=x) (f x) >>= star f) ++ [x] | |
alt :: [Matcher] -> Matcher | |
alt fs x = concatMap ($ x) fs | |
ordered :: [Matcher] -> Matcher | |
ordered fs x = foldl (>>=) [x] fs | |
plus :: Matcher -> Matcher | |
plus f = ordered [ f, star f ] | |
reoptional :: Matcher -> Matcher | |
reoptional f = alt [ f, return ] | |
literalchar :: Parser Char | |
literalchar = letter <|> digit <|> oneOf " ,-_;:'\"" | |
parseregex :: Parser Matcher | |
parseregex = do | |
xs <- sepBy1 (many1 atomWithMod) (char '|') | |
return (alt (map ordered xs)) | |
parsechar :: Parser Matcher | |
parsechar = literalchar >>= (return . rechar) | |
atomWithMod :: Parser Matcher | |
atomWithMod = do | |
a <- atom | |
choice (map (\(c, f) -> char c >> return (f a)) mods) <|> return a | |
where | |
mods = [('+', plus), ('*', star), ('?', reoptional)] | |
atom :: Parser Matcher | |
atom = subexpression <|> charclass <|> wildcard <|> parsechar | |
subexpression :: Parser Matcher | |
subexpression = do | |
char '(' | |
expr <- parseregex | |
char ')' | |
return expr | |
charclass :: Parser Matcher | |
charclass = do | |
char '[' | |
xs <- many1 ((try range) <|> parsechar) | |
char ']' | |
return (alt xs) | |
range :: Parser Matcher | |
range = do | |
from <- literalchar | |
char '-' | |
to <- literalchar | |
return (matchOne (`elem` [from..to])) | |
wildcard :: Parser Matcher | |
wildcard = char '.' >> return dot | |
matches :: String -> String -> [Match] | |
matches re str = [MkMatch "" str] >>= (runparser parseregex re) | |
runparser :: GenParser tok () a -> [tok] -> a | |
runparser p input = case (parse p "" input) of | |
Right x -> x | |
Left err -> error (show err) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
public class RegexBuilder { | |
public boolean isMatch(String s, String p) { | |
if(p.length() == 0) | |
return s.length() == 0; | |
//p's length 1 is special case | |
if(p.length() == 1 || p.charAt(1) != '*'){ | |
if(s.length() < 1 || (p.charAt(0) != '.' && s.charAt(0) != p.charAt(0))) | |
return false; | |
return isMatch(s.substring(1), p.substring(1)); | |
}else{ | |
int len = s.length(); | |
int i = -1; | |
while(i<len && (i < 0 || p.charAt(0) == '.' || p.charAt(0) == s.charAt(i))){ | |
if(isMatch(s.substring(i+1), p.substring(2))) | |
return true; | |
i++; | |
} | |
return false; | |
} | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment