Created
April 7, 2011 14:43
-
-
Save takaokouji/907895 to your computer and use it in GitHub Desktop.
sanitize_regexp_string for "<pattern>{..}*" in Ruby
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
require 'test/unit' | |
def sanitize_regexp_string(chars) | |
lbrace_chars = "{" | |
rbrace_asterisk_chars = "}*" | |
backslash_chars = "\\" | |
res = chars.dup | |
res_diff = 0 | |
p_start = 0 | |
p_end = chars.length - 1 | |
while p_start <= p_end | |
rac_pos = chars.index(rbrace_asterisk_chars, p_start) | |
if rac_pos == nil | |
return res | |
end | |
i = rac_pos - 1 | |
l_pos = nil | |
state = 0 | |
# 0: one time, number or "," | |
# 1: number or "," or "{" | |
# 2: one time, number | |
# 3: number or "{" | |
while i >= p_start | |
c = chars[i] | |
case state | |
when 0 | |
case c | |
when "0", "1", "2", "3", "4", "5", "6", "7", "8", "9" | |
state = 1 | |
when "," | |
state = 2 | |
else | |
break | |
end | |
when 1 | |
case c | |
when "0", "1", "2", "3", "4", "5", "6", "7", "8", "9" | |
when "," | |
state = 2 | |
when lbrace_chars | |
l_pos = i | |
break | |
else | |
break | |
end | |
when 2 | |
case c | |
when "0", "1", "2", "3", "4", "5", "6", "7", "8", "9" | |
state = 3 | |
else | |
break | |
end | |
when 3 | |
case c | |
when "0", "1", "2", "3", "4", "5", "6", "7", "8", "9" | |
when lbrace_chars | |
l_pos = i | |
break | |
else | |
break | |
end | |
end | |
i -= 1 | |
end | |
if l_pos != nil | |
i = l_pos - 1 | |
in_pos = nil | |
num_blackets = 0 | |
num_braces = 0 | |
num_backslaches = 0 | |
state = 0 | |
while i >= p_start | |
c = chars[i] | |
case state | |
when 0 | |
case c | |
when "\\" | |
num_backslaches += 1 | |
state = 2 | |
when "]" | |
num_blackets += 1 | |
state = 3 | |
when ")" | |
num_braces += 1 | |
state = 20 | |
else | |
state = 1 | |
end | |
when 1 | |
case c | |
when "\\" | |
num_backslaches += 1 | |
else | |
break | |
end | |
when 2 | |
case c | |
when "\\" | |
num_backslaches += 1 | |
else | |
break | |
end | |
when 3 | |
case c | |
when "\\" | |
if num_blackets == 1 | |
num_backslaches += 1 | |
state = 1 | |
else | |
num_backslaches += 1 | |
state = 4 | |
end | |
when "]" | |
num_blackets += 1 | |
when "[" | |
num_blackets -= 1 | |
if num_blackets <= 0 | |
in_pos = i | |
break | |
end | |
end | |
when 4 | |
case c | |
when "\\" | |
num_backslaches += 1 | |
else | |
num_backslaches = 0 | |
if (num_backslaches % 2) == 1 | |
num_blackets -= 1 | |
end | |
state = 3 | |
end | |
end | |
i -= 1 | |
end | |
if state == 1 | |
if (num_backslaches % 2) == 1 | |
in_pos = l_pos - 2 | |
else | |
if num_blackets == 0 | |
in_pos = l_pos - 1 | |
end | |
end | |
end | |
if state == 2 | |
if (num_backslaches % 2) == 0 | |
in_pos = l_pos - 2 | |
end | |
end | |
if in_pos != nil | |
res[(res_diff + in_pos)..(res_diff + rac_pos)] = "(?:" + chars[in_pos..rac_pos] + ")" | |
res_diff += 4 | |
end | |
end | |
p_start = rac_pos + rbrace_asterisk_chars.length | |
end | |
return res | |
end | |
class TestMethod < Test::Unit::TestCase | |
message_expected_actual_check = | |
[ | |
["minimum", "(?:.{8})*", ".{8}*", true], | |
["escaped right brace 1", ".{8\\}*", ".{8\\}*", false], | |
["escaped right brace 2", ".{8\\\\}*", ".{8\\\\}*", false], | |
["escaped right brace 3", ".{8\\\\\\}*", ".{8\\\\\\}*", false], | |
["escaped right brace 4", ".{8\\\\\\\\}*", ".{8\\\\\\\\}*", false], | |
["escaped left brace 1", ".\\{8}*", ".\\{8}*", false], | |
["escaped left brace 2", ".(?:\\\\{8})*", ".\\\\{8}*", true], | |
["escaped left brace 3", ".\\\\\\{8}*", ".\\\\\\{8}*", false], | |
["escaped left brace 4", ".\\\\(?:\\\\{8})*", ".\\\\\\\\{8}*", true], | |
["multiple 1", "(?:.{8})*(?:.{8})*", ".{8}*.{8}*", true], | |
["multiple 2", "(?:.{8})*abc abc abc(?:.{8})*", ".{8}*abc abc abc.{8}*", true], | |
["multiple 3", | |
".{8\\}*(?:.{8})*.\\{8}*(?:.{8})*.\\{8\\}*", | |
".{8\\}*.{8}*.\\{8}*.{8}*.\\{8\\}*", false], | |
["brace 1", "(?:.{0,8})*", ".{0,8}*", true], | |
["brace 2", "(?:.{0,})*", ".{0,}*", true], | |
["brace 3", ".{,}*", ".{,}*", false], | |
["brace 4", ".{}*", ".{}*", false], | |
["brace 5", ".{a}*", ".{a}*", false], | |
["brace 6", ".{a,1}*", ".{a,1}*", false], | |
["brace 7", ".{1,2,3}*", ".{1,2,3}*", false], | |
["backslash 1", "(?:\\w{8})*", "\\w{8}*", true], | |
["backslash 2", "\\\\(?:w{8})*", "\\\\w{8}*", true], | |
["backslash 3", "\\\\(?:\\w{8})*", "\\\\\\w{8}*", true], | |
["backslash 4", "\\\\\\\\(?:w{8})*", "\\\\\\\\w{8}*", true], | |
["char class 1", "(?:[0-9]{8})*", "[0-9]{8}*", true], | |
["char class 2", "(?:[[:word:]]{8})*", "[[:word:]]{8}*", true], | |
["char class 3", "(?:[[:word:][:digit:]]{8})*", "[[:word:][:digit:]]{8}*", false], | |
["char class 4", "[abc](?:[[:word:][:digit:]]{8})*", "[abc][[:word:][:digit:]]{8}*", false], | |
["char class 5", "(?:[[]){8})*", "[[]{8}*", true], | |
["char class 6", "(?:[\\[]){8})*", "[\\[]{8}*", true], | |
["char class 7", "(?:[\\w]){8})*", "[\\w]{8}*", true], | |
["char class 8", "[\\\\[]{8}*", "[\\\\[]{8}*", false], | |
["char class 9", "(?:\\]{8})*", "\\]{8}*", false], | |
["char class 10", "\\\\]{8}*", "\\\\]{8}*", false], | |
["group 1", "(?:(abc){8})*", "(abc){8}*", true], | |
["group 2", "(?:((ab)(c)){8})*", "((ab)(c)){8}*", true], | |
["group 3", "(ABC)(?:((ab)(c)){8})*", "(ABC)((ab)(c)){8}*", true], | |
["char class and group 1", "(?:[)]{8})*", "[)]{8}*", true], | |
["char class and group 2", "((((?:[)))]{8})*)))", "((([)))]{8}*)))", true], | |
] | |
message_expected_actual_check.each do |msg, expected, actual, check| | |
class_eval <<-EOS | |
def test_sanitize_regexp_string__#{msg.gsub(/\s+/, "_")} | |
assert_equal(#{expected.inspect}, sanitize_regexp_string(#{actual.inspect}), #{msg.inspect}) | |
if #{check} | |
assert_nothing_raised do | |
Regexp.new(sanitize_regexp_string(#{actual.inspect})) | |
end | |
end | |
end | |
EOS | |
end | |
end |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment