-
-
Save leehambley/1242132 to your computer and use it in GitHub Desktop.
solr query sanitizer using a statemachine (like a baws!)
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# the only dependencies are http://rubygems.org/gems/transitions | |
# and I think active support | |
module Search | |
module QueryBuilderHelper | |
def sanitize_field_for_solr_query(query) | |
if query.is_a?(Hash) | |
query.inject({}) do |result, query_component| | |
result[query_component.first] = query_component.first == '*' ? query_component.last : QueryField.sanitize(query_component.last) | |
result | |
end | |
else | |
QueryField.sanitize(query) | |
end | |
end | |
end | |
end |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
require 'transitions' | |
module Search | |
class QueryField | |
include Transitions | |
def initialize(query) | |
@index = 0 | |
@output_string = "" | |
@current_char = nil | |
@query = query | |
end | |
state_machine do | |
# state :idle | |
state :beginning_of_word | |
state :unescaped_outside_of_quotation | |
state :escaped_outside_of_quotation | |
state :unescaped_inside_of_quotation | |
state :escaped_inside_of_quotation | |
state :end_of_string | |
event :normal_character do | |
transitions :from => :beginning_of_word, :to => :unescaped_outside_of_quotation, :on_transition => :unescaped_print | |
transitions :from => :unescaped_outside_of_quotation, :to => :unescaped_outside_of_quotation, :on_transition => :unescaped_print | |
transitions :from => :escaped_outside_of_quotation, :to => :unescaped_outside_of_quotation, :on_transition => :escaped_print | |
transitions :from => :unescaped_inside_of_quotation, :to => :unescaped_inside_of_quotation, :on_transition => :unescaped_print | |
transitions :from => :escaped_inside_of_quotation, :to => :unescaped_inside_of_quotation, :on_transition => :unescaped_print | |
end | |
event :special_character do | |
transitions :from => :beginning_of_word, :to => :unescaped_outside_of_quotation, :on_transition => :escaped_print | |
transitions :from => :unescaped_outside_of_quotation, :to => :unescaped_outside_of_quotation, :on_transition => :escaped_print | |
transitions :from => :escaped_outside_of_quotation, :to => :unescaped_outside_of_quotation, :on_transition => :unescaped_print | |
transitions :from => :unescaped_inside_of_quotation, :to => :unescaped_inside_of_quotation, :on_transition => :unescaped_print | |
transitions :from => :escaped_inside_of_quotation, :to => :unescaped_inside_of_quotation, :on_transition => :unescaped_print | |
end | |
event :star do | |
transitions :from => :beginning_of_word, :to => :unescaped_outside_of_quotation, :on_transition => :escaped_print | |
transitions :from => :unescaped_outside_of_quotation, :to => :unescaped_outside_of_quotation, :on_transition => :unescaped_print | |
transitions :from => :escaped_outside_of_quotation, :to => :unescaped_outside_of_quotation, :on_transition => :unescaped_print | |
transitions :from => :unescaped_inside_of_quotation, :to => :unescaped_inside_of_quotation, :on_transition => :unescaped_print | |
transitions :from => :escaped_inside_of_quotation, :to => :unescaped_inside_of_quotation, :on_transition => :unescaped_print | |
end | |
event :whitespace do | |
transitions :from => :beginning_of_word, :to => :beginning_of_word, :on_transition => :unescaped_print | |
transitions :from => :unescaped_outside_of_quotation, :to => :beginning_of_word, :on_transition => :unescaped_print | |
transitions :from => :escaped_outside_of_quotation, :to => :beginning_of_word, :on_transition => :escaped_print | |
transitions :from => :unescaped_inside_of_quotation, :to => :unescaped_inside_of_quotation, :on_transition => :unescaped_print | |
transitions :from => :escaped_inside_of_quotation, :to => :unescaped_inside_of_quotation, :on_transition => :unescaped_print | |
end | |
event :backslash do | |
transitions :from => :beginning_of_word, :to => :escaped_outside_of_quotation, :on_transition => :unescaped_print | |
transitions :from => :unescaped_outside_of_quotation, :to => :escaped_outside_of_quotation, :on_transition => :unescaped_print | |
transitions :from => :escaped_outside_of_quotation, :to => :unescaped_outside_of_quotation, :on_transition => :unescaped_print | |
transitions :from => :unescaped_inside_of_quotation, :to => :escaped_inside_of_quotation, :on_transition => :unescaped_print | |
transitions :from => :escaped_inside_of_quotation, :to => :unescaped_inside_of_quotation, :on_transition => :unescaped_print | |
end | |
event :quote do | |
transitions :from => :beginning_of_word, :to => :unescaped_inside_of_quotation, :on_transition => :unescaped_print | |
transitions :from => :unescaped_outside_of_quotation, :to => :unescaped_inside_of_quotation, :on_transition => :unescaped_print | |
transitions :from => :escaped_outside_of_quotation, :to => :unescaped_outside_of_quotation, :on_transition => :unescaped_print | |
transitions :from => :unescaped_inside_of_quotation, :to => :unescaped_outside_of_quotation, :on_transition => :unescaped_print | |
transitions :from => :escaped_inside_of_quotation, :to => :unescaped_inside_of_quotation, :on_transition => :unescaped_print | |
end | |
event :eos_character do | |
transitions :from => :beginning_of_word, :to => :end_of_string | |
transitions :from => :unescaped_outside_of_quotation, :to => :end_of_string | |
transitions :from => :escaped_outside_of_quotation, :to => :end_of_string | |
transitions :from => :unescaped_inside_of_quotation, :to => :end_of_string, :on_transition => :quote_print | |
transitions :from => :escaped_inside_of_quotation, :to => :end_of_string, :on_transition => :escape_and_quote_print | |
end | |
end | |
def escaped_print | |
@output_string << "\\#{@current_char}" | |
end | |
def unescaped_print | |
@output_string << @current_char | |
end | |
def escape_and_quote_print | |
@output_string << "\\\"#{@current_char}" | |
end | |
def quote_print | |
@output_string << "\"#{@current_char}" | |
end | |
def process_char | |
case @current_char | |
when /\+|-|&|\||\!|\(|\)|\{|\}|\[|\]|\^|~|\?|:/ | |
special_character | |
when /\s/ | |
whitespace | |
when "*" | |
star | |
when "\\" | |
backslash | |
when '"' | |
quote | |
else | |
normal_character | |
end | |
end | |
def sanitize | |
return @query unless @query.is_a?(String) | |
while((@current_char = @query[@index]) != nil) do | |
@current_char = @current_char.chr | |
process_char | |
@index += 1 | |
end | |
eos_character | |
return @output_string | |
end | |
def self.sanitize(query) | |
new(query).sanitize | |
end | |
end | |
end |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
class TheSolrQuerySanitizer < ActiveSupport::TestCase | |
include Search::QueryBuilderHelper | |
test "should eat hashes for breakfast" do | |
params = {:what => "*", :where => '"""'} | |
assert_equal({:what => '\\*', :where => '""""'}, sanitize_field_for_solr_query(params)) | |
end | |
test "should escape a lonely asterisk" do | |
assert_equal 'leiter \*', sanitize_field_for_solr_query('leiter *') | |
assert_equal '\*', sanitize_field_for_solr_query('*') | |
end | |
test "should not escape a correctly used asterisk" do | |
assert_equal 'manager*', sanitize_field_for_solr_query('manager*') | |
end | |
test "should escape parantheses" do | |
assert_equal '\(peter \)\) a\)\)', sanitize_field_for_solr_query('(peter \)) a))') | |
assert_equal '\(peter \)\) a\)\)', sanitize_field_for_solr_query('(peter )) a))') | |
assert_equal '\(peter \)\) a\)\)', sanitize_field_for_solr_query('(peter \)) a))') | |
assert_equal '\(\)', sanitize_field_for_solr_query('()') | |
end | |
test "should escape all kind of crazy solr special character wizzardry" do | |
'+-&|!(){}[]^~?:'.each_char do |char| | |
assert_equal "\\#{char}", sanitize_field_for_solr_query(char) | |
end | |
end | |
test "should close trailing quote if uneven number of quotes" do | |
assert_equal '""""', sanitize_field_for_solr_query('"""') | |
assert_equal '""', sanitize_field_for_solr_query('""') | |
assert_equal '""a""', sanitize_field_for_solr_query('""a"') | |
end | |
test "should escape backslashes that don't escape a special character" do | |
assert_equal '\\\\ \(', sanitize_field_for_solr_query('\ \(') | |
assert_equal '\\\\ \(', sanitize_field_for_solr_query('\\\\ \(') | |
assert_equal '\\\\\\\\ \(', sanitize_field_for_solr_query('\\\\\ \(') | |
assert_equal '\\\\\\\\ \(', sanitize_field_for_solr_query('\\\\\\\\ \(') | |
assert_equal '\( \(', sanitize_field_for_solr_query('\( (') | |
assert_equal '\\\\\( \(', sanitize_field_for_solr_query('\\\( \(') | |
end | |
test "should not assplode on empty strings" do | |
assert_equal "", sanitize_field_for_solr_query("") | |
end | |
test "should like escaped 'quotes in quotes'(tm)" do | |
assert_equal '"hallo \" ha! \" "', sanitize_field_for_solr_query('"hallo \" ha! \" "') | |
end | |
test "should handle stars inside and and outside of quotes" do | |
assert_equal ' \* " * "', sanitize_field_for_solr_query(' * " * "') | |
end | |
test "'should love backslashes as last character inside unmatched quotes\\" do | |
assert_equal '"ha \\\\"', sanitize_field_for_solr_query('"ha \\') | |
end | |
test "should escape unescaped backslashes before whitespace" do | |
assert_equal ' ha\\\\ ', sanitize_field_for_solr_query(' ha\\ ') | |
assert_equal " ha\\\\\t", sanitize_field_for_solr_query(" ha\\\t") | |
end | |
test "should make sweet sweet love to stuff other than strings (which includes Hendrik)" do | |
assert_equal :hendrik, sanitize_field_for_solr_query(:hendrik) | |
assert_equal nil, sanitize_field_for_solr_query(nil) | |
assert_equal 1, sanitize_field_for_solr_query(1) | |
assert_equal [:foo, 1, "baz"], sanitize_field_for_solr_query([:foo, 1, "baz"]) | |
end | |
test "should crush wicked qa test posting titles" do | |
assert_equal( | |
'Job Ad LOGO \(14.01.2009\) \- Job Position \( ‘ ` \| / \\\\ , ; \: \& < > \^ \* \? \) \(äöüß\) \(Ромашка\)', | |
sanitize_field_for_solr_query("Job Ad LOGO (14.01.2009) - Job Position ( ‘ ` | / \\ , ; : & < > ^ * ? ) (äöüß) (Ромашка)")) | |
end | |
end |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment