Last active
April 11, 2018 17:37
-
-
Save KeithHenry/796627eba2278b1b613f4ee6c1cd0799 to your computer and use it in GitHub Desktop.
SQL UDF to create an XML intermediate step to convert the '(A or B) and (C or D)' style complex searches
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
create function dbo.searchToXml(@input varchar(max)) returns xml | |
as | |
begin | |
set @input = ltrim(rtrim(@input)) | |
-- Mismatched ( and ) means we can't parse with this method | |
if len(replace(@input, '(', '')) <> len(replace(@input, ')', '')) | |
return null; | |
set @input = replace(replace(replace(replace(replace(replace(replace( | |
@input, | |
char(13), ''), -- Replace newlines and tabs | |
char(10), ''), | |
char(9), ''), | |
'&', '&'), -- Replace special chars XML doesn't like | |
'’', '''), | |
'<', '<'), | |
'>', '>') | |
declare @x xml, | |
@current varchar(max), | |
@logicIndex int, | |
@subclause varchar(8000), | |
@logic varchar(3) | |
-- Even number of " means quoted strings | |
if (len(replace(@input, '"', '')) - len(@input)) % 2 = 0 | |
begin | |
set @input = replace(@input, '"', '<word>') | |
declare @nextWord int = -1; | |
declare @ord int = 0; | |
while @nextWord < len(@input) and @nextWord <> 0 | |
begin | |
set @nextWord = charindex('<word>', @input, @nextWord+1) | |
-- If found and odd then make <word> into closing </word> | |
if(@nextWord > 0 and @ord % 2 = 1) | |
set @input = stuff(@input, @nextWord+1, 0, '/') | |
set @ord = @ord + 1 | |
end | |
end | |
-- Replace brackets with XML nodes we can parse | |
set @x = cast('<clause>' + | |
replace(replace( | |
@input, | |
'(', '<clause>'), -- Replace () with <clause> node | |
')', '</clause>') + | |
'</clause>' as xml) | |
-- Give each node an ID | |
declare @id int = 1 | |
while @x.exist('//clause[not(@id)]') = 1 | |
begin | |
set @x.modify('insert (attribute id {sql:variable("@id")}) as first into (//clause[not(@id)])[1]') | |
set @id = @id + 1 | |
end | |
-- If they contant an AND set that as the logic | |
while @x.exist('//clause[not(@logic)][text()[contains(.," AND ")]]') = 1 | |
set @x.modify('insert (attribute logic {"and"}) as first into (//clause[not(@logic)][text()[contains(.," AND ")]])[1]') | |
-- If they contant an OR set that as the logic | |
while @x.exist('//clause[not(@logic)][text()[contains(.," OR ")]]') = 1 | |
set @x.modify('insert (attribute logic {"or"}) as first into (//clause[not(@logic)][text()[contains(.," OR ")]])[1]') | |
-- Repeat for lowercase and/or because XPath is case sensitive and applying translate() is awful | |
while @x.exist('//clause[not(@logic)][text()[contains(.," and ")]]') = 1 | |
set @x.modify('insert (attribute logic {"and"}) as first into (//clause[not(@logic)][text()[contains(.," and ")]])[1]') | |
while @x.exist('//clause[not(@logic)][text()[contains(.," or ")]]') = 1 | |
set @x.modify('insert (attribute logic {"or"}) as first into (//clause[not(@logic)][text()[contains(.," or ")]])[1]') | |
-- Loop the IDs | |
while @id > 0 | |
begin | |
-- Get the current logic, then clear the text nodes | |
set @logic = @x.value('(//clause[@id=sql:variable("@id")]/@logic)[1]', 'varchar(3)') | |
set @current = | |
@x.value('(//clause[@id=sql:variable("@id")]/text())[1]', 'nvarchar(4000)') + | |
isnull(@x.value('(//clause[@id=sql:variable("@id")]/text())[2]', 'nvarchar(4000)'), '') + | |
isnull(@x.value('(//clause[@id=sql:variable("@id")]/text())[3]', 'nvarchar(4000)'), '') + | |
isnull(@x.value('(//clause[@id=sql:variable("@id")]/text())[4]', 'nvarchar(4000)'), '') + | |
isnull(@x.value('(//clause[@id=sql:variable("@id")]/text())[5]', 'nvarchar(4000)'), '') -- Should loop really | |
set @x.modify('delete //clause[@id=sql:variable("@id")]/text()') | |
-- Split the string by the logic keyword, and add each substring as a <word> | |
set @logicIndex = charindex(' ' + @logic + ' ', @current, 0) | |
while @logicIndex > 0 | |
begin | |
set @subclause = rtrim(ltrim( | |
replace(replace(replace(replace(replace( | |
left(@current, @logicIndex-1), | |
'“', ''), '”', ''), '"', ''), '(', ''), ')', '') | |
)) | |
if len(@subclause) > 0 | |
set @x.modify('insert <word>{ xs:string(sql:variable("@subclause")) }</word> as last into (//clause[@id=sql:variable("@id")])[1]') | |
set @current = stuff(@current, 1, @logicIndex + len(' ' + @logic + ' '), '') | |
set @logicIndex = charindex(' ' + @logic + ' ', @current, 0) | |
end | |
-- Add the tail, too | |
set @current = rtrim(ltrim( | |
replace(replace(replace(replace(replace( | |
@current, | |
'“', ''), '”', ''), '"', ''), '(', ''), ')', '') | |
)) | |
if len(@current) > 0 | |
set @x.modify('insert <word>{ xs:string(sql:variable("@current")) }</word> as last into (//clause[@id=sql:variable("@id")])[1]') | |
set @id = @id - 1 | |
end | |
-- remove <word><clause>...</clause></word> nested clauses as we can't handle them - probably due to backets inside delimited words | |
set @x.modify('delete //word/clause') | |
-- Find <clause><clause>...</clause></clause> nested clauses and promote the inner | |
declare @promote xml = @x.query('clause[count(clause)=1][count(word)=0]/clause') | |
while @promote.exist('*') = 1 | |
begin | |
set @x = @promote | |
set @promote = @promote.query('clause[count(clause)=1][count(word)=0]/clause') | |
end | |
return @x | |
end |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
This is designed to turn a boolean search string into XML that we can handle in a structured way.
For instance
(A or B) and (C or D or "E and F")
should become:This then allows
.nodes('//clause')
and.nodes('//word')
to be used a sub-queries to build this XML into hierarchical tables we can query directly.