Last active
October 3, 2022 09:47
-
-
Save wolever/1a5ccf6396f00229b2dc to your computer and use it in GitHub Desktop.
Parse and escape a query string so it's safe to use with Postgres' `to_tsquery(…)`
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import re | |
from nose.tools import assert_equal | |
from nose_parameterized import parameterized | |
from tsquery_escape import tsquery_escape | |
@parameterized([ | |
("1 OR 2", "1 | 2"), | |
("(1) 2", "( 1 ) & 2"), | |
("&", "'&':*"), | |
('"hello world"', "'hello world'"), | |
("not 1", "! 1"), | |
("1 and not (2 or 3)", "1 & ! ( 2 | 3 )"), | |
("not and and 1) or ( 2 not", "! 1 | ( 2 )"), | |
("() 1", "1"), | |
("1 2 3", "1 & 2 & 3"), | |
("'&' |", "'&':* & '|':*"), | |
(") and 1 (2 or", "1 & ( 2 )"), | |
("it's '", "'its':*"), | |
("(1)", "( 1 )"), | |
("1((", "1"), | |
]) | |
def test_tsquery_escape(input, expected): | |
expected = re.sub("([0-9])", r"'\1':*", expected) | |
actual = tsquery_escape(input) | |
assert_equal(actual, expected) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import re | |
def tsquery_escape(term): | |
""" Escape a query string so it's safe to use with Postgres' | |
``to_tsquery(...)``. Single quotes are ignored, double quoted strings | |
are used as literals, and the logical operators 'and', 'or', 'not', | |
'(', and ')' can be used: | |
>>> tsquery_escape("Hello") | |
"'hello':*" | |
>>> tsquery_escape('"Quoted string"') | |
"'quoted string'" | |
>>> tsquery_escape("multiple terms OR another") | |
"'multiple':* & 'terms':* | 'another':*" | |
>>> tsquery_escape("'\"*|") | |
"'\"*|':*" | |
>>> tsquery_escape('not foo and (bar or "baz")') | |
"! 'foo':* & ( 'bar':* | 'baz' )" | |
""" | |
magic_terms = { | |
"and": "&", | |
"or": "|", | |
"not": "!", | |
"OR": "|", | |
"AND": "&", | |
"NOT": "!", | |
"(": "(", | |
")": ")", | |
} | |
magic_values = set(magic_terms.values()) | |
paren_count = 0 | |
res = [] | |
bits = re.split(r'((?:".*?")|[()])', term) | |
for bit in bits: | |
if not bit: | |
continue | |
split_bits = ( | |
[bit] if bit.startswith('"') and bit.endswith('"') else | |
bit.strip().split() | |
) | |
for bit in split_bits: | |
if not bit: | |
continue | |
if bit in magic_terms: | |
bit = magic_terms[bit] | |
last = res and res[-1] or "" | |
if bit == ")": | |
if last == "(": | |
paren_count -= 1 | |
res.pop() | |
continue | |
if paren_count == 0: | |
continue | |
if last in magic_values and last != "(": | |
res.pop() | |
elif bit == "|" and last == "&": | |
res.pop() | |
elif bit == "!": | |
pass | |
elif bit == "(": | |
pass | |
elif last in magic_values or not last: | |
continue | |
if bit == ")": | |
paren_count -= 1 | |
elif bit == "(": | |
paren_count += 1 | |
res.append(bit) | |
if bit == ")": | |
res.append("&") | |
continue | |
bit = bit.replace("'", "") | |
if not bit: | |
continue | |
if bit.startswith('"') and bit.endswith('"'): | |
res.append(bit.replace('"', "'")) | |
else: | |
res.append("'%s':*" %(bit.replace("'", ""), )) | |
res.append("&") | |
while res and res[-1] in magic_values: | |
last = res[-1] | |
if last == ")": | |
break | |
if last == "(": | |
paren_count -= 1 | |
res.pop() | |
while paren_count > 0: | |
res.append(")") | |
paren_count -= 1 | |
return " ".join(res) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Hi,
thank you for sharing this snippet! I think I found two bugs:
The following queries:
"
and""
get converted to:
'
and''
which throws a syntax error
A quick fix would be to replace line 78 with this one:
if not bit or bit in ['""', '"']:
There is probably a better solution