Skip to content

Instantly share code, notes, and snippets.

@knazarov
Created September 23, 2020 08:00
Show Gist options
  • Save knazarov/8102e558bac54cdd255e9025eee8aa11 to your computer and use it in GitHub Desktop.
Save knazarov/8102e558bac54cdd255e9025eee8aa11 to your computer and use it in GitHub Desktop.
Full text search example for Tarantool
#!/usr/bin/env tarantool
local pickle = require('pickle')
local yaml = require('yaml')
function trivec(str)
str = string.lower(str)
local vec = ""
if #str < 3 then
return ""
end
local res = 0
for i = 1,#str-2 do
local c1 = string.sub(str, i, i)
local c2 = string.sub(str, i+1, i+1)
local c3 = string.sub(str, i+2, i+2)
local val = string.byte(c1) * 10000 +
string.byte(c2) * 100 + string.byte(c3)
res = bit.bor(res, bit.lshift(1ULL, val%64))
end
return pickle.pack('Q', res)
end
box.cfg{}
box.schema.space.create('account', {if_not_exists=true})
box.space.account:format({ {name='id',type='unsigned'},
{name='name',type='string'},
{name='trivec',type='string'},
})
box.space.account:create_index(
'primary',
{
unique = true,
parts = { {field = 'id', type = 'unsigned'}},
if_not_exists=true})
box.space.account:create_index(
'hash',
{unique=false, type='BITSET', parts={3,type='string'},if_not_exists=true})
function put_user(id, name)
return box.space.account:put({id, name, trivec(name)})
end
function find_user(name_part)
local vec = trivec(name_part)
local res = {}
for _, v in box.space.account.index.hash:pairs(vec, {iterator='BITS_ALL_SET'}) do
if string.match(string.lower(v.name), string.lower(name_part)) then
table.insert(res, v)
end
end
return res
end
put_user(1, "Konstantin Nazarov")
put_user(2, "Konstantin Osipov")
print(yaml.encode(find_user('onst')))
@filonenko-mikhail
Copy link

filonenko-mikhail commented Nov 3, 2020

----- text search bench ----
Space len: 	69778094
Space whole bsize (Mb): 	5414ULL
Search only first 35 tuples
Searching for 	онст
SEARCH TIME (seconds)	1.3714169999876
RESULT[1]	[6607,"Алистратова Константина",")c\u0003qu\u0007\u00011"]
RESULT[2]	[6608,"Алисултанов Константинина",")�\t0uEQ!"]	 and	33	  more
Searching for 	ova
SEARCH TIME (seconds)	21.423429000017
RESULT[1]	null
RESULT[2]	null	 and	33	  more
Searching for 	Филон
SEARCH TIME (seconds)	1.251500000013
RESULT[1]	[10650,"Апиканов Филонена","�\u0001Q\u00110\u0005a1"]
RESULT[2]	[10651,"Апиканова Филонида","�\u0011Q\u00111\u0005A\u0000"]	 and	33	  more
Searching for 	Кокорин
SEARCH TIME (seconds)	0.70181100000627
RESULT[1]	[122577,"Кокорин Парирух","!\u0013\u0010��\u0002#\u0010"]
RESULT[2]	[122578,"Кокорина Париса","!\u0003\u00101�\u0003\u0003\u0011"]	 and	33	  more
Searching for 	ихаил
SEARCH TIME (seconds)	0.78236699997797
RESULT[1]	[7962,"Амековский Михаилина",";#\u0003\u0011S\u0015\u00051"]
RESULT[2]	[32458,"Бордвинова Михаил",")\u0017\u0002\u0011\u0011\u0015C\u0011"]	 and	33	  more
Searching for 	артеп
SEARCH TIME (seconds)	0.11721799999941
RESULT[1]	null
RESULT[2]	null	 and	33	  more
Searching for 	ооп
SEARCH TIME (seconds)	2.750595999998
RESULT[1]	[6614,"Алита Кооператина",")\u0013\u0011q�\u0003\u0001!"]
RESULT[2]	[58144,"Генько Кооператина","5\u0011\u0011q�\u0003!1"]	 and	33	  more
Searching for 	секун
SEARCH TIME (seconds)	0.32387299998663
RESULT[1]	[196026,"Пасекунов Ильфир","\u000b\u000311!\u0006U�"]
RESULT[2]	[196027,"Пасекунова Ильфруз","\n\u00031�!\u0007U�"]	 and	33	  more
Searching for 	дорош
SEARCH TIME (seconds)	3.8413509999809
RESULT[1]	[285699,"Худорошко Аршалюс","M�\u0010\u0017c\u0003\u00121"]
RESULT[2]	[604174,"Худорошко Аршам","U�\u0010\u0013A\u0003\u00121"]	 and	33	  more
Searching for 	Прин
SEARCH TIME (seconds)	3.3689030000241
RESULT[1]	[19736,"Барбенкова Принсесс","!3 1�\u0017%1"]
RESULT[2]	[19737,"Барбер Принцесса","%3 1�3\u00001"]	 and	33	  more
Searching for 	Ягод
SEARCH TIME (seconds)	0.28668099999777
RESULT[1]	[23616,"Безусый Ягода","�\u0011�\u0019\u0000�!�"]
RESULT[2]	[49042,"Власова-Ягодина Усман","�\u0001p9\u00197Q\u0011"]	 and	33	  more
Searching for 	Жок
SEARCH TIME (seconds)	4.1069439999992
RESULT[1]	[5453,"Алабовская Жокадия","\t\u001a\u0001\u0001VGU1"]
RESULT[2]	[56983,"Гатчинова Жокадия","1\u001b\u0003QUGI\u0000"]	 and	33	  more
---
...

@ochaton
Copy link

ochaton commented Feb 15, 2021

WTF? Why is it working?

@ochaton
Copy link

ochaton commented Feb 15, 2021

Modifying constants in hash function seems to change nothing!

#!/usr/bin/env tarantool
require 'strict'.on()
local utf8 = require 'utf8'

box.cfg{}
box.once('schema:v1', function()
	box.schema.space.create('accounts', {
		format = {
			{ name = 'id',   type = 'unsigned' },
			{ name = 'name', type = 'string'   },
			{ name = 'bpf',  type = 'string'   },
		}
	})

	box.space.accounts:create_index('primary', {
		unique = true,
		parts = { 'id' },
	})

	box.space.accounts:create_index('bpf', {
		unique = false,
		type = 'BITSET',
		parts = { 'bpf' },
	})

	print("space accounts created")
end)

local api = {}
rawset(_G, 'api', api)

local function trivec(str)
	if #str < 3 then return "" end

	local res = 0
	for i = 1,#str-2 do
		local b1 = str:sub(i, i):byte()
		local b2 = str:sub(i+1, i+1):byte()
		local b3 = str:sub(i+2, i+2):byte()
		local val = b1 * 0xffff + b2 * 0xff + b3

		res = bit.bor(res, bit.lshift(1, val%64))
	end
	return require'pickle'.pack('Q', res)
end

function api.register(id, name)
	return box.space.accounts:replace{ id, name, trivec(utf8.lower(name)) }
end

function api.find(name)
	name = utf8.lower(name)

	return box.space.accounts.index.bpf
		:pairs({ trivec(name) }, {iterator='BITS_ALL_SET'})
		:take(1000) -- hard limit
		:grep(function(account)
			return utf8.lower(account.name):find(name)
		end)
		:totable()
end

require 'console'.start()
os.exit(0)

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment