Last active
September 19, 2018 06:17
-
-
Save healiseu/0856470814b87f78d188dd1ec82d712b to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Sorted float secondary index: An in-memory data structures comparison of Python, NumPy and Redis data structures | |
# (C) By Athanassios I. Hatzis | |
# 18th of September 2018 | |
# Written to explain issue: https://github.com/RedisLabsModules/RediSearch/issues/493 | |
# | |
# OS Environment: Linux Ubuntu x64 | |
# Processor: Intel(R) Core(TM) i3 CPU 540 @ 3.07GHz | |
# Memory 16 GB | |
from bisect import bisect_left, bisect_right | |
from collections import OrderedDict | |
import numpy as np | |
import time | |
import sys | |
from walrus import Database | |
from redisearch import Client, NumericField, Query | |
class SlicableOrderedDict(OrderedDict): | |
def __getitem__(self, k): | |
if not isinstance(k, slice): | |
return OrderedDict.__getitem__(self, k) | |
x = SlicableOrderedDict() | |
for idx, key in enumerate(self.keys()): | |
if k.start <= idx < k.stop: | |
x[key] = self[key] | |
return x | |
# ********************************************************************************************** | |
# --------------------- Utility Functions ------------------------------------------------------------- | |
# ********************************************************************************************** | |
def ndx_range(min, max, ndx): | |
""" | |
:param ndx: Sorted OrderedDict of values, i.e. index | |
:param min: lower bound of index values | |
:param max: upper bound of index values | |
:return: a range of indexes from the sorted OrderedDict with values between min and max | |
""" | |
lower_bound = find_gt(ndx, min) | |
upper_bound = find_lt(ndx, max) | |
return ndx[lower_bound:upper_bound] | |
def find_gt(ndx, k): | |
""" | |
:param ndx: Sorted OrderedDict of values | |
:param k: a key value of an OrderedDict | |
:return: the position of k | |
""" | |
arr = list(ndx.keys()) | |
#Find leftmost key value greater than k' | |
arr_pos = bisect_right(arr, k) | |
if arr_pos != len(arr): | |
return arr_pos | |
raise ValueError | |
def find_lt(ndx, k): | |
""" | |
:param ndx: Sorted OrderedDict of values | |
:param k: a key value of an OrderedDict | |
:return: the position of k | |
""" | |
arr = list(ndx.keys()) | |
# Find rightmost key value less than k' | |
arr_pos = bisect_left(arr, k) | |
if arr_pos: | |
return arr_pos | |
raise ValueError | |
def bytes2mb(bytes): | |
return round(bytes / 1024 ** 2, 1) | |
def array2dict(arr): | |
d = dict(enumerate(arr)) | |
return {v:k for k,v in d.items()} | |
def array2ordered_dict(arr): | |
# create a Python Dictionary from NumPy array | |
d = array2dict(arr) | |
# return dictionary ordered by value | |
return SlicableOrderedDict(sorted(d.items(), key=lambda t: t[0])) | |
def floats_sample(size, range): | |
return (range[1]-range[0])*np.random.sample(size)+range[0] | |
def redis_dict(arr): | |
""" | |
:param arr: numpy array of floats | |
:return: dictionary with key in the form of numerical string integers and float values | |
""" | |
d = dict(enumerate(arr)) | |
return {str(k): v for k, v in d.items()} | |
# ********************************************************************************************** | |
########## 1. Create float secondary index with Python/Numpy data structure #################### | |
# ********************************************************************************************** | |
# %%%%%%%%%%%%%%%%%%%%%%%%%%% Start Timing Benchmark %%%%%%%%%%%%%%%%%%%%%%%%%% | |
t1_start = time.perf_counter() | |
t2_start = time.process_time() | |
# ----------------- Create NumPy Array of Float Values ------------------------ | |
sample_size = 250000 # sample size of 250,000 float values | |
sample_interval = (-5000, 5000) # sampling range of values min=-5000, max=5000 | |
flt_arr = floats_sample(sample_size, sample_interval) | |
# ---------------- Create Python Ordered Dictionary to represent sorted float index | |
flt_sorted_ndx = array2ordered_dict(flt_arr) | |
t1_stop = time.perf_counter() | |
t2_stop = time.process_time() | |
print(f"Python/NumPy - elapsed time for {sample_size} floats : {int(round((t1_stop-t1_start)))} [sec]") | |
print(f"Python/NumPy - cpu process time for {sample_size} floats: {int(round((t2_stop-t2_start)))} [sec]") | |
# %%%%%%%%%%%%%%%%% End Timing Benchmark %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% | |
# Python/NumPy - elapsed time for 250000 floats : 1 [sec] | |
# Python/NumPy - cpu process time for 250000 floats: 1 [sec] | |
# Sanity check | |
29 == flt_sorted_ndx[flt_arr[29]] | |
# In order to create the redis sorted set keys must be of type string | |
# these keys are numerical string integers and the values are the float numbers | |
flt_scores = redis_dict(flt_arr) | |
# ------------------ Print Memory Size Info ------------------------------ | |
print(f'Size: {flt_arr.size}') | |
print(f'Data Type: {flt_arr.dtype}') | |
print(f'Memory Size: {flt_arr.nbytes}') | |
in_memory_total = bytes2mb(sys.getsizeof(flt_arr) + sys.getsizeof(flt_sorted_ndx)) | |
print(f'Numpy Array and Ordered Dictionary - Total memory size: {in_memory_total} MB ') | |
# Size: 250000 | |
# Data Type: float64 | |
# Memory Size: 2000000 | |
# Numpy Array and Ordered Dictionary - Total memory size: 23.5 MB | |
# ************************************************************************************************ | |
# ############### 2. Create float secondary index with Walrus/Redis Data Structure ############### | |
# ************************************************************************************************* | |
# -------------------- Create Redis-Walrus Structures -------------------------------------------- | |
db = Database(host='localhost', port=6379, db=9) | |
db.flushdb() | |
db.dbsize() | |
# Float Values are stored in a Redis LIST (Walrus Array) | |
FloatField = db.Array('FloatField') | |
# Float Index is created and stored with a Redis ZSET | |
FloatNDX = db.ZSet('FloatNDX') | |
# %%%%%%%%%%%%%%%%%%%%%%%%%%% Start Timing Benchmark %%%%%%%%%%%%%%%%%%%%%%%%%% | |
t1_start = time.perf_counter() | |
t2_start = time.process_time() | |
FloatField.extend(flt_arr) | |
FloatNDX.add(**flt_scores) | |
t1_stop = time.perf_counter() | |
t2_stop = time.process_time() | |
print(f"Walrus-Redis - elapsed time for {sample_size} floats : {int(round((t1_stop-t1_start)))} [sec]") | |
print(f"Walrus-Redis - cpu process time for {sample_size} floats: {int(round((t2_stop-t2_start)))} [sec]") | |
# %%%%%%%%%%%%%%%%% End Timing Benchmark %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% | |
# Walrus-Redis - elapsed time for 250000 floats : 6 [sec] | |
# Walrus-Redis - cpu process time for 250000 floats: 5 [sec] | |
# Redis Server Before Insertion | |
# used_memory_human : 0.99 MB | |
# used_memory_dataset: 0.05 MB | |
# Redis Server After Insertion | |
# used_memory_human : 36.92 MB | |
# used_memory_dataset: 35.96 MB | |
# Save RDB policy: after 600 sec (10 min) if at least 10 keys changed | |
# Redis 4.0.8 (00000000/0) 64 bit | |
# ************************************************************************************************ | |
# ############### 3. Create float secondary index with RediSearch Module ############### | |
# ************************************************************************************************* | |
# Creating a client with a given index name - DVS (Data Value Store) | |
redclient = Client('DVS') | |
# Creating the index definition and schema | |
redclient.create_index([NumericField('flt', sortable=True)]) | |
batchndx = redclient.batch_indexer(chunk_size=500) | |
# %%%%%%%%%%%%%%%%%%%%%%%%%%% Start Timing Benchmark %%%%%%%%%%%%%%%%%%%%%%%%%% | |
t1_start = time.perf_counter() | |
t2_start = time.process_time() | |
[batchndx.add_document(doc_id=n, flt=flt_arr[n]) for n in range(sample_size)] | |
batchndx.commit() | |
t1_stop = time.perf_counter() | |
t2_stop = time.process_time() | |
print(f"RediSearch Module - elapsed time for {sample_size} floats : {int(round((t1_stop-t1_start)))} [sec]") | |
print(f"RediSearch Module - cpu process time for {sample_size} floats: {int(round((t2_stop-t2_start)))} [sec]") | |
# %%%%%%%%%%%%%%%%% End Timing Benchmark %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% | |
# RediSearch Module - elapsed time for 250000 floats (500 chunks) : 25 [sec] | |
# RediSearch Module - cpu process time for 250000 floats (500 chunks) : 17 [sec] | |
# Redis Server Before Insertion | |
# used_memory_human : 0.99 MB | |
# used_memory_dataset: 0.05 MB | |
# Redis Server After Insertion | |
# used_memory_human : 66.62 MB | |
# used_memory_dataset: 54.12 MB | |
# Redis 4.0.8 (00000000/0) 64 bit | |
# Save RDB policy: after 600 sec (10 min) if at least 10 keys changed | |
print(f"Fields : {redclient.info()['fields']}") | |
print(f"Total values : {redclient.info()['num_docs']}") | |
print(f"Total size mb : {redclient.info()['doc_table_size_mb']}") | |
print(f"Sortable values size mb : {redclient.info()['sortable_values_size_mb']}") | |
#Fields : [[b'flt', b'type', b'NUMERIC', b'SORTABLE']] | |
#Total values : 250000 | |
#Total size mb : 20.874872207641602 | |
#Sortable values size mb : 5.7220458984375 | |
# ************************************************************************************************ | |
# +++++++++++++++++++++++++++ SEARCHING BENCHMARKS WITH INDEX FOR 1, 2, 3 Cases +++++++++++++++++ | |
# *********************************************************************************************** | |
val_range = (2300, 2303) | |
# 1. Python/NumPy | |
len(ndx_range(*val_range, flt_sorted_ndx)) | |
ndx_range(*val_range, flt_sorted_ndx) | |
# 2. Redis/Walrus | |
FloatNDX.count(*val_range) | |
FloatNDX.range_by_score(*val_range, with_scores=True) | |
# 3. RediSearch Module | |
res = redclient.search('@flt:[2300 2303]') | |
res.total | |
res.docs | |
redclient.search(Query('@flt:[2300 2303]').return_fields('flt').sort_by('flt')).docs | |
# 2. Python Range Query Time Benchmark | |
%timeit -r 7 -n 10 ndx_range(*val_range, flt_sorted_ndx) | |
# 245 ms ± 1.31 ms per loop (mean ± std. dev. of 7 runs, 10 loops each) | |
# 2. Redis Range Query Time Benchmark | |
%timeit -r 7 -n 1000 FloatNDX.range_by_score(*val_range, with_scores=True) | |
# 490 µs ± 82.6 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each) | |
# 3. RediSearch Query Time Benchmark | |
%timeit -r 7 -n 1000 redclient.search('@flt:[2300 2303]') | |
# 434 µs ± 6.5 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment