healiseu · September 19, 2018 06:17
diff --git a/sorted_float_index_benchmarks.py b/sorted_float_index_benchmarks.py
 # Sorted float secondary index: An in-memory data structures comparison of Python, NumPy and Redis data structures
 # (C) By Athanassios I. Hatzis
 # 18th of September 2018
 # Written to explain issue: https://github.com/RedisLabsModules/RediSearch/issues/493
 #
 # OS Environment: Linux Ubuntu x64
 # Processor: Intel(R) Core(TM) i3 CPU 540  @ 3.07GHz
 # Memory 16 GB

 from bisect import bisect_left, bisect_right
 from collections import OrderedDict
 import numpy as np
 import time
 import sys

 from walrus import Database

 from redisearch import Client, NumericField, Query


 class SlicableOrderedDict(OrderedDict):
    def __getitem__(self, k):
        if not isinstance(k, slice):
            return OrderedDict.__getitem__(self, k)
        x = SlicableOrderedDict()
        for idx, key in enumerate(self.keys()):
            if k.start <= idx < k.stop:
                x[key] = self[key]
        return x

 # **********************************************************************************************
 # --------------------- Utility Functions -------------------------------------------------------------
 # **********************************************************************************************
 def ndx_range(min, max, ndx):
    """
    :param ndx: Sorted OrderedDict of values, i.e. index
    :param min: lower bound of index values
    :param max: upper bound of index values
    :return: a range of indexes from the sorted OrderedDict with values between min and max
    """
    lower_bound = find_gt(ndx, min)
    upper_bound = find_lt(ndx, max)
    return ndx[lower_bound:upper_bound]

 def find_gt(ndx, k):
    """
    :param ndx: Sorted OrderedDict of values
    :param k: a key value of an OrderedDict
    :return: the position of k
    """
    arr = list(ndx.keys())
    
    #Find leftmost key value greater than k'
    arr_pos = bisect_right(arr, k)    
    if arr_pos != len(arr):
        return arr_pos
    raise ValueError

 def find_lt(ndx, k):
    """
    :param ndx: Sorted OrderedDict of values
    :param k: a key value of an OrderedDict
    :return: the position of k
    """
    arr = list(ndx.keys())
    
    # Find rightmost key value less than k'
    arr_pos = bisect_left(arr, k)
    if arr_pos:
        return arr_pos
    raise ValueError

 def bytes2mb(bytes):
    return round(bytes / 1024 ** 2, 1)

 def array2dict(arr):
    d = dict(enumerate(arr))
    return {v:k for k,v in d.items()}

 def array2ordered_dict(arr):
    # create a Python Dictionary from NumPy array
    d = array2dict(arr)
    # return dictionary ordered by value
    return SlicableOrderedDict(sorted(d.items(), key=lambda t: t[0]))

 def floats_sample(size, range):
    return (range[1]-range[0])*np.random.sample(size)+range[0]

 def redis_dict(arr):
    """
    :param arr: numpy array of floats
    :return: dictionary with key in the form of numerical string integers and float values
    """
    d = dict(enumerate(arr))
    return {str(k): v for k, v in d.items()}


 # **********************************************************************************************
 ########## 1. Create float secondary index with Python/Numpy data structure ####################
 # **********************************************************************************************

 # %%%%%%%%%%%%%%%%%%%%%%%%%%% Start Timing Benchmark %%%%%%%%%%%%%%%%%%%%%%%%%%
 t1_start = time.perf_counter()
 t2_start = time.process_time()

 # ----------------- Create NumPy Array of Float Values ------------------------
 sample_size = 250000             # sample size of 250,000 float values
 sample_interval = (-5000, 5000)  # sampling range of values min=-5000, max=5000
 flt_arr = floats_sample(sample_size, sample_interval)

 # ---------------- Create Python Ordered Dictionary to represent sorted float index
 flt_sorted_ndx = array2ordered_dict(flt_arr)

 t1_stop = time.perf_counter()
 t2_stop = time.process_time()

 print(f"Python/NumPy - elapsed time for {sample_size} floats    : {int(round((t1_stop-t1_start)))} [sec]")
 print(f"Python/NumPy - cpu process time for {sample_size} floats: {int(round((t2_stop-t2_start)))} [sec]")
 # %%%%%%%%%%%%%%%%% End Timing Benchmark %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

 # Python/NumPy - elapsed time for 250000 floats    : 1 [sec]
 # Python/NumPy - cpu process time for 250000 floats: 1 [sec]

 # Sanity check
 29 == flt_sorted_ndx[flt_arr[29]]

 # In order to create the redis sorted set keys must be of type string
 # these keys are numerical string integers and the values are the float numbers
 flt_scores = redis_dict(flt_arr)


 # ------------------ Print Memory Size Info ------------------------------
 print(f'Size: {flt_arr.size}')
 print(f'Data Type: {flt_arr.dtype}')
 print(f'Memory Size: {flt_arr.nbytes}')

 in_memory_total = bytes2mb(sys.getsizeof(flt_arr) + sys.getsizeof(flt_sorted_ndx))
 print(f'Numpy Array and Ordered Dictionary - Total memory size: {in_memory_total} MB ')

 # Size: 250000
 # Data Type: float64
 # Memory Size: 2000000
 # Numpy Array and Ordered Dictionary - Total memory size: 23.5 MB


 # ************************************************************************************************
 # ############### 2. Create float secondary index with Walrus/Redis Data Structure ###############
 # *************************************************************************************************

 # -------------------- Create Redis-Walrus Structures --------------------------------------------
 db = Database(host='localhost', port=6379, db=9)
 db.flushdb()
 db.dbsize()

 # Float Values are stored in a Redis LIST (Walrus Array)
 FloatField = db.Array('FloatField')

 # Float Index is created and stored with a Redis ZSET
 FloatNDX   = db.ZSet('FloatNDX')

 # %%%%%%%%%%%%%%%%%%%%%%%%%%% Start Timing Benchmark %%%%%%%%%%%%%%%%%%%%%%%%%%
 t1_start = time.perf_counter()
 t2_start = time.process_time()

 FloatField.extend(flt_arr)

 FloatNDX.add(**flt_scores)

 t1_stop = time.perf_counter()
 t2_stop = time.process_time()

 print(f"Walrus-Redis - elapsed time for {sample_size} floats    : {int(round((t1_stop-t1_start)))} [sec]")
 print(f"Walrus-Redis - cpu process time for {sample_size} floats: {int(round((t2_stop-t2_start)))} [sec]")
 # %%%%%%%%%%%%%%%%% End Timing Benchmark %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

 # Walrus-Redis - elapsed time for 250000 floats    : 6 [sec]
 # Walrus-Redis - cpu process time for 250000 floats: 5 [sec]

 # Redis Server Before Insertion
 # used_memory_human  :  0.99 MB
 # used_memory_dataset:  0.05 MB

 # Redis Server After Insertion
 # used_memory_human :  36.92 MB
 # used_memory_dataset: 35.96 MB

 # Save RDB policy: after 600 sec (10 min) if at least 10 keys changed
 # Redis 4.0.8 (00000000/0) 64 bit



 # ************************************************************************************************
 # ############### 3. Create float secondary index with RediSearch Module ###############
 # *************************************************************************************************

 # Creating a client with a given index name - DVS (Data Value Store)
 redclient = Client('DVS')

 # Creating the index definition and schema
 redclient.create_index([NumericField('flt', sortable=True)])

 batchndx = redclient.batch_indexer(chunk_size=500)

 # %%%%%%%%%%%%%%%%%%%%%%%%%%% Start Timing Benchmark %%%%%%%%%%%%%%%%%%%%%%%%%%
 t1_start = time.perf_counter()
 t2_start = time.process_time()

 [batchndx.add_document(doc_id=n, flt=flt_arr[n]) for n in range(sample_size)]
 batchndx.commit()

 t1_stop = time.perf_counter()
 t2_stop = time.process_time()

 print(f"RediSearch Module - elapsed time for {sample_size} floats    : {int(round((t1_stop-t1_start)))} [sec]")
 print(f"RediSearch Module - cpu process time for {sample_size} floats: {int(round((t2_stop-t2_start)))} [sec]")
 # %%%%%%%%%%%%%%%%% End Timing Benchmark %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

 # RediSearch Module - elapsed time for 250000 floats (500 chunks)     : 25 [sec]
 # RediSearch Module - cpu process time for 250000 floats (500 chunks) : 17 [sec]

 # Redis Server Before Insertion
 # used_memory_human  :  0.99 MB
 # used_memory_dataset:  0.05 MB

 # Redis Server After Insertion
 # used_memory_human :  66.62 MB
 # used_memory_dataset: 54.12 MB

 # Redis 4.0.8 (00000000/0) 64 bit
 # Save RDB policy: after 600 sec (10 min) if at least 10 keys changed


 print(f"Fields                  : {redclient.info()['fields']}")
 print(f"Total values            : {redclient.info()['num_docs']}")
 print(f"Total size           mb : {redclient.info()['doc_table_size_mb']}")
 print(f"Sortable values size mb : {redclient.info()['sortable_values_size_mb']}")

 #Fields                  : [[b'flt', b'type', b'NUMERIC', b'SORTABLE']]
 #Total values            : 250000
 #Total size           mb : 20.874872207641602
 #Sortable values size mb : 5.7220458984375

 # ************************************************************************************************
 # +++++++++++++++++++++++++++ SEARCHING BENCHMARKS WITH INDEX FOR 1, 2, 3 Cases +++++++++++++++++
 # ***********************************************************************************************
 val_range = (2300, 2303)

 # 1. Python/NumPy
 len(ndx_range(*val_range, flt_sorted_ndx))
 ndx_range(*val_range, flt_sorted_ndx)

 # 2. Redis/Walrus
 FloatNDX.count(*val_range)
 FloatNDX.range_by_score(*val_range, with_scores=True)

 # 3. RediSearch Module
 res = redclient.search('@flt:[2300 2303]')
 res.total
 res.docs
 redclient.search(Query('@flt:[2300 2303]').return_fields('flt').sort_by('flt')).docs

      
 # 2. Python Range Query Time Benchmark
 %timeit -r 7 -n 10 ndx_range(*val_range, flt_sorted_ndx)
 # 245 ms ± 1.31 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)
      
 # 2. Redis Range Query Time Benchmark
 %timeit -r 7 -n 1000 FloatNDX.range_by_score(*val_range, with_scores=True)
 # 490 µs ± 82.6 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)

 # 3. RediSearch Query Time Benchmark
 %timeit -r 7 -n 1000 redclient.search('@flt:[2300 2303]')
 # 434 µs ± 6.5 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)
	# Sorted float secondary index: An in-memory data structures comparison of Python, NumPy and Redis data structures
	# (C) By Athanassios I. Hatzis
	# 18th of September 2018
	# Written to explain issue: https://github.com/RedisLabsModules/RediSearch/issues/493
	#
	# OS Environment: Linux Ubuntu x64
	# Processor: Intel(R) Core(TM) i3 CPU 540 @ 3.07GHz
	# Memory 16 GB

	from bisect import bisect_left, bisect_right
	from collections import OrderedDict
	import numpy as np
	import time
	import sys

	from walrus import Database

	from redisearch import Client, NumericField, Query


	class SlicableOrderedDict(OrderedDict):
	def __getitem__(self, k):
	if not isinstance(k, slice):
	return OrderedDict.__getitem__(self, k)
	x = SlicableOrderedDict()
	for idx, key in enumerate(self.keys()):
	if k.start <= idx < k.stop:
	x[key] = self[key]
	return x

	# **********************************************************************************************
	# --------------------- Utility Functions -------------------------------------------------------------
	# **********************************************************************************************
	def ndx_range(min, max, ndx):
	"""
	:param ndx: Sorted OrderedDict of values, i.e. index
	:param min: lower bound of index values
	:param max: upper bound of index values
	:return: a range of indexes from the sorted OrderedDict with values between min and max
	"""
	lower_bound = find_gt(ndx, min)
	upper_bound = find_lt(ndx, max)
	return ndx[lower_bound:upper_bound]

	def find_gt(ndx, k):
	"""
	:param ndx: Sorted OrderedDict of values
	:param k: a key value of an OrderedDict
	:return: the position of k
	"""
	arr = list(ndx.keys())

	#Find leftmost key value greater than k'
	arr_pos = bisect_right(arr, k)
	if arr_pos != len(arr):
	return arr_pos
	raise ValueError

	def find_lt(ndx, k):
	"""
	:param ndx: Sorted OrderedDict of values
	:param k: a key value of an OrderedDict
	:return: the position of k
	"""
	arr = list(ndx.keys())

	# Find rightmost key value less than k'
	arr_pos = bisect_left(arr, k)
	if arr_pos:
	return arr_pos
	raise ValueError

	def bytes2mb(bytes):
	return round(bytes / 1024 ** 2, 1)

	def array2dict(arr):
	d = dict(enumerate(arr))
	return {v:k for k,v in d.items()}

	def array2ordered_dict(arr):
	# create a Python Dictionary from NumPy array
	d = array2dict(arr)
	# return dictionary ordered by value
	return SlicableOrderedDict(sorted(d.items(), key=lambda t: t[0]))

	def floats_sample(size, range):
	return (range[1]-range[0])*np.random.sample(size)+range[0]

	def redis_dict(arr):
	"""
	:param arr: numpy array of floats
	:return: dictionary with key in the form of numerical string integers and float values
	"""
	d = dict(enumerate(arr))
	return {str(k): v for k, v in d.items()}


	# **********************************************************************************************
	########## 1. Create float secondary index with Python/Numpy data structure ####################
	# **********************************************************************************************

	# %%%%%%%%%%%%%%%%%%%%%%%%%%% Start Timing Benchmark %%%%%%%%%%%%%%%%%%%%%%%%%%
	t1_start = time.perf_counter()
	t2_start = time.process_time()

	# ----------------- Create NumPy Array of Float Values ------------------------
	sample_size = 250000 # sample size of 250,000 float values
	sample_interval = (-5000, 5000) # sampling range of values min=-5000, max=5000
	flt_arr = floats_sample(sample_size, sample_interval)

	# ---------------- Create Python Ordered Dictionary to represent sorted float index
	flt_sorted_ndx = array2ordered_dict(flt_arr)

	t1_stop = time.perf_counter()
	t2_stop = time.process_time()

	print(f"Python/NumPy - elapsed time for {sample_size} floats : {int(round((t1_stop-t1_start)))} [sec]")
	print(f"Python/NumPy - cpu process time for {sample_size} floats: {int(round((t2_stop-t2_start)))} [sec]")
	# %%%%%%%%%%%%%%%%% End Timing Benchmark %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

	# Python/NumPy - elapsed time for 250000 floats : 1 [sec]
	# Python/NumPy - cpu process time for 250000 floats: 1 [sec]

	# Sanity check
	29 == flt_sorted_ndx[flt_arr[29]]

	# In order to create the redis sorted set keys must be of type string
	# these keys are numerical string integers and the values are the float numbers
	flt_scores = redis_dict(flt_arr)


	# ------------------ Print Memory Size Info ------------------------------
	print(f'Size: {flt_arr.size}')
	print(f'Data Type: {flt_arr.dtype}')
	print(f'Memory Size: {flt_arr.nbytes}')

	in_memory_total = bytes2mb(sys.getsizeof(flt_arr) + sys.getsizeof(flt_sorted_ndx))
	print(f'Numpy Array and Ordered Dictionary - Total memory size: {in_memory_total} MB ')

	# Size: 250000
	# Data Type: float64
	# Memory Size: 2000000
	# Numpy Array and Ordered Dictionary - Total memory size: 23.5 MB


	# ************************************************************************************************
	# ############### 2. Create float secondary index with Walrus/Redis Data Structure ###############
	# *************************************************************************************************

	# -------------------- Create Redis-Walrus Structures --------------------------------------------
	db = Database(host='localhost', port=6379, db=9)
	db.flushdb()
	db.dbsize()

	# Float Values are stored in a Redis LIST (Walrus Array)
	FloatField = db.Array('FloatField')

	# Float Index is created and stored with a Redis ZSET
	FloatNDX = db.ZSet('FloatNDX')

	# %%%%%%%%%%%%%%%%%%%%%%%%%%% Start Timing Benchmark %%%%%%%%%%%%%%%%%%%%%%%%%%
	t1_start = time.perf_counter()
	t2_start = time.process_time()

	FloatField.extend(flt_arr)

	FloatNDX.add(**flt_scores)

	t1_stop = time.perf_counter()
	t2_stop = time.process_time()

	print(f"Walrus-Redis - elapsed time for {sample_size} floats : {int(round((t1_stop-t1_start)))} [sec]")
	print(f"Walrus-Redis - cpu process time for {sample_size} floats: {int(round((t2_stop-t2_start)))} [sec]")
	# %%%%%%%%%%%%%%%%% End Timing Benchmark %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

	# Walrus-Redis - elapsed time for 250000 floats : 6 [sec]
	# Walrus-Redis - cpu process time for 250000 floats: 5 [sec]

	# Redis Server Before Insertion
	# used_memory_human : 0.99 MB
	# used_memory_dataset: 0.05 MB

	# Redis Server After Insertion
	# used_memory_human : 36.92 MB
	# used_memory_dataset: 35.96 MB

	# Save RDB policy: after 600 sec (10 min) if at least 10 keys changed
	# Redis 4.0.8 (00000000/0) 64 bit



	# ************************************************************************************************
	# ############### 3. Create float secondary index with RediSearch Module ###############
	# *************************************************************************************************

	# Creating a client with a given index name - DVS (Data Value Store)
	redclient = Client('DVS')

	# Creating the index definition and schema
	redclient.create_index([NumericField('flt', sortable=True)])

	batchndx = redclient.batch_indexer(chunk_size=500)

	# %%%%%%%%%%%%%%%%%%%%%%%%%%% Start Timing Benchmark %%%%%%%%%%%%%%%%%%%%%%%%%%
	t1_start = time.perf_counter()
	t2_start = time.process_time()

	[batchndx.add_document(doc_id=n, flt=flt_arr[n]) for n in range(sample_size)]
	batchndx.commit()

	t1_stop = time.perf_counter()
	t2_stop = time.process_time()

	print(f"RediSearch Module - elapsed time for {sample_size} floats : {int(round((t1_stop-t1_start)))} [sec]")
	print(f"RediSearch Module - cpu process time for {sample_size} floats: {int(round((t2_stop-t2_start)))} [sec]")
	# %%%%%%%%%%%%%%%%% End Timing Benchmark %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

	# RediSearch Module - elapsed time for 250000 floats (500 chunks) : 25 [sec]
	# RediSearch Module - cpu process time for 250000 floats (500 chunks) : 17 [sec]

	# Redis Server Before Insertion
	# used_memory_human : 0.99 MB
	# used_memory_dataset: 0.05 MB

	# Redis Server After Insertion
	# used_memory_human : 66.62 MB
	# used_memory_dataset: 54.12 MB

	# Redis 4.0.8 (00000000/0) 64 bit
	# Save RDB policy: after 600 sec (10 min) if at least 10 keys changed


	print(f"Fields : {redclient.info()['fields']}")
	print(f"Total values : {redclient.info()['num_docs']}")
	print(f"Total size mb : {redclient.info()['doc_table_size_mb']}")
	print(f"Sortable values size mb : {redclient.info()['sortable_values_size_mb']}")

	#Fields : [[b'flt', b'type', b'NUMERIC', b'SORTABLE']]
	#Total values : 250000
	#Total size mb : 20.874872207641602
	#Sortable values size mb : 5.7220458984375

	# ************************************************************************************************
	# +++++++++++++++++++++++++++ SEARCHING BENCHMARKS WITH INDEX FOR 1, 2, 3 Cases +++++++++++++++++
	# ***********************************************************************************************
	val_range = (2300, 2303)

	# 1. Python/NumPy
	len(ndx_range(*val_range, flt_sorted_ndx))
	ndx_range(*val_range, flt_sorted_ndx)

	# 2. Redis/Walrus
	FloatNDX.count(*val_range)
	FloatNDX.range_by_score(*val_range, with_scores=True)

	# 3. RediSearch Module
	res = redclient.search('@flt:[2300 2303]')
	res.total
	res.docs
	redclient.search(Query('@flt:[2300 2303]').return_fields('flt').sort_by('flt')).docs


	# 2. Python Range Query Time Benchmark
	%timeit -r 7 -n 10 ndx_range(*val_range, flt_sorted_ndx)
	# 245 ms ± 1.31 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)

	# 2. Redis Range Query Time Benchmark
	%timeit -r 7 -n 1000 FloatNDX.range_by_score(*val_range, with_scores=True)
	# 490 µs ± 82.6 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)

	# 3. RediSearch Query Time Benchmark
	%timeit -r 7 -n 1000 redclient.search('@flt:[2300 2303]')
	# 434 µs ± 6.5 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)