Skip to content

Instantly share code, notes, and snippets.

@evansd
Created August 6, 2018 14:18
Show Gist options
  • Save evansd/6b0dfafba20e92dcc463423811a02af5 to your computer and use it in GitHub Desktop.
Save evansd/6b0dfafba20e92dcc463423811a02af5 to your computer and use it in GitHub Desktop.
#!/usr/bin/env python
import time
import sys
import lmdb
import numpy
import pyarrow
import scipy.sparse
#import zstandard
PRACTICES = 15384
MONTHS = 60
SHAPE = (PRACTICES, MONTHS)
#VALUE_TYPE = numpy.uint16
#decompressor = zstandard.ZstdDecompressor()
def deserialize(buf, dtype):
#raw = decompressor.decompress(buf)
contents = pyarrow.deserialize(buf)
return scipy.sparse.csc_matrix(contents, shape=SHAPE, dtype=dtype)
env = lmdb.open('lmdb-test-real-data.mdb', map_size=100000000000, readonly=True, lock=False, subdir=False, max_dbs=4)
def sum_values(env, column, prefix):
dtype = {'items': numpy.uint16, 'quantity': numpy.float_}[column]
db = env.open_db(key=column.encode('ascii'))
prefix = prefix.encode('ascii')
values = numpy.zeros(SHAPE, dtype=dtype)
n = 0
with env.begin(db=db, buffers=True) as txn:
cursor = txn.cursor()
assert cursor.set_range(prefix)
for key, next_val in cursor:
key = bytes(key)
if not key.startswith(prefix):
break
#print(key)
values = values + deserialize(next_val, dtype)
n += 1
print(n)
return values
numpy.seterr(divide='ignore', invalid='ignore')
start = time.time()
items = sum_values(env, 'items', sys.argv[1])
quantity = sum_values(env, 'quantity', sys.argv[1])
ratio = quantity / items
print('Time', time.time() - start)
print(ratio)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment