Skip to content

Instantly share code, notes, and snippets.

@glciampaglia
Last active December 18, 2015 20:49
Show Gist options
  • Save glciampaglia/5843064 to your computer and use it in GitHub Desktop.
Save glciampaglia/5843064 to your computer and use it in GitHub Desktop.
Code for writing a sparse adjacency matrix in COO format to PyTable's CArray representation.
# Adapted from
# http://pytables.github.io/usersguide/libref/homogenous_storage.html#the-carray-class
import os
import numpy
import tables
from itertools import groupby
from operator import itemgetter
from time import time
print_every = 1000
# full DBPedia data
fileName = os.path.expanduser('~/data/dbpedia/carray_all.h5')
infileName = os.path.expanduser('~/data/dbpedia/adjacency_all.npy')
N = 3141881
# # test data from 30K edges
# fileName = os.path.expanduser('~/data/dbpedia/carray_test30k.h5')
# infileName = os.path.expanduser('~/data/dbpedia/adjacency_test30k.npy')
# N = 6463
shape = (N, N)
atom = tables.Float64Atom()
filters = tables.Filters(complevel=5, complib='zlib')
h5f = tables.open_file(fileName, 'w')
ca = h5f.create_carray(h5f.root, 'carray', atom, shape, filters=filters
chunkshape=(1,100)) # or some other sensible value
print 'Reading data from {}.'.format(infileName)
coords = numpy.load(infileName)
coords.sort()
print 'Sorted data.'
Nrecs = len(coords)
print 'Writing data to {}.'.format(fileName)
tic = time()
lap = tic
totWrittenRecs = 0 # in total
writtenRecs = 0 # every `print_every` iterations
for i, (r, c, w) in enumerate(coords):
ca[r, c] = w
writtenRecs += 1
if i % print_every == 0:
totWrittenRecs += writtenRecs
prev_lap, lap = lap, time()
intertime = lap - prev_lap
speed = float(writtenRecs) / intertime
p = float(totWrittenRecs)/Nrecs*100
print '* {} of {} records ({:.2f}%) written to disk in {:.4f} seconds '\
'({:.2e} recs/s)'.format(totWrittenRecs, Nrecs, p, intertime, speed)
writtenRecs = 0
h5f.close()
toc = time()
print 'TOTAL: Wrote {} records to disk in {:.2f} seconds ({:.2e} recs/s)'.format(totWrittenRecs,
toc - tic, float(totWrittenRecs) / (toc - tic))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment