Philipp Singer psinger

Senior Principal Data Scientist at H2O.AI. PhD in computer science. Kaggle #1

psinger / pymc_correlation_1.py

Created March 18, 2018 14:43

	from pymc import Normal, Uniform, MvNormal, Exponential
	from numpy.linalg import inv, det
	from numpy import log, pi, dot
	import numpy as np
	from scipy.special import gammaln

	def _model(data, robust=False):
	# priors might be adapted here to be less flat
	mu = Normal('mu', 0, 0.000001, size=2)
	sigma = Uniform('sigma', 0, 1000, size=2)

psinger / huge_matrices_7.py

Created March 18, 2018 13:54

	#let's get the third row
	a = h5.root
	b = csr_matrix((a.data[a.indptr[3]:a.indptr[3+1]], a.indices[a.indptr[3]:a.indptr[3+1]], np.array([0,len(a.indices[a.indptr[3]:a.indptr[3+1]])])), shape=(1,n))

psinger / huge_matrices_6.py

Created March 18, 2018 13:54

	h5 = tb.open_file('dot.h5', 'r')
	a = csr_matrix((h5.root.data[:], h5.root.indices[:], h5.root.indptr[:]), shape=(l,n))

psinger / huge_matrices_5.py

Created March 18, 2018 13:54

	import numpy as np
	f = tb.open_file('dot.h5', 'w')
	filters = tb.Filters(complevel=5, complib='blosc')
	out_data = f.create_earray(f.root, 'data', tb.Float32Atom(), shape=(0,), filters=filters)
	out_indices = f.create_earray(f.root, 'indices', tb.Int32Atom(),shape=(0,), filters=filters)
	out_indptr = f.create_earray(f.root, 'indptr', tb.Int32Atom(), shape=(0,), filters=filters)
	out_indptr.append(np.array([0])) #this is needed as a first indptr
	max_indptr = 0
	for i in range(0, l, bl):
	res = a[i:min(i+bl, l),:].dot(b)

psinger / gist:9827b22d7c46e7624613941bb8c07f00

Created March 18, 2018 13:53

huge_matrices_4.py

	h5 = tb.open_file('dot.h5', 'r')
	a = csr_matrix((h5.root.data[:], (h5.root.ri[:], h5.root.ci[:])), shape=(l,n))

psinger / huge_matrices_3.py

Created March 18, 2018 13:53

	f = tb.open_file('dot.h5', 'w')
	filters = tb.Filters(complevel=5, complib='blosc')
	out_data = f.create_earray(f.root, 'data', tb.Float32Atom(), shape=(0,), filters=filters)
	out_ri = f.create_earray(f.root, 'ri', tb.Float32Atom(),shape=(0,), filters=filters)
	out_ci = f.create_earray(f.root, 'ci', tb.Float32Atom(), shape=(0,), filters=filters)
	for i in range(0, l, bl):
	res = a.dot(b[:,i:min(i+bl, l)])
	vals = res.data
	ri, ci = res.nonzero()
	out_data.append(vals)

psinger / huge_matrices_2.py

Created March 18, 2018 13:52

	h5 = tb.open_file('dot.h5', 'r')
	a = h5.root.data
	row = a[0,:] #only one row gets loaded into memory
	print row

psinger / huge_matrices_1.py

Created March 18, 2018 13:51

	from scipy.sparse import csr_matrix, rand
	import tables as tb
	a = rand(2000,2000, format='csr') #imagine that many values are stored in this matrix and that sparsity is low
	b = a.T
	l, m, n = a.shape[0], a.shape[1], b.shape[1]

	f = tb.open_file('dot.h5', 'w')
	filters = tb.Filters(complevel=5, complib='blosc')
	out = f.create_carray(f.root, 'data', tb.Float32Atom(), shape=(l, n), filters=filters)

psinger / pl_pymc_analyze.py

Created March 18, 2018 13:35

	def analyze(data, discrete=True, xmin=1.):
	model = mc.MCMC(_model(data,discrete,xmin))
	model.sample(5000)

	print
	print(model.stats()['alpha']['mean'])

	mc.Matplot.plot(model)

psinger / pl_pymc_model.py

Created March 18, 2018 13:34

	import pymc as mc
	from scipy.special import zeta

	def _model(data, discrete=True, xmin=1.):
	alpha = mc.Exponential('alpha', 1. / 1.5)

	@mc.stochastic(observed=True)
	def custom_stochastic(value=data, alpha=alpha, xmin=xmin, discrete=discrete):
	value = value[value >= xmin]
	if discrete == True: