Last active
April 30, 2019 01:56
-
-
Save brendano/320d9bde96f86ec0345b969d64d1418a to your computer and use it in GitHub Desktop.
l1 generative implementation with liblbfgs (owl-qn)
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"cell_type": "code", | |
"execution_count": 88, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"<module 'l1gen' from 'l1gen.py'>" | |
] | |
}, | |
"execution_count": 88, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"from __future__ import division, print_function\n", | |
"import l1gen; reload(l1gen)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 89, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"X = np.array([\n", | |
" [10,1,1,0],\n", | |
" [3,4,1,0],\n", | |
" [100,5,1,1],\n", | |
" ])\n", | |
"Y = np.array([\n", | |
" [1,1],\n", | |
" [1,0],\n", | |
" [0,0],\n", | |
" ])" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 90, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"176.0593838622261" | |
] | |
}, | |
"execution_count": 90, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"reload(l1gen)\n", | |
"ga = np.zeros((2,4))\n", | |
"gr = np.zeros((2,4))\n", | |
"l1gen.nll_and_grad(ga, gr, np.zeros(4), X,Y)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 91, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"3 docs, 4 vocab, 2 doc features\n", | |
"iter 1 linestep 0.220519014352 nnz 3 nll 52.3496042347 weight,grad norms 1.0 1.4127133275\n", | |
"iter 2 linestep 1.0 nnz 5 nll 52.213327684 weight,grad norms 0.964601505177 0.713841572541\n", | |
"iter 3 linestep 1.0 nnz 5 nll 52.0631340249 weight,grad norms 1.15022499658 0.52187511764\n", | |
"iter 4 linestep 1.0 nnz 5 nll 51.9296867712 weight,grad norms 1.47215385419 0.456031249589\n", | |
"iter 5 linestep 0.5 nnz 4 nll 51.915257173 weight,grad norms 1.48618595493 0.51970623396\n", | |
"iter 6 linestep 1.0 nnz 4 nll 51.8979782713 weight,grad norms 1.58596567851 0.214046507048\n", | |
"iter 7 linestep 1.0 nnz 4 nll 51.8939263397 weight,grad norms 1.53619163177 0.137203356727\n", | |
"iter 8 linestep 1.0 nnz 4 nll 51.8878404999 weight,grad norms 1.5428238106 0.128197750632\n", | |
"iter 9 linestep 1.0 nnz 3 nll 51.8675693189 weight,grad norms 1.61039397227 0.258987539685\n", | |
"iter 10 linestep 0.5 nnz 3 nll 51.8672078693 weight,grad norms 1.54542810691 0.167567894488\n", | |
"iter 11 linestep 0.5 nnz 3 nll 51.864627109 weight,grad norms 1.60153108647 0.305863132123\n", | |
"iter 12 linestep 0.25 nnz 3 nll 51.8596814455 weight,grad norms 1.57442574864 0.0803556035899\n", | |
"iter 13 linestep 0.5 nnz 3 nll 51.8590432883 weight,grad norms 1.61219786608 0.20931334182\n", | |
"iter 14 linestep 0.25 nnz 3 nll 51.8567256235 weight,grad norms 1.59323442547 0.0587179191938\n", | |
"iter 15 linestep 0.5 nnz 3 nll 51.8558215284 weight,grad norms 1.61932652776 0.125267022972\n", | |
"iter 16 linestep 1.0 nnz 3 nll 51.85424845 weight,grad norms 1.62163185468 0.0324431781205\n", | |
"iter 17 linestep 1.0 nnz 3 nll 51.8538655268 weight,grad norms 1.64071107106 0.0504468095772\n", | |
"iter 18 linestep 1.0 nnz 3 nll 51.8538224349 weight,grad norms 1.64981177752 0.0453052520606\n", | |
"iter 19 linestep 1.0 nnz 3 nll 51.8536577645 weight,grad norms 1.64787971563 0.0123090645768\n", | |
"iter 20 linestep 1.0 nnz 3 nll 51.8536525618 weight,grad norms 1.65038464428 0.00962491197395\n", | |
"iter 21 linestep 1.0 nnz 3 nll 51.8536441718 weight,grad norms 1.65021184641 0.0028541293054\n", | |
"iter 22 linestep 1.0 nnz 3 nll 51.8536437562 weight,grad norms 1.65086769368 0.00213437588594\n", | |
"iter 23 linestep 1.0 nnz 3 nll 51.8536432876 weight,grad norms 1.65105094864 0.000877366243098\n" | |
] | |
}, | |
{ | |
"data": { | |
"text/plain": [ | |
"array([[-1.57734523, 0. , 0. , 0. ],\n", | |
" [ 0.43409937, -0.22250616, 0. , 0. ]])" | |
] | |
}, | |
"execution_count": 91, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"reload(l1gen)\n", | |
"l1gen.train(X,Y,l1_penalty=1)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 92, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"100 loops, best of 3: 11.9 ms per loop\n" | |
] | |
} | |
], | |
"source": [ | |
"reload(l1gen)\n", | |
"%timeit l1gen.train(X,Y,l1_penalty=1, verbose=False)" | |
] | |
} | |
], | |
"metadata": { | |
"anaconda-cloud": {}, | |
"kernelspec": { | |
"display_name": "Python [Root]", | |
"language": "python", | |
"name": "Python [Root]" | |
}, | |
"language_info": { | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 2 | |
}, | |
"file_extension": ".py", | |
"mimetype": "text/x-python", | |
"name": "python", | |
"nbconvert_exporter": "python", | |
"pygments_lexer": "ipython2", | |
"version": "2.7.12" | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 0 | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
""" | |
"L1 SAGE": a BOW text model that assumes words are generated as log-linear | |
sparse additive combinations of their documents' metadata features. | |
x = word counts in a doc | |
y = doc's (non-textual, metadata) feature a.k.a. covariate vector | |
Both x and y are observed. | |
p(x | y) = (1/Z) exp(beta + y*gamma) | |
p(x_w | y) = (1/Z) exp(beta_w + y'gamma_{.,w}) | |
where every gamma_{k,w} ~ Laplace(1/lambda) | |
beta (V): background log-probs (set to empirical corpus MLE) | |
gamma (KxV): per-feature weights across word vocabulary | |
If y is one-hot, it's a single-membership multiclass model, similar to | |
Multinomial NB except with a shared background component. If y is not one-hot, | |
it's more interesting: for example, multiple covariate-lexical effects can be | |
present at once. | |
Similar to Eisenstein's "SAGE" model, except uses L1 regularization or MAP | |
estimation, instead of VB under a different hierarchical model. See also | |
Taddy's "inverse regression" model. This implmentation uses OWL-QN, via | |
PyLibLBFGS, for the convex optimization training. | |
""" | |
# initialized from katie's codebase: https://github.com/slanglab/docprop/blob/master/code/loglin/model.py | |
from __future__ import division, print_function | |
import time, json, glob, sys | |
import numpy as np | |
from scipy.misc import logsumexp | |
from lbfgs import LBFGS, LBFGSError, fmin_lbfgs | |
#for the pylbfgs package | |
def nll_and_grad(gamma, grad, beta, trainX, trainY): | |
''' | |
gamma:: the "corruption" vectors (class-specific vector), size KxV | |
grad:: gradient of gamma, also KxV. This will be zerod and mutated. | |
beta:: background logprobs (weights) | |
''' | |
assert grad.shape == gamma.shape | |
K, V = gamma.shape | |
D = trainY.shape[0] | |
assert D != V, "stupid limitation sorry" | |
assert beta.shape == (V,) | |
assert np.all(np.isfinite(beta)), "bad beta" | |
assert np.all(np.isfinite(gamma)), "bad gamma" | |
# pyliblbfgs wants 'grad' updated in-place. | |
# need to zero it out before incrementally adding in-place. | |
grad.fill(0.0) | |
# precompute unnormalized LMs for every doc as one big dense matrix | |
# we could do minibatches instead to save memory. | |
doc_lprobs = trainY.dot( gamma ) | |
assert doc_lprobs.shape == (D,V) | |
doc_lprobs += beta ## should broadcast to every row. | |
# could use logsumexp(axis) to batch this? | |
ll = 0.0 | |
for i in xrange(D): | |
w_lprobs = doc_lprobs[i] - logsumexp(doc_lprobs[i]) | |
w_probs = np.exp(w_lprobs) | |
doclen = np.sum(trainX[i]) | |
assert np.all(np.isfinite(w_lprobs)) | |
ll += trainX[i].dot(w_lprobs) | |
# dl/gamma_{j,w} = y_j (x_w - doclen*p(w | y)) | |
wordpred = doclen * w_probs | |
residvec = trainX[i] - wordpred | |
assert residvec.shape == (V,) | |
grad += np.outer(trainY[i], residvec) | |
# same as: | |
# for k in xrange(K): grad[k] += trainY[i,k] * residvec | |
#switch to negative for minimization. note this is INPLACE mutation | |
grad *= -1.0 | |
return -ll | |
def progress(weights, grad, fx, xnorm, gnorm, step, k, num_eval, *args): | |
"""https://github.com/larsmans/pylbfgs/blob/master/lbfgs/_lowlevel.pyx#L291""" | |
print("iter {k} linestep {step} nnz {nnz} nll {fx} weight,grad norms {xnorm} {gnorm}".format(nnz=np.sum(weights!=0), **locals())) | |
# return non-zero to stop the optimizer? or is this buggy? | |
# if k>5: return -1 | |
return 0 | |
######################### | |
def train(trainX, trainY, l1_penalty=1e-5, epsilon=1e-3, delta=1e-3, verbose=True): | |
""" | |
Inputs: | |
trainX (D,V): for each doc, vector of word counts | |
trainY (D,K): for each doc, vector of doc covariates, e.g. binary vector | |
where | |
D = number of docs | |
V = number of wordtypes in vocabulary | |
K = number of doc features | |
Returns: | |
The 'gamma' (K x V) weights matrix of sparse differences of words' | |
log-probabilities relative to the background distribution. See | |
module's docs for details on the model. | |
Model hyperparams: | |
l1_penalty: the 'lambda' in the penalty term lambda*||gamma||_1. | |
Make this higher, like 1 or 10, for more sparsity. | |
Training hyperparams: | |
epsilon: convergence threshold, relative grad norm | |
delta: convergence threshold, relative change in objective | |
""" | |
D1,V = trainX.shape | |
D2,K = trainY.shape | |
assert D1==D2, "matrices should both have 1 row per doc" | |
D=D1 | |
if verbose: | |
print("{} docs, {} vocab, {} doc features".format(D,V,K)) | |
bb=LBFGS() | |
bb.orthantwise_c = l1_penalty | |
bb.linesearch = 'wolfe' | |
# looks like it's an OR between the two convergence criteria. | |
# https://github.com/larsmans/pylbfgs/blob/master/liblbfgs/lbfgs.c#L498 | |
bb.epsilon = epsilon | |
bb.delta = delta | |
corpus_wordcounts = trainX.sum(0) | |
assert np.all(corpus_wordcounts > 0), "can't handle zero counts in input" | |
corpus_wordprob = corpus_wordcounts / corpus_wordcounts.sum() | |
beta = np.log(corpus_wordprob) | |
weight_init = np.zeros((K,V)) | |
cb = progress if verbose else lambda *a: 0 | |
opt = bb.minimize(nll_and_grad, weight_init, cb, [beta, trainX, trainY]) | |
return opt | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment