Skip to content

Instantly share code, notes, and snippets.

@brendano
Last active April 30, 2019 01:56
Show Gist options
  • Save brendano/320d9bde96f86ec0345b969d64d1418a to your computer and use it in GitHub Desktop.
Save brendano/320d9bde96f86ec0345b969d64d1418a to your computer and use it in GitHub Desktop.
l1 generative implementation with liblbfgs (owl-qn)
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "code",
"execution_count": 88,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"<module 'l1gen' from 'l1gen.py'>"
]
},
"execution_count": 88,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"from __future__ import division, print_function\n",
"import l1gen; reload(l1gen)"
]
},
{
"cell_type": "code",
"execution_count": 89,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"X = np.array([\n",
" [10,1,1,0],\n",
" [3,4,1,0],\n",
" [100,5,1,1],\n",
" ])\n",
"Y = np.array([\n",
" [1,1],\n",
" [1,0],\n",
" [0,0],\n",
" ])"
]
},
{
"cell_type": "code",
"execution_count": 90,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"176.0593838622261"
]
},
"execution_count": 90,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"reload(l1gen)\n",
"ga = np.zeros((2,4))\n",
"gr = np.zeros((2,4))\n",
"l1gen.nll_and_grad(ga, gr, np.zeros(4), X,Y)"
]
},
{
"cell_type": "code",
"execution_count": 91,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"3 docs, 4 vocab, 2 doc features\n",
"iter 1 linestep 0.220519014352 nnz 3 nll 52.3496042347 weight,grad norms 1.0 1.4127133275\n",
"iter 2 linestep 1.0 nnz 5 nll 52.213327684 weight,grad norms 0.964601505177 0.713841572541\n",
"iter 3 linestep 1.0 nnz 5 nll 52.0631340249 weight,grad norms 1.15022499658 0.52187511764\n",
"iter 4 linestep 1.0 nnz 5 nll 51.9296867712 weight,grad norms 1.47215385419 0.456031249589\n",
"iter 5 linestep 0.5 nnz 4 nll 51.915257173 weight,grad norms 1.48618595493 0.51970623396\n",
"iter 6 linestep 1.0 nnz 4 nll 51.8979782713 weight,grad norms 1.58596567851 0.214046507048\n",
"iter 7 linestep 1.0 nnz 4 nll 51.8939263397 weight,grad norms 1.53619163177 0.137203356727\n",
"iter 8 linestep 1.0 nnz 4 nll 51.8878404999 weight,grad norms 1.5428238106 0.128197750632\n",
"iter 9 linestep 1.0 nnz 3 nll 51.8675693189 weight,grad norms 1.61039397227 0.258987539685\n",
"iter 10 linestep 0.5 nnz 3 nll 51.8672078693 weight,grad norms 1.54542810691 0.167567894488\n",
"iter 11 linestep 0.5 nnz 3 nll 51.864627109 weight,grad norms 1.60153108647 0.305863132123\n",
"iter 12 linestep 0.25 nnz 3 nll 51.8596814455 weight,grad norms 1.57442574864 0.0803556035899\n",
"iter 13 linestep 0.5 nnz 3 nll 51.8590432883 weight,grad norms 1.61219786608 0.20931334182\n",
"iter 14 linestep 0.25 nnz 3 nll 51.8567256235 weight,grad norms 1.59323442547 0.0587179191938\n",
"iter 15 linestep 0.5 nnz 3 nll 51.8558215284 weight,grad norms 1.61932652776 0.125267022972\n",
"iter 16 linestep 1.0 nnz 3 nll 51.85424845 weight,grad norms 1.62163185468 0.0324431781205\n",
"iter 17 linestep 1.0 nnz 3 nll 51.8538655268 weight,grad norms 1.64071107106 0.0504468095772\n",
"iter 18 linestep 1.0 nnz 3 nll 51.8538224349 weight,grad norms 1.64981177752 0.0453052520606\n",
"iter 19 linestep 1.0 nnz 3 nll 51.8536577645 weight,grad norms 1.64787971563 0.0123090645768\n",
"iter 20 linestep 1.0 nnz 3 nll 51.8536525618 weight,grad norms 1.65038464428 0.00962491197395\n",
"iter 21 linestep 1.0 nnz 3 nll 51.8536441718 weight,grad norms 1.65021184641 0.0028541293054\n",
"iter 22 linestep 1.0 nnz 3 nll 51.8536437562 weight,grad norms 1.65086769368 0.00213437588594\n",
"iter 23 linestep 1.0 nnz 3 nll 51.8536432876 weight,grad norms 1.65105094864 0.000877366243098\n"
]
},
{
"data": {
"text/plain": [
"array([[-1.57734523, 0. , 0. , 0. ],\n",
" [ 0.43409937, -0.22250616, 0. , 0. ]])"
]
},
"execution_count": 91,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"reload(l1gen)\n",
"l1gen.train(X,Y,l1_penalty=1)"
]
},
{
"cell_type": "code",
"execution_count": 92,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"100 loops, best of 3: 11.9 ms per loop\n"
]
}
],
"source": [
"reload(l1gen)\n",
"%timeit l1gen.train(X,Y,l1_penalty=1, verbose=False)"
]
}
],
"metadata": {
"anaconda-cloud": {},
"kernelspec": {
"display_name": "Python [Root]",
"language": "python",
"name": "Python [Root]"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 2
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython2",
"version": "2.7.12"
}
},
"nbformat": 4,
"nbformat_minor": 0
}
"""
"L1 SAGE": a BOW text model that assumes words are generated as log-linear
sparse additive combinations of their documents' metadata features.
x = word counts in a doc
y = doc's (non-textual, metadata) feature a.k.a. covariate vector
Both x and y are observed.
p(x | y) = (1/Z) exp(beta + y*gamma)
p(x_w | y) = (1/Z) exp(beta_w + y'gamma_{.,w})
where every gamma_{k,w} ~ Laplace(1/lambda)
beta (V): background log-probs (set to empirical corpus MLE)
gamma (KxV): per-feature weights across word vocabulary
If y is one-hot, it's a single-membership multiclass model, similar to
Multinomial NB except with a shared background component. If y is not one-hot,
it's more interesting: for example, multiple covariate-lexical effects can be
present at once.
Similar to Eisenstein's "SAGE" model, except uses L1 regularization or MAP
estimation, instead of VB under a different hierarchical model. See also
Taddy's "inverse regression" model. This implmentation uses OWL-QN, via
PyLibLBFGS, for the convex optimization training.
"""
# initialized from katie's codebase: https://github.com/slanglab/docprop/blob/master/code/loglin/model.py
from __future__ import division, print_function
import time, json, glob, sys
import numpy as np
from scipy.misc import logsumexp
from lbfgs import LBFGS, LBFGSError, fmin_lbfgs
#for the pylbfgs package
def nll_and_grad(gamma, grad, beta, trainX, trainY):
'''
gamma:: the "corruption" vectors (class-specific vector), size KxV
grad:: gradient of gamma, also KxV. This will be zerod and mutated.
beta:: background logprobs (weights)
'''
assert grad.shape == gamma.shape
K, V = gamma.shape
D = trainY.shape[0]
assert D != V, "stupid limitation sorry"
assert beta.shape == (V,)
assert np.all(np.isfinite(beta)), "bad beta"
assert np.all(np.isfinite(gamma)), "bad gamma"
# pyliblbfgs wants 'grad' updated in-place.
# need to zero it out before incrementally adding in-place.
grad.fill(0.0)
# precompute unnormalized LMs for every doc as one big dense matrix
# we could do minibatches instead to save memory.
doc_lprobs = trainY.dot( gamma )
assert doc_lprobs.shape == (D,V)
doc_lprobs += beta ## should broadcast to every row.
# could use logsumexp(axis) to batch this?
ll = 0.0
for i in xrange(D):
w_lprobs = doc_lprobs[i] - logsumexp(doc_lprobs[i])
w_probs = np.exp(w_lprobs)
doclen = np.sum(trainX[i])
assert np.all(np.isfinite(w_lprobs))
ll += trainX[i].dot(w_lprobs)
# dl/gamma_{j,w} = y_j (x_w - doclen*p(w | y))
wordpred = doclen * w_probs
residvec = trainX[i] - wordpred
assert residvec.shape == (V,)
grad += np.outer(trainY[i], residvec)
# same as:
# for k in xrange(K): grad[k] += trainY[i,k] * residvec
#switch to negative for minimization. note this is INPLACE mutation
grad *= -1.0
return -ll
def progress(weights, grad, fx, xnorm, gnorm, step, k, num_eval, *args):
"""https://github.com/larsmans/pylbfgs/blob/master/lbfgs/_lowlevel.pyx#L291"""
print("iter {k} linestep {step} nnz {nnz} nll {fx} weight,grad norms {xnorm} {gnorm}".format(nnz=np.sum(weights!=0), **locals()))
# return non-zero to stop the optimizer? or is this buggy?
# if k>5: return -1
return 0
#########################
def train(trainX, trainY, l1_penalty=1e-5, epsilon=1e-3, delta=1e-3, verbose=True):
"""
Inputs:
trainX (D,V): for each doc, vector of word counts
trainY (D,K): for each doc, vector of doc covariates, e.g. binary vector
where
D = number of docs
V = number of wordtypes in vocabulary
K = number of doc features
Returns:
The 'gamma' (K x V) weights matrix of sparse differences of words'
log-probabilities relative to the background distribution. See
module's docs for details on the model.
Model hyperparams:
l1_penalty: the 'lambda' in the penalty term lambda*||gamma||_1.
Make this higher, like 1 or 10, for more sparsity.
Training hyperparams:
epsilon: convergence threshold, relative grad norm
delta: convergence threshold, relative change in objective
"""
D1,V = trainX.shape
D2,K = trainY.shape
assert D1==D2, "matrices should both have 1 row per doc"
D=D1
if verbose:
print("{} docs, {} vocab, {} doc features".format(D,V,K))
bb=LBFGS()
bb.orthantwise_c = l1_penalty
bb.linesearch = 'wolfe'
# looks like it's an OR between the two convergence criteria.
# https://github.com/larsmans/pylbfgs/blob/master/liblbfgs/lbfgs.c#L498
bb.epsilon = epsilon
bb.delta = delta
corpus_wordcounts = trainX.sum(0)
assert np.all(corpus_wordcounts > 0), "can't handle zero counts in input"
corpus_wordprob = corpus_wordcounts / corpus_wordcounts.sum()
beta = np.log(corpus_wordprob)
weight_init = np.zeros((K,V))
cb = progress if verbose else lambda *a: 0
opt = bb.minimize(nll_and_grad, weight_init, cb, [beta, trainX, trainY])
return opt
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment