brendano · April 30, 2019 01:56
diff --git a/example.ipynb b/example.ipynb
 {
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 88,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "<module 'l1gen' from 'l1gen.py'>"
      ]
     },
     "execution_count": 88,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from __future__ import division, print_function\n",
    "import l1gen; reload(l1gen)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 89,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "X = np.array([\n",
    "        [10,1,1,0],\n",
    "        [3,4,1,0],\n",
    "        [100,5,1,1],\n",
    "    ])\n",
    "Y = np.array([\n",
    "        [1,1],\n",
    "        [1,0],\n",
    "        [0,0],\n",
    "    ])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 90,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "176.0593838622261"
      ]
     },
     "execution_count": 90,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "reload(l1gen)\n",
    "ga = np.zeros((2,4))\n",
    "gr = np.zeros((2,4))\n",
    "l1gen.nll_and_grad(ga, gr, np.zeros(4), X,Y)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 91,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "3 docs, 4 vocab, 2 doc features\n",
      "iter 1 linestep 0.220519014352 nnz 3 nll 52.3496042347 weight,grad norms 1.0 1.4127133275\n",
      "iter 2 linestep 1.0 nnz 5 nll 52.213327684 weight,grad norms 0.964601505177 0.713841572541\n",
      "iter 3 linestep 1.0 nnz 5 nll 52.0631340249 weight,grad norms 1.15022499658 0.52187511764\n",
      "iter 4 linestep 1.0 nnz 5 nll 51.9296867712 weight,grad norms 1.47215385419 0.456031249589\n",
      "iter 5 linestep 0.5 nnz 4 nll 51.915257173 weight,grad norms 1.48618595493 0.51970623396\n",
      "iter 6 linestep 1.0 nnz 4 nll 51.8979782713 weight,grad norms 1.58596567851 0.214046507048\n",
      "iter 7 linestep 1.0 nnz 4 nll 51.8939263397 weight,grad norms 1.53619163177 0.137203356727\n",
      "iter 8 linestep 1.0 nnz 4 nll 51.8878404999 weight,grad norms 1.5428238106 0.128197750632\n",
      "iter 9 linestep 1.0 nnz 3 nll 51.8675693189 weight,grad norms 1.61039397227 0.258987539685\n",
      "iter 10 linestep 0.5 nnz 3 nll 51.8672078693 weight,grad norms 1.54542810691 0.167567894488\n",
      "iter 11 linestep 0.5 nnz 3 nll 51.864627109 weight,grad norms 1.60153108647 0.305863132123\n",
      "iter 12 linestep 0.25 nnz 3 nll 51.8596814455 weight,grad norms 1.57442574864 0.0803556035899\n",
      "iter 13 linestep 0.5 nnz 3 nll 51.8590432883 weight,grad norms 1.61219786608 0.20931334182\n",
      "iter 14 linestep 0.25 nnz 3 nll 51.8567256235 weight,grad norms 1.59323442547 0.0587179191938\n",
      "iter 15 linestep 0.5 nnz 3 nll 51.8558215284 weight,grad norms 1.61932652776 0.125267022972\n",
      "iter 16 linestep 1.0 nnz 3 nll 51.85424845 weight,grad norms 1.62163185468 0.0324431781205\n",
      "iter 17 linestep 1.0 nnz 3 nll 51.8538655268 weight,grad norms 1.64071107106 0.0504468095772\n",
      "iter 18 linestep 1.0 nnz 3 nll 51.8538224349 weight,grad norms 1.64981177752 0.0453052520606\n",
      "iter 19 linestep 1.0 nnz 3 nll 51.8536577645 weight,grad norms 1.64787971563 0.0123090645768\n",
      "iter 20 linestep 1.0 nnz 3 nll 51.8536525618 weight,grad norms 1.65038464428 0.00962491197395\n",
      "iter 21 linestep 1.0 nnz 3 nll 51.8536441718 weight,grad norms 1.65021184641 0.0028541293054\n",
      "iter 22 linestep 1.0 nnz 3 nll 51.8536437562 weight,grad norms 1.65086769368 0.00213437588594\n",
      "iter 23 linestep 1.0 nnz 3 nll 51.8536432876 weight,grad norms 1.65105094864 0.000877366243098\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "array([[-1.57734523,  0.        ,  0.        ,  0.        ],\n",
       "       [ 0.43409937, -0.22250616,  0.        ,  0.        ]])"
      ]
     },
     "execution_count": 91,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "reload(l1gen)\n",
    "l1gen.train(X,Y,l1_penalty=1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 92,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "100 loops, best of 3: 11.9 ms per loop\n"
     ]
    }
   ],
   "source": [
    "reload(l1gen)\n",
    "%timeit l1gen.train(X,Y,l1_penalty=1, verbose=False)"
   ]
  }
 ],
 "metadata": {
  "anaconda-cloud": {},
  "kernelspec": {
   "display_name": "Python [Root]",
   "language": "python",
   "name": "Python [Root]"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 2
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython2",
   "version": "2.7.12"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 0
 }
diff --git a/l1gen.py b/l1gen.py
 """
 "L1 SAGE": a BOW text model that assumes words are generated as log-linear
 sparse additive combinations of their documents' metadata features.

 x = word counts in a doc
 y = doc's (non-textual, metadata) feature a.k.a. covariate vector
 Both x and y are observed.
 p(x | y) = (1/Z) exp(beta + y*gamma)
 p(x_w | y) = (1/Z) exp(beta_w + y'gamma_{.,w})
 where every gamma_{k,w} ~ Laplace(1/lambda)

 beta (V): background log-probs (set to empirical corpus MLE)
 gamma (KxV): per-feature weights across word vocabulary

 If y is one-hot, it's a single-membership multiclass model, similar to
 Multinomial NB except with a shared background component.  If y is not one-hot,
 it's more interesting: for example, multiple covariate-lexical effects can be
 present at once.

 Similar to Eisenstein's "SAGE" model, except uses L1 regularization or MAP
 estimation, instead of VB under a different hierarchical model.  See also
 Taddy's "inverse regression" model.  This implmentation uses OWL-QN, via
 PyLibLBFGS, for the convex optimization training.
 """
 # initialized from katie's codebase: https://github.com/slanglab/docprop/blob/master/code/loglin/model.py
 from __future__ import division, print_function
 import time, json, glob, sys
 import numpy as np
 from scipy.misc import logsumexp
 from lbfgs import LBFGS, LBFGSError, fmin_lbfgs

 #for the pylbfgs package 
 def nll_and_grad(gamma, grad, beta, trainX, trainY):
    '''
    gamma:: the "corruption" vectors (class-specific vector), size KxV
    grad:: gradient of gamma, also KxV. This will be zerod and mutated.
    beta:: background logprobs (weights)
    '''
    assert grad.shape == gamma.shape
    K, V = gamma.shape
    D = trainY.shape[0]
    assert D != V, "stupid limitation sorry"
    assert beta.shape == (V,)
    assert np.all(np.isfinite(beta)), "bad beta"
    assert np.all(np.isfinite(gamma)), "bad gamma"

    # pyliblbfgs wants 'grad' updated in-place.
    # need to zero it out before incrementally adding in-place.
    grad.fill(0.0) 

    # precompute unnormalized LMs for every doc as one big dense matrix
    # we could do minibatches instead to save memory.
    doc_lprobs = trainY.dot( gamma )
    assert doc_lprobs.shape == (D,V)
    doc_lprobs += beta  ## should broadcast to every row.

    # could use logsumexp(axis) to batch this?
    ll = 0.0
    for i in xrange(D):
        w_lprobs = doc_lprobs[i] - logsumexp(doc_lprobs[i])
        w_probs = np.exp(w_lprobs)
        doclen = np.sum(trainX[i])
        assert np.all(np.isfinite(w_lprobs))

        ll += trainX[i].dot(w_lprobs)

        # dl/gamma_{j,w} = y_j (x_w - doclen*p(w | y))
        wordpred = doclen * w_probs
        residvec = trainX[i] - wordpred
        assert residvec.shape == (V,)
        grad += np.outer(trainY[i], residvec)
        # same as:
        # for k in xrange(K): grad[k] += trainY[i,k] * residvec

    #switch to negative for minimization. note this is INPLACE mutation
    grad *= -1.0
    return -ll

 def progress(weights, grad, fx, xnorm, gnorm, step, k, num_eval, *args):
    """https://github.com/larsmans/pylbfgs/blob/master/lbfgs/_lowlevel.pyx#L291"""
    print("iter {k} linestep {step} nnz {nnz} nll {fx} weight,grad norms {xnorm} {gnorm}".format(nnz=np.sum(weights!=0), **locals()))
    # return non-zero to stop the optimizer? or is this buggy?
    # if k>5: return -1
    return 0

 #########################

 def train(trainX, trainY, l1_penalty=1e-5, epsilon=1e-3, delta=1e-3, verbose=True):
    """
    Inputs:
        trainX (D,V): for each doc, vector of word counts
        trainY (D,K): for each doc, vector of doc covariates, e.g. binary vector
    where
        D = number of docs
        V = number of wordtypes in vocabulary
        K = number of doc features

    Returns:
        The 'gamma' (K x V) weights matrix of sparse differences of words'
        log-probabilities relative to the background distribution.  See
        module's docs for details on the model.

    Model hyperparams:
        l1_penalty: the 'lambda' in the penalty term lambda*||gamma||_1.
        Make this higher, like 1 or 10, for more sparsity.

    Training hyperparams:
        epsilon: convergence threshold, relative grad norm
        delta: convergence threshold, relative change in objective

    """
    D1,V = trainX.shape
    D2,K = trainY.shape
    assert D1==D2, "matrices should both have 1 row per doc"
    D=D1
    if verbose:
        print("{} docs, {} vocab, {} doc features".format(D,V,K))

    bb=LBFGS()
    bb.orthantwise_c = l1_penalty
    bb.linesearch = 'wolfe'
    # looks like it's an OR between the two convergence criteria.
    # https://github.com/larsmans/pylbfgs/blob/master/liblbfgs/lbfgs.c#L498
    bb.epsilon = epsilon
    bb.delta = delta

    corpus_wordcounts = trainX.sum(0)
    assert np.all(corpus_wordcounts > 0), "can't handle zero counts in input"
    corpus_wordprob = corpus_wordcounts / corpus_wordcounts.sum()
    beta = np.log(corpus_wordprob)

    weight_init = np.zeros((K,V))
    cb = progress if verbose else lambda *a: 0
    opt = bb.minimize(nll_and_grad, weight_init, cb, [beta, trainX, trainY])
    return opt
	{
	"cells": [
	{
	"cell_type": "code",
	"execution_count": 88,
	"metadata": {
	"collapsed": false
	},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"<module 'l1gen' from 'l1gen.py'>"
	]
	},
	"execution_count": 88,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"from __future__ import division, print_function\n",
	"import l1gen; reload(l1gen)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 89,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": [
	"X = np.array([\n",
	" [10,1,1,0],\n",
	" [3,4,1,0],\n",
	" [100,5,1,1],\n",
	" ])\n",
	"Y = np.array([\n",
	" [1,1],\n",
	" [1,0],\n",
	" [0,0],\n",
	" ])"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 90,
	"metadata": {
	"collapsed": false
	},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"176.0593838622261"
	]
	},
	"execution_count": 90,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"reload(l1gen)\n",
	"ga = np.zeros((2,4))\n",
	"gr = np.zeros((2,4))\n",
	"l1gen.nll_and_grad(ga, gr, np.zeros(4), X,Y)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 91,
	"metadata": {
	"collapsed": false
	},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"3 docs, 4 vocab, 2 doc features\n",
	"iter 1 linestep 0.220519014352 nnz 3 nll 52.3496042347 weight,grad norms 1.0 1.4127133275\n",
	"iter 2 linestep 1.0 nnz 5 nll 52.213327684 weight,grad norms 0.964601505177 0.713841572541\n",
	"iter 3 linestep 1.0 nnz 5 nll 52.0631340249 weight,grad norms 1.15022499658 0.52187511764\n",
	"iter 4 linestep 1.0 nnz 5 nll 51.9296867712 weight,grad norms 1.47215385419 0.456031249589\n",
	"iter 5 linestep 0.5 nnz 4 nll 51.915257173 weight,grad norms 1.48618595493 0.51970623396\n",
	"iter 6 linestep 1.0 nnz 4 nll 51.8979782713 weight,grad norms 1.58596567851 0.214046507048\n",
	"iter 7 linestep 1.0 nnz 4 nll 51.8939263397 weight,grad norms 1.53619163177 0.137203356727\n",
	"iter 8 linestep 1.0 nnz 4 nll 51.8878404999 weight,grad norms 1.5428238106 0.128197750632\n",
	"iter 9 linestep 1.0 nnz 3 nll 51.8675693189 weight,grad norms 1.61039397227 0.258987539685\n",
	"iter 10 linestep 0.5 nnz 3 nll 51.8672078693 weight,grad norms 1.54542810691 0.167567894488\n",
	"iter 11 linestep 0.5 nnz 3 nll 51.864627109 weight,grad norms 1.60153108647 0.305863132123\n",
	"iter 12 linestep 0.25 nnz 3 nll 51.8596814455 weight,grad norms 1.57442574864 0.0803556035899\n",
	"iter 13 linestep 0.5 nnz 3 nll 51.8590432883 weight,grad norms 1.61219786608 0.20931334182\n",
	"iter 14 linestep 0.25 nnz 3 nll 51.8567256235 weight,grad norms 1.59323442547 0.0587179191938\n",
	"iter 15 linestep 0.5 nnz 3 nll 51.8558215284 weight,grad norms 1.61932652776 0.125267022972\n",
	"iter 16 linestep 1.0 nnz 3 nll 51.85424845 weight,grad norms 1.62163185468 0.0324431781205\n",
	"iter 17 linestep 1.0 nnz 3 nll 51.8538655268 weight,grad norms 1.64071107106 0.0504468095772\n",
	"iter 18 linestep 1.0 nnz 3 nll 51.8538224349 weight,grad norms 1.64981177752 0.0453052520606\n",
	"iter 19 linestep 1.0 nnz 3 nll 51.8536577645 weight,grad norms 1.64787971563 0.0123090645768\n",
	"iter 20 linestep 1.0 nnz 3 nll 51.8536525618 weight,grad norms 1.65038464428 0.00962491197395\n",
	"iter 21 linestep 1.0 nnz 3 nll 51.8536441718 weight,grad norms 1.65021184641 0.0028541293054\n",
	"iter 22 linestep 1.0 nnz 3 nll 51.8536437562 weight,grad norms 1.65086769368 0.00213437588594\n",
	"iter 23 linestep 1.0 nnz 3 nll 51.8536432876 weight,grad norms 1.65105094864 0.000877366243098\n"
	]
	},
	{
	"data": {
	"text/plain": [
	"array([[-1.57734523, 0. , 0. , 0. ],\n",
	" [ 0.43409937, -0.22250616, 0. , 0. ]])"
	]
	},
	"execution_count": 91,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"reload(l1gen)\n",
	"l1gen.train(X,Y,l1_penalty=1)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 92,
	"metadata": {
	"collapsed": false
	},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"100 loops, best of 3: 11.9 ms per loop\n"
	]
	}
	],
	"source": [
	"reload(l1gen)\n",
	"%timeit l1gen.train(X,Y,l1_penalty=1, verbose=False)"
	]
	}
	],
	"metadata": {
	"anaconda-cloud": {},
	"kernelspec": {
	"display_name": "Python [Root]",
	"language": "python",
	"name": "Python [Root]"
	},
	"language_info": {
	"codemirror_mode": {
	"name": "ipython",
	"version": 2
	},
	"file_extension": ".py",
	"mimetype": "text/x-python",
	"name": "python",
	"nbconvert_exporter": "python",
	"pygments_lexer": "ipython2",
	"version": "2.7.12"
	}
	},
	"nbformat": 4,
	"nbformat_minor": 0
	}
	"""
	"L1 SAGE": a BOW text model that assumes words are generated as log-linear
	sparse additive combinations of their documents' metadata features.

	x = word counts in a doc
	y = doc's (non-textual, metadata) feature a.k.a. covariate vector
	Both x and y are observed.
	p(x \| y) = (1/Z) exp(beta + y*gamma)
	p(x_w \| y) = (1/Z) exp(beta_w + y'gamma_{.,w})
	where every gamma_{k,w} ~ Laplace(1/lambda)

	beta (V): background log-probs (set to empirical corpus MLE)
	gamma (KxV): per-feature weights across word vocabulary

	If y is one-hot, it's a single-membership multiclass model, similar to
	Multinomial NB except with a shared background component. If y is not one-hot,
	it's more interesting: for example, multiple covariate-lexical effects can be
	present at once.

	Similar to Eisenstein's "SAGE" model, except uses L1 regularization or MAP
	estimation, instead of VB under a different hierarchical model. See also
	Taddy's "inverse regression" model. This implmentation uses OWL-QN, via
	PyLibLBFGS, for the convex optimization training.
	"""
	# initialized from katie's codebase: https://github.com/slanglab/docprop/blob/master/code/loglin/model.py
	from __future__ import division, print_function
	import time, json, glob, sys
	import numpy as np
	from scipy.misc import logsumexp
	from lbfgs import LBFGS, LBFGSError, fmin_lbfgs

	#for the pylbfgs package
	def nll_and_grad(gamma, grad, beta, trainX, trainY):
	'''
	gamma:: the "corruption" vectors (class-specific vector), size KxV
	grad:: gradient of gamma, also KxV. This will be zerod and mutated.
	beta:: background logprobs (weights)
	'''
	assert grad.shape == gamma.shape
	K, V = gamma.shape
	D = trainY.shape[0]
	assert D != V, "stupid limitation sorry"
	assert beta.shape == (V,)
	assert np.all(np.isfinite(beta)), "bad beta"
	assert np.all(np.isfinite(gamma)), "bad gamma"

	# pyliblbfgs wants 'grad' updated in-place.
	# need to zero it out before incrementally adding in-place.
	grad.fill(0.0)

	# precompute unnormalized LMs for every doc as one big dense matrix
	# we could do minibatches instead to save memory.
	doc_lprobs = trainY.dot( gamma )
	assert doc_lprobs.shape == (D,V)
	doc_lprobs += beta ## should broadcast to every row.

	# could use logsumexp(axis) to batch this?
	ll = 0.0
	for i in xrange(D):
	w_lprobs = doc_lprobs[i] - logsumexp(doc_lprobs[i])
	w_probs = np.exp(w_lprobs)
	doclen = np.sum(trainX[i])
	assert np.all(np.isfinite(w_lprobs))

	ll += trainX[i].dot(w_lprobs)

	# dl/gamma_{j,w} = y_j (x_w - doclen*p(w \| y))
	wordpred = doclen * w_probs
	residvec = trainX[i] - wordpred
	assert residvec.shape == (V,)
	grad += np.outer(trainY[i], residvec)
	# same as:
	# for k in xrange(K): grad[k] += trainY[i,k] * residvec

	#switch to negative for minimization. note this is INPLACE mutation
	grad *= -1.0
	return -ll

	def progress(weights, grad, fx, xnorm, gnorm, step, k, num_eval, *args):
	"""https://github.com/larsmans/pylbfgs/blob/master/lbfgs/_lowlevel.pyx#L291"""
	print("iter {k} linestep {step} nnz {nnz} nll {fx} weight,grad norms {xnorm} {gnorm}".format(nnz=np.sum(weights!=0), **locals()))
	# return non-zero to stop the optimizer? or is this buggy?
	# if k>5: return -1
	return 0

	#########################

	def train(trainX, trainY, l1_penalty=1e-5, epsilon=1e-3, delta=1e-3, verbose=True):
	"""
	Inputs:
	trainX (D,V): for each doc, vector of word counts
	trainY (D,K): for each doc, vector of doc covariates, e.g. binary vector
	where
	D = number of docs
	V = number of wordtypes in vocabulary
	K = number of doc features

	Returns:
	The 'gamma' (K x V) weights matrix of sparse differences of words'
	log-probabilities relative to the background distribution. See
	module's docs for details on the model.

	Model hyperparams:
	l1_penalty: the 'lambda' in the penalty term lambda*\|\|gamma\|\|_1.
	Make this higher, like 1 or 10, for more sparsity.

	Training hyperparams:
	epsilon: convergence threshold, relative grad norm
	delta: convergence threshold, relative change in objective

	"""
	D1,V = trainX.shape
	D2,K = trainY.shape
	assert D1==D2, "matrices should both have 1 row per doc"
	D=D1
	if verbose:
	print("{} docs, {} vocab, {} doc features".format(D,V,K))

	bb=LBFGS()
	bb.orthantwise_c = l1_penalty
	bb.linesearch = 'wolfe'
	# looks like it's an OR between the two convergence criteria.
	# https://github.com/larsmans/pylbfgs/blob/master/liblbfgs/lbfgs.c#L498
	bb.epsilon = epsilon
	bb.delta = delta

	corpus_wordcounts = trainX.sum(0)
	assert np.all(corpus_wordcounts > 0), "can't handle zero counts in input"
	corpus_wordprob = corpus_wordcounts / corpus_wordcounts.sum()
	beta = np.log(corpus_wordprob)

	weight_init = np.zeros((K,V))
	cb = progress if verbose else lambda *a: 0
	opt = bb.minimize(nll_and_grad, weight_init, cb, [beta, trainX, trainY])
	return opt