Created
March 24, 2014 20:17
-
-
Save nschneid/9748235 to your computer and use it in GitHub Desktop.
Preliminary attempt at sparse learning in creg2. Non-sparse counterpart code is included for comparison.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import numpy as np | |
import scipy | |
import random | |
import math | |
import sys | |
INFINITY = float('inf') | |
def logadd(a,b): | |
""" | |
compute log(exp(a) + exp(b)) | |
""" | |
if a == -INFINITY: | |
return b | |
if b == -INFINITY: | |
return a | |
if b < a: # b - a < 0 | |
return a + math.log1p(math.exp(b - a)) | |
else: # a - b < 0 | |
return b + math.log1p(math.exp(a - b)) | |
class IOLogisticRegression: | |
""" | |
Logistic regression. | |
Minimize regularized log-loss: | |
L(x, y|w) = - sum_i log p(y_i|x_i, w) + l2 ||w||^2 | |
p(y|x, w) = exp(w[y].x) / (sum_y' exp(w[y'].x)) | |
Parameters | |
---------- | |
l2: float, default=0 | |
L2 regularization strength | |
""" | |
def __init__(self, l1=0.0, l2=0.0): | |
self.l1 = l1 | |
self.l2 = l2 | |
def gradient(self, x, n, y, y_feats, W, G): | |
z = -INFINITY | |
log_probs = np.zeros(self.num_labels) | |
xw = x.dot(W) | |
found = False | |
for yi in n: | |
if yi == y: found = True | |
u = xw.dot(y_feats[yi]) | |
log_probs[yi] = u | |
z = logadd(z, u) | |
if not found: | |
print '[ERROR] for training instance', x, 'gold label', y, 'not found in neighborhood', n | |
raise Exception | |
loss = -(log_probs[y] - z) | |
for yi in n: | |
delta = math.exp(log_probs[yi] - z) - (yi == y) | |
q = np.outer(x, y_feats[yi]) * delta | |
#print(G) | |
#print('rG') | |
#print(repr(G)) | |
#print(G+q) | |
G += q | |
return loss | |
def fit(self, infeats, outfeats, X, N, Y, y_feats, num_labels, iterations=300, minibatch_size=1000, eta=1.0): | |
minibatch_size = min(minibatch_size, X.shape[0]) | |
self.num_labels = num_labels | |
self.y_feats = y_feats | |
self.W = scipy.zeros(shape=(infeats, outfeats)) | |
G = scipy.zeros(shape=(infeats, outfeats)) | |
H = scipy.ones(shape=(infeats, outfeats)) * 1e-300 | |
H.fill(1e-300) | |
for i in range(iterations): | |
sys.stderr.write('Iteration: %d\n' % i) | |
G.fill(0) | |
loss = 0 | |
for s in range(10): # random.sample(range(X.shape[0]), minibatch_size): | |
thisloss = self.gradient(X[s], N[s], Y[s], y_feats, self.W, G) | |
#print(thisloss) | |
loss += thisloss | |
#for k in range(self.n_classes - 1): | |
# offset = (self.n_features + 1) * k | |
# for j in range(self.n_features): | |
# loss += self.l2 * self.coef_[offset + j]**2 | |
# g[offset + j] += 2 * self.l2 * self.coef_[offset + j] | |
sys.stderr.write(' Loss = %f\n' % loss) | |
G /= minibatch_size | |
#G_ = scipy.sparse.dok_matrix(G) | |
#print(G_) | |
#print(repr(G_)) | |
#print('rH') | |
#print(repr(H)) | |
#print('rG') | |
#print(repr(G)) | |
H += np.square(G) | |
self.W -= np.divide(G, np.sqrt(H)) * eta | |
return self | |
def predict_(self, x, n, probs): | |
probs.fill(0.0) | |
z = -INFINITY | |
xw = x.dot(self.W) | |
for y in n: | |
u = xw.dot(self.y_feats[y]) | |
probs[y] = u | |
z = logadd(z, u) | |
for y in n: | |
probs[y] = math.exp(probs[y] - z) | |
def predict(self, X, N): | |
post = np.zeros(shape=(X.shape[0],self.num_labels)) | |
return post | |
def predict_proba(self, X, N): | |
post = np.zeros(shape=(X.shape[0],self.num_labels)) | |
for (x, n, p) in zip(X, N, post): | |
self.predict_(x, n, p) | |
return post |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import numpy as np | |
import scipy | |
import random | |
import math | |
import sys | |
INFINITY = float('inf') | |
def logadd(a,b): | |
""" | |
compute log(exp(a) + exp(b)) | |
""" | |
if a == -INFINITY: | |
return b | |
if b == -INFINITY: | |
return a | |
if b < a: # b - a < 0 | |
return a + math.log1p(math.exp(b - a)) | |
else: # a - b < 0 | |
return b + math.log1p(math.exp(a - b)) | |
class IOLogisticRegression: | |
""" | |
Logistic regression. | |
Minimize regularized log-loss: | |
L(x, y|w) = - sum_i log p(y_i|x_i, w) + l2 ||w||^2 | |
p(y|x, w) = exp(w[y].x) / (sum_y' exp(w[y'].x)) | |
Parameters | |
---------- | |
l2: float, default=0 | |
L2 regularization strength | |
""" | |
def __init__(self, l1=0.0, l2=0.0): | |
self.l1 = l1 | |
self.l2 = l2 | |
def gradient(self, x_, n, y, y_feats, W, infeats, outfeats): | |
z = -INFINITY | |
log_probs = np.zeros(self.num_labels) | |
xw = x_.dot(W) | |
#xw_ = x_.dot(W_) | |
found = False | |
for yi in n: | |
if yi == y: found = True | |
u = (xw * y_feats[yi].T)[0,0] | |
#u = xw_.dot(y_feats[yi].T)[0,0] | |
log_probs[yi] = u | |
z = logadd(z, u) | |
if not found: | |
print '[ERROR] for training instance', x, 'gold label', y, 'not found in neighborhood', n | |
raise Exception | |
loss = -(log_probs[y] - z) | |
G = scipy.sparse.dok_matrix((infeats, outfeats)) | |
for yi in n: | |
delta = math.exp(log_probs[yi] - z) - (yi == y) | |
#q = np.outer(x, y_feats[yi].toarray()) * delta | |
q_ = (x_.T * y_feats[yi]) * delta | |
#print(G) | |
#print('rG') | |
#print(repr(G)) | |
#print(G+q) | |
G = G + q_ | |
return loss, G | |
def fit(self, infeats, outfeats, X_, N, Y, y_feats, num_labels, iterations=300, minibatch_size=1000, eta=1.0): | |
minibatch_size = min(minibatch_size, X_.shape[0]) | |
self.num_labels = num_labels | |
self.y_feats = y_feats | |
self.W = scipy.zeros(shape=(infeats, outfeats)) | |
#self.W_ = scipy.sparse.dok_matrix((infeats, outfeats)) | |
#G = scipy.zeros(shape=(infeats, outfeats)) | |
G = scipy.sparse.dok_matrix((infeats, outfeats)) | |
#H = scipy.ones(shape=(infeats, outfeats)) * 1e-300 | |
H = scipy.sparse.dok_matrix((infeats, outfeats)).todense() | |
H.fill(1e-300) | |
for i in range(iterations): | |
sys.stderr.write('Iteration: %d\n' % i) | |
loss = 0 | |
for s in range(10): #random.sample(range(X.shape[0]), minibatch_size): | |
thisloss, thisG = self.gradient(X_[s], N[s], Y[s], y_feats, self.W, infeats, outfeats) | |
#print(thisloss) | |
loss += thisloss | |
G = G + thisG | |
#ss.append(s) | |
#for k in range(self.n_classes - 1): | |
# offset = (self.n_features + 1) * k | |
# for j in range(self.n_features): | |
# loss += self.l2 * self.coef_[offset + j]**2 | |
# g[offset + j] += 2 * self.l2 * self.coef_[offset + j] | |
sys.stderr.write(' Loss = %f\n' % loss) | |
G /= minibatch_size | |
#print(G) | |
#print(repr(G)) | |
#print('rH') | |
#print(repr(H)) | |
#print('rG') | |
#print(repr(G)) | |
#H += np.square(G) | |
Gsq = scipy.sparse.csr_matrix(G.copy()) | |
Gsq.data **= 2 # square each element | |
H += Gsq | |
Hsqrt = scipy.sparse.csr_matrix(H.copy()) | |
Hsqrt.data **= 0.5 | |
#Hsqrt = np.sqrt(H) | |
self.W -= (G / Hsqrt) * eta | |
#self.W_ = self.W_ - np.divide(G, Hsqrt) * eta | |
return self | |
def predict_(self, x, n, probs): | |
probs.fill(0.0) | |
z = -INFINITY | |
xw = x.dot(self.W) | |
for y in n: | |
#u = xw.dot(self.y_feats[y]) | |
u = (xw * self.y_feats[y].T)[0,0] | |
probs[y] = u | |
z = logadd(z, u) | |
for y in n: | |
probs[y] = math.exp(probs[y] - z) | |
def predict(self, X, N): | |
post = np.zeros(shape=(X.shape[0],self.num_labels)) | |
return post | |
def predict_proba(self, X, N): | |
post = np.zeros(shape=(X.shape[0],self.num_labels)) | |
for (x, n, p) in zip(X, N, post): | |
self.predict_(x, n, p) | |
return post |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import sys | |
import json | |
from sklearn import preprocessing | |
from sklearn import feature_extraction | |
from iologreg import IOLogisticRegression | |
features = [] | |
labels = {} | |
invlabels = {} | |
# read labels and associated features | |
for line in open(sys.argv[1]): | |
(label, f) = line.strip().split('\t') | |
invlabels[len(labels)] = label | |
labels[label] = len(labels) | |
features.append(json.loads(f)) | |
label_dict = feature_extraction.DictVectorizer() | |
label_features = label_dict.fit_transform(features).toarray() | |
sys.stderr.write(' LABELS: %s\n' % ' '.join(labels.keys())) | |
sys.stderr.write('LABEL-FEATURES: %s\n' % ' '.join(label_dict.get_feature_names())) | |
out_dim = len(label_dict.get_feature_names()) | |
ids = {} | |
X = [] | |
N = [] | |
# read training instances and neighborhoods | |
for line in open(sys.argv[2]): | |
(id, xfeats, n) = line.strip().split('\t') | |
ids[id] = len(ids) | |
X.append(json.loads(xfeats)) | |
neighborhood = json.loads(n)['N'] | |
if len(neighborhood) == 0: | |
sys.stderr.write('[ERROR] empty neighborhood in line:\n%s' % line) | |
sys.exit(1) | |
if len(neighborhood) == 1: | |
sys.stderr.write('[WARNING] neighborhood for id="%s" is singleton: %s\n' % (id, str(neighborhood))) | |
n = [labels[x] for x in neighborhood] | |
N.append(n) | |
X_dict = feature_extraction.DictVectorizer() | |
X0 = X_dict.fit_transform(X) | |
X = X0.toarray() | |
sys.stderr.write(' rows(X): %d\n' % X.shape[0]) | |
sys.stderr.write('INPUT-FEATURES: %s\n' % ' '.join(X_dict.get_feature_names())) | |
in_dim = len(X_dict.get_feature_names()) | |
# read gold labels | |
Y = [0 for x in xrange(X.shape[0])] | |
for line in open(sys.argv[3]): | |
(id, y) = line.strip().split('\t') | |
Y[ids[id]] = labels[y] | |
assert X.shape[0] == len(N) | |
assert len(Y) == X.shape[0] | |
model = IOLogisticRegression() | |
model.fit(in_dim, out_dim, X, N, Y, label_features, len(labels), iterations = 1000, minibatch_size=10) | |
D = model.predict_proba(X, N) | |
for row in D: | |
dist = {} | |
for i in range(len(row)): | |
if row[i] > 0.0: dist[invlabels[i]] = row[i] | |
print dist | |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import sys | |
import json | |
from sklearn import preprocessing | |
from sklearn import feature_extraction | |
from iologreg_sparse import IOLogisticRegression | |
features = [] | |
labels = {} | |
invlabels = {} | |
# read labels and associated features | |
for line in open(sys.argv[1]): | |
(label, f) = line.strip().split('\t') | |
invlabels[len(labels)] = label | |
labels[label] = len(labels) | |
features.append(json.loads(f)) | |
label_dict = feature_extraction.DictVectorizer() | |
#label_features = label_dict.fit_transform(features).toarray() | |
label_features = label_dict.fit_transform(features).tocsr() | |
sys.stderr.write(' LABELS: %s\n' % ' '.join(labels.keys())) | |
sys.stderr.write('LABEL-FEATURES: %s\n' % ' '.join(label_dict.get_feature_names())) | |
out_dim = len(label_dict.get_feature_names()) | |
ids = {} | |
X = [] | |
N = [] | |
# read training instances and neighborhoods | |
for line in open(sys.argv[2]): | |
(id, xfeats, n) = line.strip().split('\t') | |
ids[id] = len(ids) | |
X.append(json.loads(xfeats)) | |
neighborhood = json.loads(n)['N'] | |
if len(neighborhood) == 0: | |
sys.stderr.write('[ERROR] empty neighborhood in line:\n%s' % line) | |
sys.exit(1) | |
if len(neighborhood) == 1: | |
sys.stderr.write('[WARNING] neighborhood for id="%s" is singleton: %s\n' % (id, str(neighborhood))) | |
n = [labels[x] for x in neighborhood] | |
N.append(n) | |
X_dict = feature_extraction.DictVectorizer() | |
X0 = X_dict.fit_transform(X) | |
X_ = X0.tocsr() | |
sys.stderr.write(' rows(X): %d\n' % X_.shape[0]) | |
sys.stderr.write('INPUT-FEATURES: %s\n' % ' '.join(X_dict.get_feature_names())) | |
in_dim = len(X_dict.get_feature_names()) | |
# read gold labels | |
Y = [0 for x in xrange(X_.shape[0])] | |
for line in open(sys.argv[3]): | |
(id, y) = line.strip().split('\t') | |
Y[ids[id]] = labels[y] | |
assert X_.shape[0] == len(N) | |
assert len(Y) == X_.shape[0] | |
model = IOLogisticRegression() | |
model.fit(in_dim, out_dim, X_, N, Y, label_features, len(labels), iterations = 1000, minibatch_size=10) | |
D = model.predict_proba(X_, N) | |
for row in D: | |
dist = {} | |
for i in range(len(row)): | |
if row[i] > 0.0: dist[invlabels[i]] = row[i] | |
print dist | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment