Created
May 27, 2013 05:30
-
-
Save johnsmith17th/5655323 to your computer and use it in GitHub Desktop.
Text Classifiers
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
''' | |
@author: John Smith | |
''' | |
class Classifier(object): | |
''' base classifier ''' | |
def __init__(self, **kwargs): | |
pass | |
def loadm(self, data): | |
''' load model from dictionary object ''' | |
pass | |
def savem(self): | |
''' get dictionary object of model ''' | |
return { } | |
def load(self, filename): | |
''' load model from file ''' | |
import json | |
finp = open(filename, 'r') | |
text = finp.read() | |
finp.close() | |
data = json.loads(text) | |
self.loadm(data) | |
pass | |
def save(self, filename): | |
''' save model to file ''' | |
import json | |
data = self.savem() | |
text = json.dumps(data, ensure_ascii = False, indent = 4) | |
fout = open(filename, 'w') | |
fout.write(text) | |
fout.close() | |
pass | |
def model(self, X, Y): | |
''' build classifier model ''' | |
''' X: the feature weight array of samples ''' | |
''' Y: the label array of samples ''' | |
pass | |
def predict(self, x): | |
''' to predict the label of sample ''' | |
pass | |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
''' | |
@author: John Smith | |
''' | |
from base import Classifier | |
class NaiveBayes(Classifier): | |
''' base naive bayes classifier ''' | |
@staticmethod | |
def sampling(terms, docs): | |
X = [] # features | |
Y = [] # labels | |
for d in docs: | |
d.prepare() | |
X.append([d.seglist.count(t) for t in terms]) | |
Y.append(1 if d.cat == '-' else -1) | |
return (X, Y) | |
def savem(self): | |
data = {'probc': self.probc, | |
'probf': self.probf, | |
'cats': self.cats, | |
'numx': self.numx, | |
'numf': self.numf, | |
'numc': self.numc, | |
} | |
return data | |
def loadm(self, data): | |
self.probc = data['probc'] | |
self.probf = data['probf'] | |
self.cats = data['cats'] | |
self.numx = data['numx'] | |
self.numf = data['numf'] | |
self.numc = data['numc'] | |
def model(self, X, Y): | |
Classifier.model(self, X, Y) | |
self.probc = [] | |
self.probf = [] | |
self.cats = list(set(Y)) | |
self.numx = len(X) # number of sample | |
self.numf = len(X[0]) # number of feature | |
self.numc = len(self.cats) # number of category | |
class BernoulliBayes(NaiveBayes): | |
''' bernoulli naive bayes classifier ''' | |
def model(self, X, Y): | |
NaiveBayes.model(self, X, Y) | |
sit = range(self.numx) | |
cit = range(self.numc) | |
fit = range(self.numf) | |
sta1 = lambda c: sum(1 for y in Y if y == c) # N(cat) | |
sta2 = lambda f, c: sum(1 for i in sit if Y[i] == c and X[i][f] > 0) # N(feature, cat) | |
for c in cit: | |
ci = self.cats[c] | |
nc = sta1(ci) # N(c) | |
pc = float(nc) / float(self.numx) # P(c) | |
pf = [0] * self.numf | |
for f in fit: | |
fn = sta2(f, ci) # N(f, c) | |
pf[f] = float(1 + fn) / float(self.numc + nc) | |
self.probc.append(pc) | |
self.probf.append(pf) | |
def predict(self, x): | |
p = [0] * self.numc | |
cit = range(self.numc) | |
fit = range(self.numf) | |
for c in cit: | |
pi = 1 | |
for f in fit: | |
if x[f] > 0: pi *= self.probf[c][f] | |
else: pi *= (1.0 - self.probf[c][f]) | |
p[c] = pi * self.probc[c] | |
return self.cats[p.index(max(p))] | |
class MultinomialBayes(NaiveBayes): | |
''' multinomial naive bayes classifier ''' | |
def model(self, X, Y): | |
NaiveBayes.model(self, X, Y) | |
sit = range(self.numx) | |
cit = range(self.numc) | |
fit = range(self.numf) | |
sta1 = lambda c: sum(1 for y in Y if y == c) # N(cat) | |
sta2 = lambda f, c: sum(X[i][f] for i in sit if Y[i] == c) # F(feature, cat) | |
for c in cit: | |
ci = self.cats[c] | |
nc = sta1(ci) # N(c) | |
pc = float(nc) / float(self.numx) # P(c) | |
pf = [0] * self.numf | |
for f in fit: | |
pf[f] = sta2(f, ci) | |
ss = sum(pf) + self.numx | |
for f in fit: | |
pf[f] = float(1 + pf[f]) / float(ss) | |
self.probc.append(pc) | |
self.probf.append(pf) | |
def predict(self, x): | |
p = [0] * self.numc | |
cit = range(self.numc) | |
fit = range(self.numf) | |
fact = lambda x: x and x * fact(x - 1) or 1 | |
for c in cit: | |
pi = 1 | |
for f in fit: | |
nf = x[f] | |
pi *= float(self.probf[c][f] ** nf) / float(fact(nf)) | |
p[c] = pi * self.probc[c] | |
return self.cats[p.index(max(p))] |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
''' | |
@author: Smith | |
''' | |
from bayes import NaiveBayes | |
from nnetwork import NeuralNetwork | |
from utils import udoc | |
import random | |
class BayesNetwork(object): | |
def __init__(self, features): | |
tar = {+1: [+1], -1:[-1]} | |
self.features = features | |
self.bayes = {} | |
for k in self.features: | |
self.bayes[k] = NaiveBayes(len(self.features[k]), [+1, -1]) | |
self.network = NeuralNetwork(tar, len(self.bayes), len(self.bayes), 1, 0.01) | |
pass | |
def model(self, docs): | |
sf = {} | |
for k in self.features: | |
s = udoc.sampling(self.features[k], docs, {'+': +1, '-': -1}) | |
sf[k] = s | |
self.bayes[k].model(s) | |
for i in range(len(docs)): | |
d = docs[i] | |
c = +1 if d.cat == '+' else -1 | |
sx = [] | |
fx = [] | |
for k in self.features: | |
s = sf[k][i] | |
fx.append(self.bayes[k].predict(s)) | |
sx.append((fx, c)) | |
self.network.model(sx) | |
def predict(self, doc): | |
sx = [] | |
c = +1 if doc.cat == '+' else -1 | |
for k in self.features: | |
s = udoc.sampling_one(self.features[k], doc, {'+': +1, '-': -1}) | |
sx.append(self.bayes[k].predict(s)) | |
return self.network.predict((sx, c)) | |
def validate(self, docs, fold = 10): | |
p, r, a, f = [0] * 4 | |
s = list(docs) | |
random.shuffle(s) | |
for k in range(fold): | |
trainset = [x for i, x in enumerate(s) if i % fold != k] | |
testset = [x for i, x in enumerate(s) if i % fold == k] | |
(pi, ri, ai, fi) = self.test(trainset, testset) | |
print (pi, ri, ai, fi) | |
p += pi; r += ri; a += ai; f += fi; | |
m = float(fold) | |
print (p / m, r / m, a / m, f / m) | |
return (p / m, r / m, a / m, f / m) | |
def test(self, trainset, testset): | |
self.model(trainset) | |
pc = [(self.predict(x), +1 if x.cat == '+' else -1) for x in testset] | |
mp, np, nr, ma = [0] * 4 | |
for x in pc: | |
if x[0] == -1 and x[1] == -1: mp += 1 | |
if x[0] == -1: np += 1 | |
if x[1] == -1: nr += 1 | |
if x[0] == x[1]: ma += 1 | |
p = (float(mp) / float(np)) if np > 0 else 0 # precision | |
r = (float(mp) / float(nr)) if nr > 0 else 0 # recall | |
a = float(ma) / float(len(pc)) # accuracy | |
f = self.f_mesure(p, r, 1.0) # f-mesure | |
return (p, r, a, f) | |
def f_mesure(self, p, r, beta = 1): | |
b = beta ** 2 | |
try: f = float((b + 1) * p * r) / float(b * p + r) | |
except: f = 0 | |
return f |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
''' | |
@author: John Smith | |
''' | |
from base import Classifier | |
class AdaBoost(Classifier): | |
def __init__(self, classifier, T): | |
self.classifier = classifier | |
self.T = T | |
pass | |
def model(self, X, Y): | |
import copy, math | |
self.alpha = [] | |
self.classifiers = [] | |
N = len(X) | |
nit = range(N) | |
w = [1.0 / N] * N | |
for t in range(self.T): | |
cls = copy.deepcopy(self.classifier) | |
cls.model(X, Y) | |
Yp = [cls.predict(x) for x in X] | |
e = sum(w[i] for i in nit if Yp[i] != Y[i]) / float(N) | |
a = 0.5 * math.log((1 - e) / e) | |
for i in nit: | |
if Yp[i] == Y[i]: w[i] *= math.exp(-a) | |
else: w[i] *= math.exp(a) | |
z = sum(w) * 1.0 | |
for i in nit: w[i] /= z | |
self.alpha.append(a) | |
self.classifiers.append(cls) | |
def predict(self, x): | |
p = sum(self.alpha[t] * self.classifiers[t].predict(x) for t in range(self.T)) | |
return 1 if p > 0 else -1 |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
''' | |
@author: John Smith | |
''' | |
import math, random | |
from base import Classifier | |
class NeuralNetwork(Classifier): | |
''' neural network classifier ''' | |
@staticmethod | |
def sampling(terms, docs): | |
X = [] # features | |
Y = [] # labels | |
for d in docs: | |
d.prepare() | |
X.append([d.seglist.count(t) for t in terms]) | |
Y.append(1 if d.cat == '-' else -1) | |
return (X, Y) | |
def __init__(self, target, i, h, o, r = 0.01): | |
self.target = target | |
# network output for each category | |
# e.g. {+1: [+1, +1, +1], [-1: [-1, -1, -1], ...} | |
self.ni = i # number of input nodes | |
self.nh = h # number of hidden nodes | |
self.no = o # number of output nodes | |
self.rate = r # learning rate | |
def savem(self): | |
data = {'target': self.target, 'rate': self.rate, | |
'ni': self.ni, 'nh': self.nh, 'no': self.no, | |
'vi': self.vi, 'vh': self.vh, 'vo': self.vo, | |
'wi': self.wi, 'wo': self.wo | |
} | |
return data | |
def loadm(self, data): | |
self.target = data['target'] | |
self.rate = data['rate'] | |
self.ni = data['ni'] | |
self.nh = data['nh'] | |
self.no = data['no'] | |
self.vi = data['vi'] | |
self.vh = data['vh'] | |
self.vo = data['vo'] | |
self.wi = data['wi'] | |
self.wo = data['wo'] | |
def model(self, X, Y): | |
ri = range(self.ni) | |
rh = range(self.nh) | |
ro = range(self.no) | |
rx = range(len(X)) | |
self.vi = [1.0] * self.ni | |
# values of input nodes | |
self.vh = [1.0] * self.nh | |
# values of hidden nodes | |
self.vo = [1.0] * self.no | |
# values of output nodes | |
self.wi = [[random.uniform(-self.rate, self.rate) for j in ri] for x in rh] | |
# weight matrix for input nodes to hidden nodes | |
self.wo = [[random.uniform(-self.rate, self.rate) for j in rh] for x in ro] | |
# weight matrix for hidden nodes to output nodes | |
for i in rx: | |
self.forword(X[i]) | |
self.backpropagate(self.target[Y[i]]) | |
# train the network | |
def predict(self, x): | |
output = self.forword(x) | |
d = {t: self.euclidean(output, self.target[t]) for t in self.target} | |
return min(d, key = d.get) | |
def forword(self, inputs): | |
for i in range(self.ni): | |
self.vi[i] = inputs[i] | |
# activate inputs nodes | |
for i in range(self.nh): | |
self.vh[i] = self.sigmod(self.dotproduct(self.vi, self.wi[i])) | |
# activate hidden nodes | |
for i in range(self.no): | |
self.vo[i] = self.sigmod(self.dotproduct(self.vh, self.wo[i])) | |
# activate output nodes | |
return self.vo[:] | |
def backpropagate(self, outputs): | |
ri = range(self.ni) | |
rh = range(self.nh) | |
ro = range(self.no) | |
o_deltas = [0.0] * self.no | |
for i in ro: | |
e = outputs[i] - self.vo[i] | |
o_deltas[i] = self.dsigmoid(self.vo[i]) * e | |
# calculate error for output | |
h_deltas = [0.0] * self.nh | |
for i in rh: | |
e = 0.0 | |
for j in ro: | |
e += o_deltas[j] * self.wo[j][i] | |
h_deltas[i] = self.dsigmoid(self.vh[i]) * e | |
# calculate error for hidden | |
for i in ro: | |
for j in rh: | |
self.wo[i][j] += self.rate * o_deltas[i] * self.vh[j] | |
# update output weights | |
for i in rh: | |
for j in ri: | |
self.wi[i][j] += self.rate * h_deltas[i] * self.vi[j] | |
# update input weights | |
e = sum([0.5 * (outputs[i] - self.vo[i]) ** 2 for i in ro]) | |
#print e | |
return e | |
def sigmod(self, x): | |
#return 1.0 / (1 + math.exp(-x)) | |
return math.tanh(x) | |
def dsigmoid(self, x): | |
#return x * (1.0 - x) | |
return 1.0 - x ** 2 | |
def dotproduct(self, x, y): | |
# calculate dot product of two vectors | |
return math.fsum(x[i] * y[i] for i in range(len(x))) | |
def euclidean(self, x, y): | |
# calculate euclidean distance of two vector | |
return math.sqrt(math.fsum([math.pow(x[i] - y[i], 2) for i in range(len(x))])) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
''' | |
@author: John Smith | |
''' | |
class CrossValidation(object): | |
def shuffle(self, samples): | |
import random | |
(X, Y) = samples | |
array = range(len(X)) | |
random.shuffle(array) | |
samples = ([X[i] for i in array], [Y[i] for i in array]) | |
return samples | |
def validate(self, classifier, samples, cat, fold = 10): | |
p, r, a, f = [0] * 4 | |
X, Y = self.shuffle(samples) | |
sit = range(len(X)) | |
for k in range(fold): | |
trainx = [X[i] for i in sit if i % fold != k] | |
trainy = [Y[i] for i in sit if i % fold != k] | |
testx = [X[i] for i in sit if i % fold == k] | |
testy = [Y[i] for i in sit if i % fold == k] | |
(pi, ri, ai, fi) = self.test(classifier, trainx, trainy, testx, testy, cat) | |
self.printf(pi, ri, ai, fi) | |
p += pi; r += ri; a += ai; f += fi; | |
m = float(fold) | |
self.printf(p / m, r / m, a / m, f / m) | |
return (p / m, r / m, a / m, f / m) | |
def test(self, classifier, trainx, trainy, testx, testy, cat): | |
classifier.model(trainx, trainy) | |
pc = [(classifier.predict(testx[i]), testy[i]) for i in range(len(testx))] | |
mp, np, nr, ma = [0] * 4 | |
for x in pc: | |
if x[0] == cat and x[1] == cat: mp += 1 | |
if x[0] == cat: np += 1 | |
if x[1] == cat: nr += 1 | |
if x[0] == x[1]: ma += 1 | |
p = (float(mp) / float(np)) if np > 0 else 0 # precision | |
r = (float(mp) / float(nr)) if nr > 0 else 0 # recall | |
a = float(ma) / float(len(pc)) # accuracy | |
f = self.f_mesure(p, r, 1.0) # f-mesure | |
return (p, r, a, f) | |
def f_mesure(self, p, r, beta = 1): | |
b = beta ** 2 | |
try: f = float((b + 1) * p * r) / float(b * p + r) | |
except: f = 0 | |
return f | |
def printf(self, p, r, a, f): | |
t = (p * 100.0, r * 100.0, a * 100.0, f * 100.0) | |
print 'P: %0.4f%% R: %0.4f%% A: %0.4f%% F: %0.4f%%' % t |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
''' | |
@author: Smith | |
''' | |
from base import Classifier | |
class BalanceWinnow(Classifier): | |
@staticmethod | |
def sampling(terms, docs): | |
X = [] | |
Y = [] | |
for d in docs: | |
d.prepare() | |
X.append([1 if t in d.segset else 0 for t in terms]) | |
Y.append(1 if d.cat == '-' else 0) | |
return (X, Y) | |
@staticmethod | |
def theta(terms, docs): | |
a = [sum(d.seglist.count(t) for t in terms) for d in docs] | |
return sum(a) / len(a) | |
def __init__(self, alpha, beta, theta): | |
self.alpha = alpha | |
self.beta = beta | |
self.theta = theta | |
def savem(self): | |
data = {'numf': self.numf, | |
'alpha': self.alpha, | |
'beta': self.beta, | |
'theta': self.theta, | |
'w_pos': self.w_pos, | |
'w_neg': self.w_neg, | |
} | |
return data | |
def loadm(self, data): | |
self.features = data['numf'] | |
self.alpha = data['alpha'] | |
self.beta = data['beta'] | |
self.theta = data['theta'] | |
self.w_pos = data['w_pos'] | |
self.w_neg = data['w_neg'] | |
def model(self, X, Y): | |
self.numf = len(X[0]) | |
self.w_pos = [2] * self.numf | |
self.w_neg = [1] * self.numf | |
sit = range(len(X)) | |
fit = range(self.numf) | |
for i in sit: | |
p = self.predict(X[i]) | |
if p == 1 and p != Y[i]: | |
# decrease the weights | |
for f in fit: | |
if X[i][f] != 0: | |
self.w_pos[f] *= self.beta | |
self.w_neg[f] *= self.alpha | |
elif p == 0 and p != Y[i]: | |
# increase the weights | |
for f in fit: | |
if X[i][f] != 0: | |
self.w_pos[f] *= self.alpha | |
self.w_neg[f] *= self.beta | |
def predict(self, x): | |
r = range(self.numf) | |
s = sum((self.w_pos[i] - self.w_neg[i]) * x[i] for i in r) | |
return 1 if s > self.theta else 0 |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment