Skip to content

Instantly share code, notes, and snippets.

@karimkhanp
Created October 19, 2015 09:19
Show Gist options
  • Select an option

  • Save karimkhanp/dba84d95d035c3177ab3 to your computer and use it in GitHub Desktop.

Select an option

Save karimkhanp/dba84d95d035c3177ab3 to your computer and use it in GitHub Desktop.
from __future__ import division
from math import log, exp
from operator import mul
from collections import Counter
import os
import pylab
import cPickle
class MyDict(dict):
def __getitem__(self, key):
if key in self:
return self.get(key)
return 0
pos = MyDict()
neg = MyDict()
features = set()
totals = [0, 0]
delchars = ''.join(c for c in map(chr, range(128)) if not c.isalnum())
CDATA_FILE = "countdata.pickle"
FDATA_FILE = "reduceddata.pickle"
def negate_sequence(text):
"""
Detects negations and transforms negated words into "not_" form.
"""
negation = False
delims = "?.,!:;"
result = []
words = text.split()
prev = None
pprev = None
for word in words:
# stripped = word.strip(delchars)
stripped = word.strip(delims).lower()
negated = "not_" + stripped if negation else stripped
result.append(negated)
if prev:
bigram = prev + " " + negated
result.append(bigram)
if pprev:
trigram = pprev + " " + bigram
result.append(trigram)
pprev = prev
prev = negated
if any(neg in word for neg in ["not", "n't", "no"]):
negation = not negation
if any(c in word for c in delims):
negation = False
return result
def train():
global pos, neg, totals
retrain = False
# Load counts if they already exist.
if not retrain and os.path.isfile(CDATA_FILE):
pos, neg, totals = cPickle.load(open(CDATA_FILE))
return
limit = 12500
for file in os.listdir("./aclImdb/train/pos")[:limit]:
for word in set(negate_sequence(open("./aclImdb/train/pos/" + file).read())):
pos[word] += 1
neg['not_' + word] += 1
for file in os.listdir("./aclImdb/train/neg")[:limit]:
for word in set(negate_sequence(open("./aclImdb/train/neg/" + file).read())):
neg[word] += 1
pos['not_' + word] += 1
prune_features()
totals[0] = sum(pos.values())
totals[1] = sum(neg.values())
countdata = (pos, neg, totals)
cPickle.dump(countdata, open(CDATA_FILE, 'w'))
def classify(text):
words = set(word for word in negate_sequence(text) if word in features)
if (len(words) == 0): return True
# Probability that word occurs in pos documents
pos_prob = sum(log((pos[word] + 1) / (2 * totals[0])) for word in words)
neg_prob = sum(log((neg[word] + 1) / (2 * totals[1])) for word in words)
return pos_prob > neg_prob
def classify2(text):
"""
For classification from pretrained data
"""
words = set(word for word in negate_sequence(text) if word in pos or word in neg)
if (len(words) == 0): return True
# Probability that word occurs in pos documents
pos_prob = sum(log((pos[word] + 1) / (2 * totals[0])) for word in words)
neg_prob = sum(log((neg[word] + 1) / (2 * totals[1])) for word in words)
return pos_prob > neg_prob
def classify_demo(text):
words = set(word for word in negate_sequence(text) if word in pos or word in neg)
if (len(words) == 0):
print "No features to compare on"
return True
pprob, nprob = 0, 0
# print "\nResult with log"
for word in words:
pp = (pos[word] + 1) / (2 * totals[0])
np = (neg[word] + 1) / (2 * totals[1])
#print "%15s %.9f %.9f" % (word, exp(pp), exp(np))
pprob += pp
nprob += np
# print pp, ", ", np
print "Positive prob : ", pprob
print "Negative prob : ", nprob
p = (pprob/(pprob + nprob))*100
n = (nprob/(pprob + nprob))*100
print "Positive : ", p
print "Negative : ", n
# print ("Positive" if pprob > nprob else "Negative"), "log-diff = %.9f" % abs(pprob - nprob)
#
# print "\nResult without log"
# pprob1, nprob1 = 0, 0
# for word in words:
# pp1 = (pos[word] + 1) / (2 * totals[0])
# np1 = (neg[word] + 1) / (2 * totals[1])
# #print "%15s %.9f %.9f" % (word, exp(pp), exp(np))
# pprob1 += pp1
# nprob1 += np1
# print pp1, ", ", np1
#
# print ("Positive" if pprob > nprob else "Negative"), "Pprob : %f " % pprob1, "nprob : %f " % nprob1
def MI(word):
"""
Compute the weighted mutual information of a term.
"""
T = totals[0] + totals[1]
W = pos[word] + neg[word]
I = 0
if W==0:
return 0
if neg[word] > 0:
# doesn't occur in -ve
I += (totals[1] - neg[word]) / T * log ((totals[1] - neg[word]) * T / (T - W) / totals[1])
# occurs in -ve
I += neg[word] / T * log (neg[word] * T / W / totals[1])
if pos[word] > 0:
# doesn't occur in +ve
I += (totals[0] - pos[word]) / T * log ((totals[0] - pos[word]) * T / (T - W) / totals[0])
# occurs in +ve
I += pos[word] / T * log (pos[word] * T / W / totals[0])
return I
def get_relevant_features():
pos_dump = MyDict({k: pos[k] for k in pos if k in features})
neg_dump = MyDict({k: neg[k] for k in neg if k in features})
totals_dump = [sum(pos_dump.values()), sum(neg_dump.values())]
return (pos_dump, neg_dump, totals_dump)
def prune_features():
"""
Remove features that appear only once.
"""
global pos, neg
for k in pos.keys():
if pos[k] <= 1 and neg[k] <= 1:
del pos[k]
for k in neg.keys():
if neg[k] <= 1 and pos[k] <= 1:
del neg[k]
def feature_selection_trials():
"""
Select top k features. Vary k and plot data
"""
global pos, neg, totals, features
retrain = True
if not retrain and os.path.isfile(FDATA_FILE):
pos, neg, totals = cPickle.load(open(FDATA_FILE))
return
words = list(set(pos.keys() + neg.keys()))
print "Total no of features:", len(words)
words.sort(key=lambda w: -MI(w))
num_features, accuracy = [], []
bestk = 0
limit = 500
path = "./aclImdb/test/"
step = 500
start = 20000
best_accuracy = 0.0
for w in words[:start]:
features.add(w)
for k in xrange(start, 40000, step):
for w in words[k:k+step]:
features.add(w)
correct = 0
size = 0
for file in os.listdir(path + "pos")[:limit]:
correct += classify(open(path + "pos/" + file).read()) == True
size += 1
for file in os.listdir(path + "neg")[:limit]:
correct += classify(open(path + "neg/" + file).read()) == False
size += 1
num_features.append(k+step)
accuracy.append(correct / size)
if (correct / size) > best_accuracy:
bestk = k
print k+step, correct / size
features = set(words[:bestk])
cPickle.dump(get_relevant_features(), open(FDATA_FILE, 'w'))
pylab.plot(num_features, accuracy)
pylab.show()
def test_pang_lee():
"""
Tests the Pang Lee dataset
"""
total, correct = 0, 0
for fname in os.listdir("txt_sentoken/pos"):
correct += int(classify2(open("txt_sentoken/pos/" + fname).read()) == True)
total += 1
for fname in os.listdir("txt_sentoken/neg"):
correct += int(classify2(open("txt_sentoken/neg/" + fname).read()) == False)
total += 1
print "accuracy: %f" % (correct / total)
if __name__ == '__main__':
train()
while True:
text = raw_input("Please enter the sentence : ")
classify_demo(text)
# feature_selection_trials()
# test_pang_lee()
# classify_demo(open("pos_example").read())
# classify_demo(open("neg_example").read())
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment