Created
October 19, 2015 09:19
-
-
Save karimkhanp/dba84d95d035c3177ab3 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| from __future__ import division | |
| from math import log, exp | |
| from operator import mul | |
| from collections import Counter | |
| import os | |
| import pylab | |
| import cPickle | |
| class MyDict(dict): | |
| def __getitem__(self, key): | |
| if key in self: | |
| return self.get(key) | |
| return 0 | |
| pos = MyDict() | |
| neg = MyDict() | |
| features = set() | |
| totals = [0, 0] | |
| delchars = ''.join(c for c in map(chr, range(128)) if not c.isalnum()) | |
| CDATA_FILE = "countdata.pickle" | |
| FDATA_FILE = "reduceddata.pickle" | |
| def negate_sequence(text): | |
| """ | |
| Detects negations and transforms negated words into "not_" form. | |
| """ | |
| negation = False | |
| delims = "?.,!:;" | |
| result = [] | |
| words = text.split() | |
| prev = None | |
| pprev = None | |
| for word in words: | |
| # stripped = word.strip(delchars) | |
| stripped = word.strip(delims).lower() | |
| negated = "not_" + stripped if negation else stripped | |
| result.append(negated) | |
| if prev: | |
| bigram = prev + " " + negated | |
| result.append(bigram) | |
| if pprev: | |
| trigram = pprev + " " + bigram | |
| result.append(trigram) | |
| pprev = prev | |
| prev = negated | |
| if any(neg in word for neg in ["not", "n't", "no"]): | |
| negation = not negation | |
| if any(c in word for c in delims): | |
| negation = False | |
| return result | |
| def train(): | |
| global pos, neg, totals | |
| retrain = False | |
| # Load counts if they already exist. | |
| if not retrain and os.path.isfile(CDATA_FILE): | |
| pos, neg, totals = cPickle.load(open(CDATA_FILE)) | |
| return | |
| limit = 12500 | |
| for file in os.listdir("./aclImdb/train/pos")[:limit]: | |
| for word in set(negate_sequence(open("./aclImdb/train/pos/" + file).read())): | |
| pos[word] += 1 | |
| neg['not_' + word] += 1 | |
| for file in os.listdir("./aclImdb/train/neg")[:limit]: | |
| for word in set(negate_sequence(open("./aclImdb/train/neg/" + file).read())): | |
| neg[word] += 1 | |
| pos['not_' + word] += 1 | |
| prune_features() | |
| totals[0] = sum(pos.values()) | |
| totals[1] = sum(neg.values()) | |
| countdata = (pos, neg, totals) | |
| cPickle.dump(countdata, open(CDATA_FILE, 'w')) | |
| def classify(text): | |
| words = set(word for word in negate_sequence(text) if word in features) | |
| if (len(words) == 0): return True | |
| # Probability that word occurs in pos documents | |
| pos_prob = sum(log((pos[word] + 1) / (2 * totals[0])) for word in words) | |
| neg_prob = sum(log((neg[word] + 1) / (2 * totals[1])) for word in words) | |
| return pos_prob > neg_prob | |
| def classify2(text): | |
| """ | |
| For classification from pretrained data | |
| """ | |
| words = set(word for word in negate_sequence(text) if word in pos or word in neg) | |
| if (len(words) == 0): return True | |
| # Probability that word occurs in pos documents | |
| pos_prob = sum(log((pos[word] + 1) / (2 * totals[0])) for word in words) | |
| neg_prob = sum(log((neg[word] + 1) / (2 * totals[1])) for word in words) | |
| return pos_prob > neg_prob | |
| def classify_demo(text): | |
| words = set(word for word in negate_sequence(text) if word in pos or word in neg) | |
| if (len(words) == 0): | |
| print "No features to compare on" | |
| return True | |
| pprob, nprob = 0, 0 | |
| # print "\nResult with log" | |
| for word in words: | |
| pp = (pos[word] + 1) / (2 * totals[0]) | |
| np = (neg[word] + 1) / (2 * totals[1]) | |
| #print "%15s %.9f %.9f" % (word, exp(pp), exp(np)) | |
| pprob += pp | |
| nprob += np | |
| # print pp, ", ", np | |
| print "Positive prob : ", pprob | |
| print "Negative prob : ", nprob | |
| p = (pprob/(pprob + nprob))*100 | |
| n = (nprob/(pprob + nprob))*100 | |
| print "Positive : ", p | |
| print "Negative : ", n | |
| # print ("Positive" if pprob > nprob else "Negative"), "log-diff = %.9f" % abs(pprob - nprob) | |
| # | |
| # print "\nResult without log" | |
| # pprob1, nprob1 = 0, 0 | |
| # for word in words: | |
| # pp1 = (pos[word] + 1) / (2 * totals[0]) | |
| # np1 = (neg[word] + 1) / (2 * totals[1]) | |
| # #print "%15s %.9f %.9f" % (word, exp(pp), exp(np)) | |
| # pprob1 += pp1 | |
| # nprob1 += np1 | |
| # print pp1, ", ", np1 | |
| # | |
| # print ("Positive" if pprob > nprob else "Negative"), "Pprob : %f " % pprob1, "nprob : %f " % nprob1 | |
| def MI(word): | |
| """ | |
| Compute the weighted mutual information of a term. | |
| """ | |
| T = totals[0] + totals[1] | |
| W = pos[word] + neg[word] | |
| I = 0 | |
| if W==0: | |
| return 0 | |
| if neg[word] > 0: | |
| # doesn't occur in -ve | |
| I += (totals[1] - neg[word]) / T * log ((totals[1] - neg[word]) * T / (T - W) / totals[1]) | |
| # occurs in -ve | |
| I += neg[word] / T * log (neg[word] * T / W / totals[1]) | |
| if pos[word] > 0: | |
| # doesn't occur in +ve | |
| I += (totals[0] - pos[word]) / T * log ((totals[0] - pos[word]) * T / (T - W) / totals[0]) | |
| # occurs in +ve | |
| I += pos[word] / T * log (pos[word] * T / W / totals[0]) | |
| return I | |
| def get_relevant_features(): | |
| pos_dump = MyDict({k: pos[k] for k in pos if k in features}) | |
| neg_dump = MyDict({k: neg[k] for k in neg if k in features}) | |
| totals_dump = [sum(pos_dump.values()), sum(neg_dump.values())] | |
| return (pos_dump, neg_dump, totals_dump) | |
| def prune_features(): | |
| """ | |
| Remove features that appear only once. | |
| """ | |
| global pos, neg | |
| for k in pos.keys(): | |
| if pos[k] <= 1 and neg[k] <= 1: | |
| del pos[k] | |
| for k in neg.keys(): | |
| if neg[k] <= 1 and pos[k] <= 1: | |
| del neg[k] | |
| def feature_selection_trials(): | |
| """ | |
| Select top k features. Vary k and plot data | |
| """ | |
| global pos, neg, totals, features | |
| retrain = True | |
| if not retrain and os.path.isfile(FDATA_FILE): | |
| pos, neg, totals = cPickle.load(open(FDATA_FILE)) | |
| return | |
| words = list(set(pos.keys() + neg.keys())) | |
| print "Total no of features:", len(words) | |
| words.sort(key=lambda w: -MI(w)) | |
| num_features, accuracy = [], [] | |
| bestk = 0 | |
| limit = 500 | |
| path = "./aclImdb/test/" | |
| step = 500 | |
| start = 20000 | |
| best_accuracy = 0.0 | |
| for w in words[:start]: | |
| features.add(w) | |
| for k in xrange(start, 40000, step): | |
| for w in words[k:k+step]: | |
| features.add(w) | |
| correct = 0 | |
| size = 0 | |
| for file in os.listdir(path + "pos")[:limit]: | |
| correct += classify(open(path + "pos/" + file).read()) == True | |
| size += 1 | |
| for file in os.listdir(path + "neg")[:limit]: | |
| correct += classify(open(path + "neg/" + file).read()) == False | |
| size += 1 | |
| num_features.append(k+step) | |
| accuracy.append(correct / size) | |
| if (correct / size) > best_accuracy: | |
| bestk = k | |
| print k+step, correct / size | |
| features = set(words[:bestk]) | |
| cPickle.dump(get_relevant_features(), open(FDATA_FILE, 'w')) | |
| pylab.plot(num_features, accuracy) | |
| pylab.show() | |
| def test_pang_lee(): | |
| """ | |
| Tests the Pang Lee dataset | |
| """ | |
| total, correct = 0, 0 | |
| for fname in os.listdir("txt_sentoken/pos"): | |
| correct += int(classify2(open("txt_sentoken/pos/" + fname).read()) == True) | |
| total += 1 | |
| for fname in os.listdir("txt_sentoken/neg"): | |
| correct += int(classify2(open("txt_sentoken/neg/" + fname).read()) == False) | |
| total += 1 | |
| print "accuracy: %f" % (correct / total) | |
| if __name__ == '__main__': | |
| train() | |
| while True: | |
| text = raw_input("Please enter the sentence : ") | |
| classify_demo(text) | |
| # feature_selection_trials() | |
| # test_pang_lee() | |
| # classify_demo(open("pos_example").read()) | |
| # classify_demo(open("neg_example").read()) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment