spiderChow · December 15, 2017 07:59
diff --git a/reader.py b/reader.py
 from sklearn import preprocessing


 def reader(data_file):
    """
    :param data_file: line1=raw sentence, line0=raw category line2=\n
    :return: sens:List: raw sentence  labels:List: label number labels_encoder:LabelEncoder() in sklearn
    """
    with open(data_file) as f:
        lines = f.readlines()

    cats = []
    sens = []
    for num, line in enumerate(lines):
        if num % 3 == 0:
            cats.append(line.strip())
        elif num % 3 == 1:
            sens.append(line.strip())

    # make the raw categories into numbers,i.e.["Male","Female"] into [0,1]
    enc = preprocessing.LabelEncoder()
    labels_encoder = enc.fit(cats)
    #unique_cats = list(labels_encoder.classes_)
    labels = labels_encoder.transform(cats).tolist()  # make numpy into list

    return sens, labels, labels_encoder
diff --git a/sklearn_text.py b/sklearn_text.py
 import logging
 import numpy as np
 from optparse import OptionParser
 import sys
 from time import time
 import matplotlib.pyplot as plt
 from sklearn.feature_extraction.text import TfidfVectorizer
 from sklearn.feature_extraction.text import HashingVectorizer
 from sklearn.feature_selection import SelectFromModel
 from sklearn.feature_selection import SelectKBest, chi2
 from sklearn.model_selection import train_test_split
 from sklearn.linear_model import RidgeClassifier
 from sklearn.pipeline import Pipeline
 from sklearn.svm import LinearSVC
 from sklearn.linear_model import SGDClassifier
 from sklearn.linear_model import Perceptron
 from sklearn.linear_model import PassiveAggressiveClassifier
 from sklearn.naive_bayes import BernoulliNB, MultinomialNB
 from sklearn.neighbors import KNeighborsClassifier
 from sklearn.neighbors import NearestCentroid
 from sklearn.ensemble import RandomForestClassifier

 from Reader import reader
 from bench import benchmark


 # set logging
 logging.basicConfig(level=logging.INFO,
                    format='%(asctime)s %(levelname)s %(message)s',filename="classifier.log")
 # create logger
 param_logger = logging.getLogger("paramLogger")
 result_logger = logging.getLogger("resultLogger")
 param_logger.setLevel(logging.INFO)
 result_logger.setLevel(logging.INFO)
 #create formatter
 fmt = "%(asctime)-15s %(levelname)s :: %(message)s"
 formatter = logging.Formatter(fmt=fmt)
 # create handler
 sh = logging.StreamHandler(stream=None)
 sh.setLevel(logging.INFO)
 fh_param = logging.FileHandler(filename="para.log",mode="a")
 fh_param.setLevel(logging.INFO)
 fh_resu = logging.FileHandler(filename="result.log",mode="a")
 fh_resu.setLevel(logging.INFO)
 # add handler and formatter to logger
 fh_param.setFormatter(formatter)
 fh_resu.setFormatter(formatter)
 param_logger.addHandler(fh_param)
 result_logger.addHandler(sh)
 result_logger.addHandler(fh_resu)


 # parse commandline arguments
 op = OptionParser()
 op.add_option("--report",
              action="store_true", dest="print_report",
              help="Print a detailed classification report.")
 op.add_option("--chi2_select",
              action="store", type="int", dest="select_chi2",
              help="Select some number of features using a chi-squared test")
 op.add_option("--confusion_matrix",
              action="store_true", dest="print_cm",
              help="Print the confusion matrix.")
 op.add_option("--top10",
              action="store_true", dest="print_top10",
              help="Print ten most discriminative terms per class"
                   " for every classifier.")
 op.add_option("--all_categories",
              action="store_true", dest="all_categories",
              help="Whether to use all categories or not.")
 op.add_option("--use_hashing",
              action="store_true",
              help="Use a hashing vectorizer.")
 op.add_option("--n_features",
              action="store", type=int, default=2 ** 16,
              help="n_features when using the hashing vectorizer.")
 op.add_option("--filtered",
              action="store_true",
              help="Remove newsgroup information that is easily overfit: "
                   "headers, signatures, and quoting.")

 (opts, args) = op.parse_args()

 # read the datafile
 sens, labels, labels_encoder = reader(data_file="V_0.0.7_generated_question_intent_3000.txt")
 target_names = list(labels_encoder.classes_)
 param_logger.info("target_names are {}".format(target_names))

 # split the the corpus into train and test
 X_raw_train, X_raw_test, y_train, y_test = train_test_split(sens, labels, test_size=0.33, random_state=42)
 # extract features from the training data using a sparse vectorizer
 vectorizer = TfidfVectorizer(sublinear_tf=True, max_df=0.5)
 X_train = vectorizer.fit_transform(X_raw_train)
 X_test = vectorizer.transform(X_raw_test)
 feature_names = vectorizer.get_feature_names()

 param_logger.info("feature_name's length is {}".format(len(feature_names)))


 results = []
 for clf, name in (
        (RidgeClassifier(tol=1e-2, solver="lsqr"), "Ridge Classifier"),
        (Perceptron(n_iter=50), "Perceptron"),
        (PassiveAggressiveClassifier(n_iter=50), "Passive-Aggressive"),
        (KNeighborsClassifier(n_neighbors=10), "kNN"),
        (RandomForestClassifier(n_estimators=100), "Random forest")):
    result_logger.info('=' * 80)
    result_logger.info(name)
    results.append(benchmark(clf,X_train, y_train, X_test, y_test, target_names,opts))

 for penalty in ["l2", "l1"]:
    result_logger.info('=' * 80)
    result_logger.info("%s penalty" % penalty.upper())
    # Train Liblinear model
    results.append(benchmark(LinearSVC(penalty=penalty, dual=False,
                                       tol=1e-3),X_train, y_train, X_test, y_test, target_names,opts))

    # Train SGD model
    results.append(benchmark(SGDClassifier(alpha=.0001, n_iter=50,
                                           penalty=penalty),X_train, y_train, X_test, y_test, target_names,opts))

 # Train SGD with Elastic Net penalty
 result_logger.info('=' * 80)
 result_logger.info("Elastic-Net penalty")
 results.append(benchmark(SGDClassifier(alpha=.0001, n_iter=50,
                                       penalty="elasticnet"),X_train, y_train, X_test, y_test, target_names,opts))

 # Train NearestCentroid without threshold
 result_logger.info('=' * 80)
 result_logger.info("NearestCentroid (aka Rocchio classifier)")
 results.append(benchmark(NearestCentroid(),X_train, y_train, X_test, y_test, target_names,opts))

 # Train sparse Naive Bayes classifiers
 result_logger.info('=' * 80)
 result_logger.info("Naive Bayes")
 results.append(benchmark(MultinomialNB(alpha=.01),X_train, y_train, X_test, y_test, target_names,opts))
 results.append(benchmark(BernoulliNB(alpha=.01),X_train, y_train, X_test, y_test, target_names,opts))

 result_logger.info('=' * 80)
 result_logger.info("LinearSVC with L1-based feature selection")
 # The smaller C, the stronger the regularization.
 # The more regularization, the more sparsity.
 results.append(benchmark(Pipeline([
    ('feature_selection', SelectFromModel(LinearSVC(penalty="l1", dual=False,
                                                    tol=1e-3))),
    ('classification', LinearSVC(penalty="l2"))]),X_train, y_train, X_test, y_test, target_names,opts))

 # # make some plots
 #
 # indices = np.arange(len(results))
 #
 # results = [[x[i] for x in results] for i in range(4)]
 #
 # clf_names, score, training_time, test_time = results
 # training_time = np.array(training_time) / np.max(training_time)
 # test_time = np.array(test_time) / np.max(test_time)

 # plt.figure(figsize=(12, 8))
 # plt.title("Score")
 # plt.barh(indices, score, .2, label="score", color='navy')
 # plt.barh(indices + .3, training_time, .2, label="training time",
 #          color='c')
 # plt.barh(indices + .6, test_time, .2, label="test time", color='darkorange')
 # plt.yticks(())
 # plt.legend(loc='best')
 # plt.subplots_adjust(left=.25)
 # plt.subplots_adjust(top=.95)
 # plt.subplots_adjust(bottom=.05)
 #
 # for i, c in zip(indices, clf_names):
 #     plt.text(-.3, i, c)
 #
 # plt.show()
	from sklearn import preprocessing


	def reader(data_file):
	"""
	:param data_file: line1=raw sentence, line0=raw category line2=\n
	:return: sens:List: raw sentence labels:List: label number labels_encoder:LabelEncoder() in sklearn
	"""
	with open(data_file) as f:
	lines = f.readlines()

	cats = []
	sens = []
	for num, line in enumerate(lines):
	if num % 3 == 0:
	cats.append(line.strip())
	elif num % 3 == 1:
	sens.append(line.strip())

	# make the raw categories into numbers,i.e.["Male","Female"] into [0,1]
	enc = preprocessing.LabelEncoder()
	labels_encoder = enc.fit(cats)
	#unique_cats = list(labels_encoder.classes_)
	labels = labels_encoder.transform(cats).tolist() # make numpy into list

	return sens, labels, labels_encoder
	import logging
	import numpy as np
	from optparse import OptionParser
	import sys
	from time import time
	import matplotlib.pyplot as plt
	from sklearn.feature_extraction.text import TfidfVectorizer
	from sklearn.feature_extraction.text import HashingVectorizer
	from sklearn.feature_selection import SelectFromModel
	from sklearn.feature_selection import SelectKBest, chi2
	from sklearn.model_selection import train_test_split
	from sklearn.linear_model import RidgeClassifier
	from sklearn.pipeline import Pipeline
	from sklearn.svm import LinearSVC
	from sklearn.linear_model import SGDClassifier
	from sklearn.linear_model import Perceptron
	from sklearn.linear_model import PassiveAggressiveClassifier
	from sklearn.naive_bayes import BernoulliNB, MultinomialNB
	from sklearn.neighbors import KNeighborsClassifier
	from sklearn.neighbors import NearestCentroid
	from sklearn.ensemble import RandomForestClassifier

	from Reader import reader
	from bench import benchmark


	# set logging
	logging.basicConfig(level=logging.INFO,
	format='%(asctime)s %(levelname)s %(message)s',filename="classifier.log")
	# create logger
	param_logger = logging.getLogger("paramLogger")
	result_logger = logging.getLogger("resultLogger")
	param_logger.setLevel(logging.INFO)
	result_logger.setLevel(logging.INFO)
	#create formatter
	fmt = "%(asctime)-15s %(levelname)s :: %(message)s"
	formatter = logging.Formatter(fmt=fmt)
	# create handler
	sh = logging.StreamHandler(stream=None)
	sh.setLevel(logging.INFO)
	fh_param = logging.FileHandler(filename="para.log",mode="a")
	fh_param.setLevel(logging.INFO)
	fh_resu = logging.FileHandler(filename="result.log",mode="a")
	fh_resu.setLevel(logging.INFO)
	# add handler and formatter to logger
	fh_param.setFormatter(formatter)
	fh_resu.setFormatter(formatter)
	param_logger.addHandler(fh_param)
	result_logger.addHandler(sh)
	result_logger.addHandler(fh_resu)


	# parse commandline arguments
	op = OptionParser()
	op.add_option("--report",
	action="store_true", dest="print_report",
	help="Print a detailed classification report.")
	op.add_option("--chi2_select",
	action="store", type="int", dest="select_chi2",
	help="Select some number of features using a chi-squared test")
	op.add_option("--confusion_matrix",
	action="store_true", dest="print_cm",
	help="Print the confusion matrix.")
	op.add_option("--top10",
	action="store_true", dest="print_top10",
	help="Print ten most discriminative terms per class"
	" for every classifier.")
	op.add_option("--all_categories",
	action="store_true", dest="all_categories",
	help="Whether to use all categories or not.")
	op.add_option("--use_hashing",
	action="store_true",
	help="Use a hashing vectorizer.")
	op.add_option("--n_features",
	action="store", type=int, default=2 ** 16,
	help="n_features when using the hashing vectorizer.")
	op.add_option("--filtered",
	action="store_true",
	help="Remove newsgroup information that is easily overfit: "
	"headers, signatures, and quoting.")

	(opts, args) = op.parse_args()

	# read the datafile
	sens, labels, labels_encoder = reader(data_file="V_0.0.7_generated_question_intent_3000.txt")
	target_names = list(labels_encoder.classes_)
	param_logger.info("target_names are {}".format(target_names))

	# split the the corpus into train and test
	X_raw_train, X_raw_test, y_train, y_test = train_test_split(sens, labels, test_size=0.33, random_state=42)
	# extract features from the training data using a sparse vectorizer
	vectorizer = TfidfVectorizer(sublinear_tf=True, max_df=0.5)
	X_train = vectorizer.fit_transform(X_raw_train)
	X_test = vectorizer.transform(X_raw_test)
	feature_names = vectorizer.get_feature_names()

	param_logger.info("feature_name's length is {}".format(len(feature_names)))


	results = []
	for clf, name in (
	(RidgeClassifier(tol=1e-2, solver="lsqr"), "Ridge Classifier"),
	(Perceptron(n_iter=50), "Perceptron"),
	(PassiveAggressiveClassifier(n_iter=50), "Passive-Aggressive"),
	(KNeighborsClassifier(n_neighbors=10), "kNN"),
	(RandomForestClassifier(n_estimators=100), "Random forest")):
	result_logger.info('=' * 80)
	result_logger.info(name)
	results.append(benchmark(clf,X_train, y_train, X_test, y_test, target_names,opts))

	for penalty in ["l2", "l1"]:
	result_logger.info('=' * 80)
	result_logger.info("%s penalty" % penalty.upper())
	# Train Liblinear model
	results.append(benchmark(LinearSVC(penalty=penalty, dual=False,
	tol=1e-3),X_train, y_train, X_test, y_test, target_names,opts))

	# Train SGD model
	results.append(benchmark(SGDClassifier(alpha=.0001, n_iter=50,
	penalty=penalty),X_train, y_train, X_test, y_test, target_names,opts))

	# Train SGD with Elastic Net penalty
	result_logger.info('=' * 80)
	result_logger.info("Elastic-Net penalty")
	results.append(benchmark(SGDClassifier(alpha=.0001, n_iter=50,
	penalty="elasticnet"),X_train, y_train, X_test, y_test, target_names,opts))

	# Train NearestCentroid without threshold
	result_logger.info('=' * 80)
	result_logger.info("NearestCentroid (aka Rocchio classifier)")
	results.append(benchmark(NearestCentroid(),X_train, y_train, X_test, y_test, target_names,opts))

	# Train sparse Naive Bayes classifiers
	result_logger.info('=' * 80)
	result_logger.info("Naive Bayes")
	results.append(benchmark(MultinomialNB(alpha=.01),X_train, y_train, X_test, y_test, target_names,opts))
	results.append(benchmark(BernoulliNB(alpha=.01),X_train, y_train, X_test, y_test, target_names,opts))

	result_logger.info('=' * 80)
	result_logger.info("LinearSVC with L1-based feature selection")
	# The smaller C, the stronger the regularization.
	# The more regularization, the more sparsity.
	results.append(benchmark(Pipeline([
	('feature_selection', SelectFromModel(LinearSVC(penalty="l1", dual=False,
	tol=1e-3))),
	('classification', LinearSVC(penalty="l2"))]),X_train, y_train, X_test, y_test, target_names,opts))

	# # make some plots
	#
	# indices = np.arange(len(results))
	#
	# results = [[x[i] for x in results] for i in range(4)]
	#
	# clf_names, score, training_time, test_time = results
	# training_time = np.array(training_time) / np.max(training_time)
	# test_time = np.array(test_time) / np.max(test_time)

	# plt.figure(figsize=(12, 8))
	# plt.title("Score")
	# plt.barh(indices, score, .2, label="score", color='navy')
	# plt.barh(indices + .3, training_time, .2, label="training time",
	# color='c')
	# plt.barh(indices + .6, test_time, .2, label="test time", color='darkorange')
	# plt.yticks(())
	# plt.legend(loc='best')
	# plt.subplots_adjust(left=.25)
	# plt.subplots_adjust(top=.95)
	# plt.subplots_adjust(bottom=.05)
	#
	# for i, c in zip(indices, clf_names):
	# plt.text(-.3, i, c)
	#
	# plt.show()