spencerwilson · November 21, 2018 18:22
diff --git a/nb.py b/nb.py
 #!/usr/bin/env python

 import numpy as np
 import pandas as pd
 from sklearn import svm

 FILE_NAME = 'mosteller-wallace-federalist-papers.csv'
 STOP_WORDS = ['a', 'an']

 # Rows are samples, cols are [...word_count, AUTHOR, CODE_NUMER].
 # Discard CODE_NUMBER col because it's useless.
 df = pd.read_csv(FILE_NAME).iloc[:, :-1]

 # In an exciting twist, rather than stop words being omitted, they're
 # the only words we _do_ include.
 df = df.loc[:, STOP_WORDS + ['AUTHOR']]

 # Partition the dataset into X_train and X_test in a special way:
 # samples whose label is 'unknown' ('HAMILTON OR MADISON') will be
 # X_test, and everything written by those two men is X_train.
 # Filter using boolean indexing: https://pandas.pydata.org/pandas-docs/stable/indexing.html#boolean-indexing
 df_unknown = df['AUTHOR'] == 'HAMILTON OR MADISON'
 df_unknown = df[df_unknown]
 df_known = df[(df['AUTHOR'] == 'HAMILTON') | (df['AUTHOR'] == 'MADISON')]

 X_train, y_train = df_known.iloc[:, :-1], df_known.iloc[:, -1:].values.ravel()
 X_test = df_unknown.iloc[:, :-1]

 # Rando classifier, from https://scikit-learn.org/stable/tutorial/basic/tutorial.html#learning-and-predicting
 clf_svm = svm.SVC(gamma=0.001, C=100.)
 clf_svm.fit(X_train, y_train)

 from sklearn import naive_bayes

 # A given author has some 'true' probability of producing a given sample
 # (AKA feature vector AKA word count histogram for a document), and the task
 # is try and estimate it. If we could, then we could substitute that value
 # directly into Bayes' theorem as the 'likelihood' term, p(x | C_k). Along
 # with the easily-estimated (substantiated below) 'prior' term p(C_k), the
 # 'posterior' p(C_k | x) is computable. That sounds helpful, so let's try
 # to get there.
 #
 # Main idea: each author (class, C_k) has his own personal multinomial distribution
 # that characterizes the kinds of bags of words they're likely to produce when
 # they sit down at their writing desk and produce documents.
 #
 # A multinomial distribution says what the probability of producing a given
 # sample is, but to construct it you need its special parameter sauce: the
 # author's probability of using each word, (p1, ..., pn). To estimate this,
 # each word's relative frequency in the training set is used. This simple technique
 # method has a fancy name: maximum likelihood estimation.
 #
 # The pmf for that distribution _is_ the p(x | C_k) that appears in Bayes'
 # theorem. This likelihood term, along with the prior term p(C_k) (itself
 # estimated simply based on the relative frequency of C_k within y_train), allows
 # the computation of that thicc posterior: p(C_k | x).
 #
 # Finally, add in a decision rule like 'For a given sample, compute the
 # posterior and pick the class whose posterior is greatest', AKA maximum
 # a posteriori. With that, a classifier is born.
 #
 # See 
 #  - https://scikit-learn.org/stable/modules/naive_bayes.html#multinomial-naive-bayes
 #  - https://en.wikipedia.org/wiki/Naive_Bayes_classifier#Multinomial_naive_Bayes
 #  - https://en.wikipedia.org/wiki/Naive_Bayes_classifier#Constructing_a_classifier_from_the_probability_model
 #  - https://en.wikipedia.org/wiki/Multinomial_distribution
 clf_nb = naive_bayes.MultinomialNB()
 clf_nb.fit(X_train, y_train)

 print('(SVM, MultinomialNB)')
 print(np.stack((clf_svm.predict(X_test), clf_nb.predict(X_test)), axis=1))

 # TODO: Try cross-validation: https://scikit-learn.org/stable/modules/cross_validation.html#cross-validation
	#!/usr/bin/env python

	import numpy as np
	import pandas as pd
	from sklearn import svm

	FILE_NAME = 'mosteller-wallace-federalist-papers.csv'
	STOP_WORDS = ['a', 'an']

	# Rows are samples, cols are [...word_count, AUTHOR, CODE_NUMER].
	# Discard CODE_NUMBER col because it's useless.
	df = pd.read_csv(FILE_NAME).iloc[:, :-1]

	# In an exciting twist, rather than stop words being omitted, they're
	# the only words we _do_ include.
	df = df.loc[:, STOP_WORDS + ['AUTHOR']]

	# Partition the dataset into X_train and X_test in a special way:
	# samples whose label is 'unknown' ('HAMILTON OR MADISON') will be
	# X_test, and everything written by those two men is X_train.
	# Filter using boolean indexing: https://pandas.pydata.org/pandas-docs/stable/indexing.html#boolean-indexing
	df_unknown = df['AUTHOR'] == 'HAMILTON OR MADISON'
	df_unknown = df[df_unknown]
	df_known = df[(df['AUTHOR'] == 'HAMILTON') \| (df['AUTHOR'] == 'MADISON')]

	X_train, y_train = df_known.iloc[:, :-1], df_known.iloc[:, -1:].values.ravel()
	X_test = df_unknown.iloc[:, :-1]

	# Rando classifier, from https://scikit-learn.org/stable/tutorial/basic/tutorial.html#learning-and-predicting
	clf_svm = svm.SVC(gamma=0.001, C=100.)
	clf_svm.fit(X_train, y_train)

	from sklearn import naive_bayes

	# A given author has some 'true' probability of producing a given sample
	# (AKA feature vector AKA word count histogram for a document), and the task
	# is try and estimate it. If we could, then we could substitute that value
	# directly into Bayes' theorem as the 'likelihood' term, p(x \| C_k). Along
	# with the easily-estimated (substantiated below) 'prior' term p(C_k), the
	# 'posterior' p(C_k \| x) is computable. That sounds helpful, so let's try
	# to get there.
	#
	# Main idea: each author (class, C_k) has his own personal multinomial distribution
	# that characterizes the kinds of bags of words they're likely to produce when
	# they sit down at their writing desk and produce documents.
	#
	# A multinomial distribution says what the probability of producing a given
	# sample is, but to construct it you need its special parameter sauce: the
	# author's probability of using each word, (p1, ..., pn). To estimate this,
	# each word's relative frequency in the training set is used. This simple technique
	# method has a fancy name: maximum likelihood estimation.
	#
	# The pmf for that distribution _is_ the p(x \| C_k) that appears in Bayes'
	# theorem. This likelihood term, along with the prior term p(C_k) (itself
	# estimated simply based on the relative frequency of C_k within y_train), allows
	# the computation of that thicc posterior: p(C_k \| x).
	#
	# Finally, add in a decision rule like 'For a given sample, compute the
	# posterior and pick the class whose posterior is greatest', AKA maximum
	# a posteriori. With that, a classifier is born.
	#
	# See
	# - https://scikit-learn.org/stable/modules/naive_bayes.html#multinomial-naive-bayes
	# - https://en.wikipedia.org/wiki/Naive_Bayes_classifier#Multinomial_naive_Bayes
	# - https://en.wikipedia.org/wiki/Naive_Bayes_classifier#Constructing_a_classifier_from_the_probability_model
	# - https://en.wikipedia.org/wiki/Multinomial_distribution
	clf_nb = naive_bayes.MultinomialNB()
	clf_nb.fit(X_train, y_train)

	print('(SVM, MultinomialNB)')
	print(np.stack((clf_svm.predict(X_test), clf_nb.predict(X_test)), axis=1))

	# TODO: Try cross-validation: https://scikit-learn.org/stable/modules/cross_validation.html#cross-validation