femtotrader · October 1, 2015 06:01
diff --git a/digit_version_charles_dadi.py b/digit_version_charles_dadi.py
 #http://scikit-learn.org/stable/auto_examples/applications/face_recognition.html#example-applications-face-recognition-py
 from sklearn.ensemble import RandomForestClassifier
 from numpy import genfromtxt, savetxt
 from sklearn import neighbors, datasets

 from sklearn.cross_validation import train_test_split
 from sklearn.datasets import fetch_lfw_people
 from sklearn.grid_search import GridSearchCV
 from sklearn.metrics import classification_report
 from sklearn.metrics import confusion_matrix
 from sklearn.decomposition import RandomizedPCA
 from sklearn import cross_validation
 from sklearn.cross_validation import train_test_split
 from sklearn.grid_search import GridSearchCV


 from sklearn.ensemble import AdaBoostClassifier

 from sklearn.decomposition import SparsePCA, MiniBatchSparsePCA

 from sklearn.datasets import fetch_olivetti_faces
 from sklearn.cluster import MiniBatchKMeans
 from sklearn import decomposition

 from sklearn.ensemble import RandomForestClassifier

 from sklearn.svm import SVC
 import timeit

 start = timeit.default_timer() #// Starting time calculation
 from time import time
 import logging
 import pylab as pl


 train_path='/home/cerveau2charles/Dropbox/Ivan-Charles-Dan/digit recognizer/train.csv'
 test_path='/home/cerveau2charles/Dropbox/Ivan-Charles-Dan/digit recognizer/test.csv'
 submission_path='/home/cerveau2charles/Dropbox/Ivan-Charles-Dan/digit recognizer/'


 #create the training & test sets, skipping the header row with [1:]
 dataset = genfromtxt(open(train_path,'r'), delimiter=',', dtype='f8')[1:,:]    
 target = [x[0] for x in dataset]
 test = genfromtxt(open(test_path,'r'), delimiter=',', dtype='f8')[1:]
 train=genfromtxt(open(train_path,'r'), delimiter=',', dtype='f8')[1:,1:]
    

 n_components =100
    
    
 #Dimension reduction .methods
 estimators = [

    ('Eigenfaces - RandomizedPCA',
     decomposition.RandomizedPCA(n_components=n_components,iterated_power=20, whiten=True),
     True),

    ('Non-negative components - NMF',
     decomposition.NMF(n_components=n_components, init='nndsvda', beta=5.0,
                       tol=5e-3, sparseness='components'),
     False),

    ('Independent components - FastICA',
     decomposition.FastICA(n_components=n_components, whiten=True),
     True),

    ('Sparse comp. - MiniBatchSparsePCA',
     decomposition.MiniBatchSparsePCA(n_components=n_components, alpha=0.8,
                                      n_iter=100, batch_size=3),
     True),

    ('MiniBatchDictionaryLearning',
        decomposition.MiniBatchDictionaryLearning(n_components=15, alpha=0.1,
                                                  n_iter=50, batch_size=3),
     True),

    ('Cluster centers - MiniBatchKMeans',
        MiniBatchKMeans(n_clusters=n_components, tol=1e-3, batch_size=20,
                        max_iter=50),
     True),

    ('Factor Analysis components - FA',
     decomposition.FactorAnalysis(n_components=n_components, max_iter=2),
     True),
 ]    

    

 which_pca = 0
 print("Extracting the top %d %s..." % (n_components, estimators[which_pca][0]))
 t0 = time()
 pca=estimators[which_pca][1].fit(train)
 train_time = (time() - t0)
 print("done in %0.3fs" % train_time)

 t0 = time()
 train_pca = pca.transform(train)
 test_pca = pca.transform(test)
 print("done in %0.3fs" % (time() - t0))



 ################################################################################
 # Grid search with k-fold cross validation to tune parameters for SVC

 # parameters for SVC
 # right way to tune SVC - 'A Practical Guide to SVC'
 # loose search
 # Results -- gamma=2**-5, C=2**5
 # tuned_parameters = [{'kernel': ['rbf'],
 #                      'gamma': [2**-15, 2**-11, 2**-7, 2**-5, 2**-3, 2**-1],
 #                      'C': [2**-3, 2**-1, 2, 2**3, 2**5, 2**7, 2**9, 2**11, 2**13, 2**15]}]
 # fine search
 # tuned_parameters = [{'kernel': ['rbf'], 
 #                      'gamma': [2**-9, 2**-8, 2**-7, 2**-6, 2**-5, 2**-4],
 #                      'C': [2**3, 2**4, 2**5, 2**6, 2**7, 2**8, 2**9, 2**10]}]

 # parameters for LinearSVC
 # tuned_parameters = [{'loss': ['l2'],
 #                      'penalty': ['l1', 'l2'],
 #                      'C': [2**-3, 2**-1, 2, 2**3, 2**5, 2**7, 2**9, 2**11, 2**13, 2**15]}]

 # parameters for Tree
 # tuned_parameters = [{'max_depth': ['8', '10', '12', '15', '18'],
 #                      'min_split': ['3', '4', '5', '6', '7', '8', '9']}]

 # parameters for kNN



 ###############################################################################
 # Train a SVM classification model
 print("Fitting the classifier to the training set")
 t0 = time()

              
 tuned_parameters = [{'kernel': ['linear','rbf'], 
                      'gamma': [2**-5, 2**-4],
                      'C': [2**3, 2**4, 2**5, 2**6, 2**7, 2**8]}]
                      
 clf = GridSearchCV(SVC(kernel='rbf', class_weight='auto'), tuned_parameters)
 clf = clf.fit(train_pca, target)
 print("done in %0.3fs" % (time() - t0))
 print("Best estimator found by grid search:")
 print(clf.best_estimator_)
 savetxt(submission_path+'PCA'+estimators[which_pca][0]+'_SVM', clf.predict(test_pca), delimiter=',', fmt='%f')
 print("done in %0.3fs" % (time() - t0))


 ###############################################################################
 # Train a KNN classification model
 print("Fitting the classifier to the training set")
 t0 = time()
 tuned_parameters = [{'n_neighbors': [5,6]}]
 clf = GridSearchCV(neighbors.KNeighborsClassifier(n_neighbors=5), tuned_parameters, cv=3, n_jobs=-1).fit(train_pca, target)
 print 'Best parameters set found on development set:'
 print clf.best_estimator_
 savetxt(submission_path+ 'PCA'+estimators[which_pca][0]+'_KNN', clf.predict(test_pca), delimiter=',', fmt='%f')
 print("done in %0.3fs" % (time() - t0))
 ###############################################################################




 ###############################################################################
 #Train adaBoost Multi Class
 clf = AdaBoostClassifier(n_estimators=100)
 clf = clf.fit(train_pca, target)
 print("done in %0.3fs" % (time() - t0))
 savetxt(submission_path+'PCA'+estimators[which_pca][0]+'_AdaBoost', clf.predict(test_pca), delimiter=',', fmt='%f')
 print("done in %0.3fs" % (time() - t0))

 #create and train the random forest
 #multi-core CPUs can use: rf = RandomForestClassifier(n_estimators=100, n_jobs=2) 
 #rf = RandomForestClassifier(n_estimators=100)
 #rf.fit(train, target)
 #savetxt(submission_path+randomForest, rf.predict(test), delimiter=',', fmt='%f')

 #n_neighbors = 5
 #clf = neighbors.KNeighborsClassifier(n_neighbors)
 #clf.fit(train, target)
 #savetxt(submission_path+'knn', clf.predict(test), delimiter=',', fmt='%f')

 ###############################################################################
 # Compute a randomized PCA (eigenfaces) on the face dataset (treated as unlabeled
 # dataset): unsupervised feature extraction / dimensionality reduction
 n_components = 33
 print("Extracting the top %d eigenfaces from %d faces"
      % (n_components, size(train,0)))
 t0 = time()
 pca = RandomizedPCA(n_components=n_components, whiten=True).fit(train)
 print("done in %0.3fs" % (time() - t0))
 print("Projecting the input data on the eigenfaces orthonormal basis")
 t0 = time()
 train_pca = pca.transform(train)
 test_pca = pca.transform(test)
 print("done in %0.3fs" % (time() - t0))




    ###############################################################################
 # Train a randomforestclassification model
 print("Fitting the classifier to the training set")
 t0 = time()
 clf =  RandomForestClassifier(n_jobs=500)
 clf.fit(train_pca, target)
 print("done in %0.3fs" % (time() - t0))
 savetxt(submission_path+'randomForest', clf.predict(test_pca), delimiter=',', fmt='%f')
	#http://scikit-learn.org/stable/auto_examples/applications/face_recognition.html#example-applications-face-recognition-py
	from sklearn.ensemble import RandomForestClassifier
	from numpy import genfromtxt, savetxt
	from sklearn import neighbors, datasets

	from sklearn.cross_validation import train_test_split
	from sklearn.datasets import fetch_lfw_people
	from sklearn.grid_search import GridSearchCV
	from sklearn.metrics import classification_report
	from sklearn.metrics import confusion_matrix
	from sklearn.decomposition import RandomizedPCA
	from sklearn import cross_validation
	from sklearn.cross_validation import train_test_split
	from sklearn.grid_search import GridSearchCV


	from sklearn.ensemble import AdaBoostClassifier

	from sklearn.decomposition import SparsePCA, MiniBatchSparsePCA

	from sklearn.datasets import fetch_olivetti_faces
	from sklearn.cluster import MiniBatchKMeans
	from sklearn import decomposition

	from sklearn.ensemble import RandomForestClassifier

	from sklearn.svm import SVC
	import timeit

	start = timeit.default_timer() #// Starting time calculation
	from time import time
	import logging
	import pylab as pl


	train_path='/home/cerveau2charles/Dropbox/Ivan-Charles-Dan/digit recognizer/train.csv'
	test_path='/home/cerveau2charles/Dropbox/Ivan-Charles-Dan/digit recognizer/test.csv'
	submission_path='/home/cerveau2charles/Dropbox/Ivan-Charles-Dan/digit recognizer/'


	#create the training & test sets, skipping the header row with [1:]
	dataset = genfromtxt(open(train_path,'r'), delimiter=',', dtype='f8')[1:,:]
	target = [x[0] for x in dataset]
	test = genfromtxt(open(test_path,'r'), delimiter=',', dtype='f8')[1:]
	train=genfromtxt(open(train_path,'r'), delimiter=',', dtype='f8')[1:,1:]


	n_components =100


	#Dimension reduction .methods
	estimators = [

	('Eigenfaces - RandomizedPCA',
	decomposition.RandomizedPCA(n_components=n_components,iterated_power=20, whiten=True),
	True),

	('Non-negative components - NMF',
	decomposition.NMF(n_components=n_components, init='nndsvda', beta=5.0,
	tol=5e-3, sparseness='components'),
	False),

	('Independent components - FastICA',
	decomposition.FastICA(n_components=n_components, whiten=True),
	True),

	('Sparse comp. - MiniBatchSparsePCA',
	decomposition.MiniBatchSparsePCA(n_components=n_components, alpha=0.8,
	n_iter=100, batch_size=3),
	True),

	('MiniBatchDictionaryLearning',
	decomposition.MiniBatchDictionaryLearning(n_components=15, alpha=0.1,
	n_iter=50, batch_size=3),
	True),

	('Cluster centers - MiniBatchKMeans',
	MiniBatchKMeans(n_clusters=n_components, tol=1e-3, batch_size=20,
	max_iter=50),
	True),

	('Factor Analysis components - FA',
	decomposition.FactorAnalysis(n_components=n_components, max_iter=2),
	True),
	]



	which_pca = 0
	print("Extracting the top %d %s..." % (n_components, estimators[which_pca][0]))
	t0 = time()
	pca=estimators[which_pca][1].fit(train)
	train_time = (time() - t0)
	print("done in %0.3fs" % train_time)

	t0 = time()
	train_pca = pca.transform(train)
	test_pca = pca.transform(test)
	print("done in %0.3fs" % (time() - t0))



	################################################################################
	# Grid search with k-fold cross validation to tune parameters for SVC

	# parameters for SVC
	# right way to tune SVC - 'A Practical Guide to SVC'
	# loose search
	# Results -- gamma=2-5, C=25
	# tuned_parameters = [{'kernel': ['rbf'],
	# 'gamma': [2-15, 2-11, 2-7, 2-5, 2-3, 2-1],
	# 'C': [2-3, 2-1, 2, 23, 25, 27, 29, 211, 213, 2**15]}]
	# fine search
	# tuned_parameters = [{'kernel': ['rbf'],
	# 'gamma': [2-9, 2-8, 2-7, 2-6, 2-5, 2-4],
	# 'C': [23, 24, 25, 26, 27, 28, 29, 210]}]

	# parameters for LinearSVC
	# tuned_parameters = [{'loss': ['l2'],
	# 'penalty': ['l1', 'l2'],
	# 'C': [2-3, 2-1, 2, 23, 25, 27, 29, 211, 213, 2**15]}]

	# parameters for Tree
	# tuned_parameters = [{'max_depth': ['8', '10', '12', '15', '18'],
	# 'min_split': ['3', '4', '5', '6', '7', '8', '9']}]

	# parameters for kNN



	###############################################################################
	# Train a SVM classification model
	print("Fitting the classifier to the training set")
	t0 = time()


	tuned_parameters = [{'kernel': ['linear','rbf'],
	'gamma': [2-5, 2-4],
	'C': [23, 24, 25, 26, 27, 28]}]

	clf = GridSearchCV(SVC(kernel='rbf', class_weight='auto'), tuned_parameters)
	clf = clf.fit(train_pca, target)
	print("done in %0.3fs" % (time() - t0))
	print("Best estimator found by grid search:")
	print(clf.best_estimator_)
	savetxt(submission_path+'PCA'+estimators[which_pca][0]+'_SVM', clf.predict(test_pca), delimiter=',', fmt='%f')
	print("done in %0.3fs" % (time() - t0))


	###############################################################################
	# Train a KNN classification model
	print("Fitting the classifier to the training set")
	t0 = time()
	tuned_parameters = [{'n_neighbors': [5,6]}]
	clf = GridSearchCV(neighbors.KNeighborsClassifier(n_neighbors=5), tuned_parameters, cv=3, n_jobs=-1).fit(train_pca, target)
	print 'Best parameters set found on development set:'
	print clf.best_estimator_
	savetxt(submission_path+ 'PCA'+estimators[which_pca][0]+'_KNN', clf.predict(test_pca), delimiter=',', fmt='%f')
	print("done in %0.3fs" % (time() - t0))
	###############################################################################




	###############################################################################
	#Train adaBoost Multi Class
	clf = AdaBoostClassifier(n_estimators=100)
	clf = clf.fit(train_pca, target)
	print("done in %0.3fs" % (time() - t0))
	savetxt(submission_path+'PCA'+estimators[which_pca][0]+'_AdaBoost', clf.predict(test_pca), delimiter=',', fmt='%f')
	print("done in %0.3fs" % (time() - t0))

	#create and train the random forest
	#multi-core CPUs can use: rf = RandomForestClassifier(n_estimators=100, n_jobs=2)
	#rf = RandomForestClassifier(n_estimators=100)
	#rf.fit(train, target)
	#savetxt(submission_path+randomForest, rf.predict(test), delimiter=',', fmt='%f')

	#n_neighbors = 5
	#clf = neighbors.KNeighborsClassifier(n_neighbors)
	#clf.fit(train, target)
	#savetxt(submission_path+'knn', clf.predict(test), delimiter=',', fmt='%f')

	###############################################################################
	# Compute a randomized PCA (eigenfaces) on the face dataset (treated as unlabeled
	# dataset): unsupervised feature extraction / dimensionality reduction
	n_components = 33
	print("Extracting the top %d eigenfaces from %d faces"
	% (n_components, size(train,0)))
	t0 = time()
	pca = RandomizedPCA(n_components=n_components, whiten=True).fit(train)
	print("done in %0.3fs" % (time() - t0))
	print("Projecting the input data on the eigenfaces orthonormal basis")
	t0 = time()
	train_pca = pca.transform(train)
	test_pca = pca.transform(test)
	print("done in %0.3fs" % (time() - t0))




	###############################################################################
	# Train a randomforestclassification model
	print("Fitting the classifier to the training set")
	t0 = time()
	clf = RandomForestClassifier(n_jobs=500)
	clf.fit(train_pca, target)
	print("done in %0.3fs" % (time() - t0))
	savetxt(submission_path+'randomForest', clf.predict(test_pca), delimiter=',', fmt='%f')