import pandas as pd
import numpy as np
import matplotlib.pylab as plt
%matplotlib inline
data_train = pd.read_csv( "data/datatraining.txt")
data_test = pd.read_csv( "data/datatest.txt")
Index([u'date', u'Temperature', u'Humidity', u'Light', u'CO2',
u'HumidityRatio', u'Occupancy'],
dtype='object')
|
Temperature |
Humidity |
Light |
CO2 |
HumidityRatio |
Occupancy |
count |
8143.000000 |
8143.000000 |
8143.000000 |
8143.000000 |
8143.000000 |
8143.000000 |
mean |
20.619084 |
25.731507 |
119.519375 |
606.546243 |
0.003863 |
0.212330 |
std |
1.016916 |
5.531211 |
194.755805 |
314.320877 |
0.000852 |
0.408982 |
min |
19.000000 |
16.745000 |
0.000000 |
412.750000 |
0.002674 |
0.000000 |
25% |
19.700000 |
20.200000 |
0.000000 |
439.000000 |
0.003078 |
0.000000 |
50% |
20.390000 |
26.222500 |
0.000000 |
453.500000 |
0.003801 |
0.000000 |
75% |
21.390000 |
30.533333 |
256.375000 |
638.833333 |
0.004352 |
0.000000 |
max |
23.180000 |
39.117500 |
1546.333333 |
2028.500000 |
0.006476 |
1.000000 |
|
date |
Temperature |
Humidity |
Light |
CO2 |
HumidityRatio |
Occupancy |
1 |
2015-02-04 17:51:00 |
23.18 |
27.2720 |
426.0 |
721.25 |
0.004793 |
1 |
2 |
2015-02-04 17:51:59 |
23.15 |
27.2675 |
429.5 |
714.00 |
0.004783 |
1 |
3 |
2015-02-04 17:53:00 |
23.15 |
27.2450 |
426.0 |
713.50 |
0.004779 |
1 |
4 |
2015-02-04 17:54:00 |
23.15 |
27.2000 |
426.0 |
708.25 |
0.004772 |
1 |
5 |
2015-02-04 17:55:00 |
23.10 |
27.2000 |
426.0 |
704.50 |
0.004757 |
1 |
data_train.drop(labels=['date'], axis=1,inplace=True)
out_train = data_train['Occupancy']
data_train.drop(labels=['Occupancy'], axis=1,inplace=True)
out_train = out_train.values
data_train = data_train.values
print 'Giriş : ',data_train.shape
print 'Çıkış :' ,out_train.shape
Giriş : (8143L, 5L)
Çıkış : (8143L,)
data_test.drop(labels=['date'], axis=1,inplace=True)
out_test = data_test['Occupancy']
data_test.drop(labels=['Occupancy'], axis=1,inplace=True)
inLabels = data_test.columns.tolist()
out_test = out_test.values
data_test = data_test.values
['Temperature', 'Humidity', 'Light', 'CO2', 'HumidityRatio']
from sklearn.ensemble import RandomForestClassifier
import sklearn.metrics
classifier = RandomForestClassifier(n_estimators=25,max_depth= 3)
classifier=classifier.fit(data_train,out_train)
predictions=classifier.predict(data_test)
sklearn.metrics.confusion_matrix(out_test,predictions)
array([[1640, 53],
[ 5, 967]])
sklearn.metrics.accuracy_score(out_test,predictions)
0.97823639774859283
print(classifier.feature_importances_)
[ 0.06791678 0.01400845 0.61079644 0.28089006 0.02638828]
x_pos = list(range(len(inLabels)))
plt.bar(x_pos,classifier.feature_importances_,align='center')
plt.grid()
max_y = max(classifier.feature_importances_)
plt.ylim([0, max_y*1.1])
plt.ylabel('Importance')
plt.xticks(x_pos, inLabels)
plt.title('Importance of features')
plt.show()

from sklearn.linear_model import LassoLarsCV
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaler.fit(data_train)
data_train = scaler.transform(data_train)
data_test = scaler.transform(data_test)
regModel = LassoLarsCV(cv=10)
regModel=regModel.fit(data_train,out_train)
rPredictions=regModel.predict(data_test)
thresh = 0.5
rPredictions[rPredictions>=thresh] = 1
rPredictions[rPredictions<thresh] = 0
sklearn.metrics.confusion_matrix(out_test,rPredictions)
array([[1637, 56],
[ 2, 970]])
sklearn.metrics.accuracy_score(out_test,rPredictions)
0.97823639774859283
x_pos = list(range(len(inLabels)))
plt.bar(x_pos,regModel.coef_,align='center')
plt.grid()
max_y = max(regModel.coef_)
plt.ylim([0, max_y*1.1])
plt.ylabel('Importance')
plt.xticks(x_pos, inLabels)
plt.title('Importance of features')
plt.show()
print(regModel.coef_)

[ 0. 0. 0.30397557 0.06647666 0. ]
rPredictions=regModel.predict(data_test)
ptrain =regModel.predict(data_train)
xpos = []
accTest = []
accTrain = []
maxAc = 0.0
maxTh = 0
for i in range(999,0,-1) :
thresh = i/1000.0
tp1 = rPredictions.copy()
tp2 = ptrain.copy()
tp1[tp1>=thresh] = 1
tp1[tp1<thresh] = 0
tp2[tp2>=thresh] = 1
tp2[tp2<thresh] = 0
a1 = sklearn.metrics.accuracy_score(out_test,tp1)
a2 = sklearn.metrics.accuracy_score(out_train,tp2)
xpos.append(thresh)
a = (a1+a2)/2
accTest.append(a1)
accTrain.append(a2)
if(a > maxAc ):
maxAc = a
maxTh = thresh
plt.plot(xpos,accTest,'r',xpos,accTrain,'b')
plt.legend(('Test','Train'))
plt.grid()
plt.show()
i_max = xpos.index(maxTh)
print "Maksimum Accuracy Mean : ", maxAc," with threshold value :",maxTh
print "Maksimum Accuracy for Training Data : ",accTrain[i_max],"... for Testing :",accTest[i_max]

Maksimum Accuracy Mean : 0.983284967878 with threshold value : 0.602
Maksimum Accuracy for Training Data : 0.988333538008 ... for Testing : 0.978236397749