occupy.md

import pandas as pd
import numpy as np
import matplotlib.pylab as plt
%matplotlib inline

data_train = pd.read_csv( "data/datatraining.txt")
data_test = pd.read_csv( "data/datatest.txt")

data_train.columns

Index([u'date', u'Temperature', u'Humidity', u'Light', u'CO2',
       u'HumidityRatio', u'Occupancy'],
      dtype='object')

data_train.describe()

	Temperature	Humidity	Light	CO2	HumidityRatio	Occupancy
count	8143.000000	8143.000000	8143.000000	8143.000000	8143.000000	8143.000000
mean	20.619084	25.731507	119.519375	606.546243	0.003863	0.212330
std	1.016916	5.531211	194.755805	314.320877	0.000852	0.408982
min	19.000000	16.745000	0.000000	412.750000	0.002674	0.000000
25%	19.700000	20.200000	0.000000	439.000000	0.003078	0.000000
50%	20.390000	26.222500	0.000000	453.500000	0.003801	0.000000
75%	21.390000	30.533333	256.375000	638.833333	0.004352	0.000000
max	23.180000	39.117500	1546.333333	2028.500000	0.006476	1.000000

data_train.head(5)

	date	Temperature	Humidity	Light	CO2	HumidityRatio	Occupancy
1	2015-02-04 17:51:00	23.18	27.2720	426.0	721.25	0.004793	1
2	2015-02-04 17:51:59	23.15	27.2675	429.5	714.00	0.004783	1
3	2015-02-04 17:53:00	23.15	27.2450	426.0	713.50	0.004779	1
4	2015-02-04 17:54:00	23.15	27.2000	426.0	708.25	0.004772	1
5	2015-02-04 17:55:00	23.10	27.2000	426.0	704.50	0.004757	1

data_train.drop(labels=['date'], axis=1,inplace=True)
out_train = data_train['Occupancy']
data_train.drop(labels=['Occupancy'], axis=1,inplace=True)

out_train = out_train.values
data_train = data_train.values

print 'Giriş : ',data_train.shape
print 'Çıkış :' ,out_train.shape

Giriş :  (8143L, 5L)
Çıkış : (8143L,)

data_test.drop(labels=['date'], axis=1,inplace=True)
out_test = data_test['Occupancy']
data_test.drop(labels=['Occupancy'], axis=1,inplace=True)
inLabels = data_test.columns.tolist()
out_test = out_test.values
data_test = data_test.values

inLabels

['Temperature', 'Humidity', 'Light', 'CO2', 'HumidityRatio']

from sklearn.ensemble import RandomForestClassifier
import sklearn.metrics

classifier = RandomForestClassifier(n_estimators=25,max_depth= 3)
classifier=classifier.fit(data_train,out_train)

predictions=classifier.predict(data_test)
sklearn.metrics.confusion_matrix(out_test,predictions)

array([[1640,   53],
       [   5,  967]])

sklearn.metrics.accuracy_score(out_test,predictions)

0.97823639774859283

print(classifier.feature_importances_)

[ 0.06791678  0.01400845  0.61079644  0.28089006  0.02638828]

x_pos = list(range(len(inLabels)))
plt.bar(x_pos,classifier.feature_importances_,align='center')
plt.grid()
max_y = max(classifier.feature_importances_)
plt.ylim([0, max_y*1.1])
plt.ylabel('Importance')
plt.xticks(x_pos, inLabels)
plt.title('Importance of features')
plt.show()

from sklearn.linear_model import LassoLarsCV
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
scaler.fit(data_train)
data_train = scaler.transform(data_train)
data_test = scaler.transform(data_test)

regModel = LassoLarsCV(cv=10)
regModel=regModel.fit(data_train,out_train)

rPredictions=regModel.predict(data_test)

thresh = 0.5
rPredictions[rPredictions>=thresh] = 1
rPredictions[rPredictions<thresh] = 0

sklearn.metrics.confusion_matrix(out_test,rPredictions)

array([[1637,   56],
       [   2,  970]])

sklearn.metrics.accuracy_score(out_test,rPredictions)

0.97823639774859283

x_pos = list(range(len(inLabels)))
plt.bar(x_pos,regModel.coef_,align='center')
plt.grid()
max_y = max(regModel.coef_)
plt.ylim([0, max_y*1.1])
plt.ylabel('Importance')
plt.xticks(x_pos, inLabels)
plt.title('Importance of features')
plt.show()
print(regModel.coef_)

[ 0.          0.          0.30397557  0.06647666  0.        ]

rPredictions=regModel.predict(data_test)
ptrain =regModel.predict(data_train)
xpos = []
accTest = []
accTrain = []
maxAc = 0.0
maxTh = 0
for i in range(999,0,-1) :
    thresh = i/1000.0
    tp1 = rPredictions.copy()
    tp2 = ptrain.copy()
    tp1[tp1>=thresh] = 1
    tp1[tp1<thresh] = 0
    tp2[tp2>=thresh] = 1
    tp2[tp2<thresh] = 0    
    a1 = sklearn.metrics.accuracy_score(out_test,tp1)
    a2 = sklearn.metrics.accuracy_score(out_train,tp2)
    xpos.append(thresh)
    a = (a1+a2)/2
    accTest.append(a1) 
    accTrain.append(a2)  
    if(a > maxAc ):
        maxAc = a
        maxTh = thresh

plt.plot(xpos,accTest,'r',xpos,accTrain,'b')
plt.legend(('Test','Train'))
plt.grid()
plt.show()
i_max = xpos.index(maxTh)
print "Maksimum Accuracy Mean : ", maxAc," with threshold value :",maxTh
print "Maksimum Accuracy for Training Data : ",accTrain[i_max],"... for Testing :",accTest[i_max]

Maksimum Accuracy Mean :  0.983284967878  with threshold value : 0.602
Maksimum Accuracy for Training Data :  0.988333538008 ... for Testing : 0.978236397749

birolkuyumcu/occupy.md

birolkuyumcu commented Jul 8, 2016