Created
November 24, 2019 01:37
-
-
Save alik604/d9dd9e774c12da2fd4bf51a0ff3edbd5 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# load data and set labels. details admitted | |
train = pd.read_csv('https://raw.githubusercontent.com/defcom17/NSL_KDD/master/KDDTrain%2B.csv') | |
test = pd.read_csv('https://raw.githubusercontent.com/defcom17/NSL_KDD/master/KDDTest%2B.csv') | |
train.columns , test.columns = labels , labels | |
combined_data = pd.concat([train, test]).drop('difficulty_level', 1) | |
le = LabelEncoder() | |
vector = combined_data['attack_type'] | |
print("Attack Vectors:", set(list(vector))) # use print to make it print on single line | |
combined_data['attack_type'] = le.fit_transform(vector) | |
combined_data['protocol_type'] = le.fit_transform(combined_data['protocol_type']) | |
combined_data['service'] = le.fit_transform(combined_data['service']) | |
combined_data['flag'] = le.fit_transform(combined_data['flag']) | |
#train_test_split and normalize | |
data_x = combined_data.drop('attack_type', axis=1) | |
## I should have normalize(data_x) here.... | |
data_y = combined_data.loc[:,['attack_type']] | |
# del combined_data # free mem | |
X_train, X_test, y_train, y_test = train_test_split(data_x, data_y, test_size=.5, random_state=42) # TODO | |
X_train = pd.DataFrame(normalize(X_train)) | |
X_test = pd.DataFrame(normalize(X_test)) | |
DTC = DecisionTreeClassifier() | |
RFC = RandomForestClassifier(n_estimators=25, random_state=1) | |
ETC = ExtraTreesClassifier(n_estimators=10, criterion='gini', max_features='auto', bootstrap=False) | |
x = X_train | |
y = y_train['attack_type'].ravel() | |
# predict data without feature selection | |
eclf = VotingClassifier(estimators=[('lr', DTC), ('rf', RFC),('et',ETC)], voting='hard') | |
RFC = RandomForestClassifier(n_estimators=25, random_state=1) | |
for clf, label in zip([DTC, RFC,ETC, eclf], ['DecisionTreeClassifier', 'RandomForestClassifier', 'ExtraTreesClassifier', 'Ensemble']): | |
_ = clf.fit(x,y) | |
pred = clf.score(X_test,y_test) | |
print("Acc: %0.10f [%s]" % (pred,label)) | |
''' | |
wow! 99% already.... | |
41 dimensions | |
Acc: 0.9907079467 [DecisionTreeClassifier] | |
Acc: 0.9934955627 [RandomForestClassifier] | |
Acc: 0.9922431555 [ExtraTreesClassifier] | |
Acc: 0.9935628964 [Ensemble] | |
''' | |
#recursive feature elimination, then Singular value decomposition | |
rfe = RFE(DTC, n).fit(x,y) | |
desiredIndices = np.where(rfe.support_==True)[0] | |
whitelist = X_train.columns.values[desiredIndices] | |
svd = TruncatedSVD(n_components=n-20) | |
_ = svd.fit(X_train[whitelist]) # or fit transform and omit part of the next line | |
X_train_svd, X_test_svd = svd.transform(X_train[whitelist]), svd.transform(X_test[whitelist]) | |
# Bechmark post feature reduction | |
DTC = DecisionTreeClassifier() | |
RFC = RandomForestClassifier(n_estimators=25, random_state=1) | |
ETC = ExtraTreesClassifier(n_estimators=10, criterion='gini', max_features='auto', bootstrap=False) | |
eclf = VotingClassifier(estimators=[('lr', DTC), ('rf', RFC),('et',ETC)], voting='hard') | |
for clf, label in zip([DTC, RFC,ETC, eclf], ['DecisionTreeClassifier', 'RandomForestClassifier', 'ExtraTreesClassifier', 'Ensemble']): | |
_ = eclf.fit(X_train_svd,y_train) | |
pred = eclf.score(X_test_svd,y_test) | |
print("Acc: %0.10f [%s]" % (pred,label)) | |
''' | |
10 dimensions | |
Acc: 0.9857387182 [DecisionTreeClassifier] | |
Acc: 0.9863043215 [RandomForestClassifier] | |
Acc: 0.9860753868 [ExtraTreesClassifier] | |
Acc: 0.9861157871 [Ensemble] # 0.74% less accuracy with 25% of the dimensions. | |
75% less data gets us <1% less accuracy | |
''' | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment