Skip to content

Instantly share code, notes, and snippets.

@alik604
Last active July 1, 2020 00:34
Show Gist options
  • Save alik604/cd32de2f63884870314fca58cdfc057e to your computer and use it in GitHub Desktop.
Save alik604/cd32de2f63884870314fca58cdfc057e to your computer and use it in GitHub Desktop.
boiler plate for jupyter notebook on Data Science or Machine Learning
%%capture
!pip install scikit-plot
!pip install catboost
!pip install mlxtend
!pip install yfinance
!pip install pyod
import pyod
import yfinance
import xgboost # xgboost.readthedocs.io/en/latest/python/python_api.html#xgboost.XGBClassifier
import lightgbm # lightgbm.readthedocs.io/en/latest/pythonapi/lightgbm.LGBMClassifier.html
import catboost # catboost.ai/docs/concepts/python-quickstart.html
import numpy as np
import pandas as pd
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.decomposition import *
from sklearn.preprocessing import *
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, VotingClassifier
from sklearn.svm import *
from sklearn.kernel_approximation import Nystroem
from mlxtend.classifier import EnsembleVoteClassifier
import copy
import matplotlib.pyplot as plt
import scikitplot as skplt
import tensorflow as tf
from tensorflow.keras.sequential import Sequential
from tensorflow.keras.layer import *
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
train = pd.read_csv('https://raw.githubusercontent.com/Nir-J/ML-Projects/master/UNSW-Network_Packet_Classification/UNSW_NB15_training-set.csv')
test = pd.read_csv('https://raw.githubusercontent.com/Nir-J/ML-Projects/master/UNSW-Network_Packet_Classification/UNSW_NB15_testing-set.csv')
combined_data = pd.concat([train, test]).drop(['id','label'], axis = 1)
# full https://github.com/alik604/The-Best-Ensemble/blob/master/The_Best_Ensemble_UNSW_Network_Packet_Classification_colab.ipynb
## will help for not doing PCA on categorical data
# columns = combined_data.columns
# categorical = ['attack_cat', 'proto', 'service', 'state']
# for cat in categorical:
# encoder = LabelEncoder()
# combined_data[cat] = encoder.fit_transform(combined_data[cat])
y = combined_data['attack_cat']
X = combined_data.drop(['attack_cat'], axis = 1) # inplace = true, to change-in place... mutate origianl data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.95, random_state=42)
wanted_explained_variance_ratio = 0.99
steps_down = 2
wanted_n_components = X_train.shape[1]
first_time = True
for i in range(X_train.shape[1]-1, 1, -steps_down):
total_var_ratio = round(np.sum(PCA(n_components=i).fit(X_train).explained_variance_ratio_), 5)
print('i =', i, 'with a variance ratio of', total_var_ratio)
if total_var_ratio < wanted_explained_variance_ratio and first_time:
wanted_n_components = i + steps_down
first_time = False
print("We should set n_components to: ",wanted_n_components)
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)
pca = PCA(n_components=wanted_n_components) # want 26ish to 28ish components, if you want 99% of variance explained
_ = pca.fit(X_train)
X_train = pca.transform(X_train)
X_test = pca.transform(X_test)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment