Last active
July 1, 2020 00:34
-
-
Save alik604/cd32de2f63884870314fca58cdfc057e to your computer and use it in GitHub Desktop.
boiler plate for jupyter notebook on Data Science or Machine Learning
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
%%capture | |
!pip install scikit-plot | |
!pip install catboost | |
!pip install mlxtend | |
!pip install yfinance | |
!pip install pyod | |
import pyod | |
import yfinance | |
import xgboost # xgboost.readthedocs.io/en/latest/python/python_api.html#xgboost.XGBClassifier | |
import lightgbm # lightgbm.readthedocs.io/en/latest/pythonapi/lightgbm.LGBMClassifier.html | |
import catboost # catboost.ai/docs/concepts/python-quickstart.html | |
import numpy as np | |
import pandas as pd | |
import sklearn | |
from sklearn.model_selection import train_test_split | |
from sklearn.decomposition import * | |
from sklearn.preprocessing import * | |
from sklearn.tree import DecisionTreeClassifier | |
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, VotingClassifier | |
from sklearn.svm import * | |
from sklearn.kernel_approximation import Nystroem | |
from mlxtend.classifier import EnsembleVoteClassifier | |
import copy | |
import matplotlib.pyplot as plt | |
import scikitplot as skplt | |
import tensorflow as tf | |
from tensorflow.keras.sequential import Sequential | |
from tensorflow.keras.layer import * | |
from IPython.core.interactiveshell import InteractiveShell | |
InteractiveShell.ast_node_interactivity = "all" | |
train = pd.read_csv('https://raw.githubusercontent.com/Nir-J/ML-Projects/master/UNSW-Network_Packet_Classification/UNSW_NB15_training-set.csv') | |
test = pd.read_csv('https://raw.githubusercontent.com/Nir-J/ML-Projects/master/UNSW-Network_Packet_Classification/UNSW_NB15_testing-set.csv') | |
combined_data = pd.concat([train, test]).drop(['id','label'], axis = 1) | |
# full https://github.com/alik604/The-Best-Ensemble/blob/master/The_Best_Ensemble_UNSW_Network_Packet_Classification_colab.ipynb | |
## will help for not doing PCA on categorical data | |
# columns = combined_data.columns | |
# categorical = ['attack_cat', 'proto', 'service', 'state'] | |
# for cat in categorical: | |
# encoder = LabelEncoder() | |
# combined_data[cat] = encoder.fit_transform(combined_data[cat]) | |
y = combined_data['attack_cat'] | |
X = combined_data.drop(['attack_cat'], axis = 1) # inplace = true, to change-in place... mutate origianl data | |
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.95, random_state=42) | |
wanted_explained_variance_ratio = 0.99 | |
steps_down = 2 | |
wanted_n_components = X_train.shape[1] | |
first_time = True | |
for i in range(X_train.shape[1]-1, 1, -steps_down): | |
total_var_ratio = round(np.sum(PCA(n_components=i).fit(X_train).explained_variance_ratio_), 5) | |
print('i =', i, 'with a variance ratio of', total_var_ratio) | |
if total_var_ratio < wanted_explained_variance_ratio and first_time: | |
wanted_n_components = i + steps_down | |
first_time = False | |
print("We should set n_components to: ",wanted_n_components) | |
scaler = StandardScaler() | |
X_train = scaler.fit_transform(X_train) | |
X_test = scaler.transform(X_test) | |
pca = PCA(n_components=wanted_n_components) # want 26ish to 28ish components, if you want 99% of variance explained | |
_ = pca.fit(X_train) | |
X_train = pca.transform(X_train) | |
X_test = pca.transform(X_test) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment