Created
May 28, 2020 15:13
-
-
Save pierrelouisbescond/ab6451014844ef38e4ef793e31849781 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Let's import standard data process librairies | |
import pandas as pd | |
pd.options.display.max_columns = 15 | |
import numpy as np | |
# Sklearn librairies for Data Generation, Imputation and Modeling | |
from sklearn.datasets import make_regression | |
from sklearn.experimental import enable_iterative_imputer | |
from sklearn.impute import SimpleImputer, KNNImputer, IterativeImputer | |
from sklearn.model_selection import train_test_split | |
from sklearn.linear_model import Lasso | |
from sklearn.neighbors import KNeighborsRegressor | |
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor | |
# Plotly librairy for data visualization | |
import plotly.graph_objects as go | |
# Regression use-case generation | |
n_samples = 10000 | |
n_features = 50 | |
n_informative = 12 | |
X, y = make_regression(n_samples=n_samples, n_features=n_features, n_informative=n_informative) | |
# We convert the X and y arrays into a DataFrame which will be useful later | |
col_names = [] | |
for i in range(n_features): | |
col_names.append("X{}".format(i+1)) | |
df = pd.DataFrame(X, columns=col_names) | |
df["Y"] = y | |
# To match realistic figures, we add between 10 and 100 to every feature | |
for col in df.columns: | |
df[col]=np.random.randint(10,100)+df[col] | |
display(df.sample(10)) | |
display(df.describe()) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment