Skip to content

Instantly share code, notes, and snippets.

Show Gist options
  • Save pierrelouisbescond/ab6451014844ef38e4ef793e31849781 to your computer and use it in GitHub Desktop.
Save pierrelouisbescond/ab6451014844ef38e4ef793e31849781 to your computer and use it in GitHub Desktop.
# Let's import standard data process librairies
import pandas as pd
pd.options.display.max_columns = 15
import numpy as np
# Sklearn librairies for Data Generation, Imputation and Modeling
from sklearn.datasets import make_regression
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import SimpleImputer, KNNImputer, IterativeImputer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import Lasso
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
# Plotly librairy for data visualization
import plotly.graph_objects as go
# Regression use-case generation
n_samples = 10000
n_features = 50
n_informative = 12
X, y = make_regression(n_samples=n_samples, n_features=n_features, n_informative=n_informative)
# We convert the X and y arrays into a DataFrame which will be useful later
col_names = []
for i in range(n_features):
col_names.append("X{}".format(i+1))
df = pd.DataFrame(X, columns=col_names)
df["Y"] = y
# To match realistic figures, we add between 10 and 100 to every feature
for col in df.columns:
df[col]=np.random.randint(10,100)+df[col]
display(df.sample(10))
display(df.describe())
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment