This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import numpy as np | |
import pandas as pd | |
import plotly.express as px | |
from xgboost import XGBRegressor | |
from sklearn.preprocessing import MinMaxScaler | |
from sklearn.metrics import mean_squared_error | |
from sklearn.linear_model import LinearRegression | |
from sklearn.model_selection import train_test_split | |
import tsfresh | |
from tsfresh import select_features |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# for reproducibility | |
pl.seed_everything(42) | |
train_df = pd.read_csv('/content/drive/MyDrive/Datasets/NASA_CMAPSS/train_FD001.txt', delimiter=' ', header=None) | |
test_df = pd.read_csv('/content/drive/MyDrive/Datasets/NASA_CMAPSS/test_FD001.txt', delimiter=' ', header=None) | |
SENSOR_COLUMN_NAMES = [f'sensor_{i}' for i in range(1, 27)] | |
df_columns = ['unit_number', 'time', *SENSOR_COLUMN_NAMES] | |
train_df.columns = df_columns | |
test_df.columns = df_columns | |
train_df.describe() |
We can make this file beautiful and searchable if this error is corrected: It looks like row 4 should actually have 20 columns, instead of 18 in line 3.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
,unit_number,time,sensor_1,sensor_2,sensor_5,sensor_6,sensor_7,sensor_10,sensor_11,sensor_12,sensor_14,sensor_15,sensor_16,sensor_17,sensor_18,sensor_20,sensor_23,sensor_24,RUL | |
count,20631.0,20631.0,20631.0,20631.0,20631.0,20631.0,20631.0,20631.0,20631.0,20631.0,20631.0,20631.0,20631.0,20631.0,20631.0,20631.0,20631.0,20631.0,20631.0 | |
mean,51.506567786340945,108.80786195530997,0.4994902214444654,0.5019590260611053,0.4430522730736785,0.42474642703118026,0.45043520742321874,0.5664591322518506,0.29795703141632884,0.1952478718490331,0.41140961013362015,0.5806972325890045,0.3178711647518692,0.2260951703849851,0.4511180526792186,0.4342211558657721,0.5242408222141806,0.5461272588576589,107.80786195530997 | |
std,29.227632908799837,68.88099017721818,0.12570766948363188,0.24421843713843072,0.15061845483753925,0.13366360409179906,0.15193458441160862,0.14252693360119403,0.1075537558953646,0.09908857365640435,0.1589805944283841,0.1572608512174252,0.10576311132144779,0.09844243975618262,0.14430564814146393,0.12906358538452264,0 |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
train_df.drop(columns=['sensor_25', 'sensor_26'], inplace=True) | |
SENSOR_COLUMN_NAMES.remove('sensor_25') | |
SENSOR_COLUMN_NAMES.remove('sensor_26') |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
MIN_MAX_SCALERS = {} | |
for col_name in SENSOR_COLUMN_NAMES: | |
scaler = MinMaxScaler() | |
train_df[col_name] = scaler.fit_transform(train_df[col_name].values.reshape(-1, 1)).squeeze() | |
MIN_MAX_SCALERS[col_name] = scaler |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
train_df.drop(columns=[f'sensor_{i}' for i in [3, 4, 8, 9, 13, 19, 21, 22]], inplace=True, errors='ignore') | |
RUL = train_df.groupby('unit_number').apply(lambda group_df: | |
pd.concat([group_df['time'].max() - group_df['time'], group_df['time']], axis=1)).\ | |
reset_index().drop(columns=['level_1']) | |
RUL.columns = ['unit_number', 'RUL', 'time'] | |
train_df = pd.merge(train_df, RUL, left_on=['unit_number', 'time'], right_on=['unit_number', 'time']) | |
X_train = train_df[[x for x in train_df.columns if 'sensor_' in x]].values | |
y_train = train_df['RUL'].values.clip(max=125) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
xgbr = XGBRegressor() | |
xgbr.fit(X_train, y_train) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
test_df.drop(columns=[f'sensor_{i}' for i in [3, 4, 8, 9, 13, 19, 21, 22, 25, 26]], inplace=True, errors='ignore') | |
for col_name in [x for x in test_df.columns if 'sensor_' in x]: | |
test_df[col_name] = MIN_MAX_SCALERS[col_name].transform(test_df[col_name].values.reshape(-1, 1)).squeeze() | |
X_test = test_df.groupby('unit_number').apply(lambda group_df: group_df.iloc[group_df['time'].argmax()])[[x for x in test_df.columns if 'sensor_' in x]].values | |
y_test = pd.read_csv('/content/drive/MyDrive/Datasets/NASA_CMAPSS/RUL_FD001.txt', header=None).values.squeeze().clip(max=125) | |
def print_train_test_results(X_train, X_test, y_train, y_test, model): | |
y_pred_train = model.predict(X_train) | |
y_pred_test = model.predict(X_test) | |
print(f'RMSE on train set: {mean_squared_error(y_train, y_pred_train, squared=False)}') |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
WINDOW_SIZE = 20 | |
def get_windowed_dataframes(df): | |
df_groups = df.sort_values(['unit_number', 'time']).groupby('unit_number') | |
all_rollings = [] | |
for _, group_df in df_groups: | |
group_df_rolling = group_df.rolling(window=WINDOW_SIZE) | |
all_rollings.extend([wnd for wnd in group_df_rolling if len(wnd) == WINDOW_SIZE]) | |
return all_rollings |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
xgbr_windows_naive = XGBRegressor() | |
xgbr_windows_naive.fit(X_train_rolling.reshape(X_train_rolling.shape[0], -1), y_train_rolling) | |
print_train_test_results(X_train_rolling.reshape(X_train_rolling.shape[0], -1), | |
X_test_rolling.reshape(X_test_rolling.shape[0], -1), | |
y_train_rolling, y_test, xgbr_windows_naive) |
OlderNewer