Skip to content

Instantly share code, notes, and snippets.

@erap129
erap129 / NASA_RUL_imports.py
Last active September 26, 2021 13:37
NASA RUL project code snippets
import numpy as np
import pandas as pd
import plotly.express as px
from xgboost import XGBRegressor
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
import tsfresh
from tsfresh import select_features
@erap129
erap129 / data_loading.py
Last active September 28, 2021 17:51
NASA RUL project - data loading
# for reproducibility
pl.seed_everything(42)
train_df = pd.read_csv('/content/drive/MyDrive/Datasets/NASA_CMAPSS/train_FD001.txt', delimiter=' ', header=None)
test_df = pd.read_csv('/content/drive/MyDrive/Datasets/NASA_CMAPSS/test_FD001.txt', delimiter=' ', header=None)
SENSOR_COLUMN_NAMES = [f'sensor_{i}' for i in range(1, 27)]
df_columns = ['unit_number', 'time', *SENSOR_COLUMN_NAMES]
train_df.columns = df_columns
test_df.columns = df_columns
train_df.describe()
@erap129
erap129 / data_describe.csv
Created September 26, 2021 13:44
NASA RUL project - data description
We can make this file beautiful and searchable if this error is corrected: It looks like row 4 should actually have 20 columns, instead of 18 in line 3.
,unit_number,time,sensor_1,sensor_2,sensor_5,sensor_6,sensor_7,sensor_10,sensor_11,sensor_12,sensor_14,sensor_15,sensor_16,sensor_17,sensor_18,sensor_20,sensor_23,sensor_24,RUL
count,20631.0,20631.0,20631.0,20631.0,20631.0,20631.0,20631.0,20631.0,20631.0,20631.0,20631.0,20631.0,20631.0,20631.0,20631.0,20631.0,20631.0,20631.0,20631.0
mean,51.506567786340945,108.80786195530997,0.4994902214444654,0.5019590260611053,0.4430522730736785,0.42474642703118026,0.45043520742321874,0.5664591322518506,0.29795703141632884,0.1952478718490331,0.41140961013362015,0.5806972325890045,0.3178711647518692,0.2260951703849851,0.4511180526792186,0.4342211558657721,0.5242408222141806,0.5461272588576589,107.80786195530997
std,29.227632908799837,68.88099017721818,0.12570766948363188,0.24421843713843072,0.15061845483753925,0.13366360409179906,0.15193458441160862,0.14252693360119403,0.1075537558953646,0.09908857365640435,0.1589805944283841,0.1572608512174252,0.10576311132144779,0.09844243975618262,0.14430564814146393,0.12906358538452264,0
@erap129
erap129 / dropping_columns.py
Created September 26, 2021 13:48
NASA RUL project - dropping columns
train_df.drop(columns=['sensor_25', 'sensor_26'], inplace=True)
SENSOR_COLUMN_NAMES.remove('sensor_25')
SENSOR_COLUMN_NAMES.remove('sensor_26')
@erap129
erap129 / creating_scalers.py
Created September 26, 2021 13:50
NASA RUL project - creating scalers
MIN_MAX_SCALERS = {}
for col_name in SENSOR_COLUMN_NAMES:
scaler = MinMaxScaler()
train_df[col_name] = scaler.fit_transform(train_df[col_name].values.reshape(-1, 1)).squeeze()
MIN_MAX_SCALERS[col_name] = scaler
@erap129
erap129 / data_preparation.py
Last active September 27, 2021 07:48
NASA RUL project - data preparation
train_df.drop(columns=[f'sensor_{i}' for i in [3, 4, 8, 9, 13, 19, 21, 22]], inplace=True, errors='ignore')
RUL = train_df.groupby('unit_number').apply(lambda group_df:
pd.concat([group_df['time'].max() - group_df['time'], group_df['time']], axis=1)).\
reset_index().drop(columns=['level_1'])
RUL.columns = ['unit_number', 'RUL', 'time']
train_df = pd.merge(train_df, RUL, left_on=['unit_number', 'time'], right_on=['unit_number', 'time'])
X_train = train_df[[x for x in train_df.columns if 'sensor_' in x]].values
y_train = train_df['RUL'].values.clip(max=125)
@erap129
erap129 / baseline_model_training.py
Created September 27, 2021 07:51
NASA RUL project - baseline model training
xgbr = XGBRegressor()
xgbr.fit(X_train, y_train)
@erap129
erap129 / baseline_performance.py
Created September 27, 2021 07:54
NASA RUL project - baseline performance
test_df.drop(columns=[f'sensor_{i}' for i in [3, 4, 8, 9, 13, 19, 21, 22, 25, 26]], inplace=True, errors='ignore')
for col_name in [x for x in test_df.columns if 'sensor_' in x]:
test_df[col_name] = MIN_MAX_SCALERS[col_name].transform(test_df[col_name].values.reshape(-1, 1)).squeeze()
X_test = test_df.groupby('unit_number').apply(lambda group_df: group_df.iloc[group_df['time'].argmax()])[[x for x in test_df.columns if 'sensor_' in x]].values
y_test = pd.read_csv('/content/drive/MyDrive/Datasets/NASA_CMAPSS/RUL_FD001.txt', header=None).values.squeeze().clip(max=125)
def print_train_test_results(X_train, X_test, y_train, y_test, model):
y_pred_train = model.predict(X_train)
y_pred_test = model.predict(X_test)
print(f'RMSE on train set: {mean_squared_error(y_train, y_pred_train, squared=False)}')
@erap129
erap129 / data_windowing.py
Created September 27, 2021 11:57
NASA RUL project - windowing the data
WINDOW_SIZE = 20
def get_windowed_dataframes(df):
df_groups = df.sort_values(['unit_number', 'time']).groupby('unit_number')
all_rollings = []
for _, group_df in df_groups:
group_df_rolling = group_df.rolling(window=WINDOW_SIZE)
all_rollings.extend([wnd for wnd in group_df_rolling if len(wnd) == WINDOW_SIZE])
return all_rollings
@erap129
erap129 / naive_windows.py
Created September 27, 2021 11:59
NASA RUL project - naive window model
xgbr_windows_naive = XGBRegressor()
xgbr_windows_naive.fit(X_train_rolling.reshape(X_train_rolling.shape[0], -1), y_train_rolling)
print_train_test_results(X_train_rolling.reshape(X_train_rolling.shape[0], -1),
X_test_rolling.reshape(X_test_rolling.shape[0], -1),
y_train_rolling, y_test, xgbr_windows_naive)