Created
January 11, 2020 18:09
-
-
Save WittmannF/7ac1af80ef586c2a6cb0aaca5e506824 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
## Imports | |
from ashrae_utils import reduce_mem_usage, CyclicLR, LRFinder | |
import numpy as np # linear algebra | |
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv) | |
import math | |
import tqdm | |
import gc | |
from sklearn.linear_model import RidgeCV | |
import seaborn as sns | |
## Parameters | |
BENCHMARK_SUBS=True | |
PRINT_CORR_HEATMAP=False | |
REPLACE_LEAK=True # Replace leak data or not | |
DEBUG=False | |
PRINT_WEIGHTS = True | |
TYPE_PREDICTION = 'ridgecv' # Mean, Median, keras, ridgecv, errorinversionnormalized | |
submission_paths = [ | |
'/kaggle/input/half-half-drop-rows-stratify-weekday/submission.csv', #1.105 --> 0.9446 | |
'/kaggle/input/simple-data-cleanup-3-models/submission.csv',# 1.072 | |
'/kaggle/input/ashrae-kfold-lightgbm-without-leak-1-08/submission.csv', | |
'/kaggle/input/another-1-08-lb-no-leak/fe2_lgbm.csv', | |
'/kaggle/input/ashrae-kfold-lightgbm-without-building-id/submission.csv', #1.098 | |
'/kaggle/input/ashrae-energy-prediction-using-stratified-kfold/fe2_lgbm.csv', #1.074 | |
'/kaggle/input/ashrae-lightgbm-without-leak/submission.csv', #1.082 | |
'/kaggle/input/ashrae-stratified-kfold-lightgbm/submission.csv', #1.075 | |
'/kaggle/input/ashrae-2-lightgbm-without-leak-data/submission.csv', | |
] | |
## Functions | |
def rmse(y_true, y_pred): | |
return np.sqrt(np.mean((y_true-y_pred)**2)) | |
def rmsle(y_true, y_pred): | |
return np.sqrt(np.mean((np.log1p(y_true)-np.log1p(y_pred))**2)) | |
def read_submissions(): | |
print('## Reading submissions') | |
subs = [] | |
for i, path in enumerate(submission_paths): | |
print(f'Reading {path}') | |
sub = pd.read_csv(path) | |
sub.columns = ['row_id', f'meter_reading_{i}'] | |
subs.append(sub[f'meter_reading_{i}']) | |
subs = pd.concat(subs, axis=1) | |
subs['row_id'] = sub.row_id | |
subs = reduce_mem_usage(subs) | |
sub = reduce_mem_usage(sub) | |
sub.columns = ['row_id', 'meter_reading'] | |
return sub, subs | |
def read_leak(): | |
y_test = pd.read_csv('/kaggle/input/leak-test-set/y_test.csv', names=['meter_reading'], index_col=0) | |
y_test['meter_reading'] = np.clip(y_test['meter_reading'], 0, None) | |
return y_test | |
def leak_benchmark(sub): | |
print("## Comparing predictions against leak data") | |
y_test = read_leak() | |
rmsle_error = rmsle(y_test.values.T[0], sub['meter_reading'][y_test.index].values) | |
print(f'RMSLE in the leak data is {rmsle_error}') | |
if REPLACE_LEAK: | |
print("## Replacing predictions with leak data") | |
sub['meter_reading'][y_test.index] = y_test['meter_reading'] | |
return sub | |
def read_X_test(): | |
X_test = pd.read_feather('/kaggle/input/ashrae-feather-format-for-fast-loading/test.feather') | |
X_test = X_test.set_index('row_id') | |
return X_test | |
def prepare_X(X): | |
X = np.log1p(X) | |
return X | |
def benchmark_subs(X, y): | |
for i, col in enumerate(X.columns): | |
print(f"Benchmarking {submission_paths[i].split('/')[-2]}") | |
score = rmse(X[col].values, y['meter_reading'].values) | |
print(f"RMSLE is {score}") | |
def ridgecv_predict(subs): | |
#X, y = get_X_y(subs) | |
y = read_leak() | |
X = subs.iloc[y.index, :len(submission_paths)] | |
X = prepare_X(X) | |
y = np.log1p(y) | |
if BENCHMARK_SUBS: | |
benchmark_subs(X, y) | |
if PRINT_CORR_HEATMAP: | |
sns_plot = sns.heatmap(pd.concat([X, y], axis=1).corr(), annot=True) | |
sns_plot.savefig("corr_w_gt.png") | |
reg = RidgeCV(alphas = RIDGE_ALPHAS).fit(X, y) | |
if PRINT_WEIGHTS: | |
print("## Ridge Coefficients") | |
print(f'Sum of coefficients: {sum(reg.coef_[0])}') | |
for ww, ss in zip(reg.coef_[0], submission_paths): | |
print(f'{ss.split("/")[-2]} has weight {ww:.2f}') | |
X = subs.iloc[:, :len(submission_paths)] | |
X = prepare_X(X) | |
y_pred = reg.predict(X) | |
y_pred = y_pred.T[0] | |
y_pred = np.clip(y_pred, 0, None) | |
y_pred = np.expm1(y_pred) | |
return y_pred | |
def predict(subs, **kwargs): | |
if TYPE_PREDICTION=='ridgecv': | |
return ridgecv_predict(subs) | |
def export(sub): | |
if not DEBUG: | |
print('## Saving to CSV') | |
sub.to_csv('submission.csv', index=False, float_format='%g') | |
## Main Function | |
if __name__=='__main__': | |
# 1. Reading Data | |
sub, subs = read_submissions() | |
# 2. Predicting | |
sub['meter_reading'] = predict(subs) | |
# 3. Leak correction | |
sub = leak_benchmark(sub) | |
# 4. Export Submission | |
export(sub) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment