Created
April 23, 2020 19:20
-
-
Save jeethu/c2d7810e080dbfce04d0001e0e0efb78 to your computer and use it in GitHub Desktop.
numerai example_model with int8 features
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
""" | |
Example classifier on Numerai data using a xgboost regression. | |
To get started, install the required packages: pip install pandas numpy sklearn xgboost | |
""" | |
import csv | |
import pandas as pd | |
import numpy as np | |
from xgboost import XGBRegressor | |
TOURNAMENT_NAME = "kazutsugi" | |
TARGET_NAME = f"target_{TOURNAMENT_NAME}" | |
PREDICTION_NAME = f"prediction_{TOURNAMENT_NAME}" | |
BENCHMARK = 0 | |
BAND = 0.2 | |
# Submissions are scored by spearman correlation | |
def score(df): | |
# method="first" breaks ties based on order in array | |
return np.corrcoef( | |
df[TARGET_NAME], | |
df[PREDICTION_NAME].rank(pct=True, method="first") | |
)[0, 1] | |
# The payout function | |
def payout(scores): | |
return ((scores - BENCHMARK) / BAND).clip(lower=-1, upper=1) | |
# Read the csv file into a pandas Dataframe | |
# Also, rescale features to int8 and targets to fp16 (saves memory) | |
def read_csv(file_path): | |
to_uint8 = lambda x: np.uint8(float(x) * 4) | |
with open(file_path) as f: | |
column_names = next(csv.reader(f)) | |
dtypes = {TOURNAMENT_NAME: np.float16} | |
converters = {x: to_uint8 for x in column_names if x.startswith('feature')} | |
return pd.read_csv(file_path, dtype=dtypes, converters=converters).set_index("id") | |
def main(): | |
print("# Loading data...") | |
# The training data is used to train your model how to predict the targets. | |
training_data = read_csv("numerai_training_data.csv") | |
# The tournament data is the data that Numerai uses to evaluate your model. | |
tournament_data = read_csv("numerai_tournament_data.csv") | |
feature_names = [f for f in training_data.columns if f.startswith("feature")] | |
print(f"Loaded {len(feature_names)} features") | |
print("Training model") | |
# For faster experimentation you can decrease n_estimators to 200, for better performance increase to 20,000 | |
model = XGBRegressor(max_depth=5, learning_rate=0.01, n_estimators=2000, n_jobs=-1, colsample_bytree=0.1) | |
model.fit(training_data[feature_names], training_data[TARGET_NAME]) | |
print("Generating predictions") | |
training_data[PREDICTION_NAME] = model.predict(training_data[feature_names]) | |
tournament_data[PREDICTION_NAME] = model.predict(tournament_data[feature_names]) | |
# Check the per-era correlations on the training set | |
train_correlations = training_data.groupby("era").apply(score) | |
print(f"On training the correlation has mean {train_correlations.mean()} and std {train_correlations.std()}") | |
print(f"On training the average per-era payout is {payout(train_correlations).mean()}") | |
# Check the per-era correlations on the validation set | |
validation_data = tournament_data[tournament_data.data_type == "validation"] | |
validation_correlations = validation_data.groupby("era").apply(score) | |
print( | |
f"On validation the correlation has mean {validation_correlations.mean()} and std {validation_correlations.std()}") | |
print(f"On validation the average per-era payout is {payout(validation_correlations).mean()}") | |
tournament_data[PREDICTION_NAME].to_csv(TOURNAMENT_NAME + "_submission.csv") | |
# Now you can upload these predictions on https://numer.ai | |
if __name__ == '__main__': | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment