Last active
January 27, 2019 11:48
-
-
Save yassineAlouini/917b0945735e2c82185b53b1d84d72c0 to your computer and use it in GitHub Desktop.
Smaller historical transactions DataFrame for the ELO competition.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# This function could be made generic to almost any loaded CSV file with | |
# pandas. Can you see how to do it? | |
import pandas as pd | |
# Some constants | |
PARQUET_ENGINE = "pyarrow" | |
DATE_COL = "purchase_date" | |
CATEGORICAL_COLS = ["card_id", "category_3", "merchant_id", "month_lag", | |
"installments", "state_id", "subsector_id", | |
"city_id", "merchant_category_id", "merchant_id"] | |
CATEGORICAL_DTYPES = {col: "category" for col in CATEGORICAL_COLS} | |
POSITIVE_LABEL = "Y" | |
INTEGER_WITH_NAN_COL = "category_2" | |
BINARY_COLS = ["authorized_flag", "category_1"] | |
INPUT_PATH = "../input/historical_transactions.csv" | |
OUTPUT_PATH = "historical_transactions.parquet" | |
def smaller_historical_transactions(input_path, output_path): | |
# Load the CSV file, parse the datetime column and the categorical ones. | |
df = pd.read_csv(input_path, parse_dates=[DATE_COL], | |
dtype=CATEGORICAL_DTYPES) | |
# Binarize some columns and cast to the boolean type | |
for col in BINARY_COLS: | |
df[col] = pd.np.where(df[col] == POSITIVE_LABEL, 1, 0).astype('bool') | |
# Cast the category_2 to np.uint8 | |
df[INTEGER_WITH_NAN_COL] = df[INTEGER_WITH_NAN_COL].values.astype(pd.np.uint8) | |
# Save as parquet file | |
df.to_parquet(engine=PARQUET_ENGINE) | |
return df | |
def load_historical_transactions(path=None): | |
if path is None: | |
return smaller_historical_transactions(INPUT_PATH, OUTPUT_PATH) | |
else: | |
df = pd.read_parquet(path, engine=PARQUET_ENGINE) | |
# Categorical columns aren't preserved when doing pandas.to_parquet | |
# (or maybe I am missing something?) | |
for col in CATEGORICAL_COLS: | |
df[col] = df[col].astype('cateogry') | |
return df |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment