Skip to content

Instantly share code, notes, and snippets.

@yassineAlouini
Last active January 27, 2019 11:48
Show Gist options
  • Save yassineAlouini/917b0945735e2c82185b53b1d84d72c0 to your computer and use it in GitHub Desktop.
Save yassineAlouini/917b0945735e2c82185b53b1d84d72c0 to your computer and use it in GitHub Desktop.
Smaller historical transactions DataFrame for the ELO competition.
# This function could be made generic to almost any loaded CSV file with
# pandas. Can you see how to do it?
import pandas as pd
# Some constants
PARQUET_ENGINE = "pyarrow"
DATE_COL = "purchase_date"
CATEGORICAL_COLS = ["card_id", "category_3", "merchant_id", "month_lag",
"installments", "state_id", "subsector_id",
"city_id", "merchant_category_id", "merchant_id"]
CATEGORICAL_DTYPES = {col: "category" for col in CATEGORICAL_COLS}
POSITIVE_LABEL = "Y"
INTEGER_WITH_NAN_COL = "category_2"
BINARY_COLS = ["authorized_flag", "category_1"]
INPUT_PATH = "../input/historical_transactions.csv"
OUTPUT_PATH = "historical_transactions.parquet"
def smaller_historical_transactions(input_path, output_path):
# Load the CSV file, parse the datetime column and the categorical ones.
df = pd.read_csv(input_path, parse_dates=[DATE_COL],
dtype=CATEGORICAL_DTYPES)
# Binarize some columns and cast to the boolean type
for col in BINARY_COLS:
df[col] = pd.np.where(df[col] == POSITIVE_LABEL, 1, 0).astype('bool')
# Cast the category_2 to np.uint8
df[INTEGER_WITH_NAN_COL] = df[INTEGER_WITH_NAN_COL].values.astype(pd.np.uint8)
# Save as parquet file
df.to_parquet(engine=PARQUET_ENGINE)
return df
def load_historical_transactions(path=None):
if path is None:
return smaller_historical_transactions(INPUT_PATH, OUTPUT_PATH)
else:
df = pd.read_parquet(path, engine=PARQUET_ENGINE)
# Categorical columns aren't preserved when doing pandas.to_parquet
# (or maybe I am missing something?)
for col in CATEGORICAL_COLS:
df[col] = df[col].astype('cateogry')
return df
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment