yassineAlouini · January 27, 2019 11:48
diff --git a/smaller_historical_transactions.py b/smaller_historical_transactions.py
 # This function could be made generic to almost any loaded CSV file with
 # pandas. Can you see how to do it?

 import pandas as pd

 # Some constants
 PARQUET_ENGINE = "pyarrow"
 DATE_COL = "purchase_date"
 CATEGORICAL_COLS = ["card_id", "category_3", "merchant_id", "month_lag", 
                    "installments", "state_id", "subsector_id", 
                    "city_id", "merchant_category_id", "merchant_id"]
 CATEGORICAL_DTYPES = {col: "category" for col in CATEGORICAL_COLS}
 POSITIVE_LABEL = "Y"
 INTEGER_WITH_NAN_COL = "category_2"
 BINARY_COLS = ["authorized_flag", "category_1"]
 INPUT_PATH = "../input/historical_transactions.csv"
 OUTPUT_PATH = "historical_transactions.parquet"


 def smaller_historical_transactions(input_path, output_path):
    # Load the CSV file, parse the datetime column and the categorical ones.
    df = pd.read_csv(input_path, parse_dates=[DATE_COL], 
                    dtype=CATEGORICAL_DTYPES)
    # Binarize some columns and cast to the boolean type
    for col in BINARY_COLS:
        df[col] = pd.np.where(df[col] == POSITIVE_LABEL, 1, 0).astype('bool')
    # Cast the category_2 to np.uint8
    df[INTEGER_WITH_NAN_COL] = df[INTEGER_WITH_NAN_COL].values.astype(pd.np.uint8)
    # Save as parquet file
    df.to_parquet(engine=PARQUET_ENGINE)
    return df
    
 def load_historical_transactions(path=None):
    if path is None:
        return smaller_historical_transactions(INPUT_PATH, OUTPUT_PATH)
    else:
        df = pd.read_parquet(path, engine=PARQUET_ENGINE)
        # Categorical columns aren't preserved when doing pandas.to_parquet
        # (or maybe I am missing something?)
        for col in CATEGORICAL_COLS:
            df[col] = df[col].astype('cateogry')
        return df
	# This function could be made generic to almost any loaded CSV file with
	# pandas. Can you see how to do it?

	import pandas as pd

	# Some constants
	PARQUET_ENGINE = "pyarrow"
	DATE_COL = "purchase_date"
	CATEGORICAL_COLS = ["card_id", "category_3", "merchant_id", "month_lag",
	"installments", "state_id", "subsector_id",
	"city_id", "merchant_category_id", "merchant_id"]
	CATEGORICAL_DTYPES = {col: "category" for col in CATEGORICAL_COLS}
	POSITIVE_LABEL = "Y"
	INTEGER_WITH_NAN_COL = "category_2"
	BINARY_COLS = ["authorized_flag", "category_1"]
	INPUT_PATH = "../input/historical_transactions.csv"
	OUTPUT_PATH = "historical_transactions.parquet"


	def smaller_historical_transactions(input_path, output_path):
	# Load the CSV file, parse the datetime column and the categorical ones.
	df = pd.read_csv(input_path, parse_dates=[DATE_COL],
	dtype=CATEGORICAL_DTYPES)
	# Binarize some columns and cast to the boolean type
	for col in BINARY_COLS:
	df[col] = pd.np.where(df[col] == POSITIVE_LABEL, 1, 0).astype('bool')
	# Cast the category_2 to np.uint8
	df[INTEGER_WITH_NAN_COL] = df[INTEGER_WITH_NAN_COL].values.astype(pd.np.uint8)
	# Save as parquet file
	df.to_parquet(engine=PARQUET_ENGINE)
	return df

	def load_historical_transactions(path=None):
	if path is None:
	return smaller_historical_transactions(INPUT_PATH, OUTPUT_PATH)
	else:
	df = pd.read_parquet(path, engine=PARQUET_ENGINE)
	# Categorical columns aren't preserved when doing pandas.to_parquet
	# (or maybe I am missing something?)
	for col in CATEGORICAL_COLS:
	df[col] = df[col].astype('cateogry')
	return df