Skip to content

Instantly share code, notes, and snippets.

import pandas as pd
import math
import cudf
import dask, dask_cudf
import xgboost as xgb
from dask.distributed import Client, wait
from dask_cuda import LocalCUDACluster
# connect to the Dask cluster created at Dataproc startup time
cluster = LocalCUDACluster()
import os
from time import time
import re
import glob
import warnings
# tools for data preproc/loading
import torch
import rmm
import nvtabular as nvt
from nvtabular.ops import Normalize, FillMissing, Categorify, LogOp, ZeroFill
rmm.reinitialize(pool_allocator=True, initial_pool_size=0.8 * rmm.get_info().free)
# define where to get our data from
INPUT_DATA_DIR = os.environ.get('INPUT_DATA_DIR', '/data')
# where we'll save our processed data to
OUTPUT_DATA_DIR = os.environ.get('OUTPUT_DATA_DIR', '/processed')
output_train_dir = os.path.join(OUTPUT_DATA_DIR, 'train/')
output_valid_dir = os.path.join(OUTPUT_DATA_DIR, 'valid/')
# define our dataset schema
CONTINUOUS_COLUMNS = ['I' + str(x) for x in range(1,14)]
fname = 'day_{}.parquet'
NUM_TRAIN_DAYS = 23
num_days = len([i for i in os.listdir(INPUT_DATA_DIR) if re.match(fname.format('[0-9]{1,2}'), i) is not None])
train_path = [os.path.join(INPUT_DATA_DIR, fname.format(day)) for day in range(NUM_TRAIN_DAYS)]
valid_path = [os.path.join(INPUT_DATA_DIR, fname.format(day)) for day in range(NUM_TRAIN_DAYS, num_days)]
proc = nvt.Workflow(
cat_names=CATEGORICAL_COLUMNS,
cont_names=CONTINUOUS_COLUMNS,
label_name=LABEL_COLUMNS)
proc.add_cont_feature([ZeroFill(), LogOp()])
proc.add_cont_preprocess(Normalize())
proc.add_cat_preprocess(Categorify(use_frequency=True, freq_threshold=15))
@rnyak
rnyak / NVT_dataset_object
Last active July 23, 2020 12:57
NVT_dataset_object
train_dataset = nvt.dataset(train_path, engine='parquet', gpu_memory_frac=0.3)
valid_dataset = nvt.dataset(valid_path, engine='parquet', gpu_memory_frac=0.3)
proc.apply(train_dataset, apply_offline=True, record_stats=True, shuffle=True, output_path=output_train_dir, num_out_files=35)
proc.apply(valid_dataset, apply_offline=True, record_stats=False, shuffle=False, output_path=output_valid_dir, num_out_files=35)
import os
import numpy as np
import pandas as pd
import glob
import shutil
import cudf
import cupy
import nvtabular as nvt
from nvtabular import ColumnSelector