Skip to content

Instantly share code, notes, and snippets.

@rnyak
rnyak / Sbr_XLNet_load.py
Created November 29, 2023 16:56
Load the saved model
import os
import tensorflow as tf
import merlin.models.tf as mm
from nvtabular.workflow import Workflow
from merlin.systems.dag.ops.tensorflow import PredictTensorflow
from merlin.systems.dag.ensemble import Ensemble
from merlin.systems.dag.ops.workflow import TransformWorkflow
@rnyak
rnyak / XLNet_MMs_train.py
Last active November 29, 2023 16:52
XLNet MMs training
import os
os.environ["TF_GPU_ALLOCATOR"]="cuda_malloc_async"
import glob
import numpy as np
import pandas as pd
import gc
import calendar
import datetime
import cudf
import os
os.environ["TF_GPU_ALLOCATOR"]="cuda_malloc_async"
import glob
import numpy as np
import pandas as pd
import gc
import calendar
import datetime
import cudf
@rnyak
rnyak / XLNet-MM.py
Last active September 5, 2023 13:40
import os
os.environ["TF_GPU_ALLOCATOR"]="cuda_malloc_async"
import glob
import numpy as np
import pandas as pd
import gc
import calendar
import datetime
import cudf
import os
import nvtabular as nvt
from nvtabular.ops import *
from merlin.schema.tags import Tags
import merlin.models.tf as mm
from merlin.io.dataset import Dataset
import tensorflow as tf
DATA_FOLDER = os.environ.get("DATA_FOLDER", "./data/")
# define output path for the processed parquet files
import os
import glob
import numpy as np
import pandas as pd
import cudf
import cupy as cp
import nvtabular as nvt
from nvtabular.ops import *
name: "0_transformworkflow"
input {
name: "item_id"
data_type: TYPE_INT32
dims: -1
dims: 1
}
input {
name: "category"
data_type: TYPE_INT32
import os
import glob
import numpy as np
import pandas as pd
import cudf
import cupy as cp
import nvtabular as nvt
from nvtabular.ops import *
from merlin.datasets.synthetic import generate_data
train, valid = generate_data("dressipi2022-preprocessed", num_rows=10000, set_sizes=(0.8, 0.2))
item_features_names = ['f_' + str(col) for col in [47, 68]]
cat_features = [['item_id', 'purchase_id']] + item_features_names >> nvt.ops.Categorify()
features = ['session_id', 'timestamp', 'date'] + cat_features
to_aggregate = {}
to_aggregate['date'] = ["first"]
to_aggregate['item_id'] = ["last", "list"]
to_aggregate['purchase_id'] = ["first"]
import os
import numpy as np
import pandas as pd
import nvtabular as nvt
NUM_ROWS = 1000
long_tailed_item_distribution = np.clip(np.random.lognormal(3., 1., NUM_ROWS).astype(np.int32), 1, 50000)
# generate random item interaction features
df = pd.DataFrame(np.random.randint(70000, 80000, NUM_ROWS), columns=['session_id'])