rnyak · November 29, 2023 16:52
diff --git a/XLNet_MMs_train.py b/XLNet_MMs_train.py
 import os
 os.environ["TF_GPU_ALLOCATOR"]="cuda_malloc_async"
 import glob
 import numpy as np
 import pandas as pd
 import gc
 import calendar
 import datetime

 import cudf
 import cupy
 import nvtabular as nvt
 from merlin.dag import ColumnSelector
 from merlin.schema import Schema, Tags

 DATA_FOLDER = os.environ.get("DATA_FOLDER", "/workspace/data")
 def generate_synthetic_data(
    start_date: datetime.date, end_date: datetime.date, rows_per_day: int = 10000
 ) -> pd.DataFrame:
    assert end_date > start_date, "end_date must be later than start_date"

    number_of_days = (end_date - start_date).days
    total_number_of_rows = number_of_days * rows_per_day

    # Generate a long-tail distribution of item interactions. This simulates that some items are
    # more popular than others.
    long_tailed_item_distribution = np.clip(
        np.random.lognormal(3.0, 10.0, total_number_of_rows).astype(np.int64), 1, 50000
    )

    # generate random item interaction features
    df = pd.DataFrame(
        {
            "session_id": np.random.randint(10000, 80000, total_number_of_rows),
            "item_id": long_tailed_item_distribution,
        },
    )

    # generate category mapping for each item-id
    df["category"] = pd.cut(df["item_id"], bins=334, labels=np.arange(1, 335)).astype(
        np.int64
    )

    max_session_length = 60 * 60  # 1 hour

    def add_timestamp_to_session(session: pd.DataFrame):
        random_start_date_and_time = calendar.timegm(
            (
                start_date
                # Add day offset from start_date
                + datetime.timedelta(days=np.random.randint(0, number_of_days))
                # Add time offset within the random day
                + datetime.timedelta(seconds=np.random.randint(0, 86_400))
            ).timetuple()
        )
        session["timestamp"] = random_start_date_and_time + np.clip(
            np.random.lognormal(3.0, 1.0, len(session)).astype(np.int64),
            0,
            max_session_length,
        )
        return session

    df = df.groupby("session_id").apply(add_timestamp_to_session).reset_index()

    return df
 
 START_DATE = os.environ.get("START_DATE", "2022/4/1")
 END_DATE = os.environ.get("END_DATE", "2022/4/5")
 interactions_df = generate_synthetic_data(datetime.datetime.strptime(START_DATE, '%Y/%m/%d'),
                                          datetime.datetime.strptime(END_DATE, '%Y/%m/%d'))
 interactions_df = cudf.from_pandas(interactions_df)

 print("Count with in-session repeated interactions: {}".format(len(interactions_df)))

 # Sorts the dataframe by session and timestamp, to remove consecutive repetitions
 interactions_df.timestamp = interactions_df.timestamp.astype(int)
 interactions_df = interactions_df.sort_values(['session_id', 'timestamp'])
 past_ids = interactions_df['item_id'].shift(1).fillna()
 session_past_ids = interactions_df['session_id'].shift(1).fillna()

 # Keeping only no consecutive repeated in session interactions
 interactions_df = interactions_df[~((interactions_df['session_id'] == session_past_ids) & (interactions_df['item_id'] == past_ids))]

 print("Count after removed in-session repeated interactions: {}".format(len(interactions_df)))

 items_first_ts_df = interactions_df.groupby('item_id').agg({'timestamp': 'min'}).reset_index().rename(columns={'timestamp': 'itemid_ts_first'})
 interactions_merged_df = interactions_df.merge(items_first_ts_df, on=['item_id'], how='left')


 # free gpu memory
 del interactions_df, session_past_ids, items_first_ts_df
 gc.collect()

 # create time features
 session_ts = ColumnSelector(['timestamp'])
 session_time = (
    session_ts >> 
    nvt.ops.LambdaOp(lambda col: cudf.to_datetime(col, unit='s')) >> 
    nvt.ops.Rename(name = 'event_time_dt')
 )
 dayofweek = (
    session_time >> 
    nvt.ops.LambdaOp(lambda col: col.dt.weekday) >> 
    nvt.ops.Rename(name ='dayofweek')
 )

 # Encodes categorical features as contiguous integers
 cat_feats = ColumnSelector(['category', 'item_id']) + dayofweek >> nvt.ops.Categorify()

 features = ColumnSelector(['session_id', 'timestamp']) + cat_feats + session_time 

 # Define Groupby Operator
 groupby_features = features >> nvt.ops.Groupby(
    groupby_cols=["session_id"], 
    sort_cols=["timestamp"],
    aggs={
        'item_id': ["list", "count"],
        'category': ["list"],  
        'timestamp': ["first"],
        'event_time_dt': ["first"],
        'dayofweek': ["first"],
        },
    name_sep="-")

 # Truncate sequence features to first interacted 20 items 
 SESSIONS_MAX_LENGTH = 20 

 item_feat = groupby_features['item_id-list'] >> nvt.ops.TagAsItemID()
 context_feat = groupby_features['dayofweek-first'] >> nvt.ops.AddMetadata(tags=[Tags.CONTEXT])

 groupby_features_list =  item_feat  + groupby_features['category-list'] >> nvt.ops.AddMetadata(tags=[Tags.SEQUENCE]) 
 groupby_features_truncated = groupby_features_list >> nvt.ops.ListSlice(-SESSIONS_MAX_LENGTH)

 # Select features for training 
 selected_features = groupby_features['session_id', 'item_id-count'] + groupby_features_truncated + context_feat

 # Filter out sessions with less than 2 interactions 
 MINIMUM_SESSION_LENGTH = 2
 filtered_sessions = selected_features >> nvt.ops.Filter(f=lambda df: df["item_id-count"] >= MINIMUM_SESSION_LENGTH) 

 dataset = nvt.Dataset(interactions_merged_df)
 workflow = nvt.Workflow(filtered_sessions['session_id', 'item_id-list', 'category-list', 'dayofweek-first'])
 sessions_gdf = workflow.fit_transform(dataset)

 sessions_gdf.to_parquet(os.path.join(DATA_FOLDER, "processed_nvt"))
 workflow.save(os.path.join(DATA_FOLDER, "workflow_etl"))

 import tensorflow as tf

 from merlin.schema.tags import Tags
 from merlin.io.dataset import Dataset
 import merlin.models.tf as mm

 train = Dataset(os.path.join(DATA_FOLDER, 'processed_nvt/', 'part_0.parquet'))
 train.schema= train.schema.select_by_name(['item_id-list', 'category-list', 'dayofweek-first'])
 seq_schema = train.schema.select_by_tag(Tags.SEQUENCE)
 context_schema = train.schema.select_by_tag(Tags.CONTEXT)

 target_schema = train.schema.select_by_tag(Tags.ITEM)
 target = target_schema.column_names[0]
 dmodel = int(os.environ.get("dmodel", '32'))

 input_block = mm.InputBlockV2(
    train.schema,    
    embeddings=mm.Embeddings(
        (seq_schema + context_schema).select_by_tag(Tags.CATEGORICAL), 
        sequence_combiner=None,
        dim=dmodel
        ),
    post=mm.BroadcastToSequence(context_schema, seq_schema),   # we use this if we have context feature (like user features). 
                                                               # If you dont have context feature do NOT use this
 )
 xlnet_block = mm.XLNetBlock(d_model=dmodel, n_head=2, n_layer=2)
 item_id_name = train.schema.select_by_tag(Tags.ITEM).first.properties['domain']['name']
 print(item_id_name)
 # if you want to apply sampled sofmax you should set `sampled_softmax=True`.

 def get_output_block(schema, input_block=None, weight_tying = True, sampled_softmax=False, logq_correction=True):
    if weight_tying:
        candidate_table = input_block["categorical"][item_id_name]
        to_call = candidate_table
    else:
        candidate = schema.select_by_tag(Tags.ITEM_ID)
        to_call = candidate

    if sampled_softmax:
        print("applying sampled softmax")
        outputs = mm.ContrastiveOutput(
            to_call=to_call,
            #logits_temperature=logits_temperature,
            negative_samplers=mm.PopularityBasedSamplerV2(
                max_num_samples=1000,  # you can change this value
                max_id=5384,     # this value comes from the schema file, check your item-id max value.
                min_id=2,         #this value also depends on your processed item-id data min value.
            ),
            logq_sampling_correction=logq_correction,
        )
    else:
        print("NO sampled softmax, scoring against all catalog")
        outputs = mm.CategoricalOutput(
            to_call=to_call,
        )
    return outputs
 
 d_model = dmodel
 weight_tying = True

 # get output block
 output_block = get_output_block(train.schema, input_block=input_block)

 # Define the session encoder
 if weight_tying:
    # project tranformer's output to same dimension as target
    projection = mm.MLPBlock(
        [128, output_block.to_call.table.dim],
        no_activation_last_layer=True,
    )
    session_encoder = mm.Encoder(
        input_block,
        mm.MLPBlock([128, dmodel], no_activation_last_layer=True),
        xlnet_block,
        projection,
    )

 else:
    session_encoder = mm.Encoder(
        input_block,
        mm.MLPBlock([d_model], no_activation_last_layer=True),
        xlnet_block,
    )

 model_transformer = mm.RetrievalModelV2(query=session_encoder, output=output_block)
 EPOCHS = int(os.environ.get("EPOCHS", '1'))
 BATCH_SIZE = 1024
 LEARNING_RATE = 0.005
 optimizer = tf.keras.optimizers.Adam(
    learning_rate=LEARNING_RATE,
 )

 # get loss
 loss = tf.keras.losses.CategoricalCrossentropy(
    from_logits=True
 )

 model_transformer.compile(
    run_eagerly=False, 
    optimizer=optimizer, 
    loss=loss,
    metrics=mm.TopKMetricsAggregator.default_metrics(top_ks=[10])
 )
 model_transformer.fit(
    train, 
    batch_size=BATCH_SIZE, 
    epochs=EPOCHS, 
    pre=mm.SequenceMaskRandom(schema=seq_schema, target=target, masking_prob=0.3, transformer=xlnet_block),
    #drop_last =True
 )
 model_transformer.save('./saved_model')
	import os
	os.environ["TF_GPU_ALLOCATOR"]="cuda_malloc_async"
	import glob
	import numpy as np
	import pandas as pd
	import gc
	import calendar
	import datetime

	import cudf
	import cupy
	import nvtabular as nvt
	from merlin.dag import ColumnSelector
	from merlin.schema import Schema, Tags

	DATA_FOLDER = os.environ.get("DATA_FOLDER", "/workspace/data")
	def generate_synthetic_data(
	start_date: datetime.date, end_date: datetime.date, rows_per_day: int = 10000
	) -> pd.DataFrame:
	assert end_date > start_date, "end_date must be later than start_date"

	number_of_days = (end_date - start_date).days
	total_number_of_rows = number_of_days * rows_per_day

	# Generate a long-tail distribution of item interactions. This simulates that some items are
	# more popular than others.
	long_tailed_item_distribution = np.clip(
	np.random.lognormal(3.0, 10.0, total_number_of_rows).astype(np.int64), 1, 50000
	)

	# generate random item interaction features
	df = pd.DataFrame(
	{
	"session_id": np.random.randint(10000, 80000, total_number_of_rows),
	"item_id": long_tailed_item_distribution,
	},
	)

	# generate category mapping for each item-id
	df["category"] = pd.cut(df["item_id"], bins=334, labels=np.arange(1, 335)).astype(
	np.int64
	)

	max_session_length = 60 * 60 # 1 hour

	def add_timestamp_to_session(session: pd.DataFrame):
	random_start_date_and_time = calendar.timegm(
	(
	start_date
	# Add day offset from start_date
	+ datetime.timedelta(days=np.random.randint(0, number_of_days))
	# Add time offset within the random day
	+ datetime.timedelta(seconds=np.random.randint(0, 86_400))
	).timetuple()
	)
	session["timestamp"] = random_start_date_and_time + np.clip(
	np.random.lognormal(3.0, 1.0, len(session)).astype(np.int64),
	0,
	max_session_length,
	)
	return session

	df = df.groupby("session_id").apply(add_timestamp_to_session).reset_index()

	return df

	START_DATE = os.environ.get("START_DATE", "2022/4/1")
	END_DATE = os.environ.get("END_DATE", "2022/4/5")
	interactions_df = generate_synthetic_data(datetime.datetime.strptime(START_DATE, '%Y/%m/%d'),
	datetime.datetime.strptime(END_DATE, '%Y/%m/%d'))
	interactions_df = cudf.from_pandas(interactions_df)

	print("Count with in-session repeated interactions: {}".format(len(interactions_df)))

	# Sorts the dataframe by session and timestamp, to remove consecutive repetitions
	interactions_df.timestamp = interactions_df.timestamp.astype(int)
	interactions_df = interactions_df.sort_values(['session_id', 'timestamp'])
	past_ids = interactions_df['item_id'].shift(1).fillna()
	session_past_ids = interactions_df['session_id'].shift(1).fillna()

	# Keeping only no consecutive repeated in session interactions
	interactions_df = interactions_df[~((interactions_df['session_id'] == session_past_ids) & (interactions_df['item_id'] == past_ids))]

	print("Count after removed in-session repeated interactions: {}".format(len(interactions_df)))

	items_first_ts_df = interactions_df.groupby('item_id').agg({'timestamp': 'min'}).reset_index().rename(columns={'timestamp': 'itemid_ts_first'})
	interactions_merged_df = interactions_df.merge(items_first_ts_df, on=['item_id'], how='left')


	# free gpu memory
	del interactions_df, session_past_ids, items_first_ts_df
	gc.collect()

	# create time features
	session_ts = ColumnSelector(['timestamp'])
	session_time = (
	session_ts >>
	nvt.ops.LambdaOp(lambda col: cudf.to_datetime(col, unit='s')) >>
	nvt.ops.Rename(name = 'event_time_dt')
	)
	dayofweek = (
	session_time >>
	nvt.ops.LambdaOp(lambda col: col.dt.weekday) >>
	nvt.ops.Rename(name ='dayofweek')
	)

	# Encodes categorical features as contiguous integers
	cat_feats = ColumnSelector(['category', 'item_id']) + dayofweek >> nvt.ops.Categorify()

	features = ColumnSelector(['session_id', 'timestamp']) + cat_feats + session_time

	# Define Groupby Operator
	groupby_features = features >> nvt.ops.Groupby(
	groupby_cols=["session_id"],
	sort_cols=["timestamp"],
	aggs={
	'item_id': ["list", "count"],
	'category': ["list"],
	'timestamp': ["first"],
	'event_time_dt': ["first"],
	'dayofweek': ["first"],
	},
	name_sep="-")

	# Truncate sequence features to first interacted 20 items
	SESSIONS_MAX_LENGTH = 20

	item_feat = groupby_features['item_id-list'] >> nvt.ops.TagAsItemID()
	context_feat = groupby_features['dayofweek-first'] >> nvt.ops.AddMetadata(tags=[Tags.CONTEXT])

	groupby_features_list = item_feat + groupby_features['category-list'] >> nvt.ops.AddMetadata(tags=[Tags.SEQUENCE])
	groupby_features_truncated = groupby_features_list >> nvt.ops.ListSlice(-SESSIONS_MAX_LENGTH)

	# Select features for training
	selected_features = groupby_features['session_id', 'item_id-count'] + groupby_features_truncated + context_feat

	# Filter out sessions with less than 2 interactions
	MINIMUM_SESSION_LENGTH = 2
	filtered_sessions = selected_features >> nvt.ops.Filter(f=lambda df: df["item_id-count"] >= MINIMUM_SESSION_LENGTH)

	dataset = nvt.Dataset(interactions_merged_df)
	workflow = nvt.Workflow(filtered_sessions['session_id', 'item_id-list', 'category-list', 'dayofweek-first'])
	sessions_gdf = workflow.fit_transform(dataset)

	sessions_gdf.to_parquet(os.path.join(DATA_FOLDER, "processed_nvt"))
	workflow.save(os.path.join(DATA_FOLDER, "workflow_etl"))

	import tensorflow as tf

	from merlin.schema.tags import Tags
	from merlin.io.dataset import Dataset
	import merlin.models.tf as mm

	train = Dataset(os.path.join(DATA_FOLDER, 'processed_nvt/', 'part_0.parquet'))
	train.schema= train.schema.select_by_name(['item_id-list', 'category-list', 'dayofweek-first'])
	seq_schema = train.schema.select_by_tag(Tags.SEQUENCE)
	context_schema = train.schema.select_by_tag(Tags.CONTEXT)

	target_schema = train.schema.select_by_tag(Tags.ITEM)
	target = target_schema.column_names[0]
	dmodel = int(os.environ.get("dmodel", '32'))

	input_block = mm.InputBlockV2(
	train.schema,
	embeddings=mm.Embeddings(
	(seq_schema + context_schema).select_by_tag(Tags.CATEGORICAL),
	sequence_combiner=None,
	dim=dmodel
	),
	post=mm.BroadcastToSequence(context_schema, seq_schema), # we use this if we have context feature (like user features).
	# If you dont have context feature do NOT use this
	)
	xlnet_block = mm.XLNetBlock(d_model=dmodel, n_head=2, n_layer=2)
	item_id_name = train.schema.select_by_tag(Tags.ITEM).first.properties['domain']['name']
	print(item_id_name)
	# if you want to apply sampled sofmax you should set `sampled_softmax=True`.

	def get_output_block(schema, input_block=None, weight_tying = True, sampled_softmax=False, logq_correction=True):
	if weight_tying:
	candidate_table = input_block["categorical"][item_id_name]
	to_call = candidate_table
	else:
	candidate = schema.select_by_tag(Tags.ITEM_ID)
	to_call = candidate

	if sampled_softmax:
	print("applying sampled softmax")
	outputs = mm.ContrastiveOutput(
	to_call=to_call,
	#logits_temperature=logits_temperature,
	negative_samplers=mm.PopularityBasedSamplerV2(
	max_num_samples=1000, # you can change this value
	max_id=5384, # this value comes from the schema file, check your item-id max value.
	min_id=2, #this value also depends on your processed item-id data min value.
	),
	logq_sampling_correction=logq_correction,
	)
	else:
	print("NO sampled softmax, scoring against all catalog")
	outputs = mm.CategoricalOutput(
	to_call=to_call,
	)
	return outputs

	d_model = dmodel
	weight_tying = True

	# get output block
	output_block = get_output_block(train.schema, input_block=input_block)

	# Define the session encoder
	if weight_tying:
	# project tranformer's output to same dimension as target
	projection = mm.MLPBlock(
	[128, output_block.to_call.table.dim],
	no_activation_last_layer=True,
	)
	session_encoder = mm.Encoder(
	input_block,
	mm.MLPBlock([128, dmodel], no_activation_last_layer=True),
	xlnet_block,
	projection,
	)

	else:
	session_encoder = mm.Encoder(
	input_block,
	mm.MLPBlock([d_model], no_activation_last_layer=True),
	xlnet_block,
	)

	model_transformer = mm.RetrievalModelV2(query=session_encoder, output=output_block)
	EPOCHS = int(os.environ.get("EPOCHS", '1'))
	BATCH_SIZE = 1024
	LEARNING_RATE = 0.005
	optimizer = tf.keras.optimizers.Adam(
	learning_rate=LEARNING_RATE,
	)

	# get loss
	loss = tf.keras.losses.CategoricalCrossentropy(
	from_logits=True
	)

	model_transformer.compile(
	run_eagerly=False,
	optimizer=optimizer,
	loss=loss,
	metrics=mm.TopKMetricsAggregator.default_metrics(top_ks=[10])
	)
	model_transformer.fit(
	train,
	batch_size=BATCH_SIZE,
	epochs=EPOCHS,
	pre=mm.SequenceMaskRandom(schema=seq_schema, target=target, masking_prob=0.3, transformer=xlnet_block),
	#drop_last =True
	)
	model_transformer.save('./saved_model')