powderluv · February 20, 2025 03:11
diff --git a/gistfile1.txt b/gistfile1.txt
 # Training data consist of X=(str, str), y=float:
 from sklearn.model_selection import train_test_split
 X = [
    ["Hello World!", "Good morning!"],
    ["It is raining", "It is cold"],
    ["Beautiful city beside mountain", "Quiet street in downtown area"],
    ["AI is the future", "AI is just a tool"],
    ["This application is great", "software is the problem"],
    ["Hello World!", "Good morning!"],
    ["It is raining", "It is cold"],
    ["Beautiful city beside mountain", "Quiet street in downtown area"],
    ["AI is the future", "AI is just a tool"],
    ["This application is great", "software is the problem"],
    ["Hello World!", "Good morning!"],
    ["It is raining", "It is cold"],
    ["Beautiful city beside mountain", "Quiet street in downtown area"],
    ["AI is the future", "AI is just a tool"],
    ["This application is great", "software is the problem"],
    ["Hello World!", "Good morning!"],
    ["It is raining", "It is cold"],
    ["Beautiful city beside mountain", "Quiet street in downtown area"],
    ["AI is the future", "AI is just a tool"],
    ["This application is great", "software is the problem"],
 ]
 y = [1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0]

 X_t, X_v, y_t, y_v = train_test_split(X, y, test_size=0.1, random_state=42)

 from sentence_transformers import CrossEncoder, InputExample
 from torch.utils.data import DataLoader
 from sentence_transformers.cross_encoder.evaluation import CEBinaryClassificationEvaluator
 import math
 import os

 # os.environ["CUDA_VISIBLE_DEVICES"] = "0"

 def some_cross_encoder(X_train, y_train, X_val, y_val, epochs=5, batch_size=32, learning_rate=2e-5, model_folder_name="version"):
    model = CrossEncoder('bert-base-multilingual-uncased', num_labels=1)

    # model.model.to(torch.float32)
    device = "cuda" # if torch.cuda.is_available() else "cpu" 
    model.model.to(device)

    train_examples = [InputExample(texts=[sent1, sent2], label=label) for (sent1, sent2), label in zip(X_train, y_train)]
    val_examples = [InputExample(texts=[sent1, sent2], label=label) for (sent1, sent2), label in zip(X_val, y_val)]
    
    train_dataloader = DataLoader(train_examples, shuffle=True, batch_size=batch_size)
    evaluator = CEBinaryClassificationEvaluator.from_input_examples(val_examples, name="Evaluate dataset")

    warmup_steps = math.ceil(len(train_dataloader) * epochs * 0.1) 
    model.fit(
        train_dataloader=train_dataloader,
        evaluator=evaluator,
        epochs=epochs,
        warmup_steps=warmup_steps,
        optimizer_params={'lr': learning_rate},
        save_best_model=True,
        output_path=(model_folder_name + '_best'),
    )

    model.save(model_folder_name)

    return model

 some_cross_encoder(X_t, y_t, X_v, y_v, batch_size=5)
	# Training data consist of X=(str, str), y=float:
	from sklearn.model_selection import train_test_split
	X = [
	["Hello World!", "Good morning!"],
	["It is raining", "It is cold"],
	["Beautiful city beside mountain", "Quiet street in downtown area"],
	["AI is the future", "AI is just a tool"],
	["This application is great", "software is the problem"],
	["Hello World!", "Good morning!"],
	["It is raining", "It is cold"],
	["Beautiful city beside mountain", "Quiet street in downtown area"],
	["AI is the future", "AI is just a tool"],
	["This application is great", "software is the problem"],
	["Hello World!", "Good morning!"],
	["It is raining", "It is cold"],
	["Beautiful city beside mountain", "Quiet street in downtown area"],
	["AI is the future", "AI is just a tool"],
	["This application is great", "software is the problem"],
	["Hello World!", "Good morning!"],
	["It is raining", "It is cold"],
	["Beautiful city beside mountain", "Quiet street in downtown area"],
	["AI is the future", "AI is just a tool"],
	["This application is great", "software is the problem"],
	]
	y = [1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0]

	X_t, X_v, y_t, y_v = train_test_split(X, y, test_size=0.1, random_state=42)

	from sentence_transformers import CrossEncoder, InputExample
	from torch.utils.data import DataLoader
	from sentence_transformers.cross_encoder.evaluation import CEBinaryClassificationEvaluator
	import math
	import os

	# os.environ["CUDA_VISIBLE_DEVICES"] = "0"

	def some_cross_encoder(X_train, y_train, X_val, y_val, epochs=5, batch_size=32, learning_rate=2e-5, model_folder_name="version"):
	model = CrossEncoder('bert-base-multilingual-uncased', num_labels=1)

	# model.model.to(torch.float32)
	device = "cuda" # if torch.cuda.is_available() else "cpu"
	model.model.to(device)

	train_examples = [InputExample(texts=[sent1, sent2], label=label) for (sent1, sent2), label in zip(X_train, y_train)]
	val_examples = [InputExample(texts=[sent1, sent2], label=label) for (sent1, sent2), label in zip(X_val, y_val)]

	train_dataloader = DataLoader(train_examples, shuffle=True, batch_size=batch_size)
	evaluator = CEBinaryClassificationEvaluator.from_input_examples(val_examples, name="Evaluate dataset")

	warmup_steps = math.ceil(len(train_dataloader) * epochs * 0.1)
	model.fit(
	train_dataloader=train_dataloader,
	evaluator=evaluator,
	epochs=epochs,
	warmup_steps=warmup_steps,
	optimizer_params={'lr': learning_rate},
	save_best_model=True,
	output_path=(model_folder_name + '_best'),
	)

	model.save(model_folder_name)

	return model

	some_cross_encoder(X_t, y_t, X_v, y_v, batch_size=5)