Prashanth Rao prrao87

AI Engineer @kuzudb 🇨🇦. Building workflows using relational/graph/vector databases and LLMs.

prrao87 / train_fasttext.py

Last active September 5, 2019 00:59

	hyper_params = {
	"lr": 0.35, # Learning rate
	"epoch": 100, # Number of training epochs to train for
	"wordNgrams": 3, # Number of word n-grams to consider during training
	"dim": 155, # Size of word vectors
	"ws": 5, # Size of the context window for CBOW or skip-gram
	"minn": 2, # Min length of char ngram
	"maxn": 5, # Max length of char ngram
	"bucket": 2014846, # Number of buckets
	}

prrao87 / quantize_fasttext.py

Created August 31, 2019 01:19

	# Quantize model to reduce space usage
	model.quantize(input=train, qnorm=True, retrain=True, cutoff=100000)
	model.save_model(os.path.join(model_path, "sst-5.ftz"))

prrao87 / fasttext_predict.py

Created August 31, 2019 01:27

	class FastTextSentiment(Base):
	"""Predict fine-grained sentiment scores using FastText"""
	def __init__(self, model_file: str=None) -> None:
	super().__init__()
	import fasttext
	self.model = fasttext.load_model(model_file)

	def score(self, text: str) -> int:
	# Predict just the top label (hence 1 index below)
	labels, probabilities = self.model.predict(text, 1)

prrao87 / training_flair.py

Last active August 31, 2019 21:24

	from flair.embeddings import FlairEmbeddings, BertEmbeddings, WordEmbeddings, DocumentRNNEmbeddings

	# init Flair embeddings
	flair_forward_embedding = FlairEmbeddings('multi-forward')
	flair_backward_embedding = FlairEmbeddings('multi-backward')

	# init BERT base (cases)
	optional_embedding = BertEmbeddings('bert-base-cased')
	# OR init ELMo (original)
	# optional_embedding = ELMoEmbeddings('original')

prrao87 / flair_predict.py

Created August 31, 2019 21:39

	class FlairSentiment(Base):
	"""Predict fine-grained sentiment scores using Flair."""
	def __init__(self, model_file: str=None) -> None:
	super().__init__()
	from flair.models import TextClassifier
	self.model = TextClassifier.load(model_file)

	def score(self, text: str) -> int:
	from flair.data import Sentence
	doc = Sentence(text)

prrao87 / example_explainer.py

Created September 2, 2019 01:08

	class ExampleExplainer:
	"""Class to explain classification results.
	The `predict` method outputs a numpy array of floats, which
	is the classifier's prediction probability for each class.
	"""
	def __init__(self, path_to_model: str) -> None:
	# Load in a trained classifier model

	def predict(self, texts: List[str]) -> np.array([float, ...]):
	# Take in a list of strings (LIME text sample of variations)

prrao87 / lime_explain.py

Last active September 2, 2019 01:16

	def explainer(method: str, path_to_file: str, text: str, num_samples: int) -> LimeTextExplainer:
	"""Run LIME explainer on provided classifier"""
	model = explainer_class(method, path_to_file)
	predictor = model.predict

	# Create a LimeTextExplainer
	explainer = LimeTextExplainer(
	# Specify split option for string
	split_expression=lambda x: x.split(),
	# Our classifer uses bigrms, trigrams or contextual ordering to classify text

prrao87 / rule_based_pred.py

Last active September 3, 2019 18:17

	# First, offset the float score from the range [-1, 1] to a range [0, 1]
	offset = (score(text) + 1) / 2.
	# Convert offset float score in [0, 1] to an integer value in the range [1, 5]
	binned = np.digitize(5 * offset, np.array([1, 2, 3, 4, 5])) + 1
	# Similate probabilities of each class based on a normal distribution
	simulated_probs = scipy.stats.norm.pdf(np.array([1, 2, 3, 4, 5]), binned, scale=0.5)

prrao87 / fasttext_cli.sh

Last active September 3, 2019 00:15

FastText text classification training with hyperparameter autotune

	#!/usr/bin/bash
	fasttext supervised -input data/sst/sst_train.txt -output model_hyperopt \
	-autotune-validation data/sst/sst_dev.txt -autotune-modelsize 10M -verbose 3

prrao87 / read_sst5.py

Created September 8, 2019 01:02

	import pandas as pd

	TEXT_COL, LABEL_COL = 'text', 'truth'

	def read_sst5(data_dir, colnames=[LABEL_COL, TEXT_COL]):
	datasets = {}
	for t in ["train", "dev", "test"]:
	df = pd.read_csv(os.path.join(data_dir, f"sst_{t}.txt"), sep='\t', header=None, names=colnames)
	df[LABEL_COL] = df[LABEL_COL].str.replace('__label__', '')
	df[LABEL_COL] = df[LABEL_COL].astype(int) # Categorical data type for truth labels