Prashanth Rao prrao87

AI Engineer @kuzudb 🇨🇦. Building workflows using relational/graph/vector databases and LLMs.

prrao87 / transform_stance.py

Created January 12, 2019 23:40

classification task-head for transformer

	def transform_stance(X1):
	# Input transform for classification task-head
	n_batch = len(X1)
	xmb = np.zeros((n_batch, 1, n_ctx, 2), dtype=np.int32)
	mmb = np.zeros((n_batch, 1, n_ctx), dtype=np.float32)
	start = encoder['_start_']
	for i, x1 in enumerate(X1):
	x12 = [start] + x1[:max_len] + [clf_token]
	l12 = len(x12)
	xmb[i, 0, :l12, 0] = x12

prrao87 / clean_stance.py

Created January 13, 2019 00:03

clean input tweet data to only have ascii characters

	def _stance(path, topic=None):
	def clean_ascii(text):
	# function to remove non-ASCII chars from data
	return ''.join(i for i in text if ord(i) < 128)
	orig = pd.read_csv(path, delimiter='\t', header=0, encoding = "latin-1")
	orig['Tweet'] = orig['Tweet'].apply(clean_ascii)
	df = orig
	# Get only those tweets that pertain to a single topic in the training data
	if topic is not None:
	df = df.loc[df['Target'] == topic]

prrao87 / split_input_stance.py

Created January 13, 2019 00:04

split tweet data into training, validation and test sets for the transformer

	def stance(data_dir, topic=None):
	path = Path(data_dir)
	trainfile = 'semeval2016-task6-trainingdata.txt'
	testfile = 'SemEval2016-Task6-subtaskA-testdata.txt'

	X, Y = _stance(path/trainfile, topic=topic)
	teX, _ = _stance(path/testfile, topic=topic)
	tr_text, va_text, tr_sent, va_sent = train_test_split(X, Y, test_size=0.2, random_state=seed)
	trX = []
	trY = []

prrao87 / tree2tabular.py

Last active August 26, 2019 15:58

Convert SST-5 tree data to tabular form

	# Load data
	import pytreebank
	import sys
	import os

	out_path = os.path.join(sys.path[0], 'sst_{}.txt')
	dataset = pytreebank.load_sst('./raw_data')

	# Store train, dev and test in separate files
	for category in ['train', 'test', 'dev']:

prrao87 / base_utils.py

Created August 26, 2019 20:25

Base utilities class for all classifiers

	import pandas as pd
	from sklearn.metrics import f1_score, accuracy_score


	class Base:
	"""Base class that houses common utilities for reading in test data
	and calculating model accuracy and F1 scores.
	"""
	def __init__(self) -> None:
	pass

prrao87 / example_sentiment_class.py

Last active August 29, 2019 14:51

Example sentiment predictor class

	class ExampleSentiment(Base):
	"""Predict sentiment scores using using X classifier"""
	def __init__(self, model_file: str=None) -> None:
	super().__init__() # Inherit methods from Base class

	def score(self, text: str) -> int:
	"""Return a sentiment score on sample text, an integer in the range [1, 2, 3, 4, 5]"""
	# Apply some sentiment scoring technique here

	def predict(self, train_file: None, test_file: str, lower_case: bool) -> pd.DataFrame:

prrao87 / textblob_predict.py

Created August 29, 2019 21:37

	class TextBlobSentiment(Base):
	"""Predict fine-grained sentiment classes using TextBlob."""
	def __init__(self, model_file: str=None) -> None:
	super().__init__()

	def score(self, text: str) -> float:
	# pip install textblob
	from textblob import TextBlob
	return TextBlob(text).sentiment.polarity

prrao87 / vader_predict.py

Last active August 30, 2019 20:10

	class VaderSentiment(Base):
	"""Predict fine-grained sentiment classes using Vader."""
	def __init__(self, model_file: str=None) -> None:
	super().__init__()
	from nltk.sentiment.vader import SentimentIntensityAnalyzer
	self.vader = SentimentIntensityAnalyzer()

	def score(self, text: str) -> float:
	return self.vader.polarity_scores(text)['compound']

prrao87 / logistic_predict.py

Last active August 30, 2019 20:56

	class LogisticRegressionSentiment(Base):
	"""Predict fine-grained sentiment scores using a sklearn Logistic Regression pipeline."""
	def __init__(self, model_file: str=None) -> None:
	super().__init__()
	from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
	from sklearn.linear_model import LogisticRegression
	from sklearn.pipeline import Pipeline
	self.pipeline = Pipeline(
	[
	('vect', CountVectorizer()),

prrao87 / svm_predict.py

Created August 30, 2019 20:55

	class SVMSentiment(Base):
	"""Predict fine-grained sentiment scores using a sklearn
	linear Support Vector Machine (SVM) pipeline."""
	def __init__(self, model_file: str=None) -> None:
	super().__init__()
	from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
	from sklearn.linear_model import SGDClassifier
	from sklearn.pipeline import Pipeline
	self.pipeline = Pipeline(
	[