Skip to content

Instantly share code, notes, and snippets.

View prrao87's full-sized avatar

Prashanth Rao prrao87

View GitHub Profile
hyper_params = {
"lr": 0.35, # Learning rate
"epoch": 100, # Number of training epochs to train for
"wordNgrams": 3, # Number of word n-grams to consider during training
"dim": 155, # Size of word vectors
"ws": 5, # Size of the context window for CBOW or skip-gram
"minn": 2, # Min length of char ngram
"maxn": 5, # Max length of char ngram
"bucket": 2014846, # Number of buckets
}
# Quantize model to reduce space usage
model.quantize(input=train, qnorm=True, retrain=True, cutoff=100000)
model.save_model(os.path.join(model_path, "sst-5.ftz"))
class FastTextSentiment(Base):
"""Predict fine-grained sentiment scores using FastText"""
def __init__(self, model_file: str=None) -> None:
super().__init__()
import fasttext
self.model = fasttext.load_model(model_file)
def score(self, text: str) -> int:
# Predict just the top label (hence 1 index below)
labels, probabilities = self.model.predict(text, 1)
from flair.embeddings import FlairEmbeddings, BertEmbeddings, WordEmbeddings, DocumentRNNEmbeddings
# init Flair embeddings
flair_forward_embedding = FlairEmbeddings('multi-forward')
flair_backward_embedding = FlairEmbeddings('multi-backward')
# init BERT base (cases)
optional_embedding = BertEmbeddings('bert-base-cased')
# OR init ELMo (original)
# optional_embedding = ELMoEmbeddings('original')
class FlairSentiment(Base):
"""Predict fine-grained sentiment scores using Flair."""
def __init__(self, model_file: str=None) -> None:
super().__init__()
from flair.models import TextClassifier
self.model = TextClassifier.load(model_file)
def score(self, text: str) -> int:
from flair.data import Sentence
doc = Sentence(text)
class ExampleExplainer:
"""Class to explain classification results.
The `predict` method outputs a numpy array of floats, which
is the classifier's prediction probability for each class.
"""
def __init__(self, path_to_model: str) -> None:
# Load in a trained classifier model
def predict(self, texts: List[str]) -> np.array([float, ...]):
# Take in a list of strings (LIME text sample of variations)
def explainer(method: str, path_to_file: str, text: str, num_samples: int) -> LimeTextExplainer:
"""Run LIME explainer on provided classifier"""
model = explainer_class(method, path_to_file)
predictor = model.predict
# Create a LimeTextExplainer
explainer = LimeTextExplainer(
# Specify split option for string
split_expression=lambda x: x.split(),
# Our classifer uses bigrms, trigrams or contextual ordering to classify text
# First, offset the float score from the range [-1, 1] to a range [0, 1]
offset = (score(text) + 1) / 2.
# Convert offset float score in [0, 1] to an integer value in the range [1, 5]
binned = np.digitize(5 * offset, np.array([1, 2, 3, 4, 5])) + 1
# Similate probabilities of each class based on a normal distribution
simulated_probs = scipy.stats.norm.pdf(np.array([1, 2, 3, 4, 5]), binned, scale=0.5)
@prrao87
prrao87 / fasttext_cli.sh
Last active September 3, 2019 00:15
FastText text classification training with hyperparameter autotune
#!/usr/bin/bash
fasttext supervised -input data/sst/sst_train.txt -output model_hyperopt \
-autotune-validation data/sst/sst_dev.txt -autotune-modelsize 10M -verbose 3
import pandas as pd
TEXT_COL, LABEL_COL = 'text', 'truth'
def read_sst5(data_dir, colnames=[LABEL_COL, TEXT_COL]):
datasets = {}
for t in ["train", "dev", "test"]:
df = pd.read_csv(os.path.join(data_dir, f"sst_{t}.txt"), sep='\t', header=None, names=colnames)
df[LABEL_COL] = df[LABEL_COL].str.replace('__label__', '')
df[LABEL_COL] = df[LABEL_COL].astype(int) # Categorical data type for truth labels