This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
hyper_params = { | |
"lr": 0.35, # Learning rate | |
"epoch": 100, # Number of training epochs to train for | |
"wordNgrams": 3, # Number of word n-grams to consider during training | |
"dim": 155, # Size of word vectors | |
"ws": 5, # Size of the context window for CBOW or skip-gram | |
"minn": 2, # Min length of char ngram | |
"maxn": 5, # Max length of char ngram | |
"bucket": 2014846, # Number of buckets | |
} |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Quantize model to reduce space usage | |
model.quantize(input=train, qnorm=True, retrain=True, cutoff=100000) | |
model.save_model(os.path.join(model_path, "sst-5.ftz")) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
class FastTextSentiment(Base): | |
"""Predict fine-grained sentiment scores using FastText""" | |
def __init__(self, model_file: str=None) -> None: | |
super().__init__() | |
import fasttext | |
self.model = fasttext.load_model(model_file) | |
def score(self, text: str) -> int: | |
# Predict just the top label (hence 1 index below) | |
labels, probabilities = self.model.predict(text, 1) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from flair.embeddings import FlairEmbeddings, BertEmbeddings, WordEmbeddings, DocumentRNNEmbeddings | |
# init Flair embeddings | |
flair_forward_embedding = FlairEmbeddings('multi-forward') | |
flair_backward_embedding = FlairEmbeddings('multi-backward') | |
# init BERT base (cases) | |
optional_embedding = BertEmbeddings('bert-base-cased') | |
# OR init ELMo (original) | |
# optional_embedding = ELMoEmbeddings('original') |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
class FlairSentiment(Base): | |
"""Predict fine-grained sentiment scores using Flair.""" | |
def __init__(self, model_file: str=None) -> None: | |
super().__init__() | |
from flair.models import TextClassifier | |
self.model = TextClassifier.load(model_file) | |
def score(self, text: str) -> int: | |
from flair.data import Sentence | |
doc = Sentence(text) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
class ExampleExplainer: | |
"""Class to explain classification results. | |
The `predict` method outputs a numpy array of floats, which | |
is the classifier's prediction probability for each class. | |
""" | |
def __init__(self, path_to_model: str) -> None: | |
# Load in a trained classifier model | |
def predict(self, texts: List[str]) -> np.array([float, ...]): | |
# Take in a list of strings (LIME text sample of variations) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def explainer(method: str, path_to_file: str, text: str, num_samples: int) -> LimeTextExplainer: | |
"""Run LIME explainer on provided classifier""" | |
model = explainer_class(method, path_to_file) | |
predictor = model.predict | |
# Create a LimeTextExplainer | |
explainer = LimeTextExplainer( | |
# Specify split option for string | |
split_expression=lambda x: x.split(), | |
# Our classifer uses bigrms, trigrams or contextual ordering to classify text |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# First, offset the float score from the range [-1, 1] to a range [0, 1] | |
offset = (score(text) + 1) / 2. | |
# Convert offset float score in [0, 1] to an integer value in the range [1, 5] | |
binned = np.digitize(5 * offset, np.array([1, 2, 3, 4, 5])) + 1 | |
# Similate probabilities of each class based on a normal distribution | |
simulated_probs = scipy.stats.norm.pdf(np.array([1, 2, 3, 4, 5]), binned, scale=0.5) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/bash | |
fasttext supervised -input data/sst/sst_train.txt -output model_hyperopt \ | |
-autotune-validation data/sst/sst_dev.txt -autotune-modelsize 10M -verbose 3 |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import pandas as pd | |
TEXT_COL, LABEL_COL = 'text', 'truth' | |
def read_sst5(data_dir, colnames=[LABEL_COL, TEXT_COL]): | |
datasets = {} | |
for t in ["train", "dev", "test"]: | |
df = pd.read_csv(os.path.join(data_dir, f"sst_{t}.txt"), sep='\t', header=None, names=colnames) | |
df[LABEL_COL] = df[LABEL_COL].str.replace('__label__', '') | |
df[LABEL_COL] = df[LABEL_COL].astype(int) # Categorical data type for truth labels |