This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def transform_stance(X1): | |
# Input transform for classification task-head | |
n_batch = len(X1) | |
xmb = np.zeros((n_batch, 1, n_ctx, 2), dtype=np.int32) | |
mmb = np.zeros((n_batch, 1, n_ctx), dtype=np.float32) | |
start = encoder['_start_'] | |
for i, x1 in enumerate(X1): | |
x12 = [start] + x1[:max_len] + [clf_token] | |
l12 = len(x12) | |
xmb[i, 0, :l12, 0] = x12 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def _stance(path, topic=None): | |
def clean_ascii(text): | |
# function to remove non-ASCII chars from data | |
return ''.join(i for i in text if ord(i) < 128) | |
orig = pd.read_csv(path, delimiter='\t', header=0, encoding = "latin-1") | |
orig['Tweet'] = orig['Tweet'].apply(clean_ascii) | |
df = orig | |
# Get only those tweets that pertain to a single topic in the training data | |
if topic is not None: | |
df = df.loc[df['Target'] == topic] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def stance(data_dir, topic=None): | |
path = Path(data_dir) | |
trainfile = 'semeval2016-task6-trainingdata.txt' | |
testfile = 'SemEval2016-Task6-subtaskA-testdata.txt' | |
X, Y = _stance(path/trainfile, topic=topic) | |
teX, _ = _stance(path/testfile, topic=topic) | |
tr_text, va_text, tr_sent, va_sent = train_test_split(X, Y, test_size=0.2, random_state=seed) | |
trX = [] | |
trY = [] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Load data | |
import pytreebank | |
import sys | |
import os | |
out_path = os.path.join(sys.path[0], 'sst_{}.txt') | |
dataset = pytreebank.load_sst('./raw_data') | |
# Store train, dev and test in separate files | |
for category in ['train', 'test', 'dev']: |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import pandas as pd | |
from sklearn.metrics import f1_score, accuracy_score | |
class Base: | |
"""Base class that houses common utilities for reading in test data | |
and calculating model accuracy and F1 scores. | |
""" | |
def __init__(self) -> None: | |
pass |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
class ExampleSentiment(Base): | |
"""Predict sentiment scores using using X classifier""" | |
def __init__(self, model_file: str=None) -> None: | |
super().__init__() # Inherit methods from Base class | |
def score(self, text: str) -> int: | |
"""Return a sentiment score on sample text, an integer in the range [1, 2, 3, 4, 5]""" | |
# Apply some sentiment scoring technique here | |
def predict(self, train_file: None, test_file: str, lower_case: bool) -> pd.DataFrame: |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
class TextBlobSentiment(Base): | |
"""Predict fine-grained sentiment classes using TextBlob.""" | |
def __init__(self, model_file: str=None) -> None: | |
super().__init__() | |
def score(self, text: str) -> float: | |
# pip install textblob | |
from textblob import TextBlob | |
return TextBlob(text).sentiment.polarity |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
class VaderSentiment(Base): | |
"""Predict fine-grained sentiment classes using Vader.""" | |
def __init__(self, model_file: str=None) -> None: | |
super().__init__() | |
from nltk.sentiment.vader import SentimentIntensityAnalyzer | |
self.vader = SentimentIntensityAnalyzer() | |
def score(self, text: str) -> float: | |
return self.vader.polarity_scores(text)['compound'] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
class LogisticRegressionSentiment(Base): | |
"""Predict fine-grained sentiment scores using a sklearn Logistic Regression pipeline.""" | |
def __init__(self, model_file: str=None) -> None: | |
super().__init__() | |
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer | |
from sklearn.linear_model import LogisticRegression | |
from sklearn.pipeline import Pipeline | |
self.pipeline = Pipeline( | |
[ | |
('vect', CountVectorizer()), |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
class SVMSentiment(Base): | |
"""Predict fine-grained sentiment scores using a sklearn | |
linear Support Vector Machine (SVM) pipeline.""" | |
def __init__(self, model_file: str=None) -> None: | |
super().__init__() | |
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer | |
from sklearn.linear_model import SGDClassifier | |
from sklearn.pipeline import Pipeline | |
self.pipeline = Pipeline( | |
[ |
OlderNewer