Skip to content

Instantly share code, notes, and snippets.

@mikepqr
Created January 16, 2019 07:09
Show Gist options
  • Save mikepqr/7d9008e70569edf02243f91cd19872b4 to your computer and use it in GitHub Desktop.
Save mikepqr/7d9008e70569edf02243f91cd19872b4 to your computer and use it in GitHub Desktop.
# !wget http://cs.stanford.edu/people/alecmgo/trainingandtestdata.zip
# !unzip trainingandtestdata.zip
# !pip3 install joblib sklearn
import pandas as pd
import joblib
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
training_csv_file = 'training.1600000.processed.noemoticon.csv'
names = ('polarity', 'id', 'date', 'query', 'author', 'text')
df = pd.read_csv(training_csv_file, encoding='latin1', names=names)
# At this point you might want to take a look at the data,
# e.g. df.head()
df['polarity'].replace({0: -1, 4: 1}, inplace=True)
df = df.sample(frac=0.2)
text = df['text']
target = df['polarity'].values
text_train, text_validation, target_train, target_validation = (
train_test_split(text, target, test_size=0.2, random_state=42)
)
vectorizer = CountVectorizer(ngram_range=(1,2), max_features=10000)
feature_selector = SelectKBest(chi2, k=1000)
classifier = LogisticRegression()
if os.path.exists('model.pkl'):
sentiment_pipeline = joblib.load('model.pkl')
else:
sentiment_pipeline = Pipeline((
('v', vectorizer),
('f', feature_selector),
('c', classifier)
))
sentiment_pipeline.fit(text_train, target_train)
joblib.dump(sentiment_pipeline, 'model.pkl');
sentiment_pipeline.score(text_validation, target_validation)
# Unit tests
print(sentiment_pipeline.predict(
['bad',
'good',
"didnt like",
"today was a good day",
"i hate this product"]
))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment