Skip to content

Instantly share code, notes, and snippets.

language: "en"
pipeline:
- name: "nlp_spacy"
model: "en"
- name: "tokenizer_spacy"
- name: "ner_crf"
- name: "intent_featurizer_spacy"
- name: "intent_classifier_sklearn"
gevent==1.2.2
klein==17.10.0
hyperlink==17.3.1
boto3==1.5.20
typing==3.6.2
future==0.16.0
six==1.11.0
jsonschema==2.6.0
matplotlib==2.1.0
requests==2.18.4
# Fit and predict
pipeline.fit(X_train, y_train)
y_pred = pipeline.predict(X_test)
# Evaluate
print('Accuracy score:', accuracy_score(y_test, y_pred))
print("-"*80)
print('Confusion matrix\n')
conmat = np.array(confusion_matrix(y_test, y_pred, labels=[1,0]))
confusion = pd.DataFrame(conmat, index=['Actual +', 'Actual -'],
# Create the pipeline for the tweets
text = Pipeline([('process_tweets', process_tweets()),
('vct', TfidfVectorizer(ngram_range=(1,2)))])
# Create the pipeline for the other variables and add selection to choose features
dummies = Pipeline([('dummies_transformation', dummies_transformation(columns=['weekday', 'calendar_day', 'hour', 'is_weekend', 'link']))])
# Merge pipelines using FeatureUnion
features = FeatureUnion([('text', text),
('dummies', dummies)])
# Clean tweets
class process_tweets(BaseEstimator, TransformerMixin):
'''Extracts the Tweet and apply the transformation'''
def __init__(self):
pass
def preprocess_text(self, text):
# Return the normal form form for the Unicode string, encode them as ascii and decode back as utf-8
text = unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('utf-8', 'ignore')
# Drop columns we don't need
data = data.drop(['date', 'tweet_size', 'mention'], axis=1)
# Subset data set for faster training
# Choose all positive and all negative samples
positive = data[data['sentiment'] == 1]
negative = data[data['sentiment'] == 0]
# Choose 5% of positives and 5% of negatives
positive = positive.sample(frac=0.05)
def import_data_set(url):
'''
This function imports Tweets data set and decomposes
the date field into:
- Weekday
- Month
- Calendar day
- Hour
- Is weekend?
import numpy as np
import pandas as pd
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
import re
import unicodedata
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.feature_extraction.text import TfidfVectorizer
query = '#Haddad OR #Haddad13 OR #HaddadSim OR' \
'#EleNao OR #EleNunca OR #Bolsonaro OR' \
'#Bolsonaro17 OR #SomosTodosBolsonaro OR #Bolsonaro2019 OR' \
'"haddad" OR "bolsonaro"'
# Get those tweets
get_save_tweets('tweets.json', api, query)
def get_save_tweets(filepath, api, query, max_tweets=1000000, lang='pt'):
tweetCount = 0
#Open file and save tweets
with open(filepath, 'w') as f:
# Send the query
for tweet in tweepy.Cursor(api.search,q=query,lang=lang).items(max_tweets):