Wilame vallantin

3 followers · 0 following

Vienna and Lisbon
https://wila.me/

View GitHub Profile

Recently created

Least recently created

Recently updated

Least recently updated

vallantin / 002-rasa.yml

Created October 29, 2018 09:35

	language: "en"

	pipeline:
	- name: "nlp_spacy"
	model: "en"
	- name: "tokenizer_spacy"
	- name: "ner_crf"
	- name: "intent_featurizer_spacy"
	- name: "intent_classifier_sklearn"

vallantin / 001-rasa.txt

Created October 29, 2018 09:31

	gevent==1.2.2
	klein==17.10.0
	hyperlink==17.3.1
	boto3==1.5.20
	typing==3.6.2
	future==0.16.0
	six==1.11.0
	jsonschema==2.6.0
	matplotlib==2.1.0
	requests==2.18.4

vallantin / 006savariables.py

Created October 19, 2018 19:04

	# Fit and predict
	pipeline.fit(X_train, y_train)
	y_pred = pipeline.predict(X_test)

	# Evaluate
	print('Accuracy score:', accuracy_score(y_test, y_pred))
	print("-"*80)
	print('Confusion matrix\n')
	conmat = np.array(confusion_matrix(y_test, y_pred, labels=[1,0]))
	confusion = pd.DataFrame(conmat, index=['Actual +', 'Actual -'],

vallantin / 005savariables.py

Last active October 19, 2018 19:00

	# Create the pipeline for the tweets
	text = Pipeline([('process_tweets', process_tweets()),
	('vct', TfidfVectorizer(ngram_range=(1,2)))])

	# Create the pipeline for the other variables and add selection to choose features
	dummies = Pipeline([('dummies_transformation', dummies_transformation(columns=['weekday', 'calendar_day', 'hour', 'is_weekend', 'link']))])

	# Merge pipelines using FeatureUnion
	features = FeatureUnion([('text', text),
	('dummies', dummies)])

vallantin / 004savariables.py

Created October 19, 2018 15:00

	# Clean tweets
	class process_tweets(BaseEstimator, TransformerMixin):
	'''Extracts the Tweet and apply the transformation'''

	def __init__(self):
	pass

	def preprocess_text(self, text):
	# Return the normal form form for the Unicode string, encode them as ascii and decode back as utf-8
	text = unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('utf-8', 'ignore')

vallantin / 003savariables.py

Last active October 19, 2018 11:13

	# Drop columns we don't need
	data = data.drop(['date', 'tweet_size', 'mention'], axis=1)

	# Subset data set for faster training
	# Choose all positive and all negative samples
	positive = data[data['sentiment'] == 1]
	negative = data[data['sentiment'] == 0]

	# Choose 5% of positives and 5% of negatives
	positive = positive.sample(frac=0.05)

vallantin / 002savariables.py

Last active October 18, 2018 14:37

	def import_data_set(url):
	'''
	This function imports Tweets data set and decomposes
	the date field into:

	- Weekday
	- Month
	- Calendar day
	- Hour
	- Is weekend?

vallantin / 001savariables.py

Last active October 19, 2018 19:11

	import numpy as np
	import pandas as pd
	import nltk
	from nltk.stem import WordNetLemmatizer
	from nltk.corpus import stopwords
	import re
	import unicodedata
	from sklearn.model_selection import train_test_split
	from sklearn.pipeline import Pipeline, FeatureUnion
	from sklearn.feature_extraction.text import TfidfVectorizer

vallantin / 04tt.py

Created October 16, 2018 15:58

	query = '#Haddad OR #Haddad13 OR #HaddadSim OR' \
	'#EleNao OR #EleNunca OR #Bolsonaro OR' \
	'#Bolsonaro17 OR #SomosTodosBolsonaro OR #Bolsonaro2019 OR' \
	'"haddad" OR "bolsonaro"'

	# Get those tweets
	get_save_tweets('tweets.json', api, query)

vallantin / 03tt.py

Created October 16, 2018 15:56

	def get_save_tweets(filepath, api, query, max_tweets=1000000, lang='pt'):

	tweetCount = 0

	#Open file and save tweets
	with open(filepath, 'w') as f:

	# Send the query
	for tweet in tweepy.Cursor(api.search,q=query,lang=lang).items(max_tweets):