Last active
April 19, 2016 21:17
-
-
Save geoom/c81970ef0171ecbb1a1b to your computer and use it in GitHub Desktop.
ANN scripts
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
class TextualAnalizer(object): | |
STOP_WORDS = ['a', 'about', 'above', 'after', 'again', 'against', 'all', 'am', 'an', 'and', 'any', | |
'are', 'aren\'t', 'as', 'at', 'be', 'because', 'been', 'before', 'being', 'below', | |
'between', 'both', 'but', 'by', 'can\'t', 'cannot', 'could', 'couldn\'t', 'did', | |
'didn\'t', 'do', 'does', 'doesn\'t', 'doing', 'don\'t', 'down', 'during', 'each', | |
'few', 'for', 'from', 'further', 'had', 'hadn\'t', 'has', 'hasn\'t', 'have', 'haven\'t', | |
'having', 'he', 'he\'d', 'he\'ll', 'he\'s', 'her', 'here', 'here\'s', 'hers', 'herself', | |
'him', 'himself', 'his', 'how', 'how\'s', 'i', 'i\'d', 'i\'ll', 'i\'m', 'i\'ve', 'if', | |
'in', 'into', 'is', 'isn\'t', 'it', 'it\'s', 'its', 'itself', 'let\'s', 'me', 'more', | |
'most', 'mustn\'t', 'my', 'myself', 'no', 'nor', 'not', 'of', 'off', 'on', 'once', 'only', | |
'or', 'other', 'ought', 'our', 'ours', 'ourselves', 'out', 'over', 'own', 'same', 'shan\'t', | |
'she', 'she\'d', 'she\'ll', 'she\'s', 'should', 'shouldn\'t', 'so', 'some', 'such', 'than', | |
'that', 'that\'s', 'the', 'their', 'theirs', 'them', 'themselves', 'then', 'there', 'there\'s', | |
'these', 'they', 'they\'d', 'they\'ll', 'they\'re', 'they\'ve', 'this', 'those', 'through', | |
'to', 'too', 'under', 'until', 'up', 'very', 'was', 'wasn\'t', 'we', 'we\'d', 'we\'ll', | |
'we\'re', 'we\'ve', 'were', 'weren\'t', 'what', 'what\'s', 'when', 'when\'s', 'where', | |
'where\'s', 'which', 'while', 'who', 'who\'s', 'whom', 'why', 'why\'s', 'with', 'won\'t', | |
'would', 'wouldn\'t', 'you', 'you\'d', 'you\'ll', 'you\'re', 'you\'ve', 'your', 'yours', | |
'yourself', 'yourselves'] | |
PUNCTUATION_MARKS = ['.', ',', '?', ':', ';', '-', '...'] | |
EXCLAMATION_MARK = '!' | |
NOISE_MARKS = ['/', '&'] | |
POSITIVE_EMOTICONS = [':)', ':D'] | |
NEUTRAL_EMOTICONS = [':|'] | |
NEGATIVE_EMOTICONS = [':(', ':\'('] | |
def __init__(self, text): | |
self.text = text.lower() | |
print "text is ", self.text | |
def _discard_terms(self, *terms_lists): | |
for _, term_list in enumerate(terms_lists): | |
for term in term_list: | |
if term in self.text: | |
self.text = self.text.replace(term, '') | |
def _get_ocurrences_number(self, term_list): | |
counter = 0 | |
self.text = self.text.lower() | |
for stop_word in term_list: | |
if stop_word in self.text: | |
counter += 1 | |
return counter | |
def get_stop_words_number(self): | |
return self._get_ocurrences_number(self.STOP_WORDS) | |
def get_words_number(self, exclude_stop_words=False): | |
return len(self.get_words_list(self.text, exclude_stop_words)) | |
def get_words_list(self, only_uniques=False, exclude_stop_words=False): | |
self._discard_terms(self.PUNCTUATION_MARKS, self.EXCLAMATION_MARK, | |
self.POSITIVE_EMOTICONS, self.NEUTRAL_EMOTICONS, | |
self.NEGATIVE_EMOTICONS, self.NOISE_MARKS) | |
all_words = self.text.strip().split(' ') | |
all_words = filter(lambda item: item != '' and '#' not in item and 'http' not in item and '@' not in item, | |
all_words) | |
if exclude_stop_words: | |
for excl_word in self.STOP_WORDS: | |
if excl_word in all_words: | |
all_words.remove(excl_word) | |
return list(set(all_words)) if only_uniques else all_words | |
def get_punctuation_marks_number(self): | |
return self._get_ocurrences_number(self.PUNCTUATION_MARKS) | |
def get_exclamation_marks_number(self): | |
exclamation_string = filter(lambda item: item == self.EXCLAMATION_MARK, self.text) | |
return len(exclamation_string) | |
def get_capitalized_words_number(self): | |
ocurrences = [word for word in self.text if word[0].isupper()] | |
return len(ocurrences) | |
def get_positive_emoticons_number(self): | |
return self._get_ocurrences_number(self.POSITIVE_EMOTICONS) | |
def get_neutral_emoticons_number(self): | |
return self._get_ocurrences_number(self.NEUTRAL_EMOTICONS) | |
def get_negative_emoticons_number(self): | |
return self._get_ocurrences_number(self.NEGATIVE_EMOTICONS) | |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import urllib | |
import settings | |
from xml.dom import minidom | |
from twitter import * | |
class TwitterHandler(object): | |
def __init__(self, query): | |
self.query = query | |
self.twitter = Twitter(auth=OAuth(settings.ACCESS_KEY, settings.ACCESS_SECRET, | |
settings.CONSUMER_KEY, settings.CONSUMER_SECRET)) | |
def get_product_tweets(self): | |
results = self.twitter.search.tweets(q=self.query, count=5) | |
return results["statuses"] | |
class FileHandler(object): | |
def __init__(self, row_format_in_file): | |
self.row_format_in_file = row_format_in_file | |
self.output_file = settings.OUTPUT_FILENAME | |
def save(self, stored_data): | |
output_file = file(self.output_file, "a") | |
row = self.row_format_in_file % stored_data | |
output_file.write(row) | |
output_file.close() | |
def clean(self): | |
output_file = file(self.output_file, "w") | |
output_file.write('') | |
output_file.close() | |
class DALHandler(object): | |
NEGATIVE_WORD, NEUTRAL_WORD, POSITIVE_WORD = (-1, 0, 1) | |
word_affect_list = list() | |
polarity_list = list() | |
def __init__(self, word_list): | |
query = '+'.join(word_list) | |
self.url = 'http://compling.org/cgi-bin/DAL_sentence_xml.cgi?sentence=%s' % query | |
def _get_remote_document(self): | |
remote_doc = urllib.urlopen(self.url).read() | |
parsed_doc = minidom.parseString(remote_doc) | |
return parsed_doc | |
@staticmethod | |
def get_polarity(valence): | |
valence = float(valence) | |
normalization_factor = 3.0 | |
result = valence/normalization_factor | |
if result < 0.5: | |
return DALHandler.NEGATIVE_WORD | |
elif result > 0.8: | |
return DALHandler.POSITIVE_WORD | |
return DALHandler.NEUTRAL_WORD | |
def make_word_affect_list(self): | |
doc = self._get_remote_document() | |
words = doc.getElementsByTagName("word") | |
for word in words: | |
token_tag = word.getElementsByTagName('token')[0] | |
emotion_measure_tag = word.getElementsByTagName('measure')[0] | |
valence = emotion_measure_tag.getAttribute("valence") | |
polarity = DALHandler.get_polarity(valence) if len(valence) > 0 else None | |
self.word_affect_list.append( | |
(token_tag.firstChild.data, polarity)) | |
self.polarity_list.append(polarity) | |
print 'word_affect_list', self.word_affect_list | |
def get_positive_word_number(self): | |
return self.polarity_list.count(DALHandler.POSITIVE_WORD) | |
def get_negative_word_number(self): | |
return self.polarity_list.count(DALHandler.NEGATIVE_WORD) | |
def get_neutral_word_number(self): | |
return self.polarity_list.count(DALHandler.NEUTRAL_WORD) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from handler import FileHandler, TwitterHandler, DALHandler | |
from analizer import TextualAnalizer | |
import settings | |
class MinedProductTweet(object): | |
def __init__(self, tweet_text): | |
self.tweet_text = tweet_text | |
self.positive_words_number = 0 | |
self.neutral_words_number = 0 | |
self.negative_words_number = 0 | |
self.stop_words_number = 0 | |
self.words_number = 0 | |
self.punctuation_marks_number = 0 | |
self.exclamation_marks_number = 0 | |
self.capitalized_words_number = 0 | |
self.positive_emoticons_number = 0 | |
self.neutral_emoticons_number = 0 | |
self.negative_emoticons_number = 0 | |
def make_data(self): | |
analizer = TextualAnalizer(self.tweet_text) | |
self.stop_words_number = analizer.get_stop_words_number() | |
self.words_number = analizer.get_words_number() | |
self.punctuation_marks_number = analizer.get_punctuation_marks_number() | |
self.exclamation_marks_number = analizer.get_exclamation_marks_number() | |
self.capitalized_words_number = analizer.get_capitalized_words_number() | |
self.positive_emoticons_number = analizer.get_positive_emoticons_number() | |
self.neutral_emoticons_number = analizer.get_neutral_emoticons_number() | |
self.negative_emoticons_number = analizer.get_negative_emoticons_number() | |
word_list = analizer.get_words_list(exclude_stop_words=True) | |
handler = DALHandler(word_list) | |
handler.make_word_affect_list() | |
self.positive_words_number = handler.get_positive_word_number() | |
self.neutral_words_number = handler.get_neutral_word_number() | |
self.negative_words_number = handler.get_negative_word_number() | |
class MinedProduct(object): | |
product_tweet_list = list() | |
def __init__(self, hashtag): | |
self.hashtag = hashtag | |
self.tweets_number = 0 | |
self.retweet_percentage = 0 | |
self.price_from_amazon_seller = 0 | |
self.sell_raking = 0 | |
self.rating_by_clients = 0 | |
self.elapsed_time_since_release = 0 | |
self.average_positive_words_number = 0 | |
self.average_neutral_words_number = 0 | |
self.average_negative_words_number = 0 | |
self.average_stop_words_number = 0 | |
self.average_words_number = 0 | |
self.average_punctuation_marks_number = 0 | |
self.average_exclamation_marks_number = 0 | |
self.average_capitalized_marks_number = 0 | |
self.average_positive_emoticons_number = 0 | |
self.average_neutral_emoticons_number = 0 | |
self.average_negative_emoticons_number = 0 | |
self.acceptability = 0 | |
def make_data(self): | |
handler = TwitterHandler(self.hashtag) | |
tweet_results = handler.get_product_tweets() | |
self.tweets_number = len(tweet_results) | |
for tweet in tweet_results: | |
product_tweet = MinedProductTweet(tweet['text'].encode('utf-8')) | |
product_tweet.make_data() | |
self.product_tweet_list.append(product_tweet) | |
positive_words_number_list, neutral_words_number_list, \ | |
negative_words_number_list = zip(*[(product_tweet.positive_words_number, product_tweet.neutral_words_number, | |
product_tweet.negative_words_number) | |
for product_tweet in self.product_tweet_list]) | |
self.average_positive_words_number = sum(positive_words_number_list)/len(positive_words_number_list) | |
self.average_neutral_words_number = sum(neutral_words_number_list)/len(neutral_words_number_list) | |
self.average_negative_words_number = sum(negative_words_number_list)/len(negative_words_number_list) | |
print self.__dict__ | |
def calculate_acceptability(self): | |
pass | |
def save(self): | |
row_format_in_file = "%(tweets_number)s, %(average_positive_words_number)s, " \ | |
"%(average_neutral_words_number)s, %(average_negative_words_number)s\n" | |
handler = FileHandler(row_format_in_file) | |
handler.save(self.__dict__) | |
class Miner(object): | |
product_hashtag_list = settings.ALL_PRODUCT_HASTAGS | |
def perform_mining(self): | |
for product_hashtag in self.product_hashtag_list: | |
product = MinedProduct(product_hashtag) | |
product.make_data() | |
# product.calculate_acceptability() | |
product.save() | |
miner = Miner() | |
miner.perform_mining() | |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
CONSUMER_KEY = "XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX" | |
CONSUMER_SECRET = "XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX" | |
ACCESS_KEY = "XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX" | |
ACCESS_SECRET = "XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX" | |
OUTPUT_FILENAME = "products.txt" | |
ALL_PRODUCT_HASTAGS = ['#gopro'] |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment