Last active
August 29, 2015 14:22
-
-
Save palcu/890e01c70c4680294cdd to your computer and use it in GitHub Desktop.
ada
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
var form_data = new FormData($('#upload-file')[0]); | |
$.ajax({ | |
type: 'POST', | |
url: '/search_trends', | |
data: form_data, | |
success: multiwordsSuccess, | |
contentType: false, | |
processData: false, | |
}) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import tweepy | |
import re | |
import collections | |
import subprocess | |
import json | |
from flask import Flask, render_template, request | |
from langdetect import detect | |
app = Flask(__name__) | |
app.config['UPLOAD_FOLDER'] = './' | |
def get_twitter_api(): | |
keys = [] | |
with open('keys.txt') as stream: | |
keys = [line.strip() for line in stream] | |
[consumer_key, consumer_secret, access_token, access_token_secret] = keys | |
auth = tweepy.OAuthHandler(consumer_key, consumer_secret) | |
auth.set_access_token(access_token, access_token_secret) | |
return tweepy.API(auth) | |
@app.route('/') | |
def home(): | |
return render_template('home.html') | |
@app.route('/trends') | |
def get_trends(): | |
api = get_twitter_api() | |
US_WOEID = '23424977' | |
response = api.trends_place(id=US_WOEID) | |
trends_names = [x['name'] for x in response[0]['trends']] | |
return json.dumps(trends_names) | |
@app.route('/search_trends', methods=['GET', 'POST']) | |
def search_trends(): | |
tweets = [] | |
if request.files.get('file'): | |
tweets = [x.decode().strip() | |
for x in request.files['file'].stream.readlines()] | |
tweets_file = 'tweets_upload.txt' | |
else: | |
tweets_file = 'tweets_trends.txt' | |
api = get_twitter_api() | |
trends_querys = json.loads(request.form['data']) | |
tweets = [] | |
max_tweets = 20 | |
try: | |
for i, query in enumerate(trends_querys): | |
print("Getting trend {0}/{1}".format(i+1, len(trends_querys))) | |
tweets += [status.text for status in | |
tweepy.Cursor(api.search, language='en', | |
q=query, ).items(max_tweets)] | |
except: | |
tweets = None | |
if tweets: | |
with open(tweets_file, 'w') as stream: | |
stream.write("\n".join(tweets)) | |
p = subprocess.Popen(["../ark-tweet-nlp-0.3.2/runTagger.sh", | |
"--no-confidence", "--input-format", "text", | |
"--output-format", "pretsv", "--quiet", tweets_file], | |
stdout=subprocess.PIPE) | |
(output, err) = p.communicate() | |
token_list_temp = re.sub("\n", "\t", output.decode()).split("\t") | |
i = 0 | |
token_list = [] | |
while i < len(token_list_temp): | |
token_list.append(token_list_temp[i:i+2]) | |
i += 3 | |
multiword_patterns = [ | |
["^", "^"], ["N", "^"], ["^", "N"], ["N", "N"], ["A", "N"], | |
["A", "^"], ["V", "N"], ["V", "^"], ["V", "T"], ["R", "V"], | |
["V", "T", "T"], ["V", "T", "P"], ["V", "D", "N"], ["V", "D", "^"], | |
["N", "O", "N"], ["^", "O", "N"], ["N", "O", "^"], ["^", "O", "^"], | |
["D", "D", "N"], ["D", "D", "^"], ["V", "D", "N"], ["V", "D", "^"], | |
["V", "T", "P"], ["N", "N", "N"], ["N", "N", "^"], ["N", "^", "N"], | |
["^", "N", "N"], ["N", "^", "^"], ["^", "N", "^"], ["^", "^", "N"], | |
["^", "^", "^"], ["A", "N", "N"], ["A", "N", "^"], ["A", "^", "N"], | |
["A", "^", "^"], ["N", "A", "N"], ["^", "A", "^"], ["N", "A", "^"], | |
["^", "A", "N"], ["A", "A", "N"], ["A", "A", "^"], ["N", "P", "N"], | |
["^", "P", "N"], ["N", "P", "^"], ["^", "P", "^"], | |
["N", "P", "A", "N"], ["^", "P", "A", "N"], ["N", "P", "A", "^"], | |
["^", "P", "A", "^"], ["N", "P", "D", "N"], ["^", "P", "D", "N"], | |
["N", "P", "D", "^"], ["^", "P", "D", "^"], ["N", "P", "N", "N"], | |
["^", "P", "N", "N"], ["N", "P", "^", "N"], ["N", "P", "N", "^"], | |
["N", "P", "^", "^"], ["^", "P", "N", "^"], ["^", "P", "^", "N"], | |
["^", "P", "^", "^"], ["N", "N", "P", "N"], ["N", "N", "P", "^"], | |
["N", "^", "P", "N"], ["^", "N", "P", "N"], ["^", "^", "P", "N"], | |
["^", "N", "P", "^"], ["N", "^", "P", "^"], ["^", "^", "P", "^"]] | |
dict_multiword = collections.defaultdict(int) | |
dict_word = collections.defaultdict(int) | |
last_tweet = '' | |
for group in token_list: | |
if len(group) == 2: | |
[tweet, tag] = group | |
if last_tweet != tweet: | |
words = tweet.split() | |
tags = tag.split() | |
word1, word2, word3 = '', '', '' | |
tag1, tag2, tag3 = '', '', '' | |
pair = "" | |
for i in range(len(words)): | |
word = words[i].lower() | |
tag = tags[i] | |
if tag in ["N", "A", "V", "R", "P", "O"]: | |
dict_word[word] += 1 | |
words_total += 1 | |
if [tag1, tag] in multiword_patterns: | |
multiword = word1 + " " + word | |
dict_multiword[multiword] += 1 | |
if [tag2, tag1, tag] in multiword_patterns: | |
multiword = word2 + " " + word1 + " " + word | |
dict_multiword[multiword] += 1 | |
if [tag3, tag2, tag1, tag] in multiword_patterns: | |
multiword = word3 + " " + word2 + " " + word1 + " " + word | |
dict_multiword[multiword] += 1 | |
word3 = word2 | |
tag3 = tag2 | |
word2 = word1 | |
tag2 = tag1 | |
word1 = word | |
tag1 = tag | |
last_tweet = tweet | |
dict_multiword_score = collections.defaultdict(int) | |
for key, val in dict_multiword.items(): | |
words = key.split() | |
score = val | |
for word in words: | |
if word in dict_word: | |
score -= dict_word[word] - val | |
if score > 0: | |
lang = '' | |
try: | |
lang = detect(key) | |
except: | |
pass | |
if lang == 'en': | |
dict_multiword_score[key] = score | |
ordered_dict_multiword_score = collections.OrderedDict( | |
sorted(dict_multiword_score.items(), key=lambda t: t[1], reverse=True)) | |
return json.dumps(ordered_dict_multiword_score) | |
if __name__ == '__main__': | |
app.debug = True | |
app.run() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment