manuelbua · August 22, 2014 12:56
diff --git a/analyze.py b/analyze.py
 '''
 A script for analyzing twitter stats on Ferguson
 '''
 import json
 import re

 import tweepy

 def get_api():
    '''
    Creates an instance of the tweepy OAuth class
    '''
    with open('config') as f:
        api_key = f.readline().strip()
        api_secret = f.readline().strip()
        access_token = f.readline().strip()
        access_token_secret = f.readline().strip()
        auth = tweepy.OAuthHandler(api_key, api_secret)
        auth.set_access_token(access_token, access_token_secret)
        return auth


 class CustomStreamListener(tweepy.StreamListener):
    '''
    Sub class of StreamListener to handle searching
    Ferguson tweets for various keywords
    '''
    def __init__(self, *args, **kwargs):
        super(CustomStreamListener, self).__init__(*args, **kwargs)
        self.count = 0
        with open('common') as f:
            self.common = set(line.strip() for line in f)
        self.all_words = {}
        self.pattern = re.compile("[^\w'#]")

    def on_status(self, status):
        print 'Got a tweet'
        self.count += 1
        tweet = status.text.lower()
        tweet = self.pattern.sub(' ', tweet)
        words = tweet.split()
        for word in words:
            if 'http' not in word and '@' not in word and \
                    len(word) > 2 and word != '' and \
                    not word.isspace() and not word.isdigit() and \
                    word not in self.common:
                if word not in self.all_words:
                    self.all_words[word] = 1
                else:
                    self.all_words[word] += 1


 if __name__ == '__main__':
    l = CustomStreamListener()
    try:
        auth = get_api()
        streaming_api = tweepy.Stream(auth, l)
        streaming_api.filter(track=['Ferguson'])
    except KeyboardInterrupt:
        print '----TOTAL TWEETS----'
        print l.count
        print '--------------------'
        json_data = json.dumps(l.all_words, indent=4)
        with open('word_data.json', 'w') as f:
            print >> f, json_data
diff --git a/common b/common
 the
 be
 to
 of
 and
 a
 in
 that
 have
 I
 it
 for
 not
 on
 with
 he
 as
 you
 do
 at
 this
 but
 his
 by
 from
 they
 we
 say
 her
 she
 or
 an
 will
 my
 one
 all
 would
 there
 their
 what
 so
 up
 out
 if
 about
 who
 get
 which
 go
 me
 when
 make
 can
 like
 time
 no
 just
 him
 know
 take
 person
 into
 year
 your
 good
 some
 could
 them
 see
 other
 than
 then
 now
 look
 only
 come
 its
 over
 think
 also
 back
 after
 use
 two
 how
 our
 work
 first
 well
 way
 even
 new
 want
 because
 any
 these
 give
 day
 most
 us
 i'll
 i'm
 until
 ha
 haha
 hahaha
 hahahaha
 hahahahaha
 hi
 rt
 re
 omg
 omgg
 omggg
 omgggg
 omggggg
 oh
 ohh
 ohhh
 was
 wtf
 said
 done
 else
 else's
 le
 such
 via
 que
 let
 still
 real
diff --git a/convert.py b/convert.py
 '''
 Convert the JSON data into a
 large block of text for parsing
 '''
 import json

 f = open('word_data.json')
 data = json.load(f)
 f.close()
 final_str = ''
 for word in data:
    count = data[word]
    while count > 0:
        final_str += word + ' '
        count -= 1
 with open('word_block.txt', 'w') as f:
    f.write(final_str)
	'''
	A script for analyzing twitter stats on Ferguson
	'''
	import json
	import re

	import tweepy

	def get_api():
	'''
	Creates an instance of the tweepy OAuth class
	'''
	with open('config') as f:
	api_key = f.readline().strip()
	api_secret = f.readline().strip()
	access_token = f.readline().strip()
	access_token_secret = f.readline().strip()
	auth = tweepy.OAuthHandler(api_key, api_secret)
	auth.set_access_token(access_token, access_token_secret)
	return auth


	class CustomStreamListener(tweepy.StreamListener):
	'''
	Sub class of StreamListener to handle searching
	Ferguson tweets for various keywords
	'''
	def __init__(self, args, *kwargs):
	super(CustomStreamListener, self).__init__(args, *kwargs)
	self.count = 0
	with open('common') as f:
	self.common = set(line.strip() for line in f)
	self.all_words = {}
	self.pattern = re.compile("[^\w'#]")

	def on_status(self, status):
	print 'Got a tweet'
	self.count += 1
	tweet = status.text.lower()
	tweet = self.pattern.sub(' ', tweet)
	words = tweet.split()
	for word in words:
	if 'http' not in word and '@' not in word and \
	len(word) > 2 and word != '' and \
	not word.isspace() and not word.isdigit() and \
	word not in self.common:
	if word not in self.all_words:
	self.all_words[word] = 1
	else:
	self.all_words[word] += 1


	if __name__ == '__main__':
	l = CustomStreamListener()
	try:
	auth = get_api()
	streaming_api = tweepy.Stream(auth, l)
	streaming_api.filter(track=['Ferguson'])
	except KeyboardInterrupt:
	print '----TOTAL TWEETS----'
	print l.count
	print '--------------------'
	json_data = json.dumps(l.all_words, indent=4)
	with open('word_data.json', 'w') as f:
	print >> f, json_data
	the
	be
	to
	of
	and
	a
	in
	that
	have
	I
	it
	for
	not
	on
	with
	he
	as
	you
	do
	at
	this
	but
	his
	by
	from
	they
	we
	say
	her
	she
	or
	an
	will
	my
	one
	all
	would
	there
	their
	what
	so
	up
	out
	if
	about
	who
	get
	which
	go
	me
	when
	make
	can
	like
	time
	no
	just
	him
	know
	take
	person
	into
	year
	your
	good
	some
	could
	them
	see
	other
	than
	then
	now
	look
	only
	come
	its
	over
	think
	also
	back
	after
	use
	two
	how
	our
	work
	first
	well
	way
	even
	new
	want
	because
	any
	these
	give
	day
	most
	us
	i'll
	i'm
	until
	ha
	haha
	hahaha
	hahahaha
	hahahahaha
	hi
	rt
	re
	omg
	omgg
	omggg
	omgggg
	omggggg
	oh
	ohh
	ohhh
	was
	wtf
	said
	done
	else
	else's
	le
	such
	via
	que
	let
	still
	real
	'''
	Convert the JSON data into a
	large block of text for parsing
	'''
	import json

	f = open('word_data.json')
	data = json.load(f)
	f.close()
	final_str = ''
	for word in data:
	count = data[word]
	while count > 0:
	final_str += word + ' '
	count -= 1
	with open('word_block.txt', 'w') as f:
	f.write(final_str)