Created
February 28, 2013 22:11
-
-
Save jbhardwaj/5060569 to your computer and use it in GitHub Desktop.
Pulls data from Twitter and writes to a .csv data file for use with Witter
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| from __future__ import division | |
| import re, string, os, math, time, sys, random | |
| from operator import itemgetter | |
| import tweetstream | |
| def strTimeProp(start, end, format, prop): | |
| """Get a time at a proportion of a range of two formatted times. | |
| start and end should be strings specifying times formated in the | |
| given format (strftime-style), giving an interval [start, end]. | |
| prop specifies how a proportion of the interval to be taken after | |
| start. The returned time will be in the specified format. | |
| """ | |
| stime = time.mktime(time.strptime(start, format)) | |
| etime = time.mktime(time.strptime(end, format)) | |
| ptime = stime + prop * (etime - stime) | |
| return time.strftime(format, time.localtime(ptime)) | |
| def randomDate(start, end, prop): | |
| return strTimeProp(start, end, '%m/%d/%Y', prop) | |
| def randomDateTime(start, end, prop): | |
| return strTimeProp(start, end, '%m/%d/%Y %I:%M %p', prop) | |
| r = re.compile("[^a-zA-Z0-9 ]", re.UNICODE) | |
| #Enter twitter credentials here | |
| stream = tweetstream.SampleStream("USERNAME","PASSWORD") | |
| f = open('names.csv', 'w') | |
| #Generates a Userset of n users with random ids between 1-10000 and joindates between 1/1/2008 and 1/1/2012 | |
| n = 0 | |
| uids = [] | |
| for tweet in stream: | |
| try: | |
| uid = random.randint(1,10000) | |
| while (uid in uids): | |
| uid = random.randint(1,10000) | |
| date = randomDate("1/1/2008", "1/1/2012", random.random()) | |
| f.write(tweet['user']['name'].decode('ascii') + ", " + date + ", " + str(uid) + ",\n") | |
| print tweet['user']['name'].decode('ascii') + ", " + date + ", " + str(uid) + "," | |
| uids.append(uid) | |
| if(n<10): | |
| n+=1 | |
| else: | |
| break | |
| except KeyError: | |
| pass | |
| except UnicodeDecodeError: | |
| pass | |
| except UnicodeEncodeError: | |
| pass | |
| f.close() | |
| print "IDs: " + str(uids) | |
| #Generates a random number (between 1 and 20) of tweets per user | |
| f = open('weets.csv','w') | |
| wids = [] | |
| for uid in uids: | |
| n = random.randint(1,20) | |
| i = 0 | |
| for tweet in stream: | |
| try: | |
| wid = random.randint(1,1000000) | |
| while (wid in wids): | |
| wid = random.randint(1,1000000) | |
| datetime = randomDateTime("1/1/2008 1:00 AM", "1/1/2012 1:00 AM", random.random()) | |
| message = ''.join(tweet['text'].decode('ascii').splitlines()).replace(',', ' ') | |
| f.write(str(wid) + ", " + str(uid) + ", " + message + ", " + datetime + "\n") | |
| print "[" + str(i) + "/" + str(n) + "] " + str(wid) + ", " + str(uid) + ", " + message + ", " + datetime + ", " | |
| wids.append(wid) | |
| if(i<n): | |
| i+=1 | |
| else: | |
| break | |
| except KeyError: | |
| pass | |
| except UnicodeDecodeError: | |
| pass | |
| except UnicodeEncodeError: | |
| pass | |
| f.close() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment