Created
May 18, 2011 07:16
-
-
Save pvieytes/978125 to your computer and use it in GitHub Desktop.
Parsing Twitter's user time with Python
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import Tweet | |
import simplejson | |
import urllib2 | |
def read_tweets(user, num_tweets): | |
tweets = [] | |
url = "http://api.twitter.com/1/statuses/user_timeline.json?\ | |
screen_name=%s&count=%s&include_rts=true" % (user, num_tweets) | |
file = urllib2.urlopen(url) | |
content = file.read() | |
json = simplejson.loads(content) | |
for js_tweet in json: | |
tweet = Tweet() | |
tweet.id = js_tweet['id'] | |
tweet.username = js_tweet['user']['screen_name'] | |
try: | |
tweet.retweet_user = js_tweet['retweeted_status']['user']['screen_name'] | |
tweet.retweeted = True | |
except: | |
tweet.retweeted = False | |
tweet.set_date(js_tweet['created_at']) | |
#tweet.id, tweet.username must exist | |
tweet.set_tweet_url() | |
#convert plain text to html text | |
tweet.set_text(js_tweet['text']) | |
#tweet.id, tweet.username must exist | |
tweet.set_profile_url() | |
if tweet.retweeted: | |
tweet.user_avatar_url = js_tweet['retweeted_status']['user']['profile_image_url'] | |
else: | |
tweet.user_avatar_url = js_tweet['user']['profile_image_url'] | |
tweets.append(tweet) | |
return tweets |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import time | |
from datetime import datetime | |
import re | |
class Tweet(): | |
"""Store the tweet info | |
""" | |
id = None | |
username = None | |
url = None | |
user_avatar_url = None | |
tweet_url = None | |
profile_url = None | |
html_text = None | |
retweeted = None | |
retweet_user = None | |
date = None | |
def set_date(self, date_str): | |
"""Convert string to datetime | |
""" | |
time_struct = time.strptime(date_str, "%a %b %d %H:%M:%S +0000 %Y")#Tue Apr 26 08:57:55 +0000 2011 | |
self.date = datetime.fromtimestamp(time.mktime(time_struct)) | |
def set_text(self, plain_text): | |
"""convert plain text into html text with http, user and hashtag links | |
""" | |
re_http = re.compile(r"(http://[^ ]+)") | |
self.html_text = re_http.sub(r'\1', plain_text) | |
re_https = re.compile(r"(https://[^ ]+)") | |
self.html_text = re_https.sub(r'\1', self.html_text) | |
re_user = re.compile(r'@[0-9a-zA-Z+_]*',re.IGNORECASE) | |
for iterator in re_user.finditer(self.html_text): | |
a_username = iterator.group(0) | |
username = a_username.replace('@','') | |
link = '' + a_username + '' | |
self.html_text = self.html_text.replace(a_username, link) | |
re_hash = re.compile(r'#[0-9a-zA-Z+_]*',re.IGNORECASE) | |
for iterator in re_hash.finditer(self.html_text): | |
h_tag = iterator.group(0) | |
link_tag = h_tag.replace('#','%23') | |
link = '' + h_tag + '' | |
self.html_text = self.html_text.replace(h_tag + " ", link + " ") | |
#check last tag | |
offset = len(self.html_text) - len(h_tag) | |
index = self.html_text.find(h_tag, offset) | |
if index >= 0: | |
self.html_text = self.html_text[:index] + " " + link | |
def set_profile_url(self): | |
"""Create the url profile | |
""" | |
if self.retweeted: | |
self.profile_url = "http://www.twitter.com/%s" % self.retweet_user | |
else: | |
self.profile_url = "http://www.twitter.com/%s" % self.username | |
def set_tweet_url(self): | |
"""Create the url of the tweet | |
""" | |
self.tweet_url = "http://www.twitter.com/%s/status/%s" % (self.username, self.id) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Will the +0000 in the date format for created_at field in the tweetJSON always remain the same? And is this format consistent throughout all the APIs of twitter? I have heard a lot of chatter about +0000 occurring in the end instead of like this for the Search API, instead of the format you've handled which is the same as the Streaming API.