Created
April 1, 2011 10:23
-
-
Save huseyinyilmaz/897979 to your computer and use it in GitHub Desktop.
Simple twitter crawler that collects twitter user and user follower relationships.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
######################################################################################### | |
# This scripts pulls twitter user information and follower relationships # | |
# using twitter REST API and stores them in an sqlite database. # | |
# Before using this one you have to run name.sh file to create sqlite3 database # | |
# that this script will use. # | |
# USAGE: # | |
# first create your database # | |
# $ ./name.sh # | |
# then add a user to your database as start point # | |
# $ python name.py yilmaz_huseyin # change twitter user name with any name you want. # | |
# after that run following command repeatedly # | |
# $ python name.py # | |
# this last command looks at tha database and downloads # | |
# followers of users than adds resulting users to same database. # | |
# Twitter REST API has 150request/hour limit so after you run this command second time, # | |
# you have to wait about an hour. Every time the command is run. About # | |
# 10.000-20.000 user will be saved to database. # | |
######################################################################################### | |
import urllib2 | |
import simplejson | |
import sqlite3 | |
import itertools | |
import sys | |
import logging | |
########## | |
# FIELDS # | |
########## | |
# API json fields for user | |
user_fields = ['id','id_str','name','lang','time_zone','screen_name','description','created_at','followers_count','following','friends_count','statuses_count','url'] | |
# User table fields | |
table_fields = ['id','id_str','name','lang','time_zone','screen_name','description','created_at','followers_count','following','friends_count','statuses_count','url','followers_processed'] | |
################ | |
# user queries # | |
################ | |
insert_user_query = """INSERT INTO user | |
(%s) | |
values | |
(%s);"""%(",".join(table_fields),','.join(itertools.repeat('?',len(table_fields)))) | |
insert_relationship_follower_query = """INSERT INTO userfollowerdtl | |
(user_id,follower_id) | |
values | |
(?,?);""" | |
count_ralationship_query = "SELECT count(*) from userfollowerdtl dtl where dtl.user_id=? and dtl.follower_id=?" | |
count_user_by_id_query = "SELECT count(*) from user where user.id = ?;" | |
process_followers_query = "SELECT %s FROM user WHERE user.followers_processed = 0"%','.join(table_fields) | |
update_followers_processed_query = "UPDATE user SET followers_processed = ? WHERE id = ?" | |
########## | |
# logger # | |
########## | |
logger = logging.getLogger(__name__) | |
log_handler = logging.StreamHandler(sys.stdout) | |
log_formatter = logging.Formatter("%(asctime)s - %(name)s - %(levelname)s - - %(funcName)s - line(%(lineno)d) :: %(message)s") | |
log_handler.setFormatter(log_formatter) | |
logger.addHandler(log_handler) | |
#logger.setLevel(logging.DEBUG) | |
logger.setLevel(logging.WARNING) | |
######### | |
# utils # | |
######### | |
class UserExistException(Exception): | |
pass | |
class RelationshipExistException(Exception): | |
pass | |
class CommitandExitException(Exception): | |
pass | |
empty_list = list() | |
############# | |
# functions # | |
############# | |
def _is_valid(user): | |
return user and user.has_key('id') | |
def set_followers_processed(cursor,obj,value): | |
cursor.execute(update_followers_processed_query,(value,obj['id'])) | |
def _add_user_object(cursor,obj): | |
values = [obj[key] for key in user_fields ] | |
values.append(False) | |
values = tuple(values) | |
logger.debug('values are : %s'%repr(values)) | |
#fetchall returns a list like [(1,)] so fetchall()[0][0] returns only number | |
count = cursor.execute(count_user_by_id_query,(obj['id'],)).fetchall()[0][0] | |
if count: | |
logger.warning('user "%s" exist.'%obj['screen_name']) | |
raise UserExistException() | |
cursor.execute(insert_user_query,values) | |
logger.info('user was inserted with values: %s'%repr(values)) | |
def add_user(cursor,user_name): | |
response = None | |
try: | |
response = urllib2.urlopen("http://api.twitter.com/1/users/show.json?screen_name=%s"%user_name) | |
except urllib2.HTTPError as e: | |
raise Exception(e.read()) | |
json_text = response.read() | |
logger.debug('user data retreived from remote server: %s'%json_text ) | |
user = simplejson.loads(json_text) | |
if _is_valid(user): | |
_add_user_object(cursor,user) | |
else: | |
logger.error('We could not retreive user from twitter returned value : %s'%repr(user)) | |
raise Exception('User could not be retreived') | |
def _add_relationship_follower(cursor,user,follower): | |
user_id = user['id'] | |
follower_id = follower['id'] | |
values = (user_id,follower_id) | |
#fetchall returns a list like [(1,)] so fetchall()[0][0] returns only number | |
count = cursor.execute(count_ralationship_query,(user_id,follower_id)).fetchall()[0][0] | |
if count: | |
logger.warning('This relationship is already exist in the database') | |
raise RelationshipExistException() | |
cursor.execute(insert_relationship_follower_query,values) | |
logger.info('%s - %s relationship was inserted'%(user['screen_name'],follower['screen_name'])) | |
def _add_followers(cursor,obj): | |
response = None | |
try: | |
response = urllib2.urlopen("http://api.twitter.com/1/statuses/followers.json?screen_name=%s"%obj['screen_name']) | |
except urllib2.HTTPError as e: | |
if e.code == 401: | |
logger.error('Not authorized to read users followers list [%s] (%s)'%(obj['screen_name'],e.read())) | |
set_followers_processed(cursor,obj,True) | |
return | |
elif e.code == 400: | |
logger.error('Twitter api user limit was reached. Try again in an hour. %s'%e.read()) | |
raise CommitandExitException() | |
else: | |
logger.error('Exception while trying to read users followers list [%s] (%s)'%(obj['screen_name'],e.read())) | |
return | |
json_text = response.read() | |
logger.debug('followers data retreived from remote server: %s'%json_text ) | |
followers = simplejson.loads(json_text) | |
if type(followers) == type(empty_list): | |
for user in followers: | |
if _is_valid(user): | |
try: | |
_add_user_object(cursor,user) | |
except UserExistException: | |
pass | |
try: | |
_add_relationship_follower(cursor,obj,user) | |
except RelationshipExistException: | |
pass | |
else: | |
logger.error('user objects are in wrong format: %s'%repr(user)) | |
set_followers_processed(cursor,obj,True) | |
else: | |
logger.error('We could not retreive followers from twitter. Returned value : %s'%repr(followers)) | |
raise Exception('Followers could not be retreived could not be retreived') | |
def scan_for_followers(cursor): | |
row_set = cursor.execute(process_followers_query).fetchall() | |
for row in row_set: | |
user = dict(zip(table_fields,row)) | |
_add_followers(cursor,user) | |
if __name__ == "__main__": | |
# get command line arguments | |
args = sys.argv[1:] | |
if len(args)>1: | |
print 'use without arguments with only a twitter screen_name.' | |
con = sqlite3.connect('name.db') | |
with con: | |
c = con.cursor() | |
if len(args) == 1: | |
try: | |
add_user(c,args[0]) | |
except UserExistException: | |
pass | |
else: | |
try: | |
scan_for_followers(c) | |
except CommitandExitException: | |
pass | |
except Exception as e: | |
logger.error('An Error was occured. saving and closing the program: %s'%repr(e)) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/bin/bash | |
sqlite3 name.db "CREATE TABLE user (id INTEGER PRIMARY_KEY NOT NULL,\ | |
id_str VARCHAR(500) NOT NULL,\ | |
created_at DATE,\ | |
description TEXT,\ | |
followers_count INTEGER,\ | |
following INTEGER,\ | |
friends_count INTEGER,\ | |
screen_name VARCHAR(255),\ | |
statuses_count INTEGER,\ | |
url VARCHAR(500),\ | |
name VARCHAR(500),\ | |
lang VARCHAR(500),\ | |
time_zone VARCHAR(500),\ | |
followers_processed BOOLEAN NOT NULL\ | |
);" | |
sqlite3 name.db "CREATE TABLE userfollowerdtl (id INTEGER PRIMARY KEY AUTOINCREMENT,\ | |
user_id INTEGER NOT NULL REFERENCES user(id),\ | |
follower_id INTEGER NOT NULL REFERENCES user(id),\ | |
UNIQUE(user_id,follower_id)\ | |
);" |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment