Skip to content

Instantly share code, notes, and snippets.

@huseyinyilmaz
Created April 1, 2011 10:23
Show Gist options
  • Save huseyinyilmaz/897979 to your computer and use it in GitHub Desktop.
Save huseyinyilmaz/897979 to your computer and use it in GitHub Desktop.
Simple twitter crawler that collects twitter user and user follower relationships.
#########################################################################################
# This scripts pulls twitter user information and follower relationships #
# using twitter REST API and stores them in an sqlite database. #
# Before using this one you have to run name.sh file to create sqlite3 database #
# that this script will use. #
# USAGE: #
# first create your database #
# $ ./name.sh #
# then add a user to your database as start point #
# $ python name.py yilmaz_huseyin # change twitter user name with any name you want. #
# after that run following command repeatedly #
# $ python name.py #
# this last command looks at tha database and downloads #
# followers of users than adds resulting users to same database. #
# Twitter REST API has 150request/hour limit so after you run this command second time, #
# you have to wait about an hour. Every time the command is run. About #
# 10.000-20.000 user will be saved to database. #
#########################################################################################
import urllib2
import simplejson
import sqlite3
import itertools
import sys
import logging
##########
# FIELDS #
##########
# API json fields for user
user_fields = ['id','id_str','name','lang','time_zone','screen_name','description','created_at','followers_count','following','friends_count','statuses_count','url']
# User table fields
table_fields = ['id','id_str','name','lang','time_zone','screen_name','description','created_at','followers_count','following','friends_count','statuses_count','url','followers_processed']
################
# user queries #
################
insert_user_query = """INSERT INTO user
(%s)
values
(%s);"""%(",".join(table_fields),','.join(itertools.repeat('?',len(table_fields))))
insert_relationship_follower_query = """INSERT INTO userfollowerdtl
(user_id,follower_id)
values
(?,?);"""
count_ralationship_query = "SELECT count(*) from userfollowerdtl dtl where dtl.user_id=? and dtl.follower_id=?"
count_user_by_id_query = "SELECT count(*) from user where user.id = ?;"
process_followers_query = "SELECT %s FROM user WHERE user.followers_processed = 0"%','.join(table_fields)
update_followers_processed_query = "UPDATE user SET followers_processed = ? WHERE id = ?"
##########
# logger #
##########
logger = logging.getLogger(__name__)
log_handler = logging.StreamHandler(sys.stdout)
log_formatter = logging.Formatter("%(asctime)s - %(name)s - %(levelname)s - - %(funcName)s - line(%(lineno)d) :: %(message)s")
log_handler.setFormatter(log_formatter)
logger.addHandler(log_handler)
#logger.setLevel(logging.DEBUG)
logger.setLevel(logging.WARNING)
#########
# utils #
#########
class UserExistException(Exception):
pass
class RelationshipExistException(Exception):
pass
class CommitandExitException(Exception):
pass
empty_list = list()
#############
# functions #
#############
def _is_valid(user):
return user and user.has_key('id')
def set_followers_processed(cursor,obj,value):
cursor.execute(update_followers_processed_query,(value,obj['id']))
def _add_user_object(cursor,obj):
values = [obj[key] for key in user_fields ]
values.append(False)
values = tuple(values)
logger.debug('values are : %s'%repr(values))
#fetchall returns a list like [(1,)] so fetchall()[0][0] returns only number
count = cursor.execute(count_user_by_id_query,(obj['id'],)).fetchall()[0][0]
if count:
logger.warning('user "%s" exist.'%obj['screen_name'])
raise UserExistException()
cursor.execute(insert_user_query,values)
logger.info('user was inserted with values: %s'%repr(values))
def add_user(cursor,user_name):
response = None
try:
response = urllib2.urlopen("http://api.twitter.com/1/users/show.json?screen_name=%s"%user_name)
except urllib2.HTTPError as e:
raise Exception(e.read())
json_text = response.read()
logger.debug('user data retreived from remote server: %s'%json_text )
user = simplejson.loads(json_text)
if _is_valid(user):
_add_user_object(cursor,user)
else:
logger.error('We could not retreive user from twitter returned value : %s'%repr(user))
raise Exception('User could not be retreived')
def _add_relationship_follower(cursor,user,follower):
user_id = user['id']
follower_id = follower['id']
values = (user_id,follower_id)
#fetchall returns a list like [(1,)] so fetchall()[0][0] returns only number
count = cursor.execute(count_ralationship_query,(user_id,follower_id)).fetchall()[0][0]
if count:
logger.warning('This relationship is already exist in the database')
raise RelationshipExistException()
cursor.execute(insert_relationship_follower_query,values)
logger.info('%s - %s relationship was inserted'%(user['screen_name'],follower['screen_name']))
def _add_followers(cursor,obj):
response = None
try:
response = urllib2.urlopen("http://api.twitter.com/1/statuses/followers.json?screen_name=%s"%obj['screen_name'])
except urllib2.HTTPError as e:
if e.code == 401:
logger.error('Not authorized to read users followers list [%s] (%s)'%(obj['screen_name'],e.read()))
set_followers_processed(cursor,obj,True)
return
elif e.code == 400:
logger.error('Twitter api user limit was reached. Try again in an hour. %s'%e.read())
raise CommitandExitException()
else:
logger.error('Exception while trying to read users followers list [%s] (%s)'%(obj['screen_name'],e.read()))
return
json_text = response.read()
logger.debug('followers data retreived from remote server: %s'%json_text )
followers = simplejson.loads(json_text)
if type(followers) == type(empty_list):
for user in followers:
if _is_valid(user):
try:
_add_user_object(cursor,user)
except UserExistException:
pass
try:
_add_relationship_follower(cursor,obj,user)
except RelationshipExistException:
pass
else:
logger.error('user objects are in wrong format: %s'%repr(user))
set_followers_processed(cursor,obj,True)
else:
logger.error('We could not retreive followers from twitter. Returned value : %s'%repr(followers))
raise Exception('Followers could not be retreived could not be retreived')
def scan_for_followers(cursor):
row_set = cursor.execute(process_followers_query).fetchall()
for row in row_set:
user = dict(zip(table_fields,row))
_add_followers(cursor,user)
if __name__ == "__main__":
# get command line arguments
args = sys.argv[1:]
if len(args)>1:
print 'use without arguments with only a twitter screen_name.'
con = sqlite3.connect('name.db')
with con:
c = con.cursor()
if len(args) == 1:
try:
add_user(c,args[0])
except UserExistException:
pass
else:
try:
scan_for_followers(c)
except CommitandExitException:
pass
except Exception as e:
logger.error('An Error was occured. saving and closing the program: %s'%repr(e))
#!/bin/bash
sqlite3 name.db "CREATE TABLE user (id INTEGER PRIMARY_KEY NOT NULL,\
id_str VARCHAR(500) NOT NULL,\
created_at DATE,\
description TEXT,\
followers_count INTEGER,\
following INTEGER,\
friends_count INTEGER,\
screen_name VARCHAR(255),\
statuses_count INTEGER,\
url VARCHAR(500),\
name VARCHAR(500),\
lang VARCHAR(500),\
time_zone VARCHAR(500),\
followers_processed BOOLEAN NOT NULL\
);"
sqlite3 name.db "CREATE TABLE userfollowerdtl (id INTEGER PRIMARY KEY AUTOINCREMENT,\
user_id INTEGER NOT NULL REFERENCES user(id),\
follower_id INTEGER NOT NULL REFERENCES user(id),\
UNIQUE(user_id,follower_id)\
);"
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment