Skip to content

Instantly share code, notes, and snippets.

@erogol
Created December 25, 2013 22:34
Show Gist options
  • Save erogol/8127619 to your computer and use it in GitHub Desktop.
Save erogol/8127619 to your computer and use it in GitHub Desktop.
scrap twitter wall given the search page address or simulating a browser via mechanize
'''
Written by:
Eren Golge - [email protected]
'''
import json
import pdb
import urllib2
import mechanize
import cookielib
import re
from BeautifulSoup import BeautifulSoup
import csv
import os
def set_mechanize():
'''
Set mechanize configs
'''
br = mechanize.Browser()
# set cookies
cookies = cookielib.LWPCookieJar()
br.set_cookiejar(cookies)
# browser settings (used to emulate a browser)
br.set_handle_equiv(True)
br.set_handle_redirect(True)
br.set_handle_referer(True)
br.set_handle_robots(False)
br.set_debug_http(False)
br.set_debug_responses(False)
br.set_debug_redirects(False)
br.set_handle_refresh(mechanize._http.HTTPRefreshProcessor(), max_time = 1)
br.addheaders = [('User-agent', 'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.1) Gecko/2008071615 Fedora/3.0.1-1.fc9 Firefox/3.0.1')]
return br
def login_twitter(br):
'''
Login to twitter and return the wall content
Inputs:
br - mechanize instance configurated initially
Outputs:
wall - the raw html content of the home twitter wall
'''
name = 'erogol'
password = 'ern88gol'
theurl = 'http://twitter.com/login'
br.open(theurl)
#test_write(br)
br.select_form(nr=0)
br["session[username_or_email]"] = name
br['password'] = password
wall = br.submit().read()
return wall
def load_page(url_addr):
return urllib2.urlopen(url_addr).read()
def scrap_wall(wall):
'''
Given the twitter wall content, it scraps the images
and the twitter texts
Inputs:
wall - raw html of the twitter wall
Outputs:
data_list - A list of dictionaries where each dict
includes meta information for each image twit
'''
data_list = []
total_img_num = 0
wall_soup = BeautifulSoup(wall)
divs = wall_soup.findAll('div', {'class':'content'})
for div in divs:
#imgs = div.findAll('img', {'alt':'Embedded image permalink'})
imgs = div.findAll('a', {'data-resolved-url-large': re.compile("jpg:large$") })
if len(imgs) > 0:
total_img_num = total_img_num + len(imgs)
#img_links = div.findAll('a', {'class':'media media-thumbnail twitter-timeline-link media-forward is-preview'})
img_links = div.findAll('a', {'data-resolved-url-large': re.compile("jpg:large$")})
if len(img_links) == 0:
print 'ERROR: img links are changed on twitter wall!!!'
img_link = img_links[0].get('data-resolved-url-large')
twit_texts = div.findAll('p', {'class' : 'js-tweet-text tweet-text'})
if len(twit_texts) < 1:
print 'ERROR: twitter text container is changed !!!'
twit_text = twit_texts[0].getText()
print twit_text
details = div.findAll('a', {'class' : 'permalink-link js-permalink js-nav'})
if len(details) < 1 :
print 'ERROR: "details" container is changed !!!'
twit_url = details[0]['href']
data_list.append({'imgs' : img_link.encode('utf8'), 'twits' : twit_text.encode('utf8'), 'links' : twit_url.encode('utf8')})
print 'Total number of images --- ', total_img_num
print data_list
return data_list
def write_csv(data_list, file_name, folder_name = ''):
'''
Save the given list of dicts
'''
if len(data_list) == 0:
print "NO DATA TO WRITE!!!"
return
full_path = folder_name + '/' + file_name
if len(folder_name) > 0:
if not os.path.exists(folder_name):
print 'New foder created as ' + folder_name
os.makedirs(folder_name)
keys = data_list[0].keys()
dict_writer = csv.DictWriter(open(full_path,'wb'), keys)
dict_writer.writer.writerow(keys)
dict_writer.writerows(data_list)
print 'DATA has been saved to ' + full_path
# auxiliary functions
def test_read(file_name = 'test.html'):
f = open(file_name, 'r')
page = f.read()
return page
def test_write(result, file_name = 'test.html'):
f = open(file_name,'wb')
f.write(result)
f.close()
if __name__ == '__main__':
#test_write(load_page('https://twitter.com/search?q=%22Fenerbah%C3%A7eSu%C3%A7suzdur%20DavaYenidenG%C3%B6r%C3%BCls%C3%BCn%22&src=tren'),'test2.html')
write_csv(scrap_wall( test_read('tests.html') ), 'test.csv', 'SAVED_DATA')
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment