Created
December 25, 2013 22:34
-
-
Save erogol/8127619 to your computer and use it in GitHub Desktop.
scrap twitter wall given the search page address or simulating a browser via mechanize
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
''' | |
Written by: | |
Eren Golge - [email protected] | |
''' | |
import json | |
import pdb | |
import urllib2 | |
import mechanize | |
import cookielib | |
import re | |
from BeautifulSoup import BeautifulSoup | |
import csv | |
import os | |
def set_mechanize(): | |
''' | |
Set mechanize configs | |
''' | |
br = mechanize.Browser() | |
# set cookies | |
cookies = cookielib.LWPCookieJar() | |
br.set_cookiejar(cookies) | |
# browser settings (used to emulate a browser) | |
br.set_handle_equiv(True) | |
br.set_handle_redirect(True) | |
br.set_handle_referer(True) | |
br.set_handle_robots(False) | |
br.set_debug_http(False) | |
br.set_debug_responses(False) | |
br.set_debug_redirects(False) | |
br.set_handle_refresh(mechanize._http.HTTPRefreshProcessor(), max_time = 1) | |
br.addheaders = [('User-agent', 'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.1) Gecko/2008071615 Fedora/3.0.1-1.fc9 Firefox/3.0.1')] | |
return br | |
def login_twitter(br): | |
''' | |
Login to twitter and return the wall content | |
Inputs: | |
br - mechanize instance configurated initially | |
Outputs: | |
wall - the raw html content of the home twitter wall | |
''' | |
name = 'erogol' | |
password = 'ern88gol' | |
theurl = 'http://twitter.com/login' | |
br.open(theurl) | |
#test_write(br) | |
br.select_form(nr=0) | |
br["session[username_or_email]"] = name | |
br['password'] = password | |
wall = br.submit().read() | |
return wall | |
def load_page(url_addr): | |
return urllib2.urlopen(url_addr).read() | |
def scrap_wall(wall): | |
''' | |
Given the twitter wall content, it scraps the images | |
and the twitter texts | |
Inputs: | |
wall - raw html of the twitter wall | |
Outputs: | |
data_list - A list of dictionaries where each dict | |
includes meta information for each image twit | |
''' | |
data_list = [] | |
total_img_num = 0 | |
wall_soup = BeautifulSoup(wall) | |
divs = wall_soup.findAll('div', {'class':'content'}) | |
for div in divs: | |
#imgs = div.findAll('img', {'alt':'Embedded image permalink'}) | |
imgs = div.findAll('a', {'data-resolved-url-large': re.compile("jpg:large$") }) | |
if len(imgs) > 0: | |
total_img_num = total_img_num + len(imgs) | |
#img_links = div.findAll('a', {'class':'media media-thumbnail twitter-timeline-link media-forward is-preview'}) | |
img_links = div.findAll('a', {'data-resolved-url-large': re.compile("jpg:large$")}) | |
if len(img_links) == 0: | |
print 'ERROR: img links are changed on twitter wall!!!' | |
img_link = img_links[0].get('data-resolved-url-large') | |
twit_texts = div.findAll('p', {'class' : 'js-tweet-text tweet-text'}) | |
if len(twit_texts) < 1: | |
print 'ERROR: twitter text container is changed !!!' | |
twit_text = twit_texts[0].getText() | |
print twit_text | |
details = div.findAll('a', {'class' : 'permalink-link js-permalink js-nav'}) | |
if len(details) < 1 : | |
print 'ERROR: "details" container is changed !!!' | |
twit_url = details[0]['href'] | |
data_list.append({'imgs' : img_link.encode('utf8'), 'twits' : twit_text.encode('utf8'), 'links' : twit_url.encode('utf8')}) | |
print 'Total number of images --- ', total_img_num | |
print data_list | |
return data_list | |
def write_csv(data_list, file_name, folder_name = ''): | |
''' | |
Save the given list of dicts | |
''' | |
if len(data_list) == 0: | |
print "NO DATA TO WRITE!!!" | |
return | |
full_path = folder_name + '/' + file_name | |
if len(folder_name) > 0: | |
if not os.path.exists(folder_name): | |
print 'New foder created as ' + folder_name | |
os.makedirs(folder_name) | |
keys = data_list[0].keys() | |
dict_writer = csv.DictWriter(open(full_path,'wb'), keys) | |
dict_writer.writer.writerow(keys) | |
dict_writer.writerows(data_list) | |
print 'DATA has been saved to ' + full_path | |
# auxiliary functions | |
def test_read(file_name = 'test.html'): | |
f = open(file_name, 'r') | |
page = f.read() | |
return page | |
def test_write(result, file_name = 'test.html'): | |
f = open(file_name,'wb') | |
f.write(result) | |
f.close() | |
if __name__ == '__main__': | |
#test_write(load_page('https://twitter.com/search?q=%22Fenerbah%C3%A7eSu%C3%A7suzdur%20DavaYenidenG%C3%B6r%C3%BCls%C3%BCn%22&src=tren'),'test2.html') | |
write_csv(scrap_wall( test_read('tests.html') ), 'test.csv', 'SAVED_DATA') | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment