Created
October 3, 2011 16:39
-
-
Save jinwei233/1259549 to your computer and use it in GitHub Desktop.
用python 模拟浏览器行为
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import sys, time, os | |
from mechanize import Browser | |
LOGIN_URL = 'http://www.example.com/login' | |
USERNAME = 'DavidMertz' | |
PASSWORD = 'TheSpanishInquisition' | |
SEARCH_URL = 'http://www.example.com/search?' | |
FIXED_QUERY = 'food=spam&' 'utensil=spork&' 'date=the_future&' | |
VARIABLE_QUERY = ['actor=%s' % actor for actor in | |
('Graham Chapman', | |
'John Cleese', | |
'Terry Gilliam', | |
'Eric Idle', | |
'Terry Jones', | |
'Michael Palin')] | |
def fetch(): | |
result_no = 0 # Number the output files | |
br = Browser() # Create a browser | |
br.open(LOGIN_URL) # Open the login page | |
br.select_form(name="login") # Find the login form | |
br['username'] = USERNAME # Set the form values | |
br['password'] = PASSWORD | |
resp = br.submit() # Submit the form | |
# Automatic redirect sometimes fails, follow manually when needed | |
if 'Redirecting' in br.title(): | |
resp = br.follow_link(text_regex='click here') | |
# Loop through the searches, keeping fixed query parameters | |
for actor in in VARIABLE_QUERY: | |
# I like to watch what's happening in the console | |
print >> sys.stderr, '***', actor | |
# Lets do the actual query now | |
br.open(SEARCH_URL + FIXED_QUERY + actor) | |
# The query actually gives us links to the content pages we like, | |
# but there are some other links on the page that we ignore | |
nice_links = [l for l in br.links() | |
if 'good_path' in l.url | |
and 'credential' in l.url] | |
if not nice_links: # Maybe the relevant results are empty | |
break | |
for link in nice_links: | |
try: | |
response = br.follow_link(link) | |
# More console reporting on title of followed link page | |
print >> sys.stderr, br.title() | |
# Increment output filenames, open and write the file | |
result_no += 1 | |
out = open(result_%04d' % result_no, 'w') | |
print >> out, response.read() | |
out.close() | |
# Nothing ever goes perfectly, ignore if we do not get page | |
except mechanize._response.httperror_seek_wrapper: | |
print >> sys.stderr, "Response error (probably 404)" | |
# Let's not hammer the site too much between fetches | |
time.sleep(1) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment