Last active
August 29, 2015 14:16
-
-
Save liulinyang/7d989cde71a226f57525 to your computer and use it in GitHub Desktop.
alexa_spider
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from bs4 import BeautifulSoup | |
import requests | |
import mechanize | |
import time | |
import logging | |
import logging.handlers | |
logger = logging.getLogger('Alexa') | |
debug_level = True | |
# debug_level = False | |
url_pattern = { | |
'global': "http://www.alexa.com/topsites/global;%s", | |
'jp':"http://www.alexa.com/topsites/countries;%s/JP", | |
'au':"http://www.alexa.com/topsites/countries;%s/AU", | |
'us':"http://www.alexa.com/topsites/countries;%s/US", | |
'mx':"http://www.alexa.com/topsites/countries;%s/MX", | |
} | |
headers = { | |
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/36.0.1985.125 Safari/537.36' | |
} | |
proxies = { | |
"http": "http://10.64.8.8:8080", | |
"ftp": "http://10.64.8.8:8080", | |
} | |
iwsaas_proxies = { | |
"http": "http://proxy.iws.trendmicro.com", | |
"ftp": "http://proxy.iws.trendmicro.com", | |
} | |
''' | |
<li class="site-listing"> | |
<div class="count">4</div> | |
<div class="desc-container"> | |
<p class="desc-paragraph"> | |
<a href="/siteinfo/yahoo.com">Yahoo.com</a> | |
</p> | |
<span class="small topsites-label"></span> | |
<div class="description">A major internet portal and service provider offering search results, customizable content, cha<span class='truncate'>... <a class='moreDesc'>More</a></span><span class="remainder">trooms, free e-mail, clubs, and pager.</span></div> | |
</div> | |
''' | |
class AlexaSpider(object): | |
def __init__(self): | |
print "Init..." | |
self.pages = {} | |
pass | |
def __is_url_wanted(self, tag): | |
return tag.has_attr('class') and "site-listing" in tag['class'] | |
def __format_one_site(self, li): | |
return "%s: %s" % (li.div.string, li.p.a.string) | |
def pageNo(self, page, region='global'): | |
print "process page %s..." % page | |
r = requests.get(url_pattern[region] % (page), proxies=proxies, headers=headers) | |
data = r.text | |
soup = BeautifulSoup(data) | |
one_page = [] | |
for link in soup.find_all(self.__is_url_wanted): | |
print '-- %s' % (self.__format_one_site(link)) | |
one_page.append(self.__format_one_site(link)) | |
self.pages[page] = one_page | |
def topN(self, region='global', top=100, start=0): | |
max_page = top / 25 | |
start_page = start / 25 | |
for i in xrange(max_page): | |
if i < start_page: | |
continue | |
self.pageNo(i, region) | |
time.sleep(5) | |
class Verifier(object): | |
def __init__(self, url): | |
self.url = url | |
# self.proxy = proxy | |
# self.username = username | |
# self.password = password | |
pass | |
def run_with_mechnize(self, proxy=None, username=None, password=None): | |
br = mechanize.Browser() | |
br.set_handle_equiv(True) | |
br.set_handle_gzip(True) | |
br.set_handle_redirect(True) | |
br.set_handle_referer(True) | |
br.set_handle_robots(False) | |
br.set_handle_refresh(mechanize._http.HTTPRefreshProcessor(), max_time=1) | |
br.addheaders = [('User-agent', 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/36.0.1985.125 Safari/537.36')] | |
if proxy is not None: | |
response = br.open(self.url) | |
br._factory.is_html = True | |
logger.debug("title: %s", br.title()) | |
print br.title() | |
for f in br.forms(): | |
print f | |
br.select_form(nr=0) | |
br.form['username'] = username | |
r = br.submit() | |
if r.code != 200: | |
logger.error("Error when input username: %s", r.code) | |
br._factory.is_html = True | |
logger.debug("title: %s", br.title()) | |
br.select_form(nr=0) | |
br.form['password'] = password | |
r = br.submit() | |
if r.code != 200: | |
logger.error("Error when input password: %s", r.code) | |
br._factory.is_html = True | |
logger.debug("title: %s", br.title()) | |
return self.url, len(r.read()), br.title() | |
else: | |
br.set_proxies(proxy) | |
response = br.open(self.url) | |
print "Title: %s" % br.title() | |
print "Response Length: %s" % len(response.read()) | |
return self.url, len(response.read()), br.title() | |
def run_with_request(self): | |
r = requests.get("http://%s" % self.url, proxies=proxies, headers=headers) | |
data = r.text | |
print len(data) | |
def test(self): | |
s = requests.Session() | |
r = s.get('http://httpbin.org/cookies/set/sessioncookie/123456789', proxies=iwsaas_proxies, headers=headers) | |
print(r.text) | |
# r = s.get("http://httpbin.org/cookies") | |
def get_top_site_list(): | |
w = AlexaSpider() | |
# w.pageN(6) | |
# w.topN(500) | |
# w.topN(500,276) | |
# w.topN('jp', 500) | |
# w.topN('jp', 500, 0) | |
# w.topN('mx', 500, 0) | |
# w.topN('us', 500, 0) | |
# w.topN('au', 500, 0) | |
w.topN('global', 500, 0) | |
for page in w.pages.keys(): | |
print "\n".join(w.pages[page]) | |
# w.topN('jp',450, 426) | |
def check_if_url_blocked(): | |
# v = Verifier('www.stackoverflow.com') | |
v = Verifier('http://www.bing.com') | |
print v.run_with_mechnize(iwsaas_proxies, '[email protected]', '******') | |
# print v.run_with_mechnize() | |
if __name__ == "__main__": | |
# main() | |
formatter = logging.Formatter("%(asctime)s %(levelname)s %(filename)s #%(lineno)d <%(process)d:%(thread)d> %(message)s") | |
#handler = logging.handlers.TimedRotatingFileHandler(options.logfile, 'm', 1, 5) | |
handler = logging.handlers.TimedRotatingFileHandler('a.log', 'midnight', 1, 5) | |
handler.setFormatter(formatter) | |
logger.addHandler(handler) | |
logger.setLevel(logging.INFO) | |
if debug_level: | |
logger.setLevel(logging.DEBUG) | |
# check_if_url_blocked() | |
get_top_site_list() | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment