Skip to content

Instantly share code, notes, and snippets.

@liulinyang
Last active August 29, 2015 14:16
Show Gist options
  • Save liulinyang/7d989cde71a226f57525 to your computer and use it in GitHub Desktop.
Save liulinyang/7d989cde71a226f57525 to your computer and use it in GitHub Desktop.
alexa_spider
from bs4 import BeautifulSoup
import requests
import mechanize
import time
import logging
import logging.handlers
logger = logging.getLogger('Alexa')
debug_level = True
# debug_level = False
url_pattern = {
'global': "http://www.alexa.com/topsites/global;%s",
'jp':"http://www.alexa.com/topsites/countries;%s/JP",
'au':"http://www.alexa.com/topsites/countries;%s/AU",
'us':"http://www.alexa.com/topsites/countries;%s/US",
'mx':"http://www.alexa.com/topsites/countries;%s/MX",
}
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/36.0.1985.125 Safari/537.36'
}
proxies = {
"http": "http://10.64.8.8:8080",
"ftp": "http://10.64.8.8:8080",
}
iwsaas_proxies = {
"http": "http://proxy.iws.trendmicro.com",
"ftp": "http://proxy.iws.trendmicro.com",
}
'''
<li class="site-listing">
<div class="count">4</div>
<div class="desc-container">
<p class="desc-paragraph">
<a href="/siteinfo/yahoo.com">Yahoo.com</a>
</p>
<span class="small topsites-label"></span>
<div class="description">A major internet portal and service provider offering search results, customizable content, cha<span class='truncate'>... <a class='moreDesc'>More</a></span><span class="remainder">trooms, free e-mail, clubs, and pager.</span></div>
</div>
'''
class AlexaSpider(object):
def __init__(self):
print "Init..."
self.pages = {}
pass
def __is_url_wanted(self, tag):
return tag.has_attr('class') and "site-listing" in tag['class']
def __format_one_site(self, li):
return "%s: %s" % (li.div.string, li.p.a.string)
def pageNo(self, page, region='global'):
print "process page %s..." % page
r = requests.get(url_pattern[region] % (page), proxies=proxies, headers=headers)
data = r.text
soup = BeautifulSoup(data)
one_page = []
for link in soup.find_all(self.__is_url_wanted):
print '-- %s' % (self.__format_one_site(link))
one_page.append(self.__format_one_site(link))
self.pages[page] = one_page
def topN(self, region='global', top=100, start=0):
max_page = top / 25
start_page = start / 25
for i in xrange(max_page):
if i < start_page:
continue
self.pageNo(i, region)
time.sleep(5)
class Verifier(object):
def __init__(self, url):
self.url = url
# self.proxy = proxy
# self.username = username
# self.password = password
pass
def run_with_mechnize(self, proxy=None, username=None, password=None):
br = mechanize.Browser()
br.set_handle_equiv(True)
br.set_handle_gzip(True)
br.set_handle_redirect(True)
br.set_handle_referer(True)
br.set_handle_robots(False)
br.set_handle_refresh(mechanize._http.HTTPRefreshProcessor(), max_time=1)
br.addheaders = [('User-agent', 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/36.0.1985.125 Safari/537.36')]
if proxy is not None:
response = br.open(self.url)
br._factory.is_html = True
logger.debug("title: %s", br.title())
print br.title()
for f in br.forms():
print f
br.select_form(nr=0)
br.form['username'] = username
r = br.submit()
if r.code != 200:
logger.error("Error when input username: %s", r.code)
br._factory.is_html = True
logger.debug("title: %s", br.title())
br.select_form(nr=0)
br.form['password'] = password
r = br.submit()
if r.code != 200:
logger.error("Error when input password: %s", r.code)
br._factory.is_html = True
logger.debug("title: %s", br.title())
return self.url, len(r.read()), br.title()
else:
br.set_proxies(proxy)
response = br.open(self.url)
print "Title: %s" % br.title()
print "Response Length: %s" % len(response.read())
return self.url, len(response.read()), br.title()
def run_with_request(self):
r = requests.get("http://%s" % self.url, proxies=proxies, headers=headers)
data = r.text
print len(data)
def test(self):
s = requests.Session()
r = s.get('http://httpbin.org/cookies/set/sessioncookie/123456789', proxies=iwsaas_proxies, headers=headers)
print(r.text)
# r = s.get("http://httpbin.org/cookies")
def get_top_site_list():
w = AlexaSpider()
# w.pageN(6)
# w.topN(500)
# w.topN(500,276)
# w.topN('jp', 500)
# w.topN('jp', 500, 0)
# w.topN('mx', 500, 0)
# w.topN('us', 500, 0)
# w.topN('au', 500, 0)
w.topN('global', 500, 0)
for page in w.pages.keys():
print "\n".join(w.pages[page])
# w.topN('jp',450, 426)
def check_if_url_blocked():
# v = Verifier('www.stackoverflow.com')
v = Verifier('http://www.bing.com')
print v.run_with_mechnize(iwsaas_proxies, '[email protected]', '******')
# print v.run_with_mechnize()
if __name__ == "__main__":
# main()
formatter = logging.Formatter("%(asctime)s %(levelname)s %(filename)s #%(lineno)d <%(process)d:%(thread)d> %(message)s")
#handler = logging.handlers.TimedRotatingFileHandler(options.logfile, 'm', 1, 5)
handler = logging.handlers.TimedRotatingFileHandler('a.log', 'midnight', 1, 5)
handler.setFormatter(formatter)
logger.addHandler(handler)
logger.setLevel(logging.INFO)
if debug_level:
logger.setLevel(logging.DEBUG)
# check_if_url_blocked()
get_top_site_list()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment