Created
September 23, 2015 05:29
-
-
Save davidtran0/77b8a5fe64b713ba4ae1 to your computer and use it in GitHub Desktop.
Python script for web-scraping the Alexa.com top-ranked websites by country or globally
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# -*- coding: utf-8 -*- | |
""" | |
Python script for web-scraping the Alexa.com top-ranked websites | |
by country or globally | |
@author: David Tran | |
""" | |
import datetime | |
import urllib2 | |
# country code, use COUNTRY_CODE = "global" for global results | |
# -- see http://www.alexa.com/topsites/countries for the full list of | |
# supported countries | |
COUNTRY_CODE = "US" | |
# number of top sites to retrieve, maximum 500 | |
TOP_N = 500 | |
BASE_URL = "http://www.alexa.com/topsites" | |
top_sites = [] | |
# retrieve HTML content of first page | |
if COUNTRY_CODE != "global": | |
response = urllib2.urlopen(BASE_URL + "/countries/" + COUNTRY_CODE) | |
else: | |
response = urllib2.urlopen(BASE_URL) | |
html = response.read() | |
# retrieve HTML content of pages 2...20 | |
for i in xrange(1, 20): | |
if COUNTRY_CODE != "global": | |
response = urllib2.urlopen(BASE_URL + "/countries;" + str(i) | |
+ "/" + COUNTRY_CODE) | |
else: | |
response = urllib2.urlopen(BASE_URL + "/global;" + str(i)) | |
html += response.read() | |
# scrape the combined HTML for domain names | |
tokens = html.split("<") | |
for token in tokens: | |
if token.startswith("a href=\"/siteinfo/"): | |
subtokens = token.split("\">")[0].split("/") | |
site = subtokens[2] | |
if len(top_sites) < TOP_N: | |
top_sites.append(site) | |
# write output file | |
date_str = datetime.datetime.today().strftime('%Y-%m-%d') | |
csv_file = open("alexa-top-" + str(TOP_N) + "-" + COUNTRY_CODE.lower() + "-" | |
+ date_str + ".csv", "w") | |
csv_file.write("rank,site\n") | |
for rank, site in enumerate(top_sites): | |
csv_file.write(str(rank + 1) + "," + site + "\n") | |
csv_file.close() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Updated with requests: