davidtran0 · September 23, 2015 05:29 · mrtrkmn · Feb 26, 2022
diff --git a/alexa-top-sites.py b/alexa-top-sites.py
 # -*- coding: utf-8 -*-
 """
 Python script for web-scraping the Alexa.com top-ranked websites 
 by country or globally

 @author: David Tran
 """

 import datetime
 import urllib2

 # country code, use COUNTRY_CODE = "global" for global results
 # -- see http://www.alexa.com/topsites/countries for the full list of 
 # supported countries
 COUNTRY_CODE = "US"

 # number of top sites to retrieve, maximum 500
 TOP_N = 500

 BASE_URL = "http://www.alexa.com/topsites"
 top_sites = []

 # retrieve HTML content of first page
 if COUNTRY_CODE != "global":
    response = urllib2.urlopen(BASE_URL + "/countries/" + COUNTRY_CODE)
 else:
    response = urllib2.urlopen(BASE_URL)
 html = response.read()

 # retrieve HTML content of pages 2...20
 for i in xrange(1, 20):
    if COUNTRY_CODE != "global":
        response = urllib2.urlopen(BASE_URL + "/countries;" + str(i) 
            + "/" + COUNTRY_CODE)
    else:
        response = urllib2.urlopen(BASE_URL + "/global;" + str(i))
    html += response.read()

 # scrape the combined HTML for domain names
 tokens = html.split("<")
 for token in tokens:
    if token.startswith("a href=\"/siteinfo/"):
        subtokens = token.split("\">")[0].split("/")
        site = subtokens[2]
        if len(top_sites) < TOP_N:
            top_sites.append(site)

 # write output file
 date_str = datetime.datetime.today().strftime('%Y-%m-%d')
 csv_file = open("alexa-top-" + str(TOP_N) + "-" + COUNTRY_CODE.lower() + "-"
    + date_str + ".csv", "w")
 csv_file.write("rank,site\n")
 for rank, site in enumerate(top_sites):
    csv_file.write(str(rank + 1) + "," + site + "\n")
 csv_file.close()
	# -- coding: utf-8 --
	"""
	Python script for web-scraping the Alexa.com top-ranked websites
	by country or globally

	@author: David Tran
	"""

	import datetime
	import urllib2

	# country code, use COUNTRY_CODE = "global" for global results
	# -- see http://www.alexa.com/topsites/countries for the full list of
	# supported countries
	COUNTRY_CODE = "US"

	# number of top sites to retrieve, maximum 500
	TOP_N = 500

	BASE_URL = "http://www.alexa.com/topsites"
	top_sites = []

	# retrieve HTML content of first page
	if COUNTRY_CODE != "global":
	response = urllib2.urlopen(BASE_URL + "/countries/" + COUNTRY_CODE)
	else:
	response = urllib2.urlopen(BASE_URL)
	html = response.read()

	# retrieve HTML content of pages 2...20
	for i in xrange(1, 20):
	if COUNTRY_CODE != "global":
	response = urllib2.urlopen(BASE_URL + "/countries;" + str(i)
	+ "/" + COUNTRY_CODE)
	else:
	response = urllib2.urlopen(BASE_URL + "/global;" + str(i))
	html += response.read()

	# scrape the combined HTML for domain names
	tokens = html.split("<")
	for token in tokens:
	if token.startswith("a href=\"/siteinfo/"):
	subtokens = token.split("\">")[0].split("/")
	site = subtokens[2]
	if len(top_sites) < TOP_N:
	top_sites.append(site)

	# write output file
	date_str = datetime.datetime.today().strftime('%Y-%m-%d')
	csv_file = open("alexa-top-" + str(TOP_N) + "-" + COUNTRY_CODE.lower() + "-"
	+ date_str + ".csv", "w")
	csv_file.write("rank,site\n")
	for rank, site in enumerate(top_sites):
	csv_file.write(str(rank + 1) + "," + site + "\n")
	csv_file.close()