Skip to content

Instantly share code, notes, and snippets.

@davidtran0
Created September 23, 2015 05:29
Show Gist options
  • Save davidtran0/77b8a5fe64b713ba4ae1 to your computer and use it in GitHub Desktop.
Save davidtran0/77b8a5fe64b713ba4ae1 to your computer and use it in GitHub Desktop.
Python script for web-scraping the Alexa.com top-ranked websites by country or globally
# -*- coding: utf-8 -*-
"""
Python script for web-scraping the Alexa.com top-ranked websites
by country or globally
@author: David Tran
"""
import datetime
import urllib2
# country code, use COUNTRY_CODE = "global" for global results
# -- see http://www.alexa.com/topsites/countries for the full list of
# supported countries
COUNTRY_CODE = "US"
# number of top sites to retrieve, maximum 500
TOP_N = 500
BASE_URL = "http://www.alexa.com/topsites"
top_sites = []
# retrieve HTML content of first page
if COUNTRY_CODE != "global":
response = urllib2.urlopen(BASE_URL + "/countries/" + COUNTRY_CODE)
else:
response = urllib2.urlopen(BASE_URL)
html = response.read()
# retrieve HTML content of pages 2...20
for i in xrange(1, 20):
if COUNTRY_CODE != "global":
response = urllib2.urlopen(BASE_URL + "/countries;" + str(i)
+ "/" + COUNTRY_CODE)
else:
response = urllib2.urlopen(BASE_URL + "/global;" + str(i))
html += response.read()
# scrape the combined HTML for domain names
tokens = html.split("<")
for token in tokens:
if token.startswith("a href=\"/siteinfo/"):
subtokens = token.split("\">")[0].split("/")
site = subtokens[2]
if len(top_sites) < TOP_N:
top_sites.append(site)
# write output file
date_str = datetime.datetime.today().strftime('%Y-%m-%d')
csv_file = open("alexa-top-" + str(TOP_N) + "-" + COUNTRY_CODE.lower() + "-"
+ date_str + ".csv", "w")
csv_file.write("rank,site\n")
for rank, site in enumerate(top_sites):
csv_file.write(str(rank + 1) + "," + site + "\n")
csv_file.close()
@mrtrkmn
Copy link

mrtrkmn commented Feb 26, 2022

Updated with requests:

# -*- coding: utf-8 -*-
"""
Python script for web-scraping the Alexa.com top-ranked websites
by country or globally

@author: David Tran
"""

import datetime
import requests

# country code, use COUNTRY_CODE = "global" for global results
# -- see http://www.alexa.com/topsites/countries for the full list of
# supported countries
COUNTRY_CODE = "US"

# number of top sites to retrieve, maximum 500
TOP_N = 500

BASE_URL = "http://www.alexa.com/topsites"
top_sites = []

# retrieve HTML content of first page
if COUNTRY_CODE != "global":
    response = requests.get(BASE_URL + "/countries/" + COUNTRY_CODE)
else:
    response = requests.get(BASE_URL)
html = response.text

# retrieve HTML content of pages 2...20
for i in range(1, 20):
    if COUNTRY_CODE != "global":
        response = requests.get(BASE_URL + "/countries;" + str(i)
            + "/" + COUNTRY_CODE)
    else:
        response = requests.get(BASE_URL + "/global;" + str(i))
    html += response.text

# scrape the combined HTML for domain names
tokens = html.split("<")
for token in tokens:
    if token.startswith("a href=\"/siteinfo/"):
        subtokens = token.split("\">")[0].split("/")
        site = subtokens[2]
        if len(top_sites) < TOP_N:
            top_sites.append(site)

# write output file
date_str = datetime.datetime.today().strftime('%Y-%m-%d')
csv_file = open("alexa-top-" + str(TOP_N) + "-" + COUNTRY_CODE.lower() + "-"
    + date_str + ".csv", "w")
csv_file.write("rank,site\n")
for rank, site in enumerate(top_sites):
    csv_file.write(str(rank + 1) + "," + site + "\n")
csv_file.close()

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment