Skip to content

Instantly share code, notes, and snippets.

@jaytaylor
Last active February 19, 2024 03:06
Show Gist options
  • Select an option

  • Save jaytaylor/11157010 to your computer and use it in GitHub Desktop.

Select an option

Save jaytaylor/11157010 to your computer and use it in GitHub Desktop.
Alexa top 1-million websites daily snapshot historical archival system
#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""Alexa top 1-million websites daily snapshot historical archival system."""
__author__ = 'Jay Taylor [@jtaylor]'
import datetime, hashlib, logging, os, urllib2
logging.basicConfig(format='%(asctime)s [%(levelname)s] %(message)s', level=logging.INFO)
def md5_for(f, block_size=2**20):
"""@param f File path or opened file object. If file object is passed, the file must have been opened using 'rb' mode."""
if isinstance(f, (str, unicode)):
f = open(f, 'rb')
close = True
else:
close = False
md5 = hashlib.md5()
while True:
data = f.read(block_size)
if not data:
break
md5.update(data)
if close:
f.close()
return md5.hexdigest()
response = urllib2.urlopen('http://s3.amazonaws.com/alexa-static/top-1m.csv.zip')
assert response.getcode() == 200, 'Failed to download updated Alexa Top 1m, status code was {}'.format(response.getcode())
today = datetime.datetime.utcnow()
yesterday = today - datetime.timedelta(days=1)
def filename_for(ts):
"""@param ts datetime object."""
return '{}-{}-{}_top-1m.csv.zip'.format(ts.year, str(ts.month).zfill(2), str(ts.day).zfill(2))
with open(filename_for(today), 'wb') as fh:
fh.write(response.read())
file_exists = lambda (filename): os.path.isfile(filename) and os.access(filename, os.R_OK)
if file_exists(filename_for(yesterday)):
if md5_for(filename_for(today)) == md5_for(filename_for(yesterday)):
logging.info('Alexa datafile for yesterday and today look the same, removing the one for today so no duplicate is stored')
os.remove(filename_for(today))
else:
logging.info('Datafile for yesterday not found, skipping md5sum check')
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment