Last active
February 19, 2024 03:06
-
-
Save jaytaylor/11157010 to your computer and use it in GitHub Desktop.
Alexa top 1-million websites daily snapshot historical archival system
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #!/usr/bin/env python | |
| # -*- coding: utf-8 -*- | |
| """Alexa top 1-million websites daily snapshot historical archival system.""" | |
| __author__ = 'Jay Taylor [@jtaylor]' | |
| import datetime, hashlib, logging, os, urllib2 | |
| logging.basicConfig(format='%(asctime)s [%(levelname)s] %(message)s', level=logging.INFO) | |
| def md5_for(f, block_size=2**20): | |
| """@param f File path or opened file object. If file object is passed, the file must have been opened using 'rb' mode.""" | |
| if isinstance(f, (str, unicode)): | |
| f = open(f, 'rb') | |
| close = True | |
| else: | |
| close = False | |
| md5 = hashlib.md5() | |
| while True: | |
| data = f.read(block_size) | |
| if not data: | |
| break | |
| md5.update(data) | |
| if close: | |
| f.close() | |
| return md5.hexdigest() | |
| response = urllib2.urlopen('http://s3.amazonaws.com/alexa-static/top-1m.csv.zip') | |
| assert response.getcode() == 200, 'Failed to download updated Alexa Top 1m, status code was {}'.format(response.getcode()) | |
| today = datetime.datetime.utcnow() | |
| yesterday = today - datetime.timedelta(days=1) | |
| def filename_for(ts): | |
| """@param ts datetime object.""" | |
| return '{}-{}-{}_top-1m.csv.zip'.format(ts.year, str(ts.month).zfill(2), str(ts.day).zfill(2)) | |
| with open(filename_for(today), 'wb') as fh: | |
| fh.write(response.read()) | |
| file_exists = lambda (filename): os.path.isfile(filename) and os.access(filename, os.R_OK) | |
| if file_exists(filename_for(yesterday)): | |
| if md5_for(filename_for(today)) == md5_for(filename_for(yesterday)): | |
| logging.info('Alexa datafile for yesterday and today look the same, removing the one for today so no duplicate is stored') | |
| os.remove(filename_for(today)) | |
| else: | |
| logging.info('Datafile for yesterday not found, skipping md5sum check') |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment