|
import requests |
|
import boto3 |
|
import datetime |
|
import os |
|
import logging |
|
|
|
|
|
# https://docs.aws.amazon.com/lambda/latest/dg/python-logging.html |
|
log = logging.getLogger() |
|
log.setLevel(logging.INFO) |
|
|
|
# urllib3 is noisy |
|
logging.getLogger('urllib3.connectionpool').setLevel(logging.WARNING) |
|
logging.getLogger('requests.packages.urllib3.connectionpool').setLevel(logging.WARNING) |
|
logging.getLogger('botocore.vendored.requests.packages.urllib3.connectionpool').setLevel(logging.WARNING) |
|
|
|
|
|
# See for authentication on lambda: |
|
# https://docs.aws.amazon.com/lambda/latest/dg/with-s3-example-deployment-pkg.html#with-s3-example-deployment-pkg-python |
|
# They are set via env variables here: https://docs.aws.amazon.com/lambda/latest/dg/current-supported-versions.html#lambda-environment-variables |
|
# Set these env variables to get this to work locally |
|
# - AWS_ACCESS_KEY_ID |
|
# - AWS_SECRET_ACCESS_KEY |
|
# See: # https://boto3.readthedocs.io/en/latest/guide/configuration.html#environment-variables |
|
s3_client = boto3.client('s3') |
|
|
|
## Constants |
|
|
|
# You must set these in the lambda function config |
|
# For details on license key, see: https://blog.maxmind.com/2019/12/18/significant-changes-to-accessing-and-using-geolite2-databases/ |
|
MAXMIND_LICENSE_KEY = os.environ["MAXMIND_LICENSE_KEY"] |
|
S3_BUCKET = os.environ["S3_BUCKET"] |
|
|
|
DB_SHA256_URL = f"https://download.maxmind.com/app/geoip_download?edition_id=GeoLite2-City&license_key={MAXMIND_LICENSE_KEY}&suffix=tar.gz.sha256" |
|
DB_DOWNLOAD_URL = f"https://download.maxmind.com/app/geoip_download?edition_id=GeoLite2-City&license_key={MAXMIND_LICENSE_KEY}&suffix=tar.gz" |
|
CHUNK_SIZE = 1024*1024 # 1 mb |
|
|
|
|
|
def unpack_list_files_contents(r, known_files): |
|
""" |
|
Take a response to list_objects_v2 and add file hash and file name to a dict |
|
""" |
|
for f in r['Contents']: |
|
fname = f['Key'].split('/')[1] |
|
file_hash = fname.split('-')[0] |
|
known_files[file_hash] = fname |
|
|
|
def get_db_version_hashes(): |
|
""" |
|
Return a dict of <hash, filename> for all previously downloded maxmind db files |
|
""" |
|
known_files = dict() |
|
|
|
# First fetch |
|
fetch_num = 1 |
|
log.info("Fetching 1000 previous items, request #%d" % (fetch_num)) |
|
r = s3_client.list_objects_v2(Bucket=S3_BUCKET, Delimiter='/', Prefix='mmdb-files/', MaxKeys=1000) |
|
continuation_token = r.get("ContinuationToken") |
|
unpack_list_files_contents(r, known_files) |
|
|
|
while continuation_token is not None: |
|
# Continue paginating |
|
fetch_num+=1 |
|
log.info("Fetching 1000 previous items, #%d" % (fetch_num)) |
|
r = s3_client.list_objects_v2(Bucket=S3_BUCKET, Delimiter='/', Prefix='mmdb-files/', MaxKeys=1000, ContinuationToken=continuation_token) |
|
continuation_token = r.get("ContinuationToken") |
|
unpack_list_files_contents(r, known_files) |
|
|
|
log.info("Found %d previous items" % (len(known_files))) |
|
|
|
return known_files |
|
|
|
|
|
def handler(event, context): |
|
""" |
|
Lambda function handler. |
|
Will be called with a scheduled event from lambda. |
|
""" |
|
start = datetime.datetime.now() |
|
|
|
log.info("Starting to grab hash of current mmdb files.") |
|
db_hash = requests.get(DB_SHA256_URL).content.decode('utf8') |
|
db_hash = db_hash.split()[0] |
|
|
|
log.info("Starting to grab md5s of previously downloaded mmdb files.") |
|
previous_versions = get_db_version_hashes() |
|
|
|
if db_hash in previous_versions: |
|
log.info("DB file hash %s already exists. Exiting." % (db_hash)) |
|
return |
|
log.info("DB file hash %s is new. Starting download." % (db_hash)) |
|
|
|
# We have 512 mb of /tmp and file <10 mb: https://docs.aws.amazon.com/lambda/latest/dg/limits.html |
|
local_tmpfile = "/tmp/GeoLite2-City.tar.gz" |
|
with open(local_tmpfile, 'wb') as f: |
|
with requests.get(DB_DOWNLOAD_URL, stream=True) as r: |
|
nchunk = 0 |
|
for chunk in r.iter_content(chunk_size=CHUNK_SIZE): |
|
nchunk += 1 |
|
log.debug("Downloaded %d MBs" % (nchunk)) |
|
if chunk: # filter out keep-alive new chunks |
|
f.write(chunk) |
|
f.flush() |
|
|
|
s3_filename = "mmdb-files/{}-{}-GeoLite2-City.tar.gz".format(db_hash, datetime.datetime.utcnow().strftime("%Y%m%d-%H%M")) |
|
log.info("Downloaded new file to '%s'. Preparing to upload to '%s'." % ( |
|
local_tmpfile, s3_filename |
|
)) |
|
s3_client.upload_file(local_tmpfile, S3_BUCKET, s3_filename) |
|
log.info("Completed upload to '%s'." % (s3_filename)) |
|
|
|
# Max timeout is 300 s (5 min) |
|
# https://docs.aws.amazon.com/lambda/latest/dg/limits.html |
|
log.info("Time elapsed (ms): %s" % ((datetime.datetime.now() - start).total_seconds()*1000)) |
|
if context: |
|
# https://docs.aws.amazon.com/lambda/latest/dg/python-context-object.html#python-context-object-methods |
|
log.info("Time remaining (ms): %s" % (context.get_remaining_time_in_millis())) |
|
|
|
if __name__ == "__main__": |
|
# More logging |
|
logging.basicConfig() |
|
|
|
# Just call it |
|
handler(None, None) |
Download sites
Update frequency, from: https://dev.maxmind.com/geoip/geoip2/geolite2/