Last active
August 29, 2015 14:07
-
-
Save marsam/df4b02f92644afbee2b1 to your computer and use it in GitHub Desktop.
topvisits: solution of maxmind dev-hire-homework.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
*.gz | |
*.log | |
*.mmdb |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from setuptools import setup | |
setup( | |
name='topvisits', | |
version='0.0.1', | |
license='MIT', | |
py_modules=['topvisits'], | |
include_package_data=True, | |
install_requires=[ | |
'Click', | |
'geoip2', | |
], | |
tests_require=[ | |
'pytest', | |
], | |
entry_points=""" | |
[console_scripts] | |
topvisits=topvisits:main | |
""", | |
) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
# -*- coding: utf-8 -*- | |
# flake8: noqa | |
import pytest | |
import topvisits | |
ip_expectations = [ | |
['183.60.212.148 - - [26/Aug/2014:06:26:39 -0600] "GET /entry/15205 HTTP/1.1" 200 4865 "-" "Mozilla/5.0 (compatible; EasouSpider; +http://www.easou.com/search/spider.html)"', | |
('183.60.212.148', 'GET /entry/15205 HTTP/1.1')], | |
['65.55.219.79 - - [26/Aug/2014:06:26:57 -0600] "GET /robots.txt HTTP/1.1" 301 178 "-" "msnbot-UDiscovery/2.0b (+http://search.msn.com/msnbot.htm)"', | |
('65.55.219.79', 'GET /robots.txt HTTP/1.1')], | |
['37.58.100.142 - - [26/Aug/2014:06:27:08 -0600] "GET /entry/near/0%2C0/filter?unit=mile;distance=25;sort_order=ASC;page=;order_by=distance;address=34034;limit= HTTP/1.1" 200 6192 "-" "Mozilla/5.0 (compatible; AhrefsBot/5.0; +http://ahrefs.com/robot/)"', | |
('37.58.100.142', 'GET /entry/near/0%2C0/filter?unit=mile;distance=25;sort_order=ASC;page=;order_by=distance;address=34034;limit= HTTP/1.1')], | |
['104.131.236.236 - mailto [26/Aug/2014:10:40:21 -0600] "GET / HTTP/1.1" 301 178 "-" "Mozilla/5.0 (Windows NT 5.1; rv:31.0) Gecko/20100101 Firefox/31.0"', | |
('104.131.236.236', 'GET / HTTP/1.1')], | |
] | |
@pytest.mark.parametrize("line, expected", ip_expectations) | |
def test_ip_reqline(line, expected): | |
assert topvisits.ip_reqline(line) == expected | |
ignorable_expectations = [ | |
('/favicon.ico', True), | |
('/entry-images/4372/4372-2495-small.jpg', True), | |
] | |
@pytest.mark.parametrize("path, ignorable", ignorable_expectations) | |
def test_is_ignorable(path, ignorable): | |
assert topvisits.is_ignorable(path) == ignorable |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
# -*- coding: utf-8 -*- | |
""" | |
topvisits | |
========= | |
Given a logfile with the Apache `combined log format`_ return the top countries for visitors. | |
Solution of maxmind `dev-hire-homework`_. | |
.. _combined log format: https://httpd.apache.org/docs/1.3/logs.html#combined | |
.. _dev-hire-homework: https://github.com/maxmind/dev-hire-homework/ | |
""" | |
from __future__ import print_function | |
import re | |
from collections import Counter | |
import click | |
import geoip2.database | |
REGEXP = re.compile(r'(?P<ip>[(\d\.)]+) - (?:[\w-]+) \[(?:.*?)\] "(?P<req>.*?)"') | |
IGNORE_PATH_REGEXP = map(re.compile, [ | |
r'^/[a-f0-9]+/css/', | |
r'^/[a-f0-9]+/images/', | |
r'^/[a-f0-9]+/js/', | |
r'^/entry-images/', | |
r'^/images/', | |
r'^/user-images/', | |
r'^/static/', | |
r'^/robots.txt', | |
r'^/favicon.ico', | |
r'.*\.atom', | |
r'.*\.rss', | |
]) | |
def ip_reqline(line): | |
"Return a tuple (ip, request-line) from a log line." | |
match = re.match(REGEXP, line) | |
if match is not None: | |
return match.group('ip'), match.group('req') | |
def is_ignorable(path): | |
for regexp in IGNORE_PATH_REGEXP: | |
if re.match(regexp, path): | |
return True | |
else: | |
return False | |
def country_from_ip(reader, ip): | |
resp = reader.country(ip) | |
return resp.country.name | |
@click.command(help=__doc__) | |
@click.option('--mmdb', default='GeoLite2-Country.mmdb', type=click.Path(exists=True, dir_okay=False), help='Path to the MaxMind database.') | |
@click.option('--top', default=10, type=click.INT, help='Number of top countries.') | |
@click.argument('logfile', type=click.File('rb')) | |
def main(mmdb, top, logfile): | |
reader = geoip2.database.Reader(mmdb) | |
ipcounter = Counter() | |
for line in logfile: | |
if not line: | |
continue | |
ip, reqline = ip_reqline(line) | |
_, path, _ = reqline.split(None, 2) | |
if is_ignorable(path): | |
continue | |
country = country_from_ip(reader, ip) or 'Unknown' | |
ipcounter.update({country: 1}) | |
click.echo(click.style('Top {0} countries for visitors'.format(top), fg='green')) | |
for country, count in ipcounter.most_common(top): | |
click.echo('{0}: {1}'.format(country, count)) | |
if __name__ == '__main__': | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment