Last active
November 29, 2015 16:20
-
-
Save gwillem/ce4ff60179ff8f802be2 to your computer and use it in GitHub Desktop.
Parse http logfile to find disguised robots
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import gzip | |
import csv | |
import sys | |
from netaddr import IPNetwork, IPAddress | |
from collections import defaultdict | |
from pprint import pprint | |
""" | |
20151129 [email protected] | |
Written to assert Peter Jaaps suspicions about disguised crawlers from competitors | |
""" | |
if len(sys.argv) > 1: | |
logfile = sys.argv[1] | |
else: | |
logfile = '/home/users/masuxftp/weblogs/raw/20151118-access.log.gz' | |
known_good_bots = [ | |
'64.18.0.0/20', | |
'64.233.160.0/19', | |
'66.102.0.0/20', | |
'66.249.64.0/19', | |
'72.14.192.0/18', | |
'74.125.0.0/16', | |
'108.177.8.0/21', | |
'173.194.0.0/16', | |
'207.126.144.0/20', | |
'209.85.128.0/17', | |
'216.58.192.0/19', | |
'216.239.32.0/19', | |
# yahoo | |
'68.180.128.0/17', | |
# microsoft | |
'207.46.0.0/16', | |
'157.54.0.0/15', | |
'157.56.0.0/14', | |
'157.60.0.0/16', | |
'40.96.0.0/12', | |
'40.74.0.0/15', | |
'40.125.0.0/17', | |
'40.124.0.0/16', | |
'40.76.0.0/14', | |
'40.120.0.0/14', | |
'40.112.0.0/13', | |
'40.80.0.0/12', | |
] | |
known_good_bots = [IPNetwork(x) for x in known_good_bots] | |
human_ext = ('.css', '.js') | |
human_ips = set() | |
robot_ips = defaultdict(dict) | |
def is_known_good_bot_ip(ip): | |
for cidr in known_good_bots: | |
if ip in cidr: | |
return True | |
return False | |
assert is_known_good_bot_ip(IPAddress('66.249.78.125')) | |
assert is_known_good_bot_ip(IPAddress('157.55.39.99')) | |
with gzip.open(logfile, 'rb') as f: | |
logreader = csv.reader(f, delimiter=' ') | |
for i, line in enumerate(logreader): | |
ip = line[0] | |
ua = line[9] | |
uri = line[5].split()[1] | |
if is_known_good_bot_ip(IPAddress(ip)): | |
continue | |
# print ip, ua, uri | |
for ext in human_ext: | |
if uri.endswith(ext): | |
human_ips.add(ip) | |
robot_ips.pop(ip, None) # ignore if it doesnt exist | |
break | |
if ip in human_ips: | |
continue | |
# so, add ua to this ip | |
if 'uas' not in robot_ips[ip]: | |
robot_ips[ip]['uas'] = set() | |
robot_ips[ip]['uas'].add(ua) | |
if 'hitcount' not in robot_ips[ip]: | |
robot_ips[ip]['hitcount'] = 0 | |
robot_ips[ip]['hitcount'] += 1 | |
#~ if i > 10000: break | |
for ip, v in sorted(robot_ips.items(), key=lambda x: robot_ips[x[0]]['hitcount'], reverse=True): | |
uas = '\n '.join(sorted(v['uas'])) | |
print "%6d %15s %s" % (v['hitcount'], ip, uas) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
11835 87.251.44.98 - | |
3595 84.24.71.138 Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/44.0.2403.155 Safari/537.36 | |
662 78.46.214.6 XMLRPC::Client | |
425 213.93.175.168 Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/46.0.2490.86 Safari/537.36 | |
329 82.94.254.54 Mozilla/4.0 (compatible; Vagabondo/4.0; webcrawler at wise-guys dot nl; http://webagent.wise-guys.nl/; http://www.wise-guys.nl/) | |
314 46.229.164.98 Mozilla/5.0 (compatible; SemrushBot/0.99~bl; +http://www.semrush.com/bot.html) | |
299 24.132.40.221 Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/42.0.2311.135 Safari/537.36 Edge/12.10240 | |
297 77.61.82.42 Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/46.0.2490.80 Safari/537.36 | |
291 86.86.113.234 Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; rv:11.0) like Gecko | |
246 173.224.161.129 Wget/1.14 (linux-gnu) | |
202 86.94.152.114 Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/46.0.2490.86 Safari/537.36 | |
202 46.229.164.102 Mozilla/5.0 (compatible; SemrushBot/0.99~bl; +http://www.semrush.com/bot.html) | |
184 84.28.255.11 Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/46.0.2490.86 Safari/537.36 | |
166 84.105.166.103 Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; NP06; rv:11.0) like Gecko | |
Mozilla/5.0 (Windows NT 6.1; Win64; x64; Trident/7.0; NP06; rv:11.0) like Gecko | |
163 94.215.223.244 Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.85 Safari/537.36 | |
162 23.92.218.106 Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/534.16 (KHTML, like Gecko) Chrome/10.0.648.11 Safari/534.16 | |
155 82.94.254.55 Mozilla/4.0 (compatible; Vagabondo/4.0; webcrawler at wise-guys dot nl; http://webagent.wise-guys.nl/) | |
Mozilla/4.0 (compatible; Vagabondo/4.0; webcrawler at wise-guys dot nl; http://webagent.wise-guys.nl/; http://www.wise-guys.nl/) | |
154 82.72.75.40 Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; rv:11.0) like Gecko | |
136 46.229.164.99 Mozilla/5.0 (compatible; SemrushBot/0.99~bl; +http://www.semrush.com/bot.html) | |
114 87.195.135.150 Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/42.0.2311.135 Safari/537.36 Edge/12.10240 | |
110 81.207.45.123 Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/42.0.2311.135 Safari/537.36 Edge/12.10240 | |
107 84.105.115.98 Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/42.0.2311.135 Safari/537.36 Edge/12.10240 | |
107 86.92.61.220 Mozilla/5.0 (Windows NT 6.3; WOW64; Trident/7.0; MASMJS; rv:11.0) like Gecko | |
104 205.201.132.14 - | |
MailChimp.com | |
Zend_Http_Client | |
100 91.179.110.41 Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; MANM; rv:11.0) like Gecko | |
93 178.132.208.46 Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/46.0.2490.86 Safari/537.36 | |
88 163.158.34.104 Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/46.0.2490.86 Safari/537.36 | |
87 77.249.51.233 Mozilla/5.0 (Windows NT 6.3; WOW64; Trident/7.0; MDDCJS; rv:11.0) like Gecko | |
83 188.202.64.209 Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/40.0.2214.115 Safari/537.36 | |
76 94.214.250.56 Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/46.0.2490.86 Safari/537.36 | |
71 46.229.164.101 Mozilla/5.0 (compatible; SemrushBot/0.99~bl; +http://www.semrush.com/bot.html) | |
70 84.81.109.204 Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/46.0.2490.86 Safari/537.36 | |
65 82.94.254.53 Mozilla/4.0 (compatible; Vagabondo/4.0; webcrawler at wise-guys dot nl; http://webagent.wise-guys.nl/) | |
Mozilla/4.0 (compatible; Vagabondo/4.0; webcrawler at wise-guys dot nl; http://webagent.wise-guys.nl/; http://www.wise-guys.nl/) | |
65 86.91.135.99 Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/46.0.2490.86 Safari/537.36 | |
64 82.94.214.64 python-requests/2.4.1 CPython/2.7.3 Linux/3.2.0-4-amd64 | |
63 5.132.52.128 Mozilla/5.0 (Linux; Android 5.0; SAMSUNG SM-G900F Build/LRX21T) AppleWebKit/537.36 (KHTML, like Gecko) SamsungBrowser/2.1 Chrome/34.0.1847.76 Mobile Safari/537.36 | |
61 82.157.18.66 Mozilla/5.0 (Windows NT 10.0; Win64; x64; WebView/3.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/42.0.2311.135 Safari/537.36 Edge/12.10240 | |
60 86.81.157.167 Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; rv:11.0) like Gecko | |
60 163.158.201.166 Mozilla/5.0 (iPhone; CPU iPhone OS 9_1 like Mac OS X) AppleWebKit/601.1.46 (KHTML, like Gecko) Version/9.0 Mobile/13B143 Safari/601.1 | |
60 212.23.48.201 - | |
60 77.162.41.141 Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/42.0.2311.135 Safari/537.36 Edge/12.10240 | |
51 54.215.85.179 Mozilla/5.0 (compatible; ExpertSearchSpider +http://www.expertsearch.nl/spider) | |
44 94.212.229.85 Mozilla/5.0 (Linux; Android 5.1.1; D6603 Build/23.4.A.1.236) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/46.0.2490.76 Mobile Safari/537.36 | |
44 83.128.125.179 Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; rv:11.0) like Gecko | |
43 94.211.191.33 Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/42.0.2311.135 Safari/537.36 Edge/12.10240 |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment