Skip to content

Instantly share code, notes, and snippets.

@gwillem
Last active November 29, 2015 16:20
Show Gist options
  • Save gwillem/ce4ff60179ff8f802be2 to your computer and use it in GitHub Desktop.
Save gwillem/ce4ff60179ff8f802be2 to your computer and use it in GitHub Desktop.
Parse http logfile to find disguised robots
import gzip
import csv
import sys
from netaddr import IPNetwork, IPAddress
from collections import defaultdict
from pprint import pprint
"""
20151129 [email protected]
Written to assert Peter Jaaps suspicions about disguised crawlers from competitors
"""
if len(sys.argv) > 1:
logfile = sys.argv[1]
else:
logfile = '/home/users/masuxftp/weblogs/raw/20151118-access.log.gz'
known_good_bots = [
# google
'64.18.0.0/20',
'64.233.160.0/19',
'66.102.0.0/20',
'66.249.64.0/19',
'72.14.192.0/18',
'74.125.0.0/16',
'108.177.8.0/21',
'173.194.0.0/16',
'207.126.144.0/20',
'209.85.128.0/17',
'216.58.192.0/19',
'216.239.32.0/19',
# yahoo
'68.180.128.0/17',
# microsoft
'207.46.0.0/16',
'157.54.0.0/15',
'157.56.0.0/14',
'157.60.0.0/16',
'40.96.0.0/12',
'40.74.0.0/15',
'40.125.0.0/17',
'40.124.0.0/16',
'40.76.0.0/14',
'40.120.0.0/14',
'40.112.0.0/13',
'40.80.0.0/12',
]
known_good_bots = [IPNetwork(x) for x in known_good_bots]
human_ext = ('.css', '.js')
human_ips = set()
robot_ips = defaultdict(dict)
def is_known_good_bot_ip(ip):
for cidr in known_good_bots:
if ip in cidr:
return True
return False
assert is_known_good_bot_ip(IPAddress('66.249.78.125'))
assert is_known_good_bot_ip(IPAddress('157.55.39.99'))
with gzip.open(logfile, 'rb') as f:
logreader = csv.reader(f, delimiter=' ')
for i, line in enumerate(logreader):
ip = line[0]
ua = line[9]
uri = line[5].split()[1]
if is_known_good_bot_ip(IPAddress(ip)):
continue
# print ip, ua, uri
for ext in human_ext:
if uri.endswith(ext):
human_ips.add(ip)
robot_ips.pop(ip, None) # ignore if it doesnt exist
break
if ip in human_ips:
continue
# so, add ua to this ip
if 'uas' not in robot_ips[ip]:
robot_ips[ip]['uas'] = set()
robot_ips[ip]['uas'].add(ua)
if 'hitcount' not in robot_ips[ip]:
robot_ips[ip]['hitcount'] = 0
robot_ips[ip]['hitcount'] += 1
#~ if i > 10000: break
for ip, v in sorted(robot_ips.items(), key=lambda x: robot_ips[x[0]]['hitcount'], reverse=True):
uas = '\n '.join(sorted(v['uas']))
print "%6d %15s %s" % (v['hitcount'], ip, uas)
11835 87.251.44.98 -
3595 84.24.71.138 Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/44.0.2403.155 Safari/537.36
662 78.46.214.6 XMLRPC::Client
425 213.93.175.168 Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/46.0.2490.86 Safari/537.36
329 82.94.254.54 Mozilla/4.0 (compatible; Vagabondo/4.0; webcrawler at wise-guys dot nl; http://webagent.wise-guys.nl/; http://www.wise-guys.nl/)
314 46.229.164.98 Mozilla/5.0 (compatible; SemrushBot/0.99~bl; +http://www.semrush.com/bot.html)
299 24.132.40.221 Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/42.0.2311.135 Safari/537.36 Edge/12.10240
297 77.61.82.42 Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/46.0.2490.80 Safari/537.36
291 86.86.113.234 Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; rv:11.0) like Gecko
246 173.224.161.129 Wget/1.14 (linux-gnu)
202 86.94.152.114 Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/46.0.2490.86 Safari/537.36
202 46.229.164.102 Mozilla/5.0 (compatible; SemrushBot/0.99~bl; +http://www.semrush.com/bot.html)
184 84.28.255.11 Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/46.0.2490.86 Safari/537.36
166 84.105.166.103 Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; NP06; rv:11.0) like Gecko
Mozilla/5.0 (Windows NT 6.1; Win64; x64; Trident/7.0; NP06; rv:11.0) like Gecko
163 94.215.223.244 Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.85 Safari/537.36
162 23.92.218.106 Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/534.16 (KHTML, like Gecko) Chrome/10.0.648.11 Safari/534.16
155 82.94.254.55 Mozilla/4.0 (compatible; Vagabondo/4.0; webcrawler at wise-guys dot nl; http://webagent.wise-guys.nl/)
Mozilla/4.0 (compatible; Vagabondo/4.0; webcrawler at wise-guys dot nl; http://webagent.wise-guys.nl/; http://www.wise-guys.nl/)
154 82.72.75.40 Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; rv:11.0) like Gecko
136 46.229.164.99 Mozilla/5.0 (compatible; SemrushBot/0.99~bl; +http://www.semrush.com/bot.html)
114 87.195.135.150 Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/42.0.2311.135 Safari/537.36 Edge/12.10240
110 81.207.45.123 Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/42.0.2311.135 Safari/537.36 Edge/12.10240
107 84.105.115.98 Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/42.0.2311.135 Safari/537.36 Edge/12.10240
107 86.92.61.220 Mozilla/5.0 (Windows NT 6.3; WOW64; Trident/7.0; MASMJS; rv:11.0) like Gecko
104 205.201.132.14 -
MailChimp.com
Zend_Http_Client
100 91.179.110.41 Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; MANM; rv:11.0) like Gecko
93 178.132.208.46 Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/46.0.2490.86 Safari/537.36
88 163.158.34.104 Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/46.0.2490.86 Safari/537.36
87 77.249.51.233 Mozilla/5.0 (Windows NT 6.3; WOW64; Trident/7.0; MDDCJS; rv:11.0) like Gecko
83 188.202.64.209 Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/40.0.2214.115 Safari/537.36
76 94.214.250.56 Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/46.0.2490.86 Safari/537.36
71 46.229.164.101 Mozilla/5.0 (compatible; SemrushBot/0.99~bl; +http://www.semrush.com/bot.html)
70 84.81.109.204 Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/46.0.2490.86 Safari/537.36
65 82.94.254.53 Mozilla/4.0 (compatible; Vagabondo/4.0; webcrawler at wise-guys dot nl; http://webagent.wise-guys.nl/)
Mozilla/4.0 (compatible; Vagabondo/4.0; webcrawler at wise-guys dot nl; http://webagent.wise-guys.nl/; http://www.wise-guys.nl/)
65 86.91.135.99 Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/46.0.2490.86 Safari/537.36
64 82.94.214.64 python-requests/2.4.1 CPython/2.7.3 Linux/3.2.0-4-amd64
63 5.132.52.128 Mozilla/5.0 (Linux; Android 5.0; SAMSUNG SM-G900F Build/LRX21T) AppleWebKit/537.36 (KHTML, like Gecko) SamsungBrowser/2.1 Chrome/34.0.1847.76 Mobile Safari/537.36
61 82.157.18.66 Mozilla/5.0 (Windows NT 10.0; Win64; x64; WebView/3.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/42.0.2311.135 Safari/537.36 Edge/12.10240
60 86.81.157.167 Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; rv:11.0) like Gecko
60 163.158.201.166 Mozilla/5.0 (iPhone; CPU iPhone OS 9_1 like Mac OS X) AppleWebKit/601.1.46 (KHTML, like Gecko) Version/9.0 Mobile/13B143 Safari/601.1
60 212.23.48.201 -
60 77.162.41.141 Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/42.0.2311.135 Safari/537.36 Edge/12.10240
51 54.215.85.179 Mozilla/5.0 (compatible; ExpertSearchSpider +http://www.expertsearch.nl/spider)
44 94.212.229.85 Mozilla/5.0 (Linux; Android 5.1.1; D6603 Build/23.4.A.1.236) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/46.0.2490.76 Mobile Safari/537.36
44 83.128.125.179 Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; rv:11.0) like Gecko
43 94.211.191.33 Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/42.0.2311.135 Safari/537.36 Edge/12.10240
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment