Last active
March 5, 2020 20:57
-
-
Save notriddle/1b9e48f6ed9d744bb76265b74c7321fd to your computer and use it in GitHub Desktop.
Sorts out the unique non-robot IP addresses that requested style.css
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/bin/sh | |
# The first step is to get a list of robot IP addresses | |
# We stash it in a file for later | |
awk -F' ' '/robots.txt/ {print $1}' /var/log/nginx/access.log.1 | sort | uniq > robots-ips.txt | |
# Also add a few known bots that don't necessarily get robots.txt from the same IP | |
grep -iE '(bingbot|BingPreview|msnbot|adbeat|ArchiveBot|YandexBot|Googlebot|Pinterestbot|Spider)' /var/log/nginx/access.log.1 | awk -F' ' '{print $1}' | sort | uniq > robots-ips.txt # The second step is to get a list of non-robot IP addresses | |
# This one is comparatively complicated, so let's dissect my awk syntax | |
# NR==FNR -- FNR is the line number ("record number") in the current file, while NR is the line number processed in total. If these are equal, then we're on the current file. # {a[$1]=1;next} -- we want to load the robots-ips.txt file into an array, and avoid the rest of the processing | |
# $1 in a {next} -- if we reached this, then we're on the second file, and if it is in the array, skip it | |
# /style.css/ {print $1} -- if it requested the stylesheet, then it's not a robot, or at least it's trying very hard to pretend not to be a robot | |
awk -F' ' 'NR==FNR {a[$1]=1;next} $1 in a {next} /style.css/ {print $1}' robots-ips.txt /var/log/nginx/access.log.1 | sort | uniq -c | sort -fnr | less |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment