Skip to content

Instantly share code, notes, and snippets.

@nickcjohnston
Created November 7, 2019 18:32
Show Gist options
  • Save nickcjohnston/0d95d9c3ad8aa6bf1798d7ed708213d9 to your computer and use it in GitHub Desktop.
Save nickcjohnston/0d95d9c3ad8aa6bf1798d7ed708213d9 to your computer and use it in GitHub Desktop.
python log data mining
#!/usr/bin/python3
# Script assumes a "browsing session" is all traffic from a unique IP.
# Yes, it's not ideal. I'm just playing around.
import sys
from collections import defaultdict
from collections import Counter
# dictionary of the form {ip:[urls, requested, by, that, ip]}
sessions = defaultdict(list)
# split each line of the log file into an array by spaces
lines = [line.split() for line in open(sys.argv[1], "r").readlines()]
# line[0] = ip address, line[6] = url requested
[sessions[line[0]].append(line[6]) for line in lines]
#{print(f"{ip}:{urls}") for (ip,urls) in sessions.items()}
# total number of distinct requests = number of lines in the log
num_requests = len(lines)
# How many times did each URL appear?
urls = Counter()
for line in lines:
urls[line[6]] += 1
#{print(f"{url}:{freq/num_requests}") for (url,freq) in urls.items()}
# Which urls occur in at least 1.9% of requests?
# The threshold was chosen because there are no results at 2% or higher.
threshold = 0.019
frequent = set()
for url in urls.keys():
freq = urls[url]/num_requests
if freq >= threshold:
frequent.add(url)
# Of those frequently occurring URLs, which sets of frequent_set_size or
# more are seen together per browsing session?
frequent_list = list()
frequent_set_size = 10
for url_list in sessions.values():
intersection = frequent & set(url_list)
if len(intersection) > frequent_set_size:
frequent_list.append(sorted(intersection))
# Remove duplicates (sets can't contain duplicates)
frequent_sets = set(tuple(i) for i in frequent_list)
# Print list of "commonly requested together during a browsing session"
# resources. Maybe good for baseline generation and outlier identification?
[print(f"{sorted(frequent_set)}") for frequent_set in frequent_sets]
# Sample output from an old wordpress site
# ['/', '/favicon.ico', '/libs/content-module.css', '/libs/defaults.css', '/libs/htmlelements.css', '/libs/layout.css', '/libs/nice_menus.css', '/libs/nice_menus_default.css', '/libs/panels.css', '/libs/style.css', '/libs/system-menus.css', '/libs/system.css']
# ['/', '/libs/content-module.css', '/libs/defaults.css', '/libs/htmlelements.css', '/libs/layout.css', '/libs/nice_menus.css', '/libs/nice_menus_default.css', '/libs/panels.css', '/libs/style.css', '/libs/system-menus.css', '/libs/system.css']
# ['/', '/libs/content-module.css', '/libs/defaults.css', '/libs/htmlelements.css', '/libs/layout.css', '/libs/nice_menus.css', '/libs/nice_menus_default.css', '/libs/panels.css', '/libs/system-menus.css', '/libs/system.css', '/robots.txt']
# ['/libs/content-module.css', '/libs/defaults.css', '/libs/htmlelements.css', '/libs/layout.css', '/libs/nice_menus.css', '/libs/nice_menus_default.css', '/libs/panels.css', '/libs/style.css', '/libs/system-menus.css', '/libs/system.css', '/robots.txt']
# ['/', '/libs/content-module.css', '/libs/defaults.css', '/libs/htmlelements.css', '/libs/layout.css', '/libs/nice_menus.css', '/libs/nice_menus_default.css', '/libs/panels.css', '/libs/style.css', '/libs/system-menus.css', '/libs/system.css', '/robots.txt']
# ['/', '/favicon.ico', '/libs/content-module.css', '/libs/defaults.css', '/libs/htmlelements.css', '/libs/nice_menus.css', '/libs/nice_menus_default.css', '/libs/panels.css', '/libs/style.css', '/libs/system-menus.css', '/libs/system.css']
# ['/favicon.ico', '/libs/content-module.css', '/libs/defaults.css', '/libs/htmlelements.css', '/libs/layout.css', '/libs/nice_menus.css', '/libs/nice_menus_default.css', '/libs/panels.css', '/libs/style.css', '/libs/system-menus.css', '/libs/system.css']
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment