Created
November 7, 2019 18:32
-
-
Save nickcjohnston/0d95d9c3ad8aa6bf1798d7ed708213d9 to your computer and use it in GitHub Desktop.
python log data mining
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/python3 | |
# Script assumes a "browsing session" is all traffic from a unique IP. | |
# Yes, it's not ideal. I'm just playing around. | |
import sys | |
from collections import defaultdict | |
from collections import Counter | |
# dictionary of the form {ip:[urls, requested, by, that, ip]} | |
sessions = defaultdict(list) | |
# split each line of the log file into an array by spaces | |
lines = [line.split() for line in open(sys.argv[1], "r").readlines()] | |
# line[0] = ip address, line[6] = url requested | |
[sessions[line[0]].append(line[6]) for line in lines] | |
#{print(f"{ip}:{urls}") for (ip,urls) in sessions.items()} | |
# total number of distinct requests = number of lines in the log | |
num_requests = len(lines) | |
# How many times did each URL appear? | |
urls = Counter() | |
for line in lines: | |
urls[line[6]] += 1 | |
#{print(f"{url}:{freq/num_requests}") for (url,freq) in urls.items()} | |
# Which urls occur in at least 1.9% of requests? | |
# The threshold was chosen because there are no results at 2% or higher. | |
threshold = 0.019 | |
frequent = set() | |
for url in urls.keys(): | |
freq = urls[url]/num_requests | |
if freq >= threshold: | |
frequent.add(url) | |
# Of those frequently occurring URLs, which sets of frequent_set_size or | |
# more are seen together per browsing session? | |
frequent_list = list() | |
frequent_set_size = 10 | |
for url_list in sessions.values(): | |
intersection = frequent & set(url_list) | |
if len(intersection) > frequent_set_size: | |
frequent_list.append(sorted(intersection)) | |
# Remove duplicates (sets can't contain duplicates) | |
frequent_sets = set(tuple(i) for i in frequent_list) | |
# Print list of "commonly requested together during a browsing session" | |
# resources. Maybe good for baseline generation and outlier identification? | |
[print(f"{sorted(frequent_set)}") for frequent_set in frequent_sets] | |
# Sample output from an old wordpress site | |
# ['/', '/favicon.ico', '/libs/content-module.css', '/libs/defaults.css', '/libs/htmlelements.css', '/libs/layout.css', '/libs/nice_menus.css', '/libs/nice_menus_default.css', '/libs/panels.css', '/libs/style.css', '/libs/system-menus.css', '/libs/system.css'] | |
# ['/', '/libs/content-module.css', '/libs/defaults.css', '/libs/htmlelements.css', '/libs/layout.css', '/libs/nice_menus.css', '/libs/nice_menus_default.css', '/libs/panels.css', '/libs/style.css', '/libs/system-menus.css', '/libs/system.css'] | |
# ['/', '/libs/content-module.css', '/libs/defaults.css', '/libs/htmlelements.css', '/libs/layout.css', '/libs/nice_menus.css', '/libs/nice_menus_default.css', '/libs/panels.css', '/libs/system-menus.css', '/libs/system.css', '/robots.txt'] | |
# ['/libs/content-module.css', '/libs/defaults.css', '/libs/htmlelements.css', '/libs/layout.css', '/libs/nice_menus.css', '/libs/nice_menus_default.css', '/libs/panels.css', '/libs/style.css', '/libs/system-menus.css', '/libs/system.css', '/robots.txt'] | |
# ['/', '/libs/content-module.css', '/libs/defaults.css', '/libs/htmlelements.css', '/libs/layout.css', '/libs/nice_menus.css', '/libs/nice_menus_default.css', '/libs/panels.css', '/libs/style.css', '/libs/system-menus.css', '/libs/system.css', '/robots.txt'] | |
# ['/', '/favicon.ico', '/libs/content-module.css', '/libs/defaults.css', '/libs/htmlelements.css', '/libs/nice_menus.css', '/libs/nice_menus_default.css', '/libs/panels.css', '/libs/style.css', '/libs/system-menus.css', '/libs/system.css'] | |
# ['/favicon.ico', '/libs/content-module.css', '/libs/defaults.css', '/libs/htmlelements.css', '/libs/layout.css', '/libs/nice_menus.css', '/libs/nice_menus_default.css', '/libs/panels.css', '/libs/style.css', '/libs/system-menus.css', '/libs/system.css'] |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment