-
-
Save hreeder/f1ffe1408d296ce0591d to your computer and use it in GitHub Desktop.
#!/usr/bin/env python | |
import gzip | |
import os | |
import sys | |
import re | |
INPUT_DIR = "nginx-logs" | |
lineformat = re.compile(r"""(?P<ipaddress>\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}) - - \[(?P<dateandtime>\d{2}\/[a-z]{3}\/\d{4}:\d{2}:\d{2}:\d{2} (\+|\-)\d{4})\] ((\"(GET|POST) )(?P<url>.+)(http\/1\.1")) (?P<statuscode>\d{3}) (?P<bytessent>\d+) (["](?P<refferer>(\-)|(.+))["]) (["](?P<useragent>.+)["])""", re.IGNORECASE) | |
for f in os.listdir(INPUT_DIR): | |
if f.endswith(".gz"): | |
logfile = gzip.open(os.path.join(INPUT_DIR, f)) | |
else: | |
logfile = open(os.path.join(INPUT_DIR, f)) | |
for l in logfile.readlines(): | |
data = re.search(lineformat, l) | |
if data: | |
datadict = data.groupdict() | |
ip = datadict["ipaddress"] | |
datetimestring = datadict["dateandtime"] | |
url = datadict["url"] | |
bytessent = datadict["bytessent"] | |
referrer = datadict["refferer"] | |
useragent = datadict["useragent"] | |
status = datadict["statuscode"] | |
method = data.group(6) | |
print ip, \ | |
datetimestring, \ | |
url, \ | |
bytessent, \ | |
referrer, \ | |
useragent, \ | |
status, \ | |
method | |
logfile.close() |
You need to import the "re" package first and then use re.search
import re
data = re.search(lineformat, l)
I deleted my question... Was able to get that out of the way... There was an html file in myfolder which caused it... Currently, my output display is "None"
Can you assist me ?? Want to add upstream_response_time too..
My current format is ((( '$remote_addr - $remote_user [$time_local] "$request" '
'$status $body_bytes_sent "$http_referer" '
'"$http_user_agent" "$http_x_forwarded_for"';)))
how about this one?
lineformat = re.compile(r"""(?P<ipaddress>\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}) - - \[(?P<dateandtime>\d{2}\/[a-z]{3}\/\d{4}:\d{2}:\d{2}:\d{2} (\+|\-)\d{4})\] ((\"(GET|POST) )(?P<url>.+)(http\/1\.1")) (?P<statuscode>\d{3}) (?P<bytessent>\d+) (?P<refferer>-|"([^"]+)") (["](?P<useragent>[^"]+)["])""", re.IGNORECASE)
much better. This one doesn't messes up the user agent and referrer.
Need help... Using python 3... Getting cannot use a string pattern for a bytes object for the read.search() function
You need to import the "re" package first and then use re.search
import re
data = re.search(lineformat, l)
use l.decode() for python3
data = re.search(lineformat, l.decode())
lineformat = re.compile(r"""(?P<ipaddress>\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}) - - \[(?P<dateandtime>\d{2}\/[a-z]{3}\/\d{4}:\d{2}:\d{2}:\d{2} (\+|\-)\d{4})\] ((\"(GET|POST|HEAD|PUT|DELETE) )(?P<url>.+)(http\/(1\.1|2\.0)")) (?P<statuscode>\d{3}) (?P<bytessent>\d+) (?P<refferer>-|"([^"]+)") (["](?P<useragent>[^"]+)["])""", re.IGNORECASE)
from @mohit6522 i have also added :
- more HTTP Verbs
- HTTP 2.0
It should increase match rate.
lineformat = re.compile(
r"""(?P<ipaddress>\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}|(([0-9a-fA-F]{1,4}:){7,7}[0-9a-fA-F]{1,4}|([0-9a-fA-F]{1,4}:){1,7}:|([0-9a-fA-F]{1,4}:){1,6}:[0-9a-fA-F]{1,4}|([0-9a-fA-F]{1,4}:){1,5}(:[0-9a-fA-F]{1,4}){1,2}|([0-9a-fA-F]{1,4}:){1,4}(:[0-9a-fA-F]{1,4}){1,3}|([0-9a-fA-F]{1,4}:){1,3}(:[0-9a-fA-F]{1,4}){1,4}|([0-9a-fA-F]{1,4}:){1,2}(:[0-9a-fA-F]{1,4}){1,5}|[0-9a-fA-F]{1,4}:((:[0-9a-fA-F]{1,4}){1,6})|:((:[0-9a-fA-F]{1,4}){1,7}|:)|fe80:(:[0-9a-fA-F]{0,4}){0,4}%[0-9a-zA-Z]{1,}|::(ffff(:0{1,4}){0,1}:){0,1}((25[0-5]|(2[0-4]|1{0,1}[0-9]){0,1}[0-9])\.){3,3}(25[0-5]|(2[0-4]|1{0,1}[0-9]){0,1}[0-9])|([0-9a-fA-F]{1,4}:){1,4}:((25[0-5]|(2[0-4]|1{0,1}[0-9]){0,1}[0-9])\.){3,3}(25[0-5]|(2[0-4]|1{0,1}[0-9]){0,1}[0-9]))) - - \[(?P<dateandtime>\d{2}\/[a-z]{3}\/\d{4}:\d{2}:\d{2}:\d{2} (\+|\-)\d{4})\] ((\"(GET|POST|HEAD|PUT|DELETE) )(?P<url>.+)(http\/(1\.1|2\.0)")) (?P<statuscode>\d{3}) (?P<bytessent>\d+) (?P<refferer>-|"([^"]+)") (["](?P<useragent>[^"]+)["])""",
re.IGNORECASE,
)
Added ipv6 matching.
check this one!
log_format = r'(?P<remote_addr>\d+.\d+.\d+.\d+)\s+\S+\s+\S+\s+[(?P[^\]]+)]\s+"(?P[^"]+)"\s+(?P\d+)\s+(?P<bytes_sent>\d+)\s+"(?P[^"]+)+"\s+"(?P<user_agent>(?!http)[^"]*)"'
check my project on github https://github.com/ksn-developer/logbrain.git
You need to import the "re" package first and then use re.search
import re
data = re.search(lineformat, l)