Skip to content

Instantly share code, notes, and snippets.

View dipanjanS's full-sized avatar
:octocat:

Dipanjan (DJ) Sarkar dipanjanS

:octocat:
View GitHub Profile
bad_rows_df = logs_df.filter(logs_df['host'].isNull()|
logs_df['timestamp'].isNull() |
logs_df['method'].isNull() |
logs_df['endpoint'].isNull() |
logs_df['status'].isNull() |
logs_df['content_size'].isNull()|
logs_df['protocol'].isNull())
bad_rows_df.count()
(base_df
.filter(base_df['value']
.isNull())
.count())
from pyspark.sql.functions import regexp_extract
logs_df = base_df.select(regexp_extract('value', host_pattern, 1).alias('host'),
regexp_extract('value', ts_pattern, 1).alias('timestamp'),
regexp_extract('value', method_uri_protocol_pattern, 1).alias('method'),
regexp_extract('value', method_uri_protocol_pattern, 2).alias('endpoint'),
regexp_extract('value', method_uri_protocol_pattern, 3).alias('protocol'),
regexp_extract('value', status_pattern, 1).cast('integer').alias('status'),
regexp_extract('value', content_size_pattern, 1).cast('integer').alias('content_size'))
logs_df.show(10, truncate=True)
content_size_pattern = r'\s(\d+)$'
content_size = [re.search(content_size_pattern, item).group(1) for item in sample_logs]
print(content_size)
status_pattern = r'\s(\d{3})\s'
status = [re.search(status_pattern, item).group(1) for item in sample_logs]
print(status)
method_uri_protocol_pattern = r'\"(\S+)\s(\S+)\s*(\S*)\"'
method_uri_protocol = [re.search(method_uri_protocol_pattern, item).groups()
if re.search(method_uri_protocol_pattern, item)
else 'no match'
for item in sample_logs]
method_uri_protocol
ts_pattern = r'\[(\d{2}/\w{3}/\d{4}:\d{2}:\d{2}:\d{2} -\d{4})]'
timestamps = [re.search(ts_pattern, item).group(1) for item in sample_logs]
timestamps
host_pattern = r'(^\S+\.[\S+\.]+\S+)\s'
hosts = [re.search(host_pattern, item).group(1)
if re.search(host_pattern, item)
else 'no match'
for item in sample_logs]
hosts
sample_logs = [item['value'] for item in base_df.take(15)]
sample_logs
field meaning
remotehost Remote hostname (or IP number if DNS hostname is not available or if DNSLookup is off).
rfc931 The remote logname of the user if at all it is present.
authuser The username of the remote user after authentication by the HTTP server.
[date] Date and time of the request.
"request" The request, exactly as it came from the browser or client.
status The HTTP status code the server sent back to the client.
bytes The number of bytes (Content-Length) transferred to the client.