| field | meaning | 
|---|---|
| remotehost | Remote hostname (or IP number if DNS hostname is not available or if DNSLookup is off). | 
| rfc931 | The remote logname of the user if at all it is present. | 
| authuser | The username of the remote user after authentication by the HTTP server. | 
| [date] | Date and time of the request. | 
| "request" | The request, exactly as it came from the browser or client. | 
| status | The HTTP status code the server sent back to the client. | 
| bytes | The number of bytes (Content-Length) transferred to the client. | 
  
    
      This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
      Learn more about bidirectional Unicode characters
    
  
  
    
  | bad_rows_df = logs_df.filter(logs_df['host'].isNull()| | |
| logs_df['timestamp'].isNull() | | |
| logs_df['method'].isNull() | | |
| logs_df['endpoint'].isNull() | | |
| logs_df['status'].isNull() | | |
| logs_df['content_size'].isNull()| | |
| logs_df['protocol'].isNull()) | |
| bad_rows_df.count() | 
  
    
      This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
      Learn more about bidirectional Unicode characters
    
  
  
    
  | (base_df | |
| .filter(base_df['value'] | |
| .isNull()) | |
| .count()) | 
  
    
      This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
      Learn more about bidirectional Unicode characters
    
  
  
    
  | from pyspark.sql.functions import regexp_extract | |
| logs_df = base_df.select(regexp_extract('value', host_pattern, 1).alias('host'), | |
| regexp_extract('value', ts_pattern, 1).alias('timestamp'), | |
| regexp_extract('value', method_uri_protocol_pattern, 1).alias('method'), | |
| regexp_extract('value', method_uri_protocol_pattern, 2).alias('endpoint'), | |
| regexp_extract('value', method_uri_protocol_pattern, 3).alias('protocol'), | |
| regexp_extract('value', status_pattern, 1).cast('integer').alias('status'), | |
| regexp_extract('value', content_size_pattern, 1).cast('integer').alias('content_size')) | |
| logs_df.show(10, truncate=True) | 
  
    
      This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
      Learn more about bidirectional Unicode characters
    
  
  
    
  | content_size_pattern = r'\s(\d+)$' | |
| content_size = [re.search(content_size_pattern, item).group(1) for item in sample_logs] | |
| print(content_size) | 
  
    
      This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
      Learn more about bidirectional Unicode characters
    
  
  
    
  | status_pattern = r'\s(\d{3})\s' | |
| status = [re.search(status_pattern, item).group(1) for item in sample_logs] | |
| print(status) | 
  
    
      This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
      Learn more about bidirectional Unicode characters
    
  
  
    
  | method_uri_protocol_pattern = r'\"(\S+)\s(\S+)\s*(\S*)\"' | |
| method_uri_protocol = [re.search(method_uri_protocol_pattern, item).groups() | |
| if re.search(method_uri_protocol_pattern, item) | |
| else 'no match' | |
| for item in sample_logs] | |
| method_uri_protocol | 
  
    
      This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
      Learn more about bidirectional Unicode characters
    
  
  
    
  | ts_pattern = r'\[(\d{2}/\w{3}/\d{4}:\d{2}:\d{2}:\d{2} -\d{4})]' | |
| timestamps = [re.search(ts_pattern, item).group(1) for item in sample_logs] | |
| timestamps | 
  
    
      This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
      Learn more about bidirectional Unicode characters
    
  
  
    
  | host_pattern = r'(^\S+\.[\S+\.]+\S+)\s' | |
| hosts = [re.search(host_pattern, item).group(1) | |
| if re.search(host_pattern, item) | |
| else 'no match' | |
| for item in sample_logs] | |
| hosts | 
  
    
      This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
      Learn more about bidirectional Unicode characters
    
  
  
    
  | sample_logs = [item['value'] for item in base_df.take(15)] | |
| sample_logs |