Skip to content

Instantly share code, notes, and snippets.

View CharlyWargnier's full-sized avatar
🎈

Charly Wargnier CharlyWargnier

🎈
View GitHub Profile
# Import libraries
import pandas as pd
import numpy as np
import socket
import json
#Connect Python to your GCP project
from google.colab import auth
auth.authenticate_user()
GCS_Bucket_Name = 'apache_seo_logs' #@param {type:"string"}
GCS_File_Path = 'Loggy*' #@param {type:"string"}
GCS_Full_Path = "gs://" + GCS_Bucket_Name + "/" + GCS_File_Path
print("the path to your GCS folder is: " + GCS_Full_Path)
# Import BigQuery from google.cloud
from google.cloud import bigquery
## Inputs from forms ##
GCP_PROJECT_ID = 'your_project_ID_here' #@param {type:"string"}
GCS_Bucket_Location = 'europe-west2' #@param ["asia-east1", "asia-east2", "asia-northeast1", "asia-northeast2", "asia-northeast3", "asia-south1", "asia-southeast1", "australia-southeast1", "europe-north1", "europe-west1", "europe-west2", "europe-west3", "europe-west4", "europe-west6", "northamerica-northeast1", "southamerica-east1", "us-central1", "us-east1", "us-east4", "us-west1", "us-west2"]
# Create a client instance for your project
## Form elements ##
table_name = 'log_sample' #@param {type:"string"}
SQL_1st_Filter ='ch.loggy' #@param {type:"string"}
SQL_Useragent_Filter = "Googlebot/2.1" #@param ["Googlebot/2.1", "YandexBot", "BingBot", "DuckDuckBot", "Baiduspider"] {allow-input: true}
# Temporary Bigquery table name
table_id = table_name
## Full SQL query ##
# Note that we convert a list of lists, not just a list.
df = pd.DataFrame.from_records(log_sample)
df.info()
df.rename(columns={0:'header'}, inplace=True)
# Split IP addresses
df[['IP','header']] = df["header"].str.split(" - - \[", 1, expand=True)
# Remove slash
df['IP'] = df['IP'].str.replace("Row\(\('","",n=1)
# Split Date & Time
df[['Date_Time', 'header']] = df['header'].str.split("\] ", 1,
expand=True)
df['Date_Time'] = df['Date_Time'].str.replace(':.*', '')
# Convert the Date_Time column from string to datetime format
df['Date'] = pd.to_datetime(df['Date_Time'], errors='coerce',
infer_datetime_format=True)
# Remove Time column, for clarity's sake
df[['Domain','header']] = df["header"].str.split(" \"", 1, expand=True)
df[['Method','header']] = df["header"].str.split(" ", 1, expand=True)