Last active
March 17, 2022 10:57
-
-
Save korakot/72beb312389e2c41ad49d34e1658fcc2 to your computer and use it in GitHub Desktop.
Using bigquery easily in Google Colab. Focus on a single table.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import re | |
import pandas as pd | |
from google.cloud import bigquery | |
from google.colab import auth | |
PROJECT_ID = 'sql-hunt' # change to your own project | |
class BqTable: | |
def __init__(self, table, dataset='samples', active_project='bigquery-public-data', client=None): | |
if client: # allow reuse client from other table | |
c = client | |
else: | |
c = bigquery.Client(project=PROJECT_ID) # for connection only | |
self.client = c | |
self.project = active_project | |
self.dataset = dataset | |
self.table = table | |
self.full_name = f"`{active_project}.{dataset}.{table}`" | |
# get real table object | |
self.d = c.get_dataset(c.dataset(dataset, active_project)) | |
self.t = c.get_table(self.d.table(table)) | |
self.schema = self.t.schema | |
self.max_gb = 1. # change it to set limit | |
def query(self, query): | |
query_size = self.query_size(query) | |
if query_size <= self.max_gb: | |
job = self.client.query(query) | |
rows = list(job.result(timeout=30)) | |
if len(rows) == 0: | |
print("Query returned no rows.") | |
return None | |
columns = list(rows[0]._xxx_field_to_index.keys()) | |
return pd.DataFrame( | |
data=[list(x.values()) for x in rows], columns=columns) | |
print(f"Query cancelled; estimated size of {query_size} exceeds limit of {self.max_gb} GB") | |
def select(self, columns, where=None, group_by=None, order_by=None, limit=None): | |
query = f"SELECT {columns} FROM {self.full_name}" | |
if where: | |
if not re.match('WHERE|GROUP|ORDER', where, re.I): | |
where = 'WHERE ' + where | |
query += " " + where | |
if group_by: | |
query += " GROUP BY " + group_by | |
if order_by: | |
query += " ORDER BY " + order_by | |
if limit: | |
query += " LIMIT " + str(limit) | |
return self.query(query) | |
def query_size(self, query): | |
config = bigquery.QueryJobConfig() | |
config.dry_run = True | |
job = self.client.query(query, job_config=config) | |
return job.total_bytes_processed / 2**30 # GB | |
def head(self, num_rows=5, start_index=None, columns=None): | |
if type(columns)==str: | |
columns = re.split(', *', columns) | |
schema = None | |
if columns: | |
schema = [col for col in self.t.schema if col.name in columns] | |
results = self.client.list_rows(self.t, selected_fields=schema, | |
max_results=num_rows, start_index=start_index) | |
rows = [x for x in results] | |
columns = list(rows[0]._xxx_field_to_index.keys()) | |
return pd.DataFrame( | |
data=[list(x.values()) for x in rows], columns=columns) | |
@property | |
def size(self): | |
query = f""" | |
SELECT SUM(size_bytes) /POW(2,30) | |
FROM `{self.project}.{self.dataset}.__TABLES__` | |
WHERE table_id='{self.table}' | |
""" | |
job = self.client.query(query) | |
gb = list(job.result())[0][0] | |
return gb | |
def __str__(self): | |
'''Use in query e.g. SELECT f FROM {tbl}''' | |
return self.full_name | |
def __repr__(self): | |
params = f'"{self.table}", "{self.dataset}"' | |
if self.project != 'bigquery-public-data': | |
params += f', "self.project"' | |
return f'BqTable({params})' | |
def view(self, columns, where=None, group_by=None, order_by=None, limit=None): | |
'''Same as select(), but for WITH-AS''' | |
query = f"SELECT {columns} FROM {self.full_name}" | |
if where: | |
if not re.match('WHERE|GROUP|ORDER', where, re.I): | |
where = 'WHERE ' + where | |
query += " " + where | |
if group_by: | |
query += " GROUP BY " + group_by | |
if order_by: | |
query += " ORDER BY " + order_by | |
if limit: | |
query += " LIMIT " + str(limit) | |
return '('+ query +')' |
Now add __str__() for easily reference to its full_name.
Also add .view() to use with WITH_AS statement.
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Usage:
Easy to select from the table
The result is shown as a pandas table.