Skip to content

Instantly share code, notes, and snippets.

@jongbinjung
Last active October 11, 2023 17:38
Show Gist options
  • Save jongbinjung/744ecb0a80767b847016f9dfd18684a7 to your computer and use it in GitHub Desktop.
Save jongbinjung/744ecb0a80767b847016f9dfd18684a7 to your computer and use it in GitHub Desktop.
Load and parse data from Gsheet
#!/usr/bin/env python
"""Collect and parse data from Gsheet
Read spreadsheet from https://docs.google.com/spreadsheets/d/{SHEET_ID}
and parse as a polars DataFrame.
This example is using a benign sheet of junk from
https://docs.google.com/spreadsheets/d/1cFVczEL7oznX_G-_VyJv5tgkEPmoVHRsGanzMj1O4nM
but should give you the idea.
Mostly stolen from https://developers.google.com/sheets/api/quickstart/python
Requires:
# For Google Sheets access
google-api-python-client
google-auth-oauthlib
# For parsing to a DataFrame, but can replace with {your-favorite-dataframe-library}
polars
"""
import os.path
from typing import List
import polars as pl
from google.auth.transport.requests import Request
from google.oauth2.credentials import Credentials
from google_auth_oauthlib.flow import InstalledAppFlow
from googleapiclient.discovery import build
RANGE_NAME = "Benign sheet!A:C"
SCOPES = ["https://www.googleapis.com/auth/spreadsheets.readonly"]
# This can be found in the spreadsheet's URL
SHEET_ID = "1cFVczEL7oznX_G-_VyJv5tgkEPmoVHRsGanzMj1O4nM"
def get_google_credentials(
oauth_json: str = "credentials.json",
token_json: str = "token.json",
) -> Credentials:
"""Get Google credentials with readonly scope
Args:
oauth_json (str): OAuth 2.0 Client credentials used to authorize token
the first time; one can be created/downloaded from
https://console.cloud.google.com/apis/credentials
token_json (str): filename for read/writing access and refresh token
Returns:
Credentials
"""
creds = None
# The token_json file stores the user's access and refresh tokens, and is
# created automatically when the authorization flow completes for the first
# time.
if os.path.exists(token_json):
creds = Credentials.from_authorized_user_file(token_json, SCOPES)
if not creds or not creds.valid:
if creds and creds.expired and creds.refresh_token:
creds.refresh(Request())
else:
flow = InstalledAppFlow.from_client_secrets_file(oauth_json, SCOPES)
creds = flow.run_local_server(port=0)
# Save the credentials for the next run
with open(token_json, "w") as token:
token.write(creds.to_json())
return creds
def parse_result_values(values: List[str]) -> pl.DataFrame:
"""Parse values from Gsheet, assuming row-major where first row is header
This parser can be modified appropriately to parse whatever the expected
format of the target spreadsheet is.
"""
column_names = values[0]
rows = values[1:]
df = pl.DataFrame(rows, orient="row", schema=column_names)
# By default, everything is a string, so need to cast appropriately
return df.with_columns(pl.col("value").cast(int))
def main():
creds = get_google_credentials()
# Build and call the Sheets API
sheet = build("sheets", "v4", credentials=creds).spreadsheets()
# This returns a dict with keys: ['range', 'majorDimension', 'values']
# By default, the 'majorDimension' is 'ROW'
# (i.e., results['values'] is a list of each row in teh specified range)
result = sheet.values().get(spreadsheetId=SHEET_ID, range=RANGE_NAME).execute()
values = result["values"]
return parse_result_values(values)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment