Last active
October 11, 2023 17:38
-
-
Save jongbinjung/744ecb0a80767b847016f9dfd18684a7 to your computer and use it in GitHub Desktop.
Load and parse data from Gsheet
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
"""Collect and parse data from Gsheet | |
Read spreadsheet from https://docs.google.com/spreadsheets/d/{SHEET_ID} | |
and parse as a polars DataFrame. | |
This example is using a benign sheet of junk from | |
https://docs.google.com/spreadsheets/d/1cFVczEL7oznX_G-_VyJv5tgkEPmoVHRsGanzMj1O4nM | |
but should give you the idea. | |
Mostly stolen from https://developers.google.com/sheets/api/quickstart/python | |
Requires: | |
# For Google Sheets access | |
google-api-python-client | |
google-auth-oauthlib | |
# For parsing to a DataFrame, but can replace with {your-favorite-dataframe-library} | |
polars | |
""" | |
import os.path | |
from typing import List | |
import polars as pl | |
from google.auth.transport.requests import Request | |
from google.oauth2.credentials import Credentials | |
from google_auth_oauthlib.flow import InstalledAppFlow | |
from googleapiclient.discovery import build | |
RANGE_NAME = "Benign sheet!A:C" | |
SCOPES = ["https://www.googleapis.com/auth/spreadsheets.readonly"] | |
# This can be found in the spreadsheet's URL | |
SHEET_ID = "1cFVczEL7oznX_G-_VyJv5tgkEPmoVHRsGanzMj1O4nM" | |
def get_google_credentials( | |
oauth_json: str = "credentials.json", | |
token_json: str = "token.json", | |
) -> Credentials: | |
"""Get Google credentials with readonly scope | |
Args: | |
oauth_json (str): OAuth 2.0 Client credentials used to authorize token | |
the first time; one can be created/downloaded from | |
https://console.cloud.google.com/apis/credentials | |
token_json (str): filename for read/writing access and refresh token | |
Returns: | |
Credentials | |
""" | |
creds = None | |
# The token_json file stores the user's access and refresh tokens, and is | |
# created automatically when the authorization flow completes for the first | |
# time. | |
if os.path.exists(token_json): | |
creds = Credentials.from_authorized_user_file(token_json, SCOPES) | |
if not creds or not creds.valid: | |
if creds and creds.expired and creds.refresh_token: | |
creds.refresh(Request()) | |
else: | |
flow = InstalledAppFlow.from_client_secrets_file(oauth_json, SCOPES) | |
creds = flow.run_local_server(port=0) | |
# Save the credentials for the next run | |
with open(token_json, "w") as token: | |
token.write(creds.to_json()) | |
return creds | |
def parse_result_values(values: List[str]) -> pl.DataFrame: | |
"""Parse values from Gsheet, assuming row-major where first row is header | |
This parser can be modified appropriately to parse whatever the expected | |
format of the target spreadsheet is. | |
""" | |
column_names = values[0] | |
rows = values[1:] | |
df = pl.DataFrame(rows, orient="row", schema=column_names) | |
# By default, everything is a string, so need to cast appropriately | |
return df.with_columns(pl.col("value").cast(int)) | |
def main(): | |
creds = get_google_credentials() | |
# Build and call the Sheets API | |
sheet = build("sheets", "v4", credentials=creds).spreadsheets() | |
# This returns a dict with keys: ['range', 'majorDimension', 'values'] | |
# By default, the 'majorDimension' is 'ROW' | |
# (i.e., results['values'] is a list of each row in teh specified range) | |
result = sheet.values().get(spreadsheetId=SHEET_ID, range=RANGE_NAME).execute() | |
values = result["values"] | |
return parse_result_values(values) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment