Created
August 13, 2019 11:44
-
-
Save jitsejan/f9868a341d4412b7448ac4b5a234968c to your computer and use it in GitHub Desktop.
Export a Google sheet to S3
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Note: make sure the Google Sheet is published to the web first | |
import boto3 | |
import json | |
import pandas as pd | |
import pyarrow | |
import requests | |
GOOGLE_URL = "https://spreadsheets.google.com/feeds/list/{key}/{worksheet}/public/full?alt={format}" | |
GOOGLE_SHEET_ID = '1234567-abcedf' | |
GOOGLE_SHEET_TITLE = 'My title' | |
def _get_output_path(json_data): | |
return "s3://data/google_sheets/{spreadsheet_title}/{worksheet_title}/data.parquet".format( | |
worksheet_title = _get_worksheet_title(json_data), | |
spreadsheet_title = _get_spreadsheet_title(), | |
) | |
def _clean_title(title): | |
return title.lower().replace(' ', '_') | |
def _get_spreadsheet_title(): | |
return _clean_title(GOOGLE_SHEET_TITLE) | |
def _get_worksheet_title(json_data): | |
return _clean_title(json_data['feed']['title']['$t']) | |
def get_worksheet_dataframe(worksheet_num): | |
try: | |
data = requests.get(GOOGLE_URL.format(key=GOOGLE_SHEET_ID, worksheet=worksheet_num, format='json')).json() | |
except: | |
return pd.DataFrame(), None | |
df = pd.io.json.json_normalize(data['feed']['entry']) | |
df.drop(['content.$t', 'category', 'link'], axis=1, inplace=True) | |
df.columns = [col.replace('.', '_').replace('$', '').replace('_t', '').replace('gsx', '') for col in df.columns] | |
df['spreadsheet_title'] = _get_spreadsheet_title() | |
df['worksheet_title'] = _get_worksheet_title(data) | |
return df, _get_output_path(data) | |
def main(): | |
sheet_id = 1 | |
result = True | |
while result: | |
df, path = get_worksheet_dataframe(sheet_id) | |
if not df.empty: | |
df.to_parquet(path, index=False) | |
sheet_id += 1 | |
else: | |
result = False | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment