Skip to content

Instantly share code, notes, and snippets.

@jitsejan
Created August 13, 2019 11:44
Show Gist options
  • Save jitsejan/f9868a341d4412b7448ac4b5a234968c to your computer and use it in GitHub Desktop.
Save jitsejan/f9868a341d4412b7448ac4b5a234968c to your computer and use it in GitHub Desktop.
Export a Google sheet to S3
# Note: make sure the Google Sheet is published to the web first
import boto3
import json
import pandas as pd
import pyarrow
import requests
GOOGLE_URL = "https://spreadsheets.google.com/feeds/list/{key}/{worksheet}/public/full?alt={format}"
GOOGLE_SHEET_ID = '1234567-abcedf'
GOOGLE_SHEET_TITLE = 'My title'
def _get_output_path(json_data):
return "s3://data/google_sheets/{spreadsheet_title}/{worksheet_title}/data.parquet".format(
worksheet_title = _get_worksheet_title(json_data),
spreadsheet_title = _get_spreadsheet_title(),
)
def _clean_title(title):
return title.lower().replace(' ', '_')
def _get_spreadsheet_title():
return _clean_title(GOOGLE_SHEET_TITLE)
def _get_worksheet_title(json_data):
return _clean_title(json_data['feed']['title']['$t'])
def get_worksheet_dataframe(worksheet_num):
try:
data = requests.get(GOOGLE_URL.format(key=GOOGLE_SHEET_ID, worksheet=worksheet_num, format='json')).json()
except:
return pd.DataFrame(), None
df = pd.io.json.json_normalize(data['feed']['entry'])
df.drop(['content.$t', 'category', 'link'], axis=1, inplace=True)
df.columns = [col.replace('.', '_').replace('$', '').replace('_t', '').replace('gsx', '') for col in df.columns]
df['spreadsheet_title'] = _get_spreadsheet_title()
df['worksheet_title'] = _get_worksheet_title(data)
return df, _get_output_path(data)
def main():
sheet_id = 1
result = True
while result:
df, path = get_worksheet_dataframe(sheet_id)
if not df.empty:
df.to_parquet(path, index=False)
sheet_id += 1
else:
result = False
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment