Created
November 17, 2021 18:19
-
-
Save GabrielSGoncalves/ff9155246c55ead6d33d1103d51bbad1 to your computer and use it in GitHub Desktop.
Function for reading private files from a Google Cloud Storage
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from typing import Union, Dict | |
from io import BytesIO | |
from bson import json_util | |
import pandas as pd | |
import requests | |
from google.cloud import storage | |
from google.oauth2 import service_account | |
def read_file_from_gcloud_storage( | |
file_format: str, | |
file_name: str, | |
gcp_bucket: str, | |
gcp_project: str, | |
gcp_credentials_file: str, | |
**kwargs, | |
) -> Union[pd.DataFrame, Dict, str]: | |
"""Read file from Google Cloud Storage into a specific Python object. | |
Parameters | |
---------- | |
file_format : str | |
File format can be 'csv', 'xlsx', 'parquet', 'json' or 'txt'. | |
file_name : str | |
String with the name of the target file. | |
gcp_bucket : str | |
String with bucket name. | |
gcp_project : str (default="jeitto-datascience") | |
String with the name of the project in GCP. | |
gcp_credentials_file : str | |
Dictionary with GCP credentials. | |
Returns | |
------- | |
Union[pd.DataFrame, Dict, str]. | |
The specified object generate from target file. | |
""" | |
# Authenticate using gcp json credentials | |
credentials = service_account.Credentials.from_service_account_file( | |
gcp_credentials_file | |
) | |
storage_client = storage.Client( | |
project=gcp_project, credentials=credentials | |
) | |
# Define bucket and file to get | |
bucket = storage_client.get_bucket(gcp_bucket) | |
blob = bucket.get_blob(file_name) | |
binary_stream = blob.download_as_string() | |
# Return corresponding Python object based on file format | |
if file_format == "csv": | |
return pd.read_csv(BytesIO(binary_stream), **kwargs) | |
elif file_format == "parquet": | |
return pd.read_parquet(BytesIO(binary_stream), **kwargs) | |
elif file_format == "json": | |
return json_util.loads(binary_stream, **kwargs) | |
elif file_format == "txt": | |
return binary_stream.decode("utf-8", **kwargs) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment