GabrielSGoncalves · November 17, 2021 18:19
diff --git a/read_file_from_gcloud_storage.py b/read_file_from_gcloud_storage.py
 from typing import Union, Dict
 from io import BytesIO
 from bson import json_util
 import pandas as pd
 import requests
 from google.cloud import storage
 from google.oauth2 import service_account


 def read_file_from_gcloud_storage(
    file_format: str,
    file_name: str,
    gcp_bucket: str,
    gcp_project: str,
    gcp_credentials_file: str,
    **kwargs,
 ) -> Union[pd.DataFrame, Dict, str]:
    """Read file from Google Cloud Storage into a specific Python object.

    Parameters
    ----------
    file_format : str
        File format can be 'csv', 'xlsx', 'parquet', 'json' or 'txt'.

    file_name : str
        String with the name of the target file.

    gcp_bucket : str
        String with bucket name.

    gcp_project : str (default="jeitto-datascience")
        String with the name of the project in GCP.

    gcp_credentials_file : str
        Dictionary with GCP credentials.

    Returns
    -------
    Union[pd.DataFrame, Dict, str].
        The specified object generate from target file.

    """
    # Authenticate using gcp json credentials
    credentials = service_account.Credentials.from_service_account_file(
        gcp_credentials_file
    )
    storage_client = storage.Client(
        project=gcp_project, credentials=credentials
    )

    # Define bucket and file to get
    bucket = storage_client.get_bucket(gcp_bucket)
    blob = bucket.get_blob(file_name)
    binary_stream = blob.download_as_string()

    # Return corresponding Python object based on file format
    if file_format == "csv":
        return pd.read_csv(BytesIO(binary_stream), **kwargs)

    elif file_format == "parquet":
        return pd.read_parquet(BytesIO(binary_stream), **kwargs)

    elif file_format == "json":
        return json_util.loads(binary_stream, **kwargs)

    elif file_format == "txt":
        return binary_stream.decode("utf-8", **kwargs)
	from typing import Union, Dict
	from io import BytesIO
	from bson import json_util
	import pandas as pd
	import requests
	from google.cloud import storage
	from google.oauth2 import service_account


	def read_file_from_gcloud_storage(
	file_format: str,
	file_name: str,
	gcp_bucket: str,
	gcp_project: str,
	gcp_credentials_file: str,
	**kwargs,
	) -> Union[pd.DataFrame, Dict, str]:
	"""Read file from Google Cloud Storage into a specific Python object.

	Parameters
	----------
	file_format : str
	File format can be 'csv', 'xlsx', 'parquet', 'json' or 'txt'.

	file_name : str
	String with the name of the target file.

	gcp_bucket : str
	String with bucket name.

	gcp_project : str (default="jeitto-datascience")
	String with the name of the project in GCP.

	gcp_credentials_file : str
	Dictionary with GCP credentials.

	Returns
	-------
	Union[pd.DataFrame, Dict, str].
	The specified object generate from target file.

	"""
	# Authenticate using gcp json credentials
	credentials = service_account.Credentials.from_service_account_file(
	gcp_credentials_file
	)
	storage_client = storage.Client(
	project=gcp_project, credentials=credentials
	)

	# Define bucket and file to get
	bucket = storage_client.get_bucket(gcp_bucket)
	blob = bucket.get_blob(file_name)
	binary_stream = blob.download_as_string()

	# Return corresponding Python object based on file format
	if file_format == "csv":
	return pd.read_csv(BytesIO(binary_stream), **kwargs)

	elif file_format == "parquet":
	return pd.read_parquet(BytesIO(binary_stream), **kwargs)

	elif file_format == "json":
	return json_util.loads(binary_stream, **kwargs)

	elif file_format == "txt":
	return binary_stream.decode("utf-8", **kwargs)