lpillmann · November 16, 2023 05:52 · lpillmann · Jul 21, 2021 · uchiiii · Aug 11, 2021
diff --git a/read_parquet.py b/read_parquet.py
 import gcsfs
 import pyarrow

 def read_parquet(gs_directory_path, to_pandas=True):
    """
    Reads multiple (partitioned) parquet files from a GS directory
    e.g. 'gs://<bucket>/<directory>' (without ending /)
    """
    gs = gcsfs.GCSFileSystem()
    arrow_df = pyarrow.parquet.ParquetDataset(gs_directory_path, filesystem=gs)
    if to_pandas:
        return arrow_df.read_pandas().to_pandas()
    return arrow_df
	import gcsfs
	import pyarrow

	def read_parquet(gs_directory_path, to_pandas=True):
	"""
	Reads multiple (partitioned) parquet files from a GS directory
	e.g. 'gs://<bucket>/<directory>' (without ending /)
	"""
	gs = gcsfs.GCSFileSystem()
	arrow_df = pyarrow.parquet.ParquetDataset(gs_directory_path, filesystem=gs)
	if to_pandas:
	return arrow_df.read_pandas().to_pandas()
	return arrow_df