Skip to content

Instantly share code, notes, and snippets.

@alexkyllo
Created September 11, 2022 06:13
Show Gist options
  • Save alexkyllo/ff70068736d2b878698d770878a9f118 to your computer and use it in GitHub Desktop.
Save alexkyllo/ff70068736d2b878698d770878a9f118 to your computer and use it in GitHub Desktop.
"""Example of how to query parquet datasets from cloud storage accounts with Polars."""
import adlfs
import polars as pl
from polars import col
from pyarrow import dataset as ds
# Create a filesystem representing an Azure Blob Storage account
fs = adlfs.AzureBlobFileSystem(
account_name="azureopendatastorage", sas_token="", container_name="mlsamples"
)
# Register it as a pyarrow dataset
dt = ds.dataset("mlsamples/diabetes", filesystem=fs)
# Create a polars LazyFrame from the dataset
di = pl.scan_ds(dt)
# Build a query on the dataset and show the execution plan
query = di.with_column(
((col("Y") - col("Y").mean().over("SEX")) / col("Y").std().over("SEX")).alias("Y_ZSCORE_SEX")
)
query
# Run the query and return the result as a polars DataFrame in memory (i.e. Arrow)
result = query.collect()
result
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment