Created
September 11, 2022 06:13
-
-
Save alexkyllo/ff70068736d2b878698d770878a9f118 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
"""Example of how to query parquet datasets from cloud storage accounts with Polars.""" | |
import adlfs | |
import polars as pl | |
from polars import col | |
from pyarrow import dataset as ds | |
# Create a filesystem representing an Azure Blob Storage account | |
fs = adlfs.AzureBlobFileSystem( | |
account_name="azureopendatastorage", sas_token="", container_name="mlsamples" | |
) | |
# Register it as a pyarrow dataset | |
dt = ds.dataset("mlsamples/diabetes", filesystem=fs) | |
# Create a polars LazyFrame from the dataset | |
di = pl.scan_ds(dt) | |
# Build a query on the dataset and show the execution plan | |
query = di.with_column( | |
((col("Y") - col("Y").mean().over("SEX")) / col("Y").std().over("SEX")).alias("Y_ZSCORE_SEX") | |
) | |
query | |
# Run the query and return the result as a polars DataFrame in memory (i.e. Arrow) | |
result = query.collect() | |
result |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment