Skip to content

Instantly share code, notes, and snippets.

@nikitajz
Last active July 1, 2024 15:22
Show Gist options
  • Save nikitajz/cf5700f6d3d65b7e8f5fb23e7099becd to your computer and use it in GitHub Desktop.
Save nikitajz/cf5700f6d3d65b7e8f5fb23e7099becd to your computer and use it in GitHub Desktop.
polars snippets and cheatsheet

Drop null columns, similar to df.dropna(axis=1) in pandas

df = df[[s.name for s in df if not (s.null_count() == df.height)]]

Select all columns + new one (SQL: SELECT *, mean(price) FROM table):

df.select([
    pl.all(),
    pl.col("price").mean().alias("mean_price")
  ])

Note, this produce shape (1,1):

df.select([
  pl.col("price").mean()
  ])

GroupBy

Groupby categorical value and convert to wide format (e.g. columns: ["account", "product", "count"])

agg_df = (df
           .group_by(["account", "product"])
           .agg(pl.count())
           .sort(by="account")
)

agg_df.head()

agg_counts_wide = agg_df.pivot(values="count", index="account", columns="product").fill_null(0)
agg_counts_wide

SELECT *, CASE WHEN <> THEN <> ELSE <> END ...

df_user_brand_event = df_user_brand_event.with_columns(
    pl.when(pl.col("pct_buy_views").is_infinite())
      .then(None)
      .otherwise(pl.col("pct_buy_views"))
)

Datetime

Convert datetime str to datetime format

https://pola-rs.github.io/polars-book/user-guide/howcani/timeseries/parsing_dates_times.html


df = df.with_column(
    pl.col("event_time").str.strptime(pl.Datetime, fmt="%Y-%m-%d %H:%M:%S %Z")
)

Read multiple parquet files

df = (pl.scan_parquet(data_path / "*.parquet")
      .with_columns(
        pl.col("datetime").dt.date()
        )
      .group_by(["datetime", "category")
      .agg(
        pl.count()
      )
      .sort(by=["category", "count"])
      #.collect(streaming=True) # handle large datasets
     )

Read partitioned parquet data from S3

Note: No s3:// prefix is needed for path

from s3fs import S3FileSystem
import pyarrow.dataset as pads
from pyarrow.dataset import HivePartitioning


PATH_DATA_S3 = "my_bucket/prefix/partitioñ=value1/filename.parquet"

AWS_PROFILE = "default"
cloudfs = S3FileSystem(profile=AWS_PROFILE)
partitioning = pads.HivePartitioning.discover()


my_dataset = pads.dataset(PATH_DATA_S3, filesystem=cloudfs, partitioning=partitioning)


my_data = (pl.scan_pyarrow_dataset(acc_dataset)
              .filter(pl.col("type") == "this_type")
              .collect())

Alternatively specify it explicitly

partitioning = HivePartitioning(pa.schema([ ("year", pa.int32()),
                                            ("month", pa.int32()),
                                            ("type", pa.string())]))
# Cheatsheet
https://franzdiebold.github.io/polars-cheat-sheet/Polars_cheat_sheet.pdf
# PyData by datenzauberai
https://github.com/datenzauberai/PyConDE-2023--Polars-make-the-switch/blob/main/Polars%20-%20make%20the%20switch%20to%20lightning-fast%20dataframes%20-%20Versand.pdf
# More examples
https://github.com/martinbel/polars-tutorial/tree/master
```
# select/slice columns
select
# create/transform/assign columns
with_columns
# filter/slice/query rows
filter
# join/merge another dataframe
join
# group dataframe rows
groupby
# aggregate groups
agg
# sort dataframe
sort
```
```
articles = pl.read_parquet("articles.parquet")
sales = pl.read_parquet("sales.parquet")
monthly_best_sellers_2019 = (
sales
.with_columns([
pl.col("date").dt.year().alias("year"),
pl.col("date").dt.month().alias("month"),
])
.filter(pl.col("year") == 2019)
.join(articles, on=
"article_id")
.groupby(["product_code"
,
"month"])
.agg(pl.col("price").sum().alias("total_sales"))
.filter(
pl.col("total_sales") ==
pl.col("total_sales").max().over("month")
)
.select(["month"
,
"product_code"])
.sort("month")
)
```
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment