Skip to content

Instantly share code, notes, and snippets.

@mh0w
Last active November 9, 2023 16:40
Show Gist options
  • Save mh0w/db82fee4817fd94736399351bd213656 to your computer and use it in GitHub Desktop.
Save mh0w/db82fee4817fd94736399351bd213656 to your computer and use it in GitHub Desktop.
intro_to_polars
"""
Sources:
https://pola-rs.github.io/polars/getting-started/intro
https://kevinheavey.github.io/modern-polars/
.../DAP_CATS/intro_to_polars/-/tree/main?ref_type=heads
"""
import polars as pl
from datetime import date, datetime
import numpy as np
s = pl.Series("a", [1, 2, 3, 4, 5])
print(s)
print(s.min())
print(s.max())
s2 = s.str.replace("polar", "pola")
start = date(2001, 1, 1)
stop = date(2001, 1, 9)
s = pl.date_range(start, stop, interval="2d", eager=True)
print(s.dt.day())
df = pl.DataFrame(
{
"my_integer_var": [1, 2, 3, 4, 5],
"my_date_var": [
datetime(2022, 1, 1),
datetime(2022, 1, 2),
datetime(2022, 1, 3),
datetime(2022, 1, 4),
datetime(2022, 1, 5),
],
"my_float_var": [4.0, 5.0, 6.0, 7.0, 8.0],
}
)
print(df)
print(df.head(3))
print(df.tail(3))
print(df.sample(2))
print(df.describe())
df.write_csv("C:/Users/hawkem/temp/output.csv")
df_csv = pl.read_csv("C:/Users/hawkem/temp/output.csv", try_parse_dates=True)
print(df_csv)
df.write_json("C:/Users/hawkem/temp/output.json")
df_json = pl.read_json("C:/Users/hawkem/temp/output.json")
print(df_json)
df.write_parquet("C:/Users/hawkem/temp/output.parquet")
df_parquet = pl.read_parquet("C:/Users/hawkem/temp/output.parquet")
print(df_parquet)
#######################################################
# Expressions; select, filter, with_columns, group_by #
#######################################################
# To select a col, we define the df we want data from & select the data we need
df.select(pl.col("*"))
df.select(pl.col("my_date_var", "my_float_var")).limit(3)
df.select(pl.exclude("my_date_var"))
# The filter option allows us to create a subset of the df
df.filter(pl.col("my_date_var").is_between(datetime(2022, 1, 2), datetime(2022, 1, 4)),)
df.filter((pl.col("my_integer_var") <= 3) & (pl.col("my_float_var").is_not_nan()))
# with_columns allows you to create new columns
df.with_columns(pl.col("my_integer_var").sum().alias("new_col_a"), (pl.col("my_float_var") + 42).alias("fl+42"))
df2 = pl.DataFrame(
{
"x": np.arange(0, 8),
"y": ["A", "A", "A", "B", "B", "C", "X", "X"],
}
)
# group_by
df2.group_by("y", maintain_order=True).agg(
pl.col("*").count().alias("count"),
pl.col("*").sum().alias("sum"),
)
# You can of course chain various expressions
df.with_columns((pl.col("my_integer_var") * pl.col("my_float_var")).alias("my_integer_var * my_float_var")).select(
pl.all().exclude(["my_date_var"])
)
####
# Combining dataframes
####
df = pl.DataFrame(
{
"a": np.arange(0, 8),
"b": np.random.rand(8),
"d": [1, 2.0, np.NaN, np.NaN, 0, -5, -42, None],
}
)
df2 = pl.DataFrame(
{
"x": np.arange(0, 8),
"y": ["A", "A", "A", "B", "B", "C", "X", "X"],
"z": np.arange(8, 0, -1),
}
)
# Merge/join
df.join(df2, left_on="a", right_on="x")
# Concatinate horizontally (see also vstack for concatinating vertically)
df.hstack(df2)
##################
# Other commands #
##################
df.dtypes
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment