Skip to content

Instantly share code, notes, and snippets.

@yoki
Last active August 24, 2024 17:59
Show Gist options
  • Save yoki/02a04983914680815d571c5372e28b0d to your computer and use it in GitHub Desktop.
Save yoki/02a04983914680815d571c5372e28b0d to your computer and use it in GitHub Desktop.
Polars tutorial
import polars as pl
from polars import col
# Creating DataFrames
df = pl.DataFrame({
'A': [1, 2, 3, 4],
'B': ['a', 'b', 'c', 'd'],
'C': [1.1, 2.2, 3.3, 4.4]
})
# Basic information
df.columns
df.shape
# Selecting columns/rows (filter)
df.select([col.A, col.B])
df.filter(col.A > 2)
# Sorting
print(df.sort(col.A, descending=True))
# add/rename/drop columns
df = df.with_column((col.A + col.C).alias('D'))
df = df.rename({'A': 'new_A'})
df = df.drop('B')
# Iteration over rows
for row in df.iter_rows(named=True):
print(f"A: {row['A']}, B: {row['B']}, C: {row['C']}, D: {row['D']}")
# Handling missing values
df = df.drop_nulls()
df = df.fill_null(0)
# CSV
df = pl.read_csv('file.csv')
df.write_csv('output.csv')
import polars as pl
from polars import col
# Type conversion
df = df.with_columns([
col.A.cast(pl.Float64),
col.C.cast(pl.Int64),
col.D.cast(pl.Utf8)
])
# Update a single value
df = df.with_row_count('index').with_column(
pl.when(col.index == 2)
.then(pl.lit(100))
.otherwise(col.A)
.alias('A')
).drop('index')
# Update based on a condition
df = df.with_column(
pl.when(col.A > 3)
.then(col.A * 2)
.otherwise(col.A)
.alias('A')
)
# Update using lambda function
df = df.with_column(
pl.struct(['A', 'C']).apply(lambda x: x['A'] + x['C']).alias('sum_A_C')
)
# Update multiple columns
df = df.with_columns([
col.A.map_elements(lambda x: x * 10).alias('A'),
col.C.map_elements(lambda x: x + 100).alias('C')
])
# add/rename/drop columns
df = df.with_column((col.A + col.C).alias('D'))
df = df.rename({'A': 'new_A'})
df = df.drop('B')
# Reset index
df = df.with_row_count('index')
# Drop the index column if you don't want it
df = df.drop('index')
import polars as pl
from polars import col
##### Join and concat #####
joined = df1.join(df2, on='key', how='left')
concat = pl.concat([joined, df3])
##### Pivot from long to wide and wide to long #####
long_df = pl.DataFrame({
'vname': ['one', 'one', 'two', 'two'] * 3,
'country': ['A', 'B', 'A', 'B'] * 3,
'year': ['x1', 'x1', 'x1', 'x1', 'x2', 'x2', 'x2', 'x2', 'x3', 'x3', 'x3', 'x3'],
'val': range(1, 13)
})
wide_df = long_df.pivot(index=['country', 'year'], columns='vname', values='val')
# wide to long
melted = wide_df.melt(id_vars=['country', 'year'], variable_name='vname', value_name='val')
##### Groupby and Transform #####
# Basic groupby and aggregation
result = df.groupby('category').agg([
col.value1.sum().alias('sum_value1'),
col.value2.mean().alias('mean_value2')
])
# Multiple aggregations
result = df.groupby('category').agg([
col.value1.agg(['sum', 'mean', 'max']),
col.value2.agg(['min', 'median'])
])
# Custom aggregation
result = df.groupby('category').agg([
col.value1.apply(lambda x: x.sum() / x.count()).alias('custom_agg')
])
# Transform (aggregate but keep same rows)
df = df.with_column(
col.value1.mean().over('category').alias('category_mean')
)
import polars as pl
from polars import col
# Create a sample DataFrame with panel data
df = pl.DataFrame({
'date': ['2023-01-01', '2023-01-02', '2023-01-03', '2023-01-04'] * 2,
'id': ['A', 'A', 'A', 'A', 'B', 'B', 'B', 'B'],
'value': [100, 101, 103, 106, 200, 202, 205, 210]
})
# Convert to datetime
df = df.with_column(col.date.str.strptime(pl.Date, fmt='%Y-%m-%d'))
# Set date as index
df = df.sort(['id', 'date'])
# Resample
daily = df.groupby(['id', 'date']).agg([
col.value.mean().alias('daily_avg')
])
# Panel various columns
df = (
df.with_column(col.value.shift(1).over("id").alias("lag_value"))
.with_column(col.value.diff().over("id").alias("diff_value"))
.with_column(col.value.rolling_mean(window_size=2).over("id").alias("rolling_avg"))
.with_column((col.value / col.value.shift(1).over("id") - 1).alias("pct_change"))
.with_column((col.value / col.value.shift(365).over("id") - 1).alias("yoy_growth"))
.with_column(col.value.cum_sum().over("id").alias("cumulative_sum"))
)
# Time-based operations
# Assuming we have more varied dates in our dataset
varied_df = pl.DataFrame({
'date': ['2023-01-01', '2023-01-15', '2023-02-01', '2023-02-15',
'2023-03-01', '2023-03-15', '2023-04-01', '2023-04-15'],
'value': [100, 110, 120, 115, 125, 130, 140, 145]
})
varied_df = varied_df.with_column(col.date.str.strptime(pl.Date, fmt='%Y-%m-%d'))
# Resample to monthly frequency
monthly = varied_df.groupby_dynamic('date', every='1mo').agg([
col.value.mean().alias('monthly_avg')
])
# Calculate month-to-month growth rate
monthly = monthly.with_column(
(col.monthly_avg / col.monthly_avg.shift(1) - 1).alias('mom_growth')
)
print(monthly)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment