Last active
August 24, 2024 17:59
-
-
Save yoki/02a04983914680815d571c5372e28b0d to your computer and use it in GitHub Desktop.
Polars tutorial
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import polars as pl | |
from polars import col | |
# Creating DataFrames | |
df = pl.DataFrame({ | |
'A': [1, 2, 3, 4], | |
'B': ['a', 'b', 'c', 'd'], | |
'C': [1.1, 2.2, 3.3, 4.4] | |
}) | |
# Basic information | |
df.columns | |
df.shape | |
# Selecting columns/rows (filter) | |
df.select([col.A, col.B]) | |
df.filter(col.A > 2) | |
# Sorting | |
print(df.sort(col.A, descending=True)) | |
# add/rename/drop columns | |
df = df.with_column((col.A + col.C).alias('D')) | |
df = df.rename({'A': 'new_A'}) | |
df = df.drop('B') | |
# Iteration over rows | |
for row in df.iter_rows(named=True): | |
print(f"A: {row['A']}, B: {row['B']}, C: {row['C']}, D: {row['D']}") | |
# Handling missing values | |
df = df.drop_nulls() | |
df = df.fill_null(0) | |
# CSV | |
df = pl.read_csv('file.csv') | |
df.write_csv('output.csv') |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import polars as pl | |
from polars import col | |
# Type conversion | |
df = df.with_columns([ | |
col.A.cast(pl.Float64), | |
col.C.cast(pl.Int64), | |
col.D.cast(pl.Utf8) | |
]) | |
# Update a single value | |
df = df.with_row_count('index').with_column( | |
pl.when(col.index == 2) | |
.then(pl.lit(100)) | |
.otherwise(col.A) | |
.alias('A') | |
).drop('index') | |
# Update based on a condition | |
df = df.with_column( | |
pl.when(col.A > 3) | |
.then(col.A * 2) | |
.otherwise(col.A) | |
.alias('A') | |
) | |
# Update using lambda function | |
df = df.with_column( | |
pl.struct(['A', 'C']).apply(lambda x: x['A'] + x['C']).alias('sum_A_C') | |
) | |
# Update multiple columns | |
df = df.with_columns([ | |
col.A.map_elements(lambda x: x * 10).alias('A'), | |
col.C.map_elements(lambda x: x + 100).alias('C') | |
]) | |
# add/rename/drop columns | |
df = df.with_column((col.A + col.C).alias('D')) | |
df = df.rename({'A': 'new_A'}) | |
df = df.drop('B') | |
# Reset index | |
df = df.with_row_count('index') | |
# Drop the index column if you don't want it | |
df = df.drop('index') |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import polars as pl | |
from polars import col | |
##### Join and concat ##### | |
joined = df1.join(df2, on='key', how='left') | |
concat = pl.concat([joined, df3]) | |
##### Pivot from long to wide and wide to long ##### | |
long_df = pl.DataFrame({ | |
'vname': ['one', 'one', 'two', 'two'] * 3, | |
'country': ['A', 'B', 'A', 'B'] * 3, | |
'year': ['x1', 'x1', 'x1', 'x1', 'x2', 'x2', 'x2', 'x2', 'x3', 'x3', 'x3', 'x3'], | |
'val': range(1, 13) | |
}) | |
wide_df = long_df.pivot(index=['country', 'year'], columns='vname', values='val') | |
# wide to long | |
melted = wide_df.melt(id_vars=['country', 'year'], variable_name='vname', value_name='val') | |
##### Groupby and Transform ##### | |
# Basic groupby and aggregation | |
result = df.groupby('category').agg([ | |
col.value1.sum().alias('sum_value1'), | |
col.value2.mean().alias('mean_value2') | |
]) | |
# Multiple aggregations | |
result = df.groupby('category').agg([ | |
col.value1.agg(['sum', 'mean', 'max']), | |
col.value2.agg(['min', 'median']) | |
]) | |
# Custom aggregation | |
result = df.groupby('category').agg([ | |
col.value1.apply(lambda x: x.sum() / x.count()).alias('custom_agg') | |
]) | |
# Transform (aggregate but keep same rows) | |
df = df.with_column( | |
col.value1.mean().over('category').alias('category_mean') | |
) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import polars as pl | |
from polars import col | |
# Create a sample DataFrame with panel data | |
df = pl.DataFrame({ | |
'date': ['2023-01-01', '2023-01-02', '2023-01-03', '2023-01-04'] * 2, | |
'id': ['A', 'A', 'A', 'A', 'B', 'B', 'B', 'B'], | |
'value': [100, 101, 103, 106, 200, 202, 205, 210] | |
}) | |
# Convert to datetime | |
df = df.with_column(col.date.str.strptime(pl.Date, fmt='%Y-%m-%d')) | |
# Set date as index | |
df = df.sort(['id', 'date']) | |
# Resample | |
daily = df.groupby(['id', 'date']).agg([ | |
col.value.mean().alias('daily_avg') | |
]) | |
# Panel various columns | |
df = ( | |
df.with_column(col.value.shift(1).over("id").alias("lag_value")) | |
.with_column(col.value.diff().over("id").alias("diff_value")) | |
.with_column(col.value.rolling_mean(window_size=2).over("id").alias("rolling_avg")) | |
.with_column((col.value / col.value.shift(1).over("id") - 1).alias("pct_change")) | |
.with_column((col.value / col.value.shift(365).over("id") - 1).alias("yoy_growth")) | |
.with_column(col.value.cum_sum().over("id").alias("cumulative_sum")) | |
) | |
# Time-based operations | |
# Assuming we have more varied dates in our dataset | |
varied_df = pl.DataFrame({ | |
'date': ['2023-01-01', '2023-01-15', '2023-02-01', '2023-02-15', | |
'2023-03-01', '2023-03-15', '2023-04-01', '2023-04-15'], | |
'value': [100, 110, 120, 115, 125, 130, 140, 145] | |
}) | |
varied_df = varied_df.with_column(col.date.str.strptime(pl.Date, fmt='%Y-%m-%d')) | |
# Resample to monthly frequency | |
monthly = varied_df.groupby_dynamic('date', every='1mo').agg([ | |
col.value.mean().alias('monthly_avg') | |
]) | |
# Calculate month-to-month growth rate | |
monthly = monthly.with_column( | |
(col.monthly_avg / col.monthly_avg.shift(1) - 1).alias('mom_growth') | |
) | |
print(monthly) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment