yoki · August 24, 2024 17:59
diff --git a/00_polars_basic.py b/00_polars_basic.py
 import polars as pl
 from polars import col

 # Creating DataFrames
 df = pl.DataFrame({
    'A': [1, 2, 3, 4],
    'B': ['a', 'b', 'c', 'd'],
    'C': [1.1, 2.2, 3.3, 4.4]
 })


 # Basic information
 df.columns
 df.shape

 # Selecting columns/rows (filter)
 df.select([col.A, col.B])
 df.filter(col.A > 2)

 # Sorting
 print(df.sort(col.A, descending=True))

 # add/rename/drop columns
 df = df.with_column((col.A + col.C).alias('D'))
 df = df.rename({'A': 'new_A'})
 df = df.drop('B')

 # Iteration over rows
 for row in df.iter_rows(named=True):
    print(f"A: {row['A']}, B: {row['B']}, C: {row['C']}, D: {row['D']}")

 # Handling missing values
 df = df.drop_nulls()
 df = df.fill_null(0)

 # CSV
 df = pl.read_csv('file.csv')
 df.write_csv('output.csv')
diff --git a/10_update.py b/10_update.py
 import polars as pl
 from polars import col

 # Type conversion
 df = df.with_columns([
    col.A.cast(pl.Float64),
    col.C.cast(pl.Int64),
    col.D.cast(pl.Utf8)
 ])

 # Update a single value
 df = df.with_row_count('index').with_column(
    pl.when(col.index == 2)
    .then(pl.lit(100))
    .otherwise(col.A)
    .alias('A')
 ).drop('index')

 # Update based on a condition
 df = df.with_column(
    pl.when(col.A > 3)
    .then(col.A * 2)
    .otherwise(col.A)
    .alias('A')
 )

 # Update using lambda function
 df = df.with_column(
    pl.struct(['A', 'C']).apply(lambda x: x['A'] + x['C']).alias('sum_A_C')
 )

 # Update multiple columns
 df = df.with_columns([
    col.A.map_elements(lambda x: x * 10).alias('A'),
    col.C.map_elements(lambda x: x + 100).alias('C')
 ])

 # add/rename/drop columns
 df = df.with_column((col.A + col.C).alias('D'))
 df = df.rename({'A': 'new_A'})
 df = df.drop('B')

 # Reset index
 df = df.with_row_count('index')

 # Drop the index column if you don't want it
 df = df.drop('index')
diff --git a/20_groupby_pivot_merge.py b/20_groupby_pivot_merge.py
 import polars as pl
 from polars import col

 ##### Join and concat #####
 joined = df1.join(df2, on='key', how='left')
 concat = pl.concat([joined, df3])

 ##### Pivot from long to wide and wide to long #####
 long_df = pl.DataFrame({
    'vname': ['one', 'one', 'two', 'two'] * 3,
    'country': ['A', 'B', 'A', 'B'] * 3,
    'year': ['x1', 'x1', 'x1', 'x1', 'x2', 'x2', 'x2', 'x2', 'x3', 'x3', 'x3', 'x3'],
    'val': range(1, 13)
 })
 wide_df = long_df.pivot(index=['country', 'year'], columns='vname', values='val')

 # wide to long
 melted = wide_df.melt(id_vars=['country', 'year'], variable_name='vname', value_name='val')


 ##### Groupby and Transform #####
 # Basic groupby and aggregation
 result = df.groupby('category').agg([
    col.value1.sum().alias('sum_value1'),
    col.value2.mean().alias('mean_value2')
 ])

 # Multiple aggregations
 result = df.groupby('category').agg([
    col.value1.agg(['sum', 'mean', 'max']),
    col.value2.agg(['min', 'median'])
 ])

 # Custom aggregation
 result = df.groupby('category').agg([
    col.value1.apply(lambda x: x.sum() / x.count()).alias('custom_agg')
 ])

 # Transform (aggregate but keep same rows)
 df = df.with_column(
    col.value1.mean().over('category').alias('category_mean')
 )
diff --git a/30_time_series.py b/30_time_series.py
 import polars as pl
 from polars import col

 # Create a sample DataFrame with panel data
 df = pl.DataFrame({
    'date': ['2023-01-01', '2023-01-02', '2023-01-03', '2023-01-04'] * 2,
    'id': ['A', 'A', 'A', 'A', 'B', 'B', 'B', 'B'],
    'value': [100, 101, 103, 106, 200, 202, 205, 210]
 })

 # Convert to datetime
 df = df.with_column(col.date.str.strptime(pl.Date, fmt='%Y-%m-%d'))

 # Set date as index
 df = df.sort(['id', 'date'])

 # Resample
 daily = df.groupby(['id', 'date']).agg([
    col.value.mean().alias('daily_avg')
 ])

 # Panel various columns
 df = (
    df.with_column(col.value.shift(1).over("id").alias("lag_value"))
    .with_column(col.value.diff().over("id").alias("diff_value"))
    .with_column(col.value.rolling_mean(window_size=2).over("id").alias("rolling_avg"))
    .with_column((col.value / col.value.shift(1).over("id") - 1).alias("pct_change"))
    .with_column((col.value / col.value.shift(365).over("id") - 1).alias("yoy_growth"))
    .with_column(col.value.cum_sum().over("id").alias("cumulative_sum"))
 )


 # Time-based operations
 # Assuming we have more varied dates in our dataset
 varied_df = pl.DataFrame({
    'date': ['2023-01-01', '2023-01-15', '2023-02-01', '2023-02-15', 
             '2023-03-01', '2023-03-15', '2023-04-01', '2023-04-15'],
    'value': [100, 110, 120, 115, 125, 130, 140, 145]
 })

 varied_df = varied_df.with_column(col.date.str.strptime(pl.Date, fmt='%Y-%m-%d'))

 # Resample to monthly frequency
 monthly = varied_df.groupby_dynamic('date', every='1mo').agg([
    col.value.mean().alias('monthly_avg')
 ])

 # Calculate month-to-month growth rate
 monthly = monthly.with_column(
    (col.monthly_avg / col.monthly_avg.shift(1) - 1).alias('mom_growth')
 )

 print(monthly)
	import polars as pl
	from polars import col

	# Creating DataFrames
	df = pl.DataFrame({
	'A': [1, 2, 3, 4],
	'B': ['a', 'b', 'c', 'd'],
	'C': [1.1, 2.2, 3.3, 4.4]
	})


	# Basic information
	df.columns
	df.shape

	# Selecting columns/rows (filter)
	df.select([col.A, col.B])
	df.filter(col.A > 2)

	# Sorting
	print(df.sort(col.A, descending=True))

	# add/rename/drop columns
	df = df.with_column((col.A + col.C).alias('D'))
	df = df.rename({'A': 'new_A'})
	df = df.drop('B')

	# Iteration over rows
	for row in df.iter_rows(named=True):
	print(f"A: {row['A']}, B: {row['B']}, C: {row['C']}, D: {row['D']}")

	# Handling missing values
	df = df.drop_nulls()
	df = df.fill_null(0)

	# CSV
	df = pl.read_csv('file.csv')
	df.write_csv('output.csv')
	import polars as pl
	from polars import col

	# Type conversion
	df = df.with_columns([
	col.A.cast(pl.Float64),
	col.C.cast(pl.Int64),
	col.D.cast(pl.Utf8)
	])

	# Update a single value
	df = df.with_row_count('index').with_column(
	pl.when(col.index == 2)
	.then(pl.lit(100))
	.otherwise(col.A)
	.alias('A')
	).drop('index')

	# Update based on a condition
	df = df.with_column(
	pl.when(col.A > 3)
	.then(col.A * 2)
	.otherwise(col.A)
	.alias('A')
	)

	# Update using lambda function
	df = df.with_column(
	pl.struct(['A', 'C']).apply(lambda x: x['A'] + x['C']).alias('sum_A_C')
	)

	# Update multiple columns
	df = df.with_columns([
	col.A.map_elements(lambda x: x * 10).alias('A'),
	col.C.map_elements(lambda x: x + 100).alias('C')
	])

	# add/rename/drop columns
	df = df.with_column((col.A + col.C).alias('D'))
	df = df.rename({'A': 'new_A'})
	df = df.drop('B')

	# Reset index
	df = df.with_row_count('index')

	# Drop the index column if you don't want it
	df = df.drop('index')
	import polars as pl
	from polars import col

	##### Join and concat #####
	joined = df1.join(df2, on='key', how='left')
	concat = pl.concat([joined, df3])

	##### Pivot from long to wide and wide to long #####
	long_df = pl.DataFrame({
	'vname': ['one', 'one', 'two', 'two'] * 3,
	'country': ['A', 'B', 'A', 'B'] * 3,
	'year': ['x1', 'x1', 'x1', 'x1', 'x2', 'x2', 'x2', 'x2', 'x3', 'x3', 'x3', 'x3'],
	'val': range(1, 13)
	})
	wide_df = long_df.pivot(index=['country', 'year'], columns='vname', values='val')

	# wide to long
	melted = wide_df.melt(id_vars=['country', 'year'], variable_name='vname', value_name='val')


	##### Groupby and Transform #####
	# Basic groupby and aggregation
	result = df.groupby('category').agg([
	col.value1.sum().alias('sum_value1'),
	col.value2.mean().alias('mean_value2')
	])

	# Multiple aggregations
	result = df.groupby('category').agg([
	col.value1.agg(['sum', 'mean', 'max']),
	col.value2.agg(['min', 'median'])
	])

	# Custom aggregation
	result = df.groupby('category').agg([
	col.value1.apply(lambda x: x.sum() / x.count()).alias('custom_agg')
	])

	# Transform (aggregate but keep same rows)
	df = df.with_column(
	col.value1.mean().over('category').alias('category_mean')
	)
	import polars as pl
	from polars import col

	# Create a sample DataFrame with panel data
	df = pl.DataFrame({
	'date': ['2023-01-01', '2023-01-02', '2023-01-03', '2023-01-04'] * 2,
	'id': ['A', 'A', 'A', 'A', 'B', 'B', 'B', 'B'],
	'value': [100, 101, 103, 106, 200, 202, 205, 210]
	})

	# Convert to datetime
	df = df.with_column(col.date.str.strptime(pl.Date, fmt='%Y-%m-%d'))

	# Set date as index
	df = df.sort(['id', 'date'])

	# Resample
	daily = df.groupby(['id', 'date']).agg([
	col.value.mean().alias('daily_avg')
	])

	# Panel various columns
	df = (
	df.with_column(col.value.shift(1).over("id").alias("lag_value"))
	.with_column(col.value.diff().over("id").alias("diff_value"))
	.with_column(col.value.rolling_mean(window_size=2).over("id").alias("rolling_avg"))
	.with_column((col.value / col.value.shift(1).over("id") - 1).alias("pct_change"))
	.with_column((col.value / col.value.shift(365).over("id") - 1).alias("yoy_growth"))
	.with_column(col.value.cum_sum().over("id").alias("cumulative_sum"))
	)


	# Time-based operations
	# Assuming we have more varied dates in our dataset
	varied_df = pl.DataFrame({
	'date': ['2023-01-01', '2023-01-15', '2023-02-01', '2023-02-15',
	'2023-03-01', '2023-03-15', '2023-04-01', '2023-04-15'],
	'value': [100, 110, 120, 115, 125, 130, 140, 145]
	})

	varied_df = varied_df.with_column(col.date.str.strptime(pl.Date, fmt='%Y-%m-%d'))

	# Resample to monthly frequency
	monthly = varied_df.groupby_dynamic('date', every='1mo').agg([
	col.value.mean().alias('monthly_avg')
	])

	# Calculate month-to-month growth rate
	monthly = monthly.with_column(
	(col.monthly_avg / col.monthly_avg.shift(1) - 1).alias('mom_growth')
	)

	print(monthly)