Skip to content

Instantly share code, notes, and snippets.

@wassname2
Last active May 23, 2024 02:00
Show Gist options
  • Save wassname2/97103aa0ec83d3a94b940b7d3bbb38ee to your computer and use it in GitHub Desktop.
Save wassname2/97103aa0ec83d3a94b940b7d3bbb38ee to your computer and use it in GitHub Desktop.
pandas utils
import pandas as pd
import numpy as np
def pandas_groupby_agg_mixed_dtype_df(df, groupby: list, agg='max', str_agg='first'):
"""
resample a df with numeric and strings
"""
numerics_cols = list(df.select_dtypes(include='number').columns)+groupby
other_cols = list(df.select_dtypes(exclude='number').columns)+groupby
# make sure they are unique
numerics_cols = list(set(numerics_cols))
other_cols = list(set(other_cols))
numerics = df[numerics_cols].groupby(groupby).agg(agg).reset_index(drop=True)
other = df[other_cols].groupby(groupby).agg(str_agg).reset_index()
# join
df2 = numerics.join(other)
# reorder
return df2[df.columns]
def pandas_resample_mixed_dtype_df(df, freq='1h', agg='mean', str_agg='first'
):
"""
resample a df with numeric and strings
"""
numerics = df.select_dtypes('number').resample(freq).agg(agg)
other = df.select_dtypes(exclude='number').resample(freq).agg(str_agg)
# join
df2 = numerics.join(other)
# reorder
return df2[df.columns]
def resample_grouped_df(df, freq = '1h', agg='mean', groups=['FLOC', 'mptId']):
"""
We are working with array that contains MULTIPLE timeseries, so we can't just resample them without first grouping
"""
(df.set_index('timestamp')
.groupby(groups) # each combination of these is a seperate timeseries
.apply(lambda d: pandas_resample_mixed_dtype_df(d, freq, agg).reset_index(drop=False))
.reset_index(drop=True)
.dropna(how='all', axis=0) # drop any times where there are no values for anything
.sort_values('timestamp')
)
return df
def chunking_by_time(startTime: pd.Timestamp, endTime: pd.Timestamp, freqs=['Y', 'M', 'D']):
"""
How do we cache timespans? We want to take year chunks, then for the remainder months, and so on.
That way old data is cached in big chunks, and new data is rechunked as needed
```py
startTime = pd.to_datetime('2023-01-02 01:01')
endTime = pd.to_datetime('2024-04-06 05:55')
chunks = chunking_by_time(startTime, endTime, ['Y', 'M', 'D'])
[Timestamp('2023-01-02 01:01:00'),
Timestamp('2023-12-31 00:00:00'),
Timestamp('2024-01-31 00:00:00'),
Timestamp('2024-02-29 00:00:00'),
Timestamp('2024-03-31 00:00:00'),
Timestamp('2024-04-01 00:00:00'),
Timestamp('2024-04-02 00:00:00'),
Timestamp('2024-04-03 00:00:00'),
Timestamp('2024-04-04 00:00:00'),
Timestamp('2024-04-05 00:00:00'),
Timestamp('2024-04-06 00:00:00'),
Timestamp('2024-04-06 01:01:00')]
"""
# Create date range
date_range = pd.date_range(startTime, endTime)
# Create DataFrame
df = pd.DataFrame(date_range, columns=['date'], index=date_range)
groupers = [pd.Grouper(key='date', freq=f) for f in freqs]
groups = [startTime, endTime] # start with our first and last
for g in groupers:
grouped = df.groupby(g)
gs = list(dict(list(grouped)).keys())
# only take the ones that are before our last ts
gs = [ts for ts in gs if ts<=df.index.max()]
groups += gs
df = df.loc[groups[-1]:]
gs = [ts for ts in gs if (ts<=df.index.max()) & (ts>=df.index.min())]
# dedup
groups = sorted(set(groups))
return groups
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment