Skip to content

Instantly share code, notes, and snippets.

@wassname2
Last active May 14, 2024 02:29
Show Gist options
  • Save wassname2/3cdad918633913bfc06761dc359f5219 to your computer and use it in GitHub Desktop.
Save wassname2/3cdad918633913bfc06761dc359f5219 to your computer and use it in GitHub Desktop.
time_chunking.py a good way to chunk time, in human readable chunks of decreasing size
import pandas as pd
def chunking_by_time(startTime: pd.Timestamp, endTime: pd.Timestamp, freqs=['Y', 'M', 'D']):
"""
How do we cache timespans? We want to take year chunks, then for the remainder months, and so on.
That way old data is cached in big chunks, and new data is rechunked as needed
```py
startTime = pd.to_datetime('2023-01-02 01:01')
endTime = pd.to_datetime('2024-04-06 05:55')
chunks = chunking_by_time(startTime, endTime, ['Y', 'M', 'D'])
> [Timestamp('2023-01-02 01:01:00'),
Timestamp('2023-12-31 00:00:00'),
Timestamp('2024-01-31 00:00:00'),
Timestamp('2024-02-29 00:00:00'),
Timestamp('2024-03-31 00:00:00'),
Timestamp('2024-04-01 00:00:00'),
Timestamp('2024-04-02 00:00:00'),
Timestamp('2024-04-03 00:00:00'),
Timestamp('2024-04-04 00:00:00'),
Timestamp('2024-04-05 00:00:00'),
Timestamp('2024-04-06 00:00:00'),
Timestamp('2024-04-06 01:01:00'),
Timestamp('2024-04-06 05:55')]
"""
# Create date range
date_range = pd.date_range(startTime, endTime)
# Create DataFrame
df = pd.DataFrame(date_range, columns=['date'], index=date_range)
groupers = [pd.Grouper(key='date', freq=f) for f in freqs]
groups = [startTime, endTime] # start with our first and last
for g in groupers:
grouped = df.groupby(g)
gs = list(dict(list(grouped)).keys())
# only take the ones that are before our last ts
gs = [ts for ts in gs if ts<=df.index.max()]
groups += gs
df = df.loc[groups[-1]:]
gs = [ts for ts in gs if (ts<=df.index.max()) & (ts>=df.index.min())]
# dedup
groups = sorted(set(groups))
return groups
# sanity checks
## should include start and end in bins
startTime = pd.to_datetime('2023-01-02 01:01')
endTime = pd.to_datetime('2024-04-06 05:55')
chunks = chunking_by_time(startTime, endTime, ['Y', 'M', 'D'])
chunks
## the last chunk should be different so we get fresh data
startTime = pd.to_datetime('2023-01-02 01:01')
endTime = pd.to_datetime('2024-04-06 05:55')
chunks = chunking_by_time(startTime, endTime, ['Y', 'M', 'D'])
chunks
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment