wassname2 · May 14, 2024 02:29
diff --git a/time_chunking.py b/time_chunking.py
 import pandas as pd


 def chunking_by_time(startTime: pd.Timestamp, endTime: pd.Timestamp, freqs=['Y', 'M', 'D']):
    """
    How do we cache timespans? We want to take year chunks, then for the remainder months, and so on.
    That way old data is cached in big chunks, and new data is rechunked as needed

    ```py
    startTime = pd.to_datetime('2023-01-02 01:01')
    endTime = pd.to_datetime('2024-04-06 05:55')
    chunks = chunking_by_time(startTime, endTime, ['Y', 'M', 'D'])

    >  [Timestamp('2023-01-02 01:01:00'),
      Timestamp('2023-12-31 00:00:00'),
      Timestamp('2024-01-31 00:00:00'),
      Timestamp('2024-02-29 00:00:00'),
      Timestamp('2024-03-31 00:00:00'),
      Timestamp('2024-04-01 00:00:00'),
      Timestamp('2024-04-02 00:00:00'),
      Timestamp('2024-04-03 00:00:00'),
      Timestamp('2024-04-04 00:00:00'),
      Timestamp('2024-04-05 00:00:00'),
      Timestamp('2024-04-06 00:00:00'),
      Timestamp('2024-04-06 01:01:00'),
      Timestamp('2024-04-06 05:55')]

    """
    
    # Create date range
    date_range = pd.date_range(startTime, endTime)

    # Create DataFrame
    df = pd.DataFrame(date_range, columns=['date'], index=date_range)

    groupers = [pd.Grouper(key='date', freq=f) for f in freqs]

    groups = [startTime, endTime] # start with our first and last
    for g in groupers:
        grouped = df.groupby(g) 
        gs = list(dict(list(grouped)).keys())
        # only take the ones that are before our last ts
        gs = [ts for ts in gs if ts<=df.index.max()]
        groups += gs
        df = df.loc[groups[-1]:]

    gs = [ts for ts in gs if (ts<=df.index.max()) & (ts>=df.index.min())]

    # dedup
    groups = sorted(set(groups))
    return groups

 # sanity checks
 ## should include start and end in bins
 startTime = pd.to_datetime('2023-01-02 01:01')
 endTime = pd.to_datetime('2024-04-06 05:55')
 chunks = chunking_by_time(startTime, endTime, ['Y', 'M', 'D'])
 chunks

 ## the last chunk should be different so we get fresh data
 startTime = pd.to_datetime('2023-01-02 01:01')
 endTime = pd.to_datetime('2024-04-06 05:55')
 chunks = chunking_by_time(startTime, endTime, ['Y', 'M', 'D'])
 chunks
	import pandas as pd


	def chunking_by_time(startTime: pd.Timestamp, endTime: pd.Timestamp, freqs=['Y', 'M', 'D']):
	"""
	How do we cache timespans? We want to take year chunks, then for the remainder months, and so on.
	That way old data is cached in big chunks, and new data is rechunked as needed

	```py
	startTime = pd.to_datetime('2023-01-02 01:01')
	endTime = pd.to_datetime('2024-04-06 05:55')
	chunks = chunking_by_time(startTime, endTime, ['Y', 'M', 'D'])

	> [Timestamp('2023-01-02 01:01:00'),
	Timestamp('2023-12-31 00:00:00'),
	Timestamp('2024-01-31 00:00:00'),
	Timestamp('2024-02-29 00:00:00'),
	Timestamp('2024-03-31 00:00:00'),
	Timestamp('2024-04-01 00:00:00'),
	Timestamp('2024-04-02 00:00:00'),
	Timestamp('2024-04-03 00:00:00'),
	Timestamp('2024-04-04 00:00:00'),
	Timestamp('2024-04-05 00:00:00'),
	Timestamp('2024-04-06 00:00:00'),
	Timestamp('2024-04-06 01:01:00'),
	Timestamp('2024-04-06 05:55')]

	"""

	# Create date range
	date_range = pd.date_range(startTime, endTime)

	# Create DataFrame
	df = pd.DataFrame(date_range, columns=['date'], index=date_range)

	groupers = [pd.Grouper(key='date', freq=f) for f in freqs]

	groups = [startTime, endTime] # start with our first and last
	for g in groupers:
	grouped = df.groupby(g)
	gs = list(dict(list(grouped)).keys())
	# only take the ones that are before our last ts
	gs = [ts for ts in gs if ts<=df.index.max()]
	groups += gs
	df = df.loc[groups[-1]:]

	gs = [ts for ts in gs if (ts<=df.index.max()) & (ts>=df.index.min())]

	# dedup
	groups = sorted(set(groups))
	return groups

	# sanity checks
	## should include start and end in bins
	startTime = pd.to_datetime('2023-01-02 01:01')
	endTime = pd.to_datetime('2024-04-06 05:55')
	chunks = chunking_by_time(startTime, endTime, ['Y', 'M', 'D'])
	chunks

	## the last chunk should be different so we get fresh data
	startTime = pd.to_datetime('2023-01-02 01:01')
	endTime = pd.to_datetime('2024-04-06 05:55')
	chunks = chunking_by_time(startTime, endTime, ['Y', 'M', 'D'])
	chunks
No results found