wassname2 · May 23, 2024 02:00
diff --git a/pd_util.py b/pd_util.py
 import pandas as pd
 import numpy as np

 def pandas_groupby_agg_mixed_dtype_df(df, groupby: list, agg='max', str_agg='first'):
    """
    resample a df with numeric and strings
    """
    numerics_cols = list(df.select_dtypes(include='number').columns)+groupby
    other_cols = list(df.select_dtypes(exclude='number').columns)+groupby

    # make sure they are unique
    numerics_cols = list(set(numerics_cols))
    other_cols = list(set(other_cols))

    numerics = df[numerics_cols].groupby(groupby).agg(agg).reset_index(drop=True)
    other = df[other_cols].groupby(groupby).agg(str_agg).reset_index()

    # join
    df2 = numerics.join(other)

    # reorder
    return df2[df.columns]

 def pandas_resample_mixed_dtype_df(df, freq='1h', agg='mean', str_agg='first'
 ):
    """
    resample a df with numeric and strings
    """
    numerics = df.select_dtypes('number').resample(freq).agg(agg)
    other = df.select_dtypes(exclude='number').resample(freq).agg(str_agg)

    # join
    df2 = numerics.join(other)

    # reorder
    return df2[df.columns]

 def resample_grouped_df(df, freq = '1h', agg='mean', groups=['FLOC', 'mptId']):
    """
    We are working with array that contains MULTIPLE timeseries, so we can't just resample them without first grouping
    """

    (df.set_index('timestamp')
        .groupby(groups) # each combination of these is a seperate timeseries
        .apply(lambda d: pandas_resample_mixed_dtype_df(d, freq, agg).reset_index(drop=False))
        .reset_index(drop=True)
        .dropna(how='all', axis=0) # drop any times where there are no values for anything
        .sort_values('timestamp')
    )
    return df


 def chunking_by_time(startTime: pd.Timestamp, endTime: pd.Timestamp, freqs=['Y', 'M', 'D']):
    """
    How do we cache timespans? We want to take year chunks, then for the remainder months, and so on.
    That way old data is cached in big chunks, and new data is rechunked as needed
    ```py
    startTime = pd.to_datetime('2023-01-02 01:01')
    endTime = pd.to_datetime('2024-04-06 05:55')
    chunks = chunking_by_time(startTime, endTime, ['Y', 'M', 'D'])
      [Timestamp('2023-01-02 01:01:00'),
      Timestamp('2023-12-31 00:00:00'),
      Timestamp('2024-01-31 00:00:00'),
      Timestamp('2024-02-29 00:00:00'),
      Timestamp('2024-03-31 00:00:00'),
      Timestamp('2024-04-01 00:00:00'),
      Timestamp('2024-04-02 00:00:00'),
      Timestamp('2024-04-03 00:00:00'),
      Timestamp('2024-04-04 00:00:00'),
      Timestamp('2024-04-05 00:00:00'),
      Timestamp('2024-04-06 00:00:00'),
      Timestamp('2024-04-06 01:01:00')]
    """
    
    # Create date range
    date_range = pd.date_range(startTime, endTime)

    # Create DataFrame
    df = pd.DataFrame(date_range, columns=['date'], index=date_range)

    groupers = [pd.Grouper(key='date', freq=f) for f in freqs]

    groups = [startTime, endTime] # start with our first and last
    for g in groupers:
        grouped = df.groupby(g) 
        gs = list(dict(list(grouped)).keys())
        # only take the ones that are before our last ts
        gs = [ts for ts in gs if ts<=df.index.max()]
        groups += gs
        df = df.loc[groups[-1]:]

    gs = [ts for ts in gs if (ts<=df.index.max()) & (ts>=df.index.min())]

    # dedup
    groups = sorted(set(groups))
    return groups
	import pandas as pd
	import numpy as np

	def pandas_groupby_agg_mixed_dtype_df(df, groupby: list, agg='max', str_agg='first'):
	"""
	resample a df with numeric and strings
	"""
	numerics_cols = list(df.select_dtypes(include='number').columns)+groupby
	other_cols = list(df.select_dtypes(exclude='number').columns)+groupby

	# make sure they are unique
	numerics_cols = list(set(numerics_cols))
	other_cols = list(set(other_cols))

	numerics = df[numerics_cols].groupby(groupby).agg(agg).reset_index(drop=True)
	other = df[other_cols].groupby(groupby).agg(str_agg).reset_index()

	# join
	df2 = numerics.join(other)

	# reorder
	return df2[df.columns]

	def pandas_resample_mixed_dtype_df(df, freq='1h', agg='mean', str_agg='first'
	):
	"""
	resample a df with numeric and strings
	"""
	numerics = df.select_dtypes('number').resample(freq).agg(agg)
	other = df.select_dtypes(exclude='number').resample(freq).agg(str_agg)

	# join
	df2 = numerics.join(other)

	# reorder
	return df2[df.columns]

	def resample_grouped_df(df, freq = '1h', agg='mean', groups=['FLOC', 'mptId']):
	"""
	We are working with array that contains MULTIPLE timeseries, so we can't just resample them without first grouping
	"""

	(df.set_index('timestamp')
	.groupby(groups) # each combination of these is a seperate timeseries
	.apply(lambda d: pandas_resample_mixed_dtype_df(d, freq, agg).reset_index(drop=False))
	.reset_index(drop=True)
	.dropna(how='all', axis=0) # drop any times where there are no values for anything
	.sort_values('timestamp')
	)
	return df


	def chunking_by_time(startTime: pd.Timestamp, endTime: pd.Timestamp, freqs=['Y', 'M', 'D']):
	"""
	How do we cache timespans? We want to take year chunks, then for the remainder months, and so on.
	That way old data is cached in big chunks, and new data is rechunked as needed
	```py
	startTime = pd.to_datetime('2023-01-02 01:01')
	endTime = pd.to_datetime('2024-04-06 05:55')
	chunks = chunking_by_time(startTime, endTime, ['Y', 'M', 'D'])
	[Timestamp('2023-01-02 01:01:00'),
	Timestamp('2023-12-31 00:00:00'),
	Timestamp('2024-01-31 00:00:00'),
	Timestamp('2024-02-29 00:00:00'),
	Timestamp('2024-03-31 00:00:00'),
	Timestamp('2024-04-01 00:00:00'),
	Timestamp('2024-04-02 00:00:00'),
	Timestamp('2024-04-03 00:00:00'),
	Timestamp('2024-04-04 00:00:00'),
	Timestamp('2024-04-05 00:00:00'),
	Timestamp('2024-04-06 00:00:00'),
	Timestamp('2024-04-06 01:01:00')]
	"""

	# Create date range
	date_range = pd.date_range(startTime, endTime)

	# Create DataFrame
	df = pd.DataFrame(date_range, columns=['date'], index=date_range)

	groupers = [pd.Grouper(key='date', freq=f) for f in freqs]

	groups = [startTime, endTime] # start with our first and last
	for g in groupers:
	grouped = df.groupby(g)
	gs = list(dict(list(grouped)).keys())
	# only take the ones that are before our last ts
	gs = [ts for ts in gs if ts<=df.index.max()]
	groups += gs
	df = df.loc[groups[-1]:]

	gs = [ts for ts in gs if (ts<=df.index.max()) & (ts>=df.index.min())]

	# dedup
	groups = sorted(set(groups))
	return groups