jwhendy · October 3, 2019 15:04
diff --git a/old-new-sorter.py b/old-new-sorter.py
 import pandas as pd
 import itertools
 df = pd.DataFrame({'group': ['a', 'a', 'a', 'a', 'a', 'a', 'b', 'b', 'b'],
                   'date': [0, 1, 1, 1, 1, 2, 3, 4, 4],
                   'old': [1, 8, 2, 2, 5, 5, 4, 10, 7],
                   'new': [2, 5, 5, 8, 2, 4, 7, 1, 10]})
 print(df)

 ### jumbled: the `new` value of a row is not the same as the next row's `old` value
 #   group  date  old  new
 # 0     a     0    1    2
 # 1     a     1    8    5
 # 2     a     1    2    5
 # 3     a     1    2    8
 # 4     a     1    5    2
 # 5     a     2    5    4
 # 6     b     3    4    7
 # 7     b     4   10    1
 # 8     b     4    7   10

 ### this is hacky; brute force permutation search
 def find_order(rows, first=None, last=None):
    combos = itertools.permutations(range(len(rows)), len(rows))
    for combo in combos:
        # we apply this test order, then extract the 1:end old values and :-1 new values
        # in the correct order, each pair will be the same, so subtracting will equal 0
        old = rows.iloc[list(combo[1:])].old.tolist()
        new = rows.iloc[list(combo[:-1])].new.tolist()
        # if we have a preceeding unambiguous row, we need to include it's new value and
        # ensure that in this permutation, the first old value is the same
        if first is not None:
            old.insert(0, rows['old'].iloc[list(combo)[0]])
            new.insert(0, first)
        # if we have a trailing unambiguous row, we need to include it's old value and
        # ensure that in this permutation, the last new value is the same
        if last is not None:
            old.append(last)
            new.append(rows['new'].iloc[list(combo)[-1]])
        # subtract the zipped list and check for all zeros, if so, this is the order we want
        if not any([a-b for (a, b) in zip(old, new)]):
            return rows.iloc[list(combo)]
    
    return rows

 # high level processing of the group
 def order_rows(rows):
    # if there's only one row in this group, return it
    if len(rows) == 1:
        return rows
    
    # start a sorted container to add to
    df_temp = pd.DataFrame(columns=rows.columns)
    
    # go through each unique date
    for date in list(rows.date.unique()):
        sub = rows[rows.date == date]
        # if our subset is only one row, the date is unique; append it
        if len(sub) == 1:
            df_temp = df_temp.append(sub)
            continue
        
        # if we have a multi-row chunk, the dates are all the same
        # if we've already got something in df_temp, there is a precediing constraint
        # include this when we pass to the ordering function
        first = df_temp['new'].iloc[-1] if len(df_temp)>0 else None
        
        # if the rows in our current chunk and those in df_temp do not equal the total rows passed
        # there's another row ahead; use this as a trailing constraint
        last = rows['old'].loc[sub.index.max()+1] if len(sub)+len(df_temp)<len(rows) else None
        
        # search for the right order, then append the sorted chunk
        sub = find_order(sub, first, last)
        df_temp = df_temp.append(sub)
    
    return df_temp

 df1 = df.copy()
 df1 = df1.groupby(['group'], as_index=False, sort=False).apply(order_rows).reset_index(drop=True)
 print(df1)

 ### correct: the `old` value in each row equals the `new` value of the previous row
 #   group date old new
 # 0     a    0   1   2
 # 1     a    1   2   5
 # 2     a    1   5   2
 # 3     a    1   2   8
 # 4     a    1   8   5
 # 5     a    2   5   4
 # 6     b    3   4   7
 # 7     b    4   7  10
 # 8     b    4  10   1
	import pandas as pd
	import itertools
	df = pd.DataFrame({'group': ['a', 'a', 'a', 'a', 'a', 'a', 'b', 'b', 'b'],
	'date': [0, 1, 1, 1, 1, 2, 3, 4, 4],
	'old': [1, 8, 2, 2, 5, 5, 4, 10, 7],
	'new': [2, 5, 5, 8, 2, 4, 7, 1, 10]})
	print(df)

	### jumbled: the `new` value of a row is not the same as the next row's `old` value
	# group date old new
	# 0 a 0 1 2
	# 1 a 1 8 5
	# 2 a 1 2 5
	# 3 a 1 2 8
	# 4 a 1 5 2
	# 5 a 2 5 4
	# 6 b 3 4 7
	# 7 b 4 10 1
	# 8 b 4 7 10

	### this is hacky; brute force permutation search
	def find_order(rows, first=None, last=None):
	combos = itertools.permutations(range(len(rows)), len(rows))
	for combo in combos:
	# we apply this test order, then extract the 1:end old values and :-1 new values
	# in the correct order, each pair will be the same, so subtracting will equal 0
	old = rows.iloc[list(combo[1:])].old.tolist()
	new = rows.iloc[list(combo[:-1])].new.tolist()
	# if we have a preceeding unambiguous row, we need to include it's new value and
	# ensure that in this permutation, the first old value is the same
	if first is not None:
	old.insert(0, rows['old'].iloc[list(combo)[0]])
	new.insert(0, first)
	# if we have a trailing unambiguous row, we need to include it's old value and
	# ensure that in this permutation, the last new value is the same
	if last is not None:
	old.append(last)
	new.append(rows['new'].iloc[list(combo)[-1]])
	# subtract the zipped list and check for all zeros, if so, this is the order we want
	if not any([a-b for (a, b) in zip(old, new)]):
	return rows.iloc[list(combo)]

	return rows

	# high level processing of the group
	def order_rows(rows):
	# if there's only one row in this group, return it
	if len(rows) == 1:
	return rows

	# start a sorted container to add to
	df_temp = pd.DataFrame(columns=rows.columns)

	# go through each unique date
	for date in list(rows.date.unique()):
	sub = rows[rows.date == date]
	# if our subset is only one row, the date is unique; append it
	if len(sub) == 1:
	df_temp = df_temp.append(sub)
	continue

	# if we have a multi-row chunk, the dates are all the same
	# if we've already got something in df_temp, there is a precediing constraint
	# include this when we pass to the ordering function
	first = df_temp['new'].iloc[-1] if len(df_temp)>0 else None

	# if the rows in our current chunk and those in df_temp do not equal the total rows passed
	# there's another row ahead; use this as a trailing constraint
	last = rows['old'].loc[sub.index.max()+1] if len(sub)+len(df_temp)<len(rows) else None

	# search for the right order, then append the sorted chunk
	sub = find_order(sub, first, last)
	df_temp = df_temp.append(sub)

	return df_temp

	df1 = df.copy()
	df1 = df1.groupby(['group'], as_index=False, sort=False).apply(order_rows).reset_index(drop=True)
	print(df1)

	### correct: the `old` value in each row equals the `new` value of the previous row
	# group date old new
	# 0 a 0 1 2
	# 1 a 1 2 5
	# 2 a 1 5 2
	# 3 a 1 2 8
	# 4 a 1 8 5
	# 5 a 2 5 4
	# 6 b 3 4 7
	# 7 b 4 7 10
	# 8 b 4 10 1