Last active
October 3, 2019 15:04
-
-
Save jwhendy/2b83320eb33792adabb4c119ae09fced to your computer and use it in GitHub Desktop.
Given a set of data with some sort value column, and columns for an old and new value (where each row's old value is the new value from the previous row), put rows in the correct order when they have the same sort key value.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import pandas as pd | |
import itertools | |
df = pd.DataFrame({'group': ['a', 'a', 'a', 'a', 'a', 'a', 'b', 'b', 'b'], | |
'date': [0, 1, 1, 1, 1, 2, 3, 4, 4], | |
'old': [1, 8, 2, 2, 5, 5, 4, 10, 7], | |
'new': [2, 5, 5, 8, 2, 4, 7, 1, 10]}) | |
print(df) | |
### jumbled: the `new` value of a row is not the same as the next row's `old` value | |
# group date old new | |
# 0 a 0 1 2 | |
# 1 a 1 8 5 | |
# 2 a 1 2 5 | |
# 3 a 1 2 8 | |
# 4 a 1 5 2 | |
# 5 a 2 5 4 | |
# 6 b 3 4 7 | |
# 7 b 4 10 1 | |
# 8 b 4 7 10 | |
### this is hacky; brute force permutation search | |
def find_order(rows, first=None, last=None): | |
combos = itertools.permutations(range(len(rows)), len(rows)) | |
for combo in combos: | |
# we apply this test order, then extract the 1:end old values and :-1 new values | |
# in the correct order, each pair will be the same, so subtracting will equal 0 | |
old = rows.iloc[list(combo[1:])].old.tolist() | |
new = rows.iloc[list(combo[:-1])].new.tolist() | |
# if we have a preceeding unambiguous row, we need to include it's new value and | |
# ensure that in this permutation, the first old value is the same | |
if first is not None: | |
old.insert(0, rows['old'].iloc[list(combo)[0]]) | |
new.insert(0, first) | |
# if we have a trailing unambiguous row, we need to include it's old value and | |
# ensure that in this permutation, the last new value is the same | |
if last is not None: | |
old.append(last) | |
new.append(rows['new'].iloc[list(combo)[-1]]) | |
# subtract the zipped list and check for all zeros, if so, this is the order we want | |
if not any([a-b for (a, b) in zip(old, new)]): | |
return rows.iloc[list(combo)] | |
return rows | |
# high level processing of the group | |
def order_rows(rows): | |
# if there's only one row in this group, return it | |
if len(rows) == 1: | |
return rows | |
# start a sorted container to add to | |
df_temp = pd.DataFrame(columns=rows.columns) | |
# go through each unique date | |
for date in list(rows.date.unique()): | |
sub = rows[rows.date == date] | |
# if our subset is only one row, the date is unique; append it | |
if len(sub) == 1: | |
df_temp = df_temp.append(sub) | |
continue | |
# if we have a multi-row chunk, the dates are all the same | |
# if we've already got something in df_temp, there is a precediing constraint | |
# include this when we pass to the ordering function | |
first = df_temp['new'].iloc[-1] if len(df_temp)>0 else None | |
# if the rows in our current chunk and those in df_temp do not equal the total rows passed | |
# there's another row ahead; use this as a trailing constraint | |
last = rows['old'].loc[sub.index.max()+1] if len(sub)+len(df_temp)<len(rows) else None | |
# search for the right order, then append the sorted chunk | |
sub = find_order(sub, first, last) | |
df_temp = df_temp.append(sub) | |
return df_temp | |
df1 = df.copy() | |
df1 = df1.groupby(['group'], as_index=False, sort=False).apply(order_rows).reset_index(drop=True) | |
print(df1) | |
### correct: the `old` value in each row equals the `new` value of the previous row | |
# group date old new | |
# 0 a 0 1 2 | |
# 1 a 1 2 5 | |
# 2 a 1 5 2 | |
# 3 a 1 2 8 | |
# 4 a 1 8 5 | |
# 5 a 2 5 4 | |
# 6 b 3 4 7 | |
# 7 b 4 7 10 | |
# 8 b 4 10 1 |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment