Created
October 10, 2020 21:22
-
-
Save u8sand/79a9d0b43c85bef8f0904529ab7f6633 to your computer and use it in GitHub Desktop.
A pandas-style merge which works efficiently on a join field that consists of sets (via an inverted index) -- relevant for things like using synonyms
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def build_inverse_dict(items): | |
idict = {} | |
for k, V in items: | |
for v in V: | |
idict[v] = idict.get(v, set()) | {k} | |
return idict | |
def pd_merge_on_set(left=None, left_on=None, right=None, right_on=None): | |
''' Merge on a one to many relationship. | |
```raw | |
df_x: df_y: | |
left_id left_literal left_o2m right_id right_literal right_o2m | |
1 test1 ['a', 'b'] 2 test2 ['a'] | |
3 test3 ['c'] 4 test4 ['b'] | |
``` | |
```python | |
merged = list(pd_merge_on_set(left=df_x, left_on='left_o2m', right=df_y, right_on='right_o2m')) | |
assert merge == [ | |
({1}, {2, 4}), | |
] | |
``` | |
''' | |
left_dict = build_inverse_dict(left[left_on].items()) | |
right_dict = build_inverse_dict(right[right_on].items()) | |
common = left_dict.keys() & right_dict.keys() | |
for v in common: | |
yield (left_dict[v], right_dict[v]) | |
def pd_automerge_on_set(df=None, df_on=None): | |
df_dict = build_inverse_dict(df[df_on].items()) | |
for _, k in df_dict.items(): | |
if len(k) > 1: | |
yield k |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment