Skip to content

Instantly share code, notes, and snippets.

@u8sand
Created October 10, 2020 21:22
Show Gist options
  • Save u8sand/79a9d0b43c85bef8f0904529ab7f6633 to your computer and use it in GitHub Desktop.
Save u8sand/79a9d0b43c85bef8f0904529ab7f6633 to your computer and use it in GitHub Desktop.
A pandas-style merge which works efficiently on a join field that consists of sets (via an inverted index) -- relevant for things like using synonyms
def build_inverse_dict(items):
idict = {}
for k, V in items:
for v in V:
idict[v] = idict.get(v, set()) | {k}
return idict
def pd_merge_on_set(left=None, left_on=None, right=None, right_on=None):
''' Merge on a one to many relationship.
```raw
df_x: df_y:
left_id left_literal left_o2m right_id right_literal right_o2m
1 test1 ['a', 'b'] 2 test2 ['a']
3 test3 ['c'] 4 test4 ['b']
```
```python
merged = list(pd_merge_on_set(left=df_x, left_on='left_o2m', right=df_y, right_on='right_o2m'))
assert merge == [
({1}, {2, 4}),
]
```
'''
left_dict = build_inverse_dict(left[left_on].items())
right_dict = build_inverse_dict(right[right_on].items())
common = left_dict.keys() & right_dict.keys()
for v in common:
yield (left_dict[v], right_dict[v])
def pd_automerge_on_set(df=None, df_on=None):
df_dict = build_inverse_dict(df[df_on].items())
for _, k in df_dict.items():
if len(k) > 1:
yield k
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment