Created
September 8, 2021 14:18
-
-
Save kingjr/4fb5f6ac0f04a71651e8d8f3f358044e to your computer and use it in GitHub Desktop.
match_list.py
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from Levenshtein import editops | |
def match_list(A, B, on_replace="delete"): | |
"""Match two lists of different sizes and return corresponding indice | |
Parameters | |
---------- | |
A: list | array, shape (n,) | |
The values of the first list | |
B: list | array: shape (m, ) | |
The values of the second list | |
Returns | |
------- | |
A_idx : array | |
The indices of the A list that match those of the B | |
B_idx : array | |
The indices of the B list that match those of the A | |
""" | |
unique = np.unique(np.r_[A, B]) | |
label_encoder = dict((k, v) for v, k in enumerate(unique)) | |
def int_to_unicode(array: np.ndarray) -> str: | |
return "".join([str(chr(label_encoder[ii])) for ii in array]) | |
changes = editops(int_to_unicode(A), int_to_unicode(B)) | |
B_sel = np.arange(len(B)).astype(float) | |
A_sel = np.arange(len(A)).astype(float) | |
for type_, val_a, val_b in changes: | |
if type_ == "insert": | |
B_sel[val_b] = np.nan | |
elif type_ == "delete": | |
A_sel[val_a] = np.nan | |
elif on_replace == "delete": | |
# print('delete replace') | |
A_sel[val_a] = np.nan | |
B_sel[val_b] = np.nan | |
elif on_replace == "keep": | |
# print('keep replace') | |
pass | |
else: | |
raise NotImplementedError | |
B_sel = B_sel[np.where(~np.isnan(B_sel))] | |
A_sel = A_sel[np.where(~np.isnan(A_sel))] | |
assert len(B_sel) == len(A_sel) | |
return A_sel.astype(int), B_sel.astype(int) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment