Created
September 19, 2022 16:31
-
-
Save kingjr/0a3eb140f2d2d123c9bd1c6901a11876 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from Levenshtein import editops | |
def match_list(A, B, on_replace="delete"): | |
"""Match two lists of different sizes and return corresponding indice | |
Parameters | |
---------- | |
A: list | array, shape (n,) | |
The values of the first list | |
B: list | array: shape (m, ) | |
The values of the second list | |
Returns | |
------- | |
A_idx : array | |
The indices of the A list that match those of the B | |
B_idx : array | |
The indices of the B list that match those of the A | |
""" | |
if not isinstance(A, str): | |
unique = np.unique(np.r_[A, B]) | |
label_encoder = dict((k, v) for v, k in enumerate(unique)) | |
def int_to_unicode(array: np.ndarray) -> str: | |
return "".join([str(chr(label_encoder[ii])) for ii in array]) | |
A = int_to_unicode(A) | |
B = int_to_unicode(B) | |
changes = editops(A, B) | |
B_sel = np.arange(len(B)).astype(float) | |
A_sel = np.arange(len(A)).astype(float) | |
for type_, val_a, val_b in changes: | |
if type_ == "insert": | |
B_sel[val_b] = np.nan | |
elif type_ == "delete": | |
A_sel[val_a] = np.nan | |
elif on_replace == "delete": | |
# print('delete replace') | |
A_sel[val_a] = np.nan | |
B_sel[val_b] = np.nan | |
elif on_replace == "keep": | |
# print('keep replace') | |
pass | |
else: | |
raise NotImplementedError | |
B_sel = B_sel[np.where(~np.isnan(B_sel))] | |
A_sel = A_sel[np.where(~np.isnan(A_sel))] | |
assert len(B_sel) == len(A_sel) | |
return A_sel.astype(int), B_sel.astype(int) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment