Created
June 19, 2018 19:37
-
-
Save MaxHalford/f4fab4c56644e074e61dcbd1e0fd8fe9 to your computer and use it in GitHub Desktop.
Reverse engineer one-hot encoded categorical variable
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| import collections | |
| import itertools | |
| df = pd.DataFrame({ | |
| 'a': [1, 0, 0, 0], | |
| 'b': [0, 1, 0, 0], | |
| 'c': [0, 0, 1, 0], | |
| 'd': [0, 0, 0, 1], | |
| 'e': [1, 1, 0, 0], | |
| 'f': [0, 0, 1, 1] | |
| }) == 1 | |
| columns = df.columns.tolist() | |
| candidates = collections.defaultdict(set) | |
| for a, b in itertools.permutations(columns, 2): | |
| if (df[a] & df[b]).any(): | |
| continue | |
| candidates[a].update([b]) | |
| combos = [] | |
| seen = set() | |
| for col in sorted(candidates): | |
| seen.update([col]) | |
| combo = set([col]) | |
| for candidate in candidates: | |
| if candidate in seen: | |
| continue | |
| ok = True | |
| for member in combo: | |
| if candidate not in candidates[member] or member not in candidates[candidate]: | |
| ok = False | |
| break | |
| if ok: | |
| combo.update([candidate]) | |
| combos.append(combo) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment