Last active
March 2, 2022 17:18
-
-
Save kwinkunks/d878bfcece739555b0c5dff04acc0a0a to your computer and use it in GitHub Desktop.
Fast, ordered unique items from sequences
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# We want to get unique items in a sequence, but to keep the order in which they appear. | |
# There are quite a few solutions here > http://www.peterbe.com/plog/uniqifiers-benchmark | |
# Good, up to date summary of methods > https://stackoverflow.com/a/17016257/3381305 | |
# Some test data: text... | |
tdat = 100 * ['c', 'a', 'c', 'b', 'e', 'd', 'f', 'g', 'h', 'i', 'j', 'j'] | |
tarr = np.array(tdat) | |
tser = pd.Series(tdat) | |
# ... and numbers. | |
narr = np.random.randint(0, 10, size=1200) | |
ndat = narr.tolist() | |
nser = pd.Series(narr) | |
# For reference, a naive loop takes about 80 µs on my machine. | |
# Order preserving | |
# 🏆 This is the winner: ~18.5 µs on my data. | |
def unique_fast(seq): | |
return list(dict.fromkeys(seq)) | |
# This is the most feature-rich. It is: | |
# - Fast (~21.5 µs on my data) | |
# - Lazy | |
# - Supports non-hashable items (eg list of lists) | |
# - Supports a key | |
from itertools import filterfalse | |
def unique_everseen(iterable, key=None): | |
# From https://docs.python.org/3/library/itertools.html#itertools-recipes | |
"List unique elements, preserving order. Remember all elements ever seen." | |
# unique_everseen('AAAABBBCCDAABBB') --> A B C D | |
# unique_everseen('ABBCcAD', str.lower) --> A B C D | |
seen = set() | |
seen_add = seen.add | |
if key is None: | |
for element in filterfalse(seen.__contains__, iterable): | |
seen_add(element) | |
yield element | |
else: | |
for element in iterable: | |
k = key(element) | |
if k not in seen: | |
seen_add(k) | |
yield element | |
# Order destroying | |
# Just use set(seq), it's about 4 times faster than np.unique(). |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment