Last active
December 22, 2022 08:49
-
-
Save f0k/2f8402e4dfb6974bfcf1 to your computer and use it in GitHub Desktop.
rolling median implementations benchmark
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
# -*- coding: utf-8 -*- | |
""" | |
Compares some algorithms for computing a sliding median of an array. | |
Results: | |
scipy.signal.medfilt2d is a bit faster than scipy.ndimage.filter.median_filter | |
and significantly faster than scipy.signal.medfilt. | |
Maintaining a sorted list of the window becomes faster than that for a filter | |
length on the order of 100 items or more. Using the builtin `list` | |
implementation for that is considerably faster than a `blist.blist`, | |
`blist.sortedlist` or `sortedcontainers.SortedList`. | |
In numbers (Python 2.7.6, numpy 1.8.2, scipy 0.13.3, i7-4770S CPU @ 3.10GHz): | |
Sequence length 100000, filter width 49: | |
scipy.signal.medfilt: 0.224 s | |
scipy.signal.medfilt2d: 0.0543 s | |
scipy.ndimage.filters.median_filter: 0.0588 s | |
using blist.blist: 0.0971 s | |
using list: 0.0774 s | |
using blist.sortedlist: 0.549 s | |
using sortedcontainers.SortedList: 0.284 s | |
Sequence length 100000, filter width 99: | |
scipy.signal.medfilt: 0.495 s | |
scipy.signal.medfilt2d: 0.103 s | |
scipy.ndimage.filters.median_filter: 0.108 s | |
using blist.blist: 0.105 s | |
using list: 0.0832 s | |
using blist.sortedlist: 0.595 s | |
using sortedcontainers.SortedList: 0.291 s | |
Sequence length 100000, filter width 599: | |
scipy.signal.medfilt: 3.85 s | |
scipy.signal.medfilt2d: 0.579 s | |
scipy.ndimage.filters.median_filter: 0.598 s | |
using blist.blist: 0.234 s | |
using list: 0.109 s | |
using blist.sortedlist: 0.894 s | |
using sortedcontainers.SortedList: 0.323 s | |
Sequence length 100000, filter width 9999: | |
scipy.signal.medfilt: 85 s | |
scipy.signal.medfilt2d: 8.87 s | |
scipy.ndimage.filters.median_filter: 10.1 s | |
using blist.blist: 0.612 s | |
using list: 0.372 s | |
using blist.sortedlist: 1.59 s | |
using sortedcontainers.SortedList: 0.505 s | |
Author: Jan Schlüter | |
""" | |
import sys | |
import os | |
import numpy as np | |
import timeit | |
NUM_REPEATS = 3 | |
NUM_ITER = 3 | |
N = 100000 | |
DATA = None | |
# Three scipy implementations | |
import scipy.signal | |
import scipy.ndimage.filters | |
def scipy_signal(data, width): | |
return scipy.signal.medfilt(data, width) | |
def scipy_signal2d(data, width): | |
return scipy.signal.medfilt2d(data.reshape(1, -1), (1, width))[0] | |
def scipy_ndimage(data, width): | |
return scipy.ndimage.filters.median_filter(data, width) | |
# An implementation using blist or list, with code adopted | |
# from http://code.activestate.com/recipes/576930/#c3 | |
# Note that this applies the median filter over the past values, | |
# while scipy applies it half to the past and half to the future. | |
# This is not critical to assess performance, though. | |
from collections import deque | |
from bisect import bisect_left, insort | |
from blist import blist | |
def custom_algorithm(data, width, listclass): | |
l = listclass(data[0].repeat(width)) | |
#l.sort() # not needed because all values are the same here | |
mididx = (width - 1) // 2 | |
result = np.empty_like(data) | |
for idx, new_elem in enumerate(data): | |
old_elem = data[max(0, idx - width)] | |
del l[bisect_left(l, old_elem)] | |
insort(l, new_elem) | |
result[idx] = l[mididx] | |
def using_blist(data, width): | |
return custom_algorithm(data, width, blist) | |
def using_list(data, width): | |
return custom_algorithm(data, width, list) | |
# A similar implementation using blist.sortedlist or sortedcontainers.SortedList | |
from blist import sortedlist as blist_sortedlist | |
from sortedcontainers import SortedList as sortcont_SortedList | |
def custom_algorithm2(data, width, sortedlistclass): | |
l = sortedlistclass(data[0].repeat(width)) | |
mididx = (width - 1) // 2 | |
result = np.empty_like(data) | |
for idx, new_elem in enumerate(data): | |
old_elem = data[max(0, idx - width)] | |
l.remove(old_elem) | |
l.add(new_elem) | |
result[idx] = l[mididx] | |
def using_blist_sortedlist(data, width): | |
return custom_algorithm2(data, width, blist_sortedlist) | |
def using_sortedcontainers(data, width): | |
return custom_algorithm2(data, width, sortcont_SortedList) | |
# The benchmark code | |
def run_benchmark(width): | |
print "Sequence length %d, filter width %d:" % (len(DATA), width) | |
print " scipy.signal.medfilt:", | |
sys.stdout.flush() | |
t = min(timeit.repeat( | |
setup="from __main__ import DATA, scipy_signal", | |
stmt="scipy_signal(DATA, %d)" % width, repeat=NUM_REPEATS, | |
number=NUM_ITER)) / float(NUM_ITER) | |
print "%.3g s" % t | |
print " scipy.signal.medfilt2d:", | |
sys.stdout.flush() | |
t = min(timeit.repeat( | |
setup="from __main__ import DATA, scipy_signal2d", | |
stmt="scipy_signal2d(DATA, %d)" % width, repeat=NUM_REPEATS, | |
number=NUM_ITER)) / float(NUM_ITER) | |
print "%.3g s" % t | |
print " scipy.ndimage.filters.median_filter:", | |
sys.stdout.flush() | |
t = min(timeit.repeat( | |
setup="from __main__ import DATA, scipy_ndimage", | |
stmt="scipy_ndimage(DATA, %d)" % width, repeat=NUM_REPEATS, | |
number=NUM_ITER)) / float(NUM_ITER) | |
print "%.3g s" % t | |
print " using blist.blist:", | |
sys.stdout.flush() | |
t = min(timeit.repeat( | |
setup="from __main__ import DATA, using_blist", | |
stmt="using_blist(DATA, %d)" % width, repeat=NUM_REPEATS, | |
number=NUM_ITER)) / float(NUM_ITER) | |
print "%.3g s" % t | |
print " using list:", | |
sys.stdout.flush() | |
t = min(timeit.repeat( | |
setup="from __main__ import DATA, using_list", | |
stmt="using_list(DATA, %d)" % width, repeat=NUM_REPEATS, | |
number=NUM_ITER)) / float(NUM_ITER) | |
print "%.3g s" % t | |
print " using blist.sortedlist:", | |
sys.stdout.flush() | |
t = min(timeit.repeat( | |
setup="from __main__ import DATA, using_blist_sortedlist", | |
stmt="using_blist_sortedlist(DATA, %d)" % width, repeat=NUM_REPEATS, | |
number=NUM_ITER)) / float(NUM_ITER) | |
print "%.3g s" % t | |
print " using sortedcontainers.SortedList:", | |
sys.stdout.flush() | |
t = min(timeit.repeat( | |
setup="from __main__ import DATA, using_sortedcontainers", | |
stmt="using_sortedcontainers(DATA, %d)" % width, repeat=NUM_REPEATS, | |
number=NUM_ITER)) / float(NUM_ITER) | |
print "%.3g s" % t | |
def main(): | |
global DATA | |
DATA = np.random.randn(N) | |
for width in 49, 99, 599, 9999: | |
run_benchmark(width) | |
if __name__=="__main__": | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Oh, only saw this now. Apparently github does not give notifications for gist comments, sorry.
Oops, that seems to have been a leftover. I'm targeting the case of having random access to the incoming data instead of a stream, so I can directly see which value left the window when moving on and don't have to remember the data in a queue or cyclic buffer.
That's exactly the value leaving the window! For the first
width
steps, it's always the first value of the array, that's what themax
is for.Good to know! That was outside my use case. I had window sizes between 50 and 200 and needed it to be fast (but not that fast that it was worth going into Cython or C).