I compared 7 functions, which solely purpose was to figure out the indices of every uppercase letter.
I used timeit
and plot the results using pyplot.
I wrote 6 testcases:
TEST_CASES = (
"Hello World!",
"".join(random.choice(string.ascii_letters) for _ in range(50)),
"".join(random.choice(string.ascii_letters) for _ in range(100)),
"".join(random.choice(string.ascii_letters) for _ in range(200)),
"".join(random.choice(string.ascii_letters) for _ in range(500)),
"".join(random.choice(string.ascii_letters) for _ in range(1000))
)
and number of iterations used for timeit are 10000
These are the following functions:
def findIndexesA(string):
indexes = []
for i in range(len(string)):
if string[i].isupper():
indexes.append(i)
return indexes
def findIndexesB(string):
return [i for i, c in enumerate(string) if c.isupper()]
def findIndexesC(string):
return list(map(lambda e: e[0], filter(lambda e: e[1].isupper(), enumerate(string))))
def findIndexesD(string):
for i, c in enumerate(string):
if c.isupper():
yield i
def findIndexesE(string):
return list((i for i, c in enumerate(string) if c.isupper()))
def findIndexesF(string):
indexes = []
for i, c in enumerate(string):
if c.isupper():
indexes.append(i)
return indexes
def findIndexesG(string):
return [i for i in range(len(string)) if string[i].isupper()]
And I used the following code to figure out their duration:
import timeit
import string
import random
import matplotlib.pyplot as plt
from main import findIndexesA, findIndexesB, findIndexesC, findIndexesD, findIndexesE, findIndexesF, findIndexesG
def runTests(string, num=1000000):
A = timeit.timeit(lambda: findIndexesA(string), number=num)
B = timeit.timeit(lambda: findIndexesB(string), number=num)
C = timeit.timeit(lambda: findIndexesC(string), number=num)
D = timeit.timeit(lambda: list(findIndexesD(string)), number=num)
E = timeit.timeit(lambda: findIndexesE(string), number=num)
F = timeit.timeit(lambda: findIndexesF(string), number=num)
G = timeit.timeit(lambda: findIndexesG(string), number=num)
return sorted([
(A, "range-for-loop"),
(B, "list-comprehension-enumerate"),
(C, "filter-and-map-enumerate"),
(D, "for-loop-generator"),
(E, "one-line-generator"),
(F, "enumerate-for-loop"),
(G, "list-comprehension-range")
], key=lambda x: x[1])
TEST_CASES = (
"Hello World!",
"".join(random.choice(string.ascii_letters) for _ in range(50)),
"".join(random.choice(string.ascii_letters) for _ in range(100)),
"".join(random.choice(string.ascii_letters) for _ in range(200)),
"".join(random.choice(string.ascii_letters) for _ in range(500)),
"".join(random.choice(string.ascii_letters) for _ in range(1000))
)
for case in TEST_CASES:
data = runTests(case, num=10000)
times = [x[0] for x in data]
labels = [x[1] for x in data]
plt.figure(figsize=(17, 12))
plt.bar(range(len(times)), times, tick_label=labels)
plt.xlabel("Name of algorithm")
plt.ylabel("Time to run (seconds)")
plt.title(f"Length of string: {len(case)}")
plt.savefig(f"{len(case)}.png", format="png")
You may notice that findIndexesD
has been converted to a list. This is because python's generator are, as the name says, generators which generate results when needed. The duration highly depends on how you use the generator. In this case we used list
to have some kind of comparison to others.
The same applies to map
. Because map
(and prob. other built-in functions too) is lazy, I have to convert it using list
. This may change depending on your context.
These are the results:
filter-and-map-enumerate
or for-loop-generator
can be a winner depending on your context. Because both map
and generators
are lazy, I had to convert them using list
.
If you work with long data (> 500), you may want to choose between range-for-loop
or list-comprehension-range
.
For small amount of data, you can see for yourself, what you like.
This whole comparsion taught me something new about python: laziness in functions like map
. Will keep that in mind in the future :)