Last active
March 17, 2023 17:56
-
-
Save hughdbrown/6d8bd9b5e7a6918751cc77180d50b272 to your computer and use it in GitHub Desktop.
Test of numpy.vectorized function versus pandas.apply
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
# Validation of code in Mewdium article: | |
# https://medium.com/the-modern-scientist/make-pandas-code-120x-faster-a-forbidden-mathematical-jutsu-87103030eb9c | |
import timeit | |
import pandas as pd | |
import numpy as np | |
sizes = [1_000, 10_000, 100_000, 1_000_000] | |
def generate_df(size): | |
df = pd.DataFrame() | |
df['age'] = np.random.randint(1,100,size) | |
df['avg_sleeping'] = np.random.randint(1,24, size) | |
df['gender'] = np.random.choice(['Male','Female'], size) | |
df['annual_income'] = np.random.randint(1000,100000, size) | |
df['phone_number'] = np.random.randint(1_111_111_111, 9_999_999_999, size) | |
df['favourite_food'] = np.random.choice(['pizza', 'burger', 'chips', 'nachos'], size) | |
return df | |
def reward_function(row): | |
total_bonus = 10 | |
if (row['avg_sleeping'] >= 6) and (5000 <= row['annual_income'] <= 10000): | |
total_bonus += 10 | |
if (60 <= row['age'] <= 90): | |
total_bonus += 20 if (row["gender"] == "Female") else 18 | |
return row['annual_income'] * (total_bonus / 100) | |
def wrapper(func, *args, **kwargs): | |
def wrapped(): | |
return func(*args, **kwargs) | |
return wrapped | |
def apply_function(df): | |
df['reward'] = df.apply(reward_function, axis=1) | |
return df | |
def test_apply(): | |
for size in sizes: | |
df = generate_df(size) | |
wrap = wrapper(apply_function, df) | |
n = timeit.timeit(wrap, number = 10) | |
print(f'Size: {size} | Time: {n}') | |
def reward_function_part(avg_sleeping, annual_income, gender, age): | |
total_bonus = 10 | |
if (avg_sleeping >= 6) and (5000 <= annual_income <= 10000): | |
total_bonus += 10 | |
if (60<= age <=90): | |
total_bonus = np.where(gender == "Female", 20, 18) | |
return annual_income * (total_bonus / 100) | |
def vectorize_function(df): | |
df["reward"] = np.vectorize(reward_function_part)( | |
df['avg_sleeping'], df['annual_income'], df['gender'], df['age'] | |
) | |
return df | |
def test_vectorize(): | |
for size in sizes: | |
df = generate_df(size) | |
wrap = wrapper(vectorize_function, df) | |
n = timeit.timeit(wrap, number = 10) | |
print(f'Size: {size} | Time: {n}') | |
if __name__ == '__main__': | |
test_apply() | |
test_vectorize() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
The numbers I get with the fourth revision are:
apply
Size: 1000 | Time: 0.158108046
Size: 10000 | Time: 1.460299413
Size: 100000 | Time: 15.09842858
Size: 1000000 | Time: 152.397236201
vectorize
Size: 1000 | Time: 0.02020495899998309
Size: 10000 | Time: 0.1365422229999922
Size: 100000 | Time: 1.258238623000011
Size: 1000000 | Time: 12.476223318999985
7-13 times speedup