Last active
July 5, 2023 04:28
-
-
Save hughdbrown/2f125de201ef70e75c8ca6af95834e8b to your computer and use it in GitHub Desktop.
Fast way to do lookup conversion in pandas
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
from pprint import pprint | |
from random import randint | |
from datetime import datetime | |
from uuid import uuid4 | |
import pandas as pd | |
NUMS = 10000 | |
def time_decorator(func): | |
def wrapper_function(*args, **kwargs): | |
start = datetime.now() | |
result = func(*args, **kwargs) | |
end = datetime.now() | |
print(f"{'-' * 30} {func.__name__!r}: {end - start} seconds") | |
return result | |
return wrapper_function | |
@time_decorator | |
def create_raw_lookup(n): | |
return { | |
(10 * i, 10 * i + 9): str(uuid4()) | |
for i in range(n) | |
} | |
@time_decorator | |
def create_expanded_lookup(raw_lookup): | |
return { | |
i: val | |
for (lower, upper), val in raw_lookup.items() | |
for i in range(lower, upper + 1) | |
} | |
@time_decorator | |
def create_df(): | |
# random.randint(x, y) creates values between x and y inclusive, | |
# so we subtract 1 from end. | |
return pd.DataFrame(data= | |
[ | |
{"zip": randint(0, 100000 - 1)} | |
for _ in range(1000000) | |
] | |
) | |
@time_decorator | |
def apply_conversion(df, lookup): | |
# for DataFrame.assign: | |
# https://tomaugspurger.net/posts/method-chaining/ | |
# for Series.map with DataFrame.assign:: | |
# https://www.sharpsightlabs.com/blog/pandas-assign/ | |
return df.assign(value=df.zip.map(lookup)) | |
def main(): | |
# Make a sparse representation | |
raw_lookup = create_raw_lookup(NUMS) | |
pprint(list(raw_lookup.items())[:5]) | |
# Convert to a dense representation | |
lookup = create_expanded_lookup(raw_lookup) | |
pprint(list(lookup.items())[:20]) | |
# Make a DataFrame with random zipcodes | |
df = create_df() | |
# Use the dense representation to add a column to the DataFrame | |
df = apply_conversion(df, lookup) | |
print(df.describe()) | |
print(df.head()) | |
if __name__ == '__main__': | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment