Last active
May 20, 2020 22:31
-
-
Save Mlawrence95/f47496184b69993645a4c112c57fb1b4 to your computer and use it in GitHub Desktop.
A python decorator that adds a column to your pandas dataframe -- the MD5 hash of the specified column
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import pandas as pd | |
from hashlib import md5 | |
def text_to_hash(text): | |
return md5(text.encode("utf8")).hexdigest() | |
def add_hash(column_name="document"): | |
""" | |
Decorator. Wraps a function that returns a dataframe, must have column_name in columns. | |
Adds an additional column to the returned df: "hashid", the MD5 hash of column_name | |
""" | |
def apply_hash(df_fn): | |
def new_df_fn(*args, **kwargs): | |
df = df_fn(*args, **kwargs) | |
df.loc[:, "hashid"] = df.loc[:, column_name].apply(text_to_hash) | |
return df | |
return new_df_fn | |
return apply_hash | |
# Test text_to_hash # | |
assert text_to_hash("hello") == text_to_hash("hello") | |
assert text_to_hash("hey hows it going") != text_to_hash("hey hey hows it going") # extra hey | |
assert text_to_hash("hey hows it going") != text_to_hash("hey how's it going") # hows != how's | |
# Test add_hash decorator # | |
@add_hash(column_name="document") | |
def test_df_fn(text_list): | |
return pd.DataFrame({"document": text_list}) | |
test_df = test_df_fn(["hi", "hello"]) | |
assert "hashid" in test_df.columns | |
test_df |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment