Skip to content

Instantly share code, notes, and snippets.

@Mlawrence95
Last active May 20, 2020 22:31
Show Gist options
  • Save Mlawrence95/f47496184b69993645a4c112c57fb1b4 to your computer and use it in GitHub Desktop.
Save Mlawrence95/f47496184b69993645a4c112c57fb1b4 to your computer and use it in GitHub Desktop.
A python decorator that adds a column to your pandas dataframe -- the MD5 hash of the specified column
import pandas as pd
from hashlib import md5
def text_to_hash(text):
return md5(text.encode("utf8")).hexdigest()
def add_hash(column_name="document"):
"""
Decorator. Wraps a function that returns a dataframe, must have column_name in columns.
Adds an additional column to the returned df: "hashid", the MD5 hash of column_name
"""
def apply_hash(df_fn):
def new_df_fn(*args, **kwargs):
df = df_fn(*args, **kwargs)
df.loc[:, "hashid"] = df.loc[:, column_name].apply(text_to_hash)
return df
return new_df_fn
return apply_hash
# Test text_to_hash #
assert text_to_hash("hello") == text_to_hash("hello")
assert text_to_hash("hey hows it going") != text_to_hash("hey hey hows it going") # extra hey
assert text_to_hash("hey hows it going") != text_to_hash("hey how's it going") # hows != how's
# Test add_hash decorator #
@add_hash(column_name="document")
def test_df_fn(text_list):
return pd.DataFrame({"document": text_list})
test_df = test_df_fn(["hi", "hello"])
assert "hashid" in test_df.columns
test_df
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment