This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import pandas as pd | |
from hashlib import md5 | |
def text_to_hash(text): | |
return md5(text.encode("utf8")).hexdigest() | |
def add_hash(column_name="document"): | |
""" | |
Decorator. Wraps a function that returns a dataframe, must have column_name in columns. | |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# checked against python 3.7.3, pandas 0.24.2, s3fs 0.4.2 | |
import tarfile | |
import io | |
import s3fs | |
import pandas as pd | |
tar_path = f"s3://my-bucket/debug.tar.gz" # path in s3 | |
metadata_path = "debug/metadata.csv" # path inside of the tar file |
OlderNewer