Why inplace=True
isn't a good thing?
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import s3fs | |
import pandas as pd | |
def get_csv_from_s3(folder_path): | |
dfs = [] | |
s3 = s3fs.S3FileSystem() | |
for fp in s3.ls(folder_path): | |
if '.csv' in fp: | |
with s3.open(fp) as s3f: | |
with bz2file.open(s3f) as f: |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import pandas as pd | |
def compare_two_dfs(input_df_1, input_df_2): | |
df_1, df_2 = input_df_1.copy(), input_df_2.copy() | |
ne_stacked = (df_1 != df_2).stack() | |
changed = ne_stacked[ne_stacked] | |
changed.index.names = ['id', 'col'] | |
difference_locations = np.where(df_1 != df_2) | |
changed_from = df_1.values[difference_locations] |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# This function is extracted from this file: https://github.com/dask/dask/blob/master/dask/diagnostics/progress.py | |
def format_time(t): | |
"""Format seconds into a human readable form. | |
>>> format_time(10.4) | |
'10.4s' | |
>>> format_time(1000.4) | |
'16min 40.4s' | |
""" |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Inspired from: https://airflow.incubator.apache.org/_modules/airflow/models.html#BaseOperator | |
import pickle | |
import logging | |
from datetime import datetime | |
import traceback | |
def pickle_info(obj, session=None): | |
d = {} | |
d['is_picklable'] = True |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from functools import wraps | |
from logs import logger | |
# Two decorator to log the shape and dtypes of a DataFrame | |
# Inspired from here: https://tomaugspurger.github.io/method-chaining | |
def log_shape(func): | |
@wraps(func) | |
def wrapper(*args, **kwargs): |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from scipy.stats import lognorm | |
import numpy as np | |
def prepare_lognorm(mean, var): | |
# Formula from https://en.wikipedia.org/wiki/Log-normal_distribution | |
sigma = np.sqrt(np.log(1 + (float(var) / mean ** 2))) | |
mu = np.log(mean / np.sqrt(1 + (float(var) / mean ** 2))) | |
# Compute the scale for scipy |
A good technique to set configuration variables (from Keras): https://github.com/fchollet/keras/blob/master/keras/backend/common.py
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import pytz | |
def localize_datetime(input_df, timezone, tms_col): | |
""" | |
Convert datetime column from UTC to another timezone. | |
""" | |
tmz = pytz.timezone(timezone) | |
df = input_df.copy() | |
return (df.set_index(tms_col) | |
.tz_localize(pytz.utc) # UTC time |
import sys
!{sys.executable} -m pip install <package>
import sys
!conda install --yes --prefix {sys.prefix} <package>