Last active
June 17, 2020 10:18
-
-
Save echasnovski/2a130b0ac6d58b9200b44821970ebd85 to your computer and use it in GitHub Desktop.
Python snippets to mimic functionality of common R functions (mostly tidyverse)
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import numpy as np | |
import pandas as pd | |
def nest(df, cols, nest_name="data", keep=False): | |
"""Nest non-grouping columns into a list-column of data frames | |
Parameters | |
---------- | |
df : Data frame. | |
cols : List of column names. | |
Columns specifying grouping. | |
nest_name : String, optional. | |
Name of list-column of data frames, by default "data". | |
keep : Boolean, optional. | |
Should the grouping columns be kept in the list column, by default `False`. | |
Returns | |
------- | |
res : Data frame with nested columns. | |
""" | |
df_grouped = df.groupby(cols) | |
group_keys = df_grouped.groups.keys() | |
res = pd.DataFrame(group_keys, columns=cols) | |
if keep: | |
res[nest_name] = [df_grouped.get_group(key) for key in group_keys] | |
else: | |
res[nest_name] = [ | |
df_grouped.get_group(key).drop(cols, axis=1) for key in group_keys | |
] | |
return res | |
def unnest(df, col): | |
"""Unnest previously nested column | |
Parameters | |
---------- | |
df : Data frame with list-column of data frames. | |
Preferably output of `nest()`. | |
col : String. | |
Name of column to unnest | |
Returns | |
------- | |
res : Data frame with unnested column. | |
""" | |
# Concatenate by row data frames from nested column | |
nest_part = pd.concat(df[col].values, axis=0).reset_index(drop=True) | |
# Repeat rows of the rest part of data frame by the lengths of | |
# corresponding unnested data frames | |
sub_df_lens = [len(sub_df) for sub_df in df[col]] | |
row_inds = np.repeat(np.arange(len(df)), sub_df_lens) | |
col_inds = (~df.columns.isin([col])).nonzero()[0] | |
rest_part = df.iloc[row_inds, col_inds].reset_index(drop=True) | |
return pd.concat([rest_part, nest_part], axis=1) | |
def complete(df, cols, fill_val=None): | |
"""Complete a data frame with missing combinations of columns | |
Parameters | |
---------- | |
df : Data frame. | |
cols : List of strings or dictionary. | |
If list, should contain column names, combinations of which should be | |
added. If dictionary, keys represent column names and values - unique | |
values of corresponding columns (if `None`, they will be inferred from | |
columns with names stored in keys). | |
fill_val : Value appropriate for `value` in `fillna()` DataFrame method. | |
Values to use in other columns inside added combinations. | |
Returns | |
------- | |
res : Data frame. | |
Examples | |
-------- | |
>>> df = pd.DataFrame({"a": [1, 2], "b": ["x", "y"], "c": [False, True]}) | |
# Basic completion from values present in data frame | |
>>> df1 = complete(df, ["a", "b"]) | |
# Order of columns matter to ordering | |
>>> df2 = complete(df, ["b", "a"]) | |
# Using dict to mimic list input | |
>>> df3 = complete(df, {"a": None, "b": None}) | |
# Extended functionality with dict `cols` | |
>>> df4 = complete(df, {"a": [3, 2, 1], "b": ["x", "y", "z"]}) | |
# Using `fill_val` | |
>>> df5 = complete(df, {"b": ["x", "y", "z"]}, fill_val={"a": 0, "c": False}) | |
""" | |
def col_sort_unique(col, val=None): | |
if val is None: | |
val = np.sort(df[col].unique()) | |
return val | |
if isinstance(cols, list): | |
col_names = cols | |
unique_col_vals = [col_sort_unique(col) for col in cols] | |
elif isinstance(cols, dict): | |
col_names = cols.keys() | |
unique_col_vals = [col_sort_unique(col, val) for col, val in cols.items()] | |
else: | |
ValueError("`cols` should be list or dict.") | |
comb = pd.MultiIndex.from_product(unique_col_vals, names=col_names) | |
res = pd.DataFrame(index=comb).reset_index().merge(df, how="left") | |
if fill_val is not None: | |
res = res.fillna(fill_val) | |
return res[df.columns] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def corners(a): | |
"""Extract corners from ndarray | |
Extract first and last elements (or single if axis has shape one) along all | |
dimensions. In other words, output elements are taken from `a` at indexes | |
with at least one index is "extreme". | |
Parameters | |
---------- | |
a : ndarray | |
Returns | |
------- | |
corners : ndarray | |
Has the same shape as `a`. | |
Examples | |
-------- | |
>>> a = np.arange(24).reshape((2, 3, 4)) | |
>>> corners(a) | |
array([[[ 0, 3], | |
[ 8, 11]], | |
[[12, 15], | |
[20, 23]]]) | |
>>> corners(a[0]) | |
array([[ 0, 3], | |
[ 8, 11]]) | |
>>> corners(a[0, 0]) | |
array([0, 3]) | |
>>> corners(a[[0], 0]) | |
array([[0, 3]]) | |
""" | |
from itertools import product | |
# Define extreme indices | |
corner_inds = [[0, -1] if d > 1 else [0] for d in a.shape] | |
dims = tuple(len(i) for i in corner_inds) | |
# Construct subsetting tuple | |
inds = product(*corner_inds) | |
inds = tuple(list(x) for x in zip(*inds)) | |
# Return corners with the same shape as `a` | |
return a[inds].reshape(dims) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment