Skip to content

Instantly share code, notes, and snippets.

@echasnovski
Last active June 17, 2020 10:18
Show Gist options
  • Save echasnovski/2a130b0ac6d58b9200b44821970ebd85 to your computer and use it in GitHub Desktop.
Save echasnovski/2a130b0ac6d58b9200b44821970ebd85 to your computer and use it in GitHub Desktop.
Python snippets to mimic functionality of common R functions (mostly tidyverse)
import numpy as np
import pandas as pd
def nest(df, cols, nest_name="data", keep=False):
"""Nest non-grouping columns into a list-column of data frames
Parameters
----------
df : Data frame.
cols : List of column names.
Columns specifying grouping.
nest_name : String, optional.
Name of list-column of data frames, by default "data".
keep : Boolean, optional.
Should the grouping columns be kept in the list column, by default `False`.
Returns
-------
res : Data frame with nested columns.
"""
df_grouped = df.groupby(cols)
group_keys = df_grouped.groups.keys()
res = pd.DataFrame(group_keys, columns=cols)
if keep:
res[nest_name] = [df_grouped.get_group(key) for key in group_keys]
else:
res[nest_name] = [
df_grouped.get_group(key).drop(cols, axis=1) for key in group_keys
]
return res
def unnest(df, col):
"""Unnest previously nested column
Parameters
----------
df : Data frame with list-column of data frames.
Preferably output of `nest()`.
col : String.
Name of column to unnest
Returns
-------
res : Data frame with unnested column.
"""
# Concatenate by row data frames from nested column
nest_part = pd.concat(df[col].values, axis=0).reset_index(drop=True)
# Repeat rows of the rest part of data frame by the lengths of
# corresponding unnested data frames
sub_df_lens = [len(sub_df) for sub_df in df[col]]
row_inds = np.repeat(np.arange(len(df)), sub_df_lens)
col_inds = (~df.columns.isin([col])).nonzero()[0]
rest_part = df.iloc[row_inds, col_inds].reset_index(drop=True)
return pd.concat([rest_part, nest_part], axis=1)
def complete(df, cols, fill_val=None):
"""Complete a data frame with missing combinations of columns
Parameters
----------
df : Data frame.
cols : List of strings or dictionary.
If list, should contain column names, combinations of which should be
added. If dictionary, keys represent column names and values - unique
values of corresponding columns (if `None`, they will be inferred from
columns with names stored in keys).
fill_val : Value appropriate for `value` in `fillna()` DataFrame method.
Values to use in other columns inside added combinations.
Returns
-------
res : Data frame.
Examples
--------
>>> df = pd.DataFrame({"a": [1, 2], "b": ["x", "y"], "c": [False, True]})
# Basic completion from values present in data frame
>>> df1 = complete(df, ["a", "b"])
# Order of columns matter to ordering
>>> df2 = complete(df, ["b", "a"])
# Using dict to mimic list input
>>> df3 = complete(df, {"a": None, "b": None})
# Extended functionality with dict `cols`
>>> df4 = complete(df, {"a": [3, 2, 1], "b": ["x", "y", "z"]})
# Using `fill_val`
>>> df5 = complete(df, {"b": ["x", "y", "z"]}, fill_val={"a": 0, "c": False})
"""
def col_sort_unique(col, val=None):
if val is None:
val = np.sort(df[col].unique())
return val
if isinstance(cols, list):
col_names = cols
unique_col_vals = [col_sort_unique(col) for col in cols]
elif isinstance(cols, dict):
col_names = cols.keys()
unique_col_vals = [col_sort_unique(col, val) for col, val in cols.items()]
else:
ValueError("`cols` should be list or dict.")
comb = pd.MultiIndex.from_product(unique_col_vals, names=col_names)
res = pd.DataFrame(index=comb).reset_index().merge(df, how="left")
if fill_val is not None:
res = res.fillna(fill_val)
return res[df.columns]
def corners(a):
"""Extract corners from ndarray
Extract first and last elements (or single if axis has shape one) along all
dimensions. In other words, output elements are taken from `a` at indexes
with at least one index is "extreme".
Parameters
----------
a : ndarray
Returns
-------
corners : ndarray
Has the same shape as `a`.
Examples
--------
>>> a = np.arange(24).reshape((2, 3, 4))
>>> corners(a)
array([[[ 0, 3],
[ 8, 11]],
[[12, 15],
[20, 23]]])
>>> corners(a[0])
array([[ 0, 3],
[ 8, 11]])
>>> corners(a[0, 0])
array([0, 3])
>>> corners(a[[0], 0])
array([[0, 3]])
"""
from itertools import product
# Define extreme indices
corner_inds = [[0, -1] if d > 1 else [0] for d in a.shape]
dims = tuple(len(i) for i in corner_inds)
# Construct subsetting tuple
inds = product(*corner_inds)
inds = tuple(list(x) for x in zip(*inds))
# Return corners with the same shape as `a`
return a[inds].reshape(dims)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment