Last active
August 29, 2015 14:15
-
-
Save d1manson/c4dc7a901f63f0286883 to your computer and use it in GitHub Desktop.
numpy indexing/labeling tools
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import numpy as np | |
def repeat_ind(n): | |
"""By example: | |
# 0 1 2 3 4 5 6 7 8 | |
n = [0, 0, 3, 0, 0, 2, 0, 2, 1] | |
res = [2, 2, 2, 5, 5, 7, 7, 8] | |
That is the input specifies how many times to repeat the given index. | |
It is equivalent to something like this : | |
hstack((zeros(n_i,dtype=int)+i for i, n_i in enumerate(n))) | |
But this version seems to be faster, and probably scales better, at | |
any rate it encapsulates a task in a function. | |
""" | |
if n.ndim != 1: | |
raise Exception("n is supposed to be 1d array.") | |
n_mask = n.astype(bool) | |
n_inds = np.nonzero(n_mask)[0] | |
n_inds[1:] = n_inds[1:]-n_inds[:-1] # take diff and leave 0th value in place | |
n_cumsum = np.empty(len(n)+1,dtype=int) | |
n_cumsum[0] = 0 | |
n_cumsum[1:] = np.cumsum(n) | |
ret = np.zeros(n_cumsum[-1],dtype=int) | |
ret[n_cumsum[n_mask]] = n_inds # note that n_mask is 1 element shorter than n_cumsum | |
return np.cumsum(ret) | |
def count_to(n): | |
"""By example: | |
# 0 1 2 3 4 5 6 7 8 | |
n = [0, 0, 3, 0, 0, 2, 0, 2, 1] | |
res = [0, 1, 2, 0, 1, 0, 1, 0] | |
That is it is equivalent to something like this : | |
hstack((arange(n_i) for n_i in n)) | |
This version seems quite a bit faster, at least for some | |
possible inputs, and at any rate it encapsulates a task | |
in a function. | |
""" | |
if n.ndim != 1: | |
raise Exception("n is supposed to be 1d array.") | |
n_mask = n.astype(bool) | |
n_cumsum = np.cumsum(n) | |
ret = np.ones(n_cumsum[-1]+1,dtype=int) | |
ret[n_cumsum[n_mask]] -= n[n_mask] | |
ret[0] -= 1 | |
return np.cumsum(ret)[:-1] | |
def label_mask_1D(M): | |
""" By example: | |
M = [F T T F F T F F F T T T] | |
result = [0 1 1 0 0 2 0 0 0 3 3 3] | |
M is boolean array, result is integer labels of contiguous True sections.""" | |
if M.ndim != 1: | |
raise Exception("this is for 1d masks only.") | |
is_start = np.empty(len(M),dtype=bool) | |
is_start[0] = M[0] | |
is_start[1:] = ~M[:-1] & M[1:] | |
L = np.cumsum(is_start) | |
L[~M] = 0 | |
return L | |
def labeled_cumsum(X,L,invalid=nan): | |
""" | |
By example: | |
X= [3 5 8 9 1 2 5 8 5 4 9 2] | |
L= [0 1 1 2 2 0 0 1 1 1 0 5] | |
result=[NaN 5 13 9 10 NaN NaN 8 13 17 NaN 2] | |
That is we calculate the cumsum for each section of `X` | |
where the sections are defined by contiguous blocks of | |
labels in `L`. Where `L==0`, the output is set to `invalid` | |
""" | |
L, X = L.ravel(), X.ravel() # TODO: be consistent with other functions in this module | |
if len(L) != len(X): | |
raise Exception('The two inputs should be vectors of the same length.') | |
# Do the full cumulative sum | |
X[np.isnan(X)] = 0 | |
S = np.cumsum(X) | |
mask = L.astype(bool) | |
# Lookup the cumulative value just before the start of each segment | |
isStart = mask.copy() | |
isStart[1:] &= (L[:-1] != L[1:]) | |
startInds, = isStart.nonzero() | |
S_starts = S[startInds-1] if startInds[0] != 0 else insert(S[startInds[1:]-1],0,0) | |
# Subtract off the excess values (i.e. the ones obtained above) | |
L_safe = np.cumsum(isStart) # we do this in case the labels in L were not sorted integers | |
S[mask] = S[mask] - S_starts[L_safe[mask]-1] | |
# Put NaNs in the false blocks | |
S[L==0] = invalid | |
return S | |
def duplicates_as_complex(x,already_sorted=False): | |
""" | |
By example: | |
x = [9.9 9.9 12.3 15.2 15.2 15.2 ] | |
ret=[9.9+0j 9.9+1j 12.3+0j 15.2+0j 15.2+1j 15.2+2j ] | |
That is we return `x` as a sorted complex number, where | |
the real part is simply the values in `x`, but the | |
imaginary part is a sub-index within a block of duplicates, | |
i.e. if value `v` occurs three times, it appears as | |
`v+0j, v+1j, v+2j`. | |
The advantage of this system is that it can be used in a | |
natural way with `np.sort` and `np.searchsorted`. | |
""" | |
if not already_sorted: | |
x = np.sort(x) | |
is_start = np.empty(len(x),dtype=bool) | |
is_start[0], is_start[1:] = True, x[:-1] != x[1:] | |
labels = np.cumsum(is_start)-1 | |
sub_idx = np.arange(len(x)) - is_start.nonzero()[0][labels] | |
return x + 1j*sub_idx |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment