Created
October 21, 2012 01:06
-
-
Save tautologico/3925372 to your computer and use it in GitHub Desktop.
Grouping a dataframe by binning a numeric column
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
function find_bin(x::Float64, limits::Vector{Float64}) | |
bin = length(limits) + 1 | |
for i in 1:length(limits) | |
if x < limits[i] | |
bin = i | |
break | |
end | |
end | |
bin | |
end | |
# some tests | |
@assert (find_bin(0.2, [1.0, 2.3, 4.5, 6.7, 9.0]) == 1) | |
@assert (find_bin(1.0, [1.0, 2.3, 4.5, 6.7, 9.0]) == 2) | |
@assert (find_bin(1.2, [1.0, 2.3, 4.5, 6.7, 9.0]) == 2) | |
@assert (find_bin(22.1, [1.0, 2.3, 4.5, 6.7, 9.0]) == 6) | |
@assert (find_bin(9.0, [1.0, 2.3, 4.5, 6.7, 9.0]) == 6) | |
@assert (find_bin(6.7, [1.0, 2.3, 4.5, 6.7, 9.0]) == 5) | |
# return a vector with the bins of all elements in dv (this is a map) | |
function find_bins(dv::DataVec{Float64}, limits::Vector{Float64}) | |
vbins = zeros(Int64, length(dv)) | |
for i in 1:length(dv) | |
vbins[i] = find_bin(dv[i], limits) | |
end | |
vbins | |
end | |
# create GroupedDataFrame by binning values in a | |
# numeric column according to some binning limits | |
function group_bins{T}(df::AbstractDataFrame, col::T, limits::Vector{Float64}) | |
n_bins = length(limits) + 1 | |
vbins = find_bins(df[col], limits) | |
(idx, starts) = groupsort_indexer(vbins, n_bins) | |
# Remove zero-length groupings | |
starts = _uniqueofsorted(starts) | |
ends = [starts[2:end] - 1] | |
GroupedDataFrame(df, [col], idx, starts[1:end-1], ends) | |
end | |
# a test | |
df1 = DataFrame(quote | |
x = [1:8] | |
y = [5.4, 10.3, 21.7, 24.5, 16.2, 32.9, 8.2, 29.9] | |
end) | |
@assert (length(group_bins(df1, "y", [10.0, 20.0, 30.0])) == 4) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
This is fantastic. This should be part of DataFrames or as a supplemental package. I've made a version where limits is also of type ::RangeStepLen{Float64} and it works with things like range(0.0, 0.5, 20) with no code changes. I went looking for this type of binning in DataFrames and came across this gist. Thanks for making this available.