Created
August 29, 2020 17:25
-
-
Save tk3369/f87d1de0f2dda8480a370c47c4d85f8b to your computer and use it in GitHub Desktop.
Sample implementation of `separate` function for DataFrames.jl
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
using DataFrames | |
using Test | |
# Tidyr match-up | |
abstract type FillStrategy end | |
struct FillMissing <: FillStrategy end | |
struct FillRight <: FillStrategy end | |
struct FillLeft <: FillStrategy end | |
""" | |
flexindex | |
Get `i`th index from `x`. If the length of `x` is smaller than `slots`, then | |
apply a fill strategy. See [`FillStrategy`](@ref) | |
# Example | |
```jldoctest | |
julia> [flexindex([1,2], i, 3, FillMissing()) for i in 1:3] | |
3-element Array{Union{Missing, Int64},1}: | |
1 | |
2 | |
missing | |
julia> [flexindex([1,2], i, 3, FillRight()) for i in 1:3] | |
3-element Array{Int64,1}: | |
1 | |
2 | |
2 | |
julia> [flexindex([1,2], i, 3, FillLeft()) for i in 1:3] | |
3-element Array{Int64,1}: | |
1 | |
1 | |
2 | |
``` | |
""" | |
function flexindex(x, i::Integer, slots::Integer, fill = FillMissing()) | |
diff = slots - length(x) | |
if diff > 0 | |
if fill == FillMissing() | |
return i > length(x) ? missing : x[i] | |
elseif fill == FillRight() | |
return i > length(x) ? x[length(x)] : x[i] | |
elseif fill == FillLeft() | |
return i <= diff ? x[1] : x[i - diff] | |
else | |
throw(ArgumentError("Invalid `fill` argument: $fill")) | |
end | |
else | |
return x[i] | |
end | |
end | |
let x = [1,2], slots = 3 | |
@test flexindex(x, 1, slots, FillMissing()) == 1 | |
@test flexindex(x, 2, slots, FillMissing()) == 2 | |
@test flexindex(x, 3, slots, FillMissing()) |> ismissing | |
@test flexindex(x, 1, slots, FillRight()) == 1 | |
@test flexindex(x, 2, slots, FillRight()) == 2 | |
@test flexindex(x, 3, slots, FillRight()) == 2 | |
@test flexindex(x, 1, slots, FillLeft()) == 1 | |
@test flexindex(x, 2, slots, FillLeft()) == 1 | |
@test flexindex(x, 3, slots, FillLeft()) == 2 | |
end | |
""" | |
separate | |
Separate a single column into multiple columns. | |
""" | |
function separate(df::AbstractDataFrame, | |
col::Union{String,Symbol}, | |
into::AbstractVector{T}; | |
splitter = (col -> split(col, r"[^[:alnum:]]+")), | |
fill = FillMissing(), | |
apply = identity, | |
remove = false) where {T <: Union{String,Symbol}} | |
slots = length(into) | |
zipped = zip([col for i in 1:slots], | |
into, | |
[c -> [flexindex(splitter(s), i, slots, fill) for s in c] for i in 1:slots]) | |
transforms = [s => (x -> apply.(f(x))) => t for (s, t, f) in zipped] | |
result_df = transform(df, transforms) | |
if remove | |
result_df = select(result_df, Not(col)) | |
end | |
return result_df | |
end | |
@testset "separate" begin | |
df = DataFrame(y = ["a.b", "a.c", "b.c", "d"]) | |
let x = separate(df, :y, [:y1, :y2]) | |
@test x.y1 == ["a", "a", "b", "d"] | |
@test x.y2[1:3] == ["b", "c", "c"] && ismissing(x.y2[4]) | |
end | |
df = DataFrame(dt = ["2020-01-01", "2020-02-01", "2020-03-01"]) | |
let x = separate(df, :dt, [:year, :month, :day]; apply = (x -> parse(Int, x))) | |
@test x.year == [2020, 2020, 2020] | |
@test x.month == [1,2,3] | |
@test x.year == repeat([2020], 3) | |
@test hasproperty(x, :dt) | |
end | |
let x = separate(df, :dt, [:year, :month, :day]; remove = true) | |
@test !hasproperty(x, :dt) | |
end | |
end |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment