Skip to content

Instantly share code, notes, and snippets.

@tk3369
Created August 29, 2020 17:25
Show Gist options
  • Save tk3369/f87d1de0f2dda8480a370c47c4d85f8b to your computer and use it in GitHub Desktop.
Save tk3369/f87d1de0f2dda8480a370c47c4d85f8b to your computer and use it in GitHub Desktop.
Sample implementation of `separate` function for DataFrames.jl
using DataFrames
using Test
# Tidyr match-up
abstract type FillStrategy end
struct FillMissing <: FillStrategy end
struct FillRight <: FillStrategy end
struct FillLeft <: FillStrategy end
"""
flexindex
Get `i`th index from `x`. If the length of `x` is smaller than `slots`, then
apply a fill strategy. See [`FillStrategy`](@ref)
# Example
```jldoctest
julia> [flexindex([1,2], i, 3, FillMissing()) for i in 1:3]
3-element Array{Union{Missing, Int64},1}:
1
2
missing
julia> [flexindex([1,2], i, 3, FillRight()) for i in 1:3]
3-element Array{Int64,1}:
1
2
2
julia> [flexindex([1,2], i, 3, FillLeft()) for i in 1:3]
3-element Array{Int64,1}:
1
1
2
```
"""
function flexindex(x, i::Integer, slots::Integer, fill = FillMissing())
diff = slots - length(x)
if diff > 0
if fill == FillMissing()
return i > length(x) ? missing : x[i]
elseif fill == FillRight()
return i > length(x) ? x[length(x)] : x[i]
elseif fill == FillLeft()
return i <= diff ? x[1] : x[i - diff]
else
throw(ArgumentError("Invalid `fill` argument: $fill"))
end
else
return x[i]
end
end
let x = [1,2], slots = 3
@test flexindex(x, 1, slots, FillMissing()) == 1
@test flexindex(x, 2, slots, FillMissing()) == 2
@test flexindex(x, 3, slots, FillMissing()) |> ismissing
@test flexindex(x, 1, slots, FillRight()) == 1
@test flexindex(x, 2, slots, FillRight()) == 2
@test flexindex(x, 3, slots, FillRight()) == 2
@test flexindex(x, 1, slots, FillLeft()) == 1
@test flexindex(x, 2, slots, FillLeft()) == 1
@test flexindex(x, 3, slots, FillLeft()) == 2
end
"""
separate
Separate a single column into multiple columns.
"""
function separate(df::AbstractDataFrame,
col::Union{String,Symbol},
into::AbstractVector{T};
splitter = (col -> split(col, r"[^[:alnum:]]+")),
fill = FillMissing(),
apply = identity,
remove = false) where {T <: Union{String,Symbol}}
slots = length(into)
zipped = zip([col for i in 1:slots],
into,
[c -> [flexindex(splitter(s), i, slots, fill) for s in c] for i in 1:slots])
transforms = [s => (x -> apply.(f(x))) => t for (s, t, f) in zipped]
result_df = transform(df, transforms)
if remove
result_df = select(result_df, Not(col))
end
return result_df
end
@testset "separate" begin
df = DataFrame(y = ["a.b", "a.c", "b.c", "d"])
let x = separate(df, :y, [:y1, :y2])
@test x.y1 == ["a", "a", "b", "d"]
@test x.y2[1:3] == ["b", "c", "c"] && ismissing(x.y2[4])
end
df = DataFrame(dt = ["2020-01-01", "2020-02-01", "2020-03-01"])
let x = separate(df, :dt, [:year, :month, :day]; apply = (x -> parse(Int, x)))
@test x.year == [2020, 2020, 2020]
@test x.month == [1,2,3]
@test x.year == repeat([2020], 3)
@test hasproperty(x, :dt)
end
let x = separate(df, :dt, [:year, :month, :day]; remove = true)
@test !hasproperty(x, :dt)
end
end
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment