Skip to content

Instantly share code, notes, and snippets.

@oxinabox
Last active May 14, 2018 15:36
Show Gist options
  • Save oxinabox/30ec8347e5073fbc003e105db263d39e to your computer and use it in GitHub Desktop.
Save oxinabox/30ec8347e5073fbc003e105db263d39e to your computer and use it in GitHub Desktop.
Interned Strings and Grouping
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"using InternedStrings\n",
"using SortingAlgorithms\n",
"using DataArrays, DataFrames"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [],
"source": [
"using BenchmarkTools"
]
},
{
"cell_type": "code",
"execution_count": 49,
"metadata": {},
"outputs": [],
"source": [
"# These algorithms may not actually be correct.\n",
"# They might have eg off-by-one indexing errors\n",
"# But for performance evaluations they are close enough"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"make_data (generic function with 5 methods)"
]
},
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"function make_data(\n",
" nstrings = 100_000,\n",
" ninterned = 99_000,\n",
" maxlen = 50,\n",
" ncategories = 5)\n",
" \n",
" empty!(InternedStrings.pool) #for testing purposes\n",
"\n",
" categories = [randstring(rand(1:maxlen)) for _ in 1:ncategories]\n",
" data = rand(categories, nstrings)\n",
" intern.(@view data[1:ninterned])\n",
" shuffle!(data)\n",
"end"
]
},
{
"cell_type": "code",
"execution_count": 39,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"current (generic function with 1 method)"
]
},
"execution_count": 39,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"function current(data)\n",
" # translated from Wes McKinney's groupsort_indexer in pandas (file: src/groupby.pyx). \n",
" dv = PooledDataArray(data)\n",
" ngroups = length(dv.pool)\n",
" x = copy!(similar(dv.refs, UInt32), dv.refs)\n",
"\n",
"\n",
" # count group sizes, location 0 for NA\n",
" n = length(x)\n",
" # counts = x.pool\n",
" counts = fill(0, ngroups + 1)\n",
" for i = 1:n\n",
" @inbounds counts[x[i] + 1] += 1\n",
" end\n",
"\n",
" # mark the start of each contiguous group of like-indexed data\n",
" starts = fill(1, ngroups + 1)\n",
" for i = 2:ngroups+1\n",
" @inbounds starts[i] = starts[i - 1] + counts[i - 1]\n",
" end\n",
"\n",
"\n",
" # this is our indexer\n",
" idx = fill(0, n)\n",
" for i = 1:n\n",
" @inbounds label = x[i] + 1\n",
" @inbounds idx[starts[label]] = i\n",
" @inbounds starts[label] += 1\n",
" end\n",
" starts = DataFrames._uniqueofsorted(starts)\n",
" idx, starts\n",
"end\n"
]
},
{
"cell_type": "code",
"execution_count": 40,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" 14.418 ms (99530 allocations: 3.84 MiB)\n"
]
},
{
"data": {
"text/plain": [
"([3, 4, 8, 13, 16, 24, 30, 31, 35, 37 … 99948, 99951, 99959, 99960, 99972, 99983, 99985, 99991, 99992, 99993], [1, 20147, 40258, 60145, 79967, 100001])"
]
},
"execution_count": 40,
"metadata": {},
"output_type": "execute_result"
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"WARNING: both DataFrames and DataArrays export \"coefnames\"; uses of it in module Main must be qualified\n",
"WARNING: both StatsBase and Compat export \"stderr\"; uses of it in module DataFrames must be qualified\n",
"WARNING: both DataArrays and BenchmarkTools export \"trim\"; uses of it in module Main must be qualified\n"
]
}
],
"source": [
"@btime current($(make_data()))"
]
},
{
"cell_type": "code",
"execution_count": 32,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"(Ptr{UInt8} @0x00007fa187591138, Ptr{UInt8} @0x00007fa187591178)"
]
},
"execution_count": 32,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"extrema([pointer(\"a\"),pointer(\"b\")])"
]
},
{
"cell_type": "code",
"execution_count": 45,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" 4.657 ms (14 allocations: 4.43 MiB)\n"
]
},
{
"data": {
"text/plain": [
"([3, 6, 14, 18, 23, 24, 26, 33, 36, 41 … 99942, 99950, 99968, 99971, 99973, 99977, 99981, 99982, 99988, 99997], [1, 20009, 39669, 59770, 79884, 100001])"
]
},
"execution_count": 45,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"function sneaky_assume_interned(data, ngroups_=nothing)\n",
" # translated from Wes McKinney's groupsort_indexer in pandas (file: src/groupby.pyx). \n",
" ptrs = Int.(pointer.(data))\n",
" \n",
" local ngroups\n",
" if ngroups_ isa Int\n",
" ngroups = ngroups_\n",
" else\n",
" ptr_min, ptr_max = extrema(ptrs)\n",
" ngroups = ptr_max - ptr_min + 1\n",
" end\n",
"\n",
"\n",
" # count group sizes, location 0 for NA\n",
" n = length(data)\n",
" counts = fill(0, ngroups + 1)\n",
" for i = 1:n\n",
" @inbounds lbl = ptrs[i]%ngroups\n",
" @inbounds counts[lbl + 1] += 1\n",
" end\n",
"\n",
" # mark the start of each contiguous group of like-indexed data\n",
" starts = fill(1, ngroups + 1)\n",
" for i = 2:ngroups+1\n",
" @inbounds starts[i] = starts[i - 1] + counts[i - 1]\n",
" end\n",
"\n",
"\n",
" # this is our indexer\n",
" idx = fill(0, n)\n",
" for i = 1:n\n",
" @inbounds lbl = ptrs[i]%ngroups\n",
" @inbounds label = lbl + 1\n",
" @inbounds idx[starts[label]] = i\n",
" @inbounds starts[label] += 1\n",
" end\n",
" starts = DataFrames._uniqueofsorted(starts)\n",
" idx, starts\n",
"end\n",
"@btime sneaky_assume_interned($(intern(make_data())))"
]
},
{
"cell_type": "code",
"execution_count": 46,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" 3.492 ms (11 allocations: 1.53 MiB)\n"
]
},
{
"data": {
"text/plain": [
"([1, 2, 3, 5, 6, 7, 8, 9, 10, 11 … 99945, 99946, 99949, 99953, 99955, 99956, 99967, 99978, 99983, 99988], [1, 59825, 79904, 100001])"
]
},
"execution_count": 46,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"@btime sneaky_assume_interned($(intern(make_data())), 10)"
]
},
{
"cell_type": "code",
"execution_count": 27,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"fancy (generic function with 1 method)"
]
},
"execution_count": 27,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"SortingAlgorithms.uint_mapping(::Base.Order.ForwardOrdering, ptr::Ptr{UInt8}) = UInt64(ptr)\n",
"\n",
"function fancy(data::Vector{T}) where T\n",
" \n",
" # translated from Wes McKinney's groupsort_indexer in pandas (file: src/groupby.pyx). \n",
" n = length(data)\n",
" \n",
" \n",
" ptrs = Int.(pointer.(data))\n",
" inds = sortperm(ptrs, alg=RadixSort)\n",
" \n",
" groups2inds = Dict{Ptr{UInt8}, Vector{Int}}()\n",
" \n",
" local cur_group_inds::Vector{Int}\n",
" prev_ptr = Ptr{UInt8}(0)\n",
" for i in 1:n\n",
" @inbounds ind = inds[i]\n",
" ptr = ptrs[ind]\n",
" if ptr != prev_ptr\n",
" @inbounds datum = data[ind]\n",
" iptr = pointer(intern(datum))\n",
" prev_ptr = iptr\n",
" # may or maynot have been added\n",
" cur_group_inds = get!(Vector{Int}, groups2inds, iptr)\n",
" push!(cur_group_inds, ind)\n",
" \n",
" else #this element is same as previous elemnt, which means this is interned and was added\n",
" prev_ptr = ptr\n",
" push!(cur_group_inds, ind)\n",
" end \n",
" end\n",
" \n",
" ngroups = length(groups2inds)\n",
" starts = fill(0, ngroups)\n",
" idx = fill(0, n)\n",
" group_num = 0\n",
" item_num = 0\n",
" for group_inds in values(groups2inds)\n",
" group_num+=1\n",
" starts[group_num] = item_num+1\n",
" for ind in group_inds\n",
" item_num += 1\n",
" @inbounds idx[item_num] = ind\n",
" end\n",
" end\n",
" idx, starts\n",
"end\n",
"@btime fancy($(make_data()))"
]
},
{
"cell_type": "code",
"execution_count": 47,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" 14.725 ms (298996 allocations: 6.93 MiB)\n"
]
},
{
"data": {
"text/plain": [
"([15, 20, 26, 36, 42, 56, 60, 62, 64, 65 … 99944, 99955, 99959, 99962, 99964, 99965, 99984, 99988, 99991, 99992], [20050, 40038, 59972, 80305, 1])"
]
},
"execution_count": 47,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"SortingAlgorithms.uint_mapping(::Base.Order.ForwardOrdering, ptr::Ptr{UInt8}) = UInt64(ptr)\n",
"\n",
"function assume_interned(data::Vector{T}) where T\n",
" n = length(data)\n",
" \n",
" ptrs = Int.(pointer.(data))\n",
" inds = sortperm(ptrs, alg=RadixSort)\n",
" ngroups = min(length(InternedStrings.get_pool(T)), n)\n",
" starts = fill(1, ngroups)\n",
" \n",
" prev_ptr = Int(0) #will never occur as this is NULL pointer\n",
" group_num = 0\n",
" for i in 1:n\n",
" @inbounds ind = inds[i]\n",
" @inbounds ptr = ptrs[ind]\n",
" if ptr != prev_ptr\n",
" @inbounds starts[group_num] = i\n",
" group_num+=1\n",
" end\n",
" prev_ptr = ptr\n",
" end\n",
" inds, starts\n",
"end\n",
"@btime assume_interned($(intern(make_data())))"
]
},
{
"cell_type": "code",
"execution_count": 48,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" 4.472 ms (16 allocations: 2.37 MiB)\n"
]
}
],
"source": [
"function just_pointer_and_sort(data) \n",
" ptrs = Int.(pointer.(data))\n",
" inds = sortperm(ptrs, alg=RadixSort)\n",
"end\n",
"@btime just_pointer_and_sort($(intern(make_data())));"
]
},
{
"cell_type": "code",
"execution_count": 33,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" 5.260 ms (19 allocations: 2.38 MiB)\n"
]
}
],
"source": []
},
{
"cell_type": "code",
"execution_count": 34,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"just_intern_and_sort (generic function with 1 method)"
]
},
"execution_count": 34,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"function just_intern_and_sort(data)\n",
" ptrs = pointer.(intern.(data))\n",
" inds = sortperm(ptrs, alg=RadixSort)\n",
"end\n",
"@btime just_intern_and_sort($(make_data()));"
]
},
{
"cell_type": "code",
"execution_count": 35,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" 33.186 ms (19 allocations: 2.38 MiB)\n"
]
}
],
"source": []
},
{
"cell_type": "code",
"execution_count": 36,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"just_pool (generic function with 1 method)"
]
},
"execution_count": 36,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"function just_pool(data)\n",
" dv = PooledDataArray(data)\n",
" ngroups = length(dv.pool)\n",
" x = copy!(similar(dv.refs, UInt32), dv.refs)\n",
"end\n",
"@btime just_pool($(make_data()));"
]
},
{
"cell_type": "code",
"execution_count": 38,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" 14.524 ms (99521 allocations: 3.07 MiB)\n"
]
}
],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Julia 0.6.2",
"language": "julia",
"name": "julia-0.6"
},
"language_info": {
"file_extension": ".jl",
"mimetype": "application/julia",
"name": "julia",
"version": "0.6.2"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment