Last active
May 14, 2018 15:36
-
-
Save oxinabox/30ec8347e5073fbc003e105db263d39e to your computer and use it in GitHub Desktop.
Interned Strings and Grouping
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| { | |
| "cells": [ | |
| { | |
| "cell_type": "code", | |
| "execution_count": 3, | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "using InternedStrings\n", | |
| "using SortingAlgorithms\n", | |
| "using DataArrays, DataFrames" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 4, | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "using BenchmarkTools" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 49, | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "# These algorithms may not actually be correct.\n", | |
| "# They might have eg off-by-one indexing errors\n", | |
| "# But for performance evaluations they are close enough" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": null, | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 5, | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/plain": [ | |
| "make_data (generic function with 5 methods)" | |
| ] | |
| }, | |
| "execution_count": 5, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "function make_data(\n", | |
| " nstrings = 100_000,\n", | |
| " ninterned = 99_000,\n", | |
| " maxlen = 50,\n", | |
| " ncategories = 5)\n", | |
| " \n", | |
| " empty!(InternedStrings.pool) #for testing purposes\n", | |
| "\n", | |
| " categories = [randstring(rand(1:maxlen)) for _ in 1:ncategories]\n", | |
| " data = rand(categories, nstrings)\n", | |
| " intern.(@view data[1:ninterned])\n", | |
| " shuffle!(data)\n", | |
| "end" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 39, | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/plain": [ | |
| "current (generic function with 1 method)" | |
| ] | |
| }, | |
| "execution_count": 39, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "function current(data)\n", | |
| " # translated from Wes McKinney's groupsort_indexer in pandas (file: src/groupby.pyx). \n", | |
| " dv = PooledDataArray(data)\n", | |
| " ngroups = length(dv.pool)\n", | |
| " x = copy!(similar(dv.refs, UInt32), dv.refs)\n", | |
| "\n", | |
| "\n", | |
| " # count group sizes, location 0 for NA\n", | |
| " n = length(x)\n", | |
| " # counts = x.pool\n", | |
| " counts = fill(0, ngroups + 1)\n", | |
| " for i = 1:n\n", | |
| " @inbounds counts[x[i] + 1] += 1\n", | |
| " end\n", | |
| "\n", | |
| " # mark the start of each contiguous group of like-indexed data\n", | |
| " starts = fill(1, ngroups + 1)\n", | |
| " for i = 2:ngroups+1\n", | |
| " @inbounds starts[i] = starts[i - 1] + counts[i - 1]\n", | |
| " end\n", | |
| "\n", | |
| "\n", | |
| " # this is our indexer\n", | |
| " idx = fill(0, n)\n", | |
| " for i = 1:n\n", | |
| " @inbounds label = x[i] + 1\n", | |
| " @inbounds idx[starts[label]] = i\n", | |
| " @inbounds starts[label] += 1\n", | |
| " end\n", | |
| " starts = DataFrames._uniqueofsorted(starts)\n", | |
| " idx, starts\n", | |
| "end\n" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 40, | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "name": "stdout", | |
| "output_type": "stream", | |
| "text": [ | |
| " 14.418 ms (99530 allocations: 3.84 MiB)\n" | |
| ] | |
| }, | |
| { | |
| "data": { | |
| "text/plain": [ | |
| "([3, 4, 8, 13, 16, 24, 30, 31, 35, 37 … 99948, 99951, 99959, 99960, 99972, 99983, 99985, 99991, 99992, 99993], [1, 20147, 40258, 60145, 79967, 100001])" | |
| ] | |
| }, | |
| "execution_count": 40, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| }, | |
| { | |
| "name": "stderr", | |
| "output_type": "stream", | |
| "text": [ | |
| "WARNING: both DataFrames and DataArrays export \"coefnames\"; uses of it in module Main must be qualified\n", | |
| "WARNING: both StatsBase and Compat export \"stderr\"; uses of it in module DataFrames must be qualified\n", | |
| "WARNING: both DataArrays and BenchmarkTools export \"trim\"; uses of it in module Main must be qualified\n" | |
| ] | |
| } | |
| ], | |
| "source": [ | |
| "@btime current($(make_data()))" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 32, | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/plain": [ | |
| "(Ptr{UInt8} @0x00007fa187591138, Ptr{UInt8} @0x00007fa187591178)" | |
| ] | |
| }, | |
| "execution_count": 32, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "extrema([pointer(\"a\"),pointer(\"b\")])" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 45, | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "name": "stdout", | |
| "output_type": "stream", | |
| "text": [ | |
| " 4.657 ms (14 allocations: 4.43 MiB)\n" | |
| ] | |
| }, | |
| { | |
| "data": { | |
| "text/plain": [ | |
| "([3, 6, 14, 18, 23, 24, 26, 33, 36, 41 … 99942, 99950, 99968, 99971, 99973, 99977, 99981, 99982, 99988, 99997], [1, 20009, 39669, 59770, 79884, 100001])" | |
| ] | |
| }, | |
| "execution_count": 45, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "function sneaky_assume_interned(data, ngroups_=nothing)\n", | |
| " # translated from Wes McKinney's groupsort_indexer in pandas (file: src/groupby.pyx). \n", | |
| " ptrs = Int.(pointer.(data))\n", | |
| " \n", | |
| " local ngroups\n", | |
| " if ngroups_ isa Int\n", | |
| " ngroups = ngroups_\n", | |
| " else\n", | |
| " ptr_min, ptr_max = extrema(ptrs)\n", | |
| " ngroups = ptr_max - ptr_min + 1\n", | |
| " end\n", | |
| "\n", | |
| "\n", | |
| " # count group sizes, location 0 for NA\n", | |
| " n = length(data)\n", | |
| " counts = fill(0, ngroups + 1)\n", | |
| " for i = 1:n\n", | |
| " @inbounds lbl = ptrs[i]%ngroups\n", | |
| " @inbounds counts[lbl + 1] += 1\n", | |
| " end\n", | |
| "\n", | |
| " # mark the start of each contiguous group of like-indexed data\n", | |
| " starts = fill(1, ngroups + 1)\n", | |
| " for i = 2:ngroups+1\n", | |
| " @inbounds starts[i] = starts[i - 1] + counts[i - 1]\n", | |
| " end\n", | |
| "\n", | |
| "\n", | |
| " # this is our indexer\n", | |
| " idx = fill(0, n)\n", | |
| " for i = 1:n\n", | |
| " @inbounds lbl = ptrs[i]%ngroups\n", | |
| " @inbounds label = lbl + 1\n", | |
| " @inbounds idx[starts[label]] = i\n", | |
| " @inbounds starts[label] += 1\n", | |
| " end\n", | |
| " starts = DataFrames._uniqueofsorted(starts)\n", | |
| " idx, starts\n", | |
| "end\n", | |
| "@btime sneaky_assume_interned($(intern(make_data())))" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 46, | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "name": "stdout", | |
| "output_type": "stream", | |
| "text": [ | |
| " 3.492 ms (11 allocations: 1.53 MiB)\n" | |
| ] | |
| }, | |
| { | |
| "data": { | |
| "text/plain": [ | |
| "([1, 2, 3, 5, 6, 7, 8, 9, 10, 11 … 99945, 99946, 99949, 99953, 99955, 99956, 99967, 99978, 99983, 99988], [1, 59825, 79904, 100001])" | |
| ] | |
| }, | |
| "execution_count": 46, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "@btime sneaky_assume_interned($(intern(make_data())), 10)" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 27, | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/plain": [ | |
| "fancy (generic function with 1 method)" | |
| ] | |
| }, | |
| "execution_count": 27, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "SortingAlgorithms.uint_mapping(::Base.Order.ForwardOrdering, ptr::Ptr{UInt8}) = UInt64(ptr)\n", | |
| "\n", | |
| "function fancy(data::Vector{T}) where T\n", | |
| " \n", | |
| " # translated from Wes McKinney's groupsort_indexer in pandas (file: src/groupby.pyx). \n", | |
| " n = length(data)\n", | |
| " \n", | |
| " \n", | |
| " ptrs = Int.(pointer.(data))\n", | |
| " inds = sortperm(ptrs, alg=RadixSort)\n", | |
| " \n", | |
| " groups2inds = Dict{Ptr{UInt8}, Vector{Int}}()\n", | |
| " \n", | |
| " local cur_group_inds::Vector{Int}\n", | |
| " prev_ptr = Ptr{UInt8}(0)\n", | |
| " for i in 1:n\n", | |
| " @inbounds ind = inds[i]\n", | |
| " ptr = ptrs[ind]\n", | |
| " if ptr != prev_ptr\n", | |
| " @inbounds datum = data[ind]\n", | |
| " iptr = pointer(intern(datum))\n", | |
| " prev_ptr = iptr\n", | |
| " # may or maynot have been added\n", | |
| " cur_group_inds = get!(Vector{Int}, groups2inds, iptr)\n", | |
| " push!(cur_group_inds, ind)\n", | |
| " \n", | |
| " else #this element is same as previous elemnt, which means this is interned and was added\n", | |
| " prev_ptr = ptr\n", | |
| " push!(cur_group_inds, ind)\n", | |
| " end \n", | |
| " end\n", | |
| " \n", | |
| " ngroups = length(groups2inds)\n", | |
| " starts = fill(0, ngroups)\n", | |
| " idx = fill(0, n)\n", | |
| " group_num = 0\n", | |
| " item_num = 0\n", | |
| " for group_inds in values(groups2inds)\n", | |
| " group_num+=1\n", | |
| " starts[group_num] = item_num+1\n", | |
| " for ind in group_inds\n", | |
| " item_num += 1\n", | |
| " @inbounds idx[item_num] = ind\n", | |
| " end\n", | |
| " end\n", | |
| " idx, starts\n", | |
| "end\n", | |
| "@btime fancy($(make_data()))" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 47, | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "name": "stdout", | |
| "output_type": "stream", | |
| "text": [ | |
| " 14.725 ms (298996 allocations: 6.93 MiB)\n" | |
| ] | |
| }, | |
| { | |
| "data": { | |
| "text/plain": [ | |
| "([15, 20, 26, 36, 42, 56, 60, 62, 64, 65 … 99944, 99955, 99959, 99962, 99964, 99965, 99984, 99988, 99991, 99992], [20050, 40038, 59972, 80305, 1])" | |
| ] | |
| }, | |
| "execution_count": 47, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "SortingAlgorithms.uint_mapping(::Base.Order.ForwardOrdering, ptr::Ptr{UInt8}) = UInt64(ptr)\n", | |
| "\n", | |
| "function assume_interned(data::Vector{T}) where T\n", | |
| " n = length(data)\n", | |
| " \n", | |
| " ptrs = Int.(pointer.(data))\n", | |
| " inds = sortperm(ptrs, alg=RadixSort)\n", | |
| " ngroups = min(length(InternedStrings.get_pool(T)), n)\n", | |
| " starts = fill(1, ngroups)\n", | |
| " \n", | |
| " prev_ptr = Int(0) #will never occur as this is NULL pointer\n", | |
| " group_num = 0\n", | |
| " for i in 1:n\n", | |
| " @inbounds ind = inds[i]\n", | |
| " @inbounds ptr = ptrs[ind]\n", | |
| " if ptr != prev_ptr\n", | |
| " @inbounds starts[group_num] = i\n", | |
| " group_num+=1\n", | |
| " end\n", | |
| " prev_ptr = ptr\n", | |
| " end\n", | |
| " inds, starts\n", | |
| "end\n", | |
| "@btime assume_interned($(intern(make_data())))" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 48, | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "name": "stdout", | |
| "output_type": "stream", | |
| "text": [ | |
| " 4.472 ms (16 allocations: 2.37 MiB)\n" | |
| ] | |
| } | |
| ], | |
| "source": [ | |
| "function just_pointer_and_sort(data) \n", | |
| " ptrs = Int.(pointer.(data))\n", | |
| " inds = sortperm(ptrs, alg=RadixSort)\n", | |
| "end\n", | |
| "@btime just_pointer_and_sort($(intern(make_data())));" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 33, | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "name": "stdout", | |
| "output_type": "stream", | |
| "text": [ | |
| " 5.260 ms (19 allocations: 2.38 MiB)\n" | |
| ] | |
| } | |
| ], | |
| "source": [] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 34, | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/plain": [ | |
| "just_intern_and_sort (generic function with 1 method)" | |
| ] | |
| }, | |
| "execution_count": 34, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "function just_intern_and_sort(data)\n", | |
| " ptrs = pointer.(intern.(data))\n", | |
| " inds = sortperm(ptrs, alg=RadixSort)\n", | |
| "end\n", | |
| "@btime just_intern_and_sort($(make_data()));" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 35, | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "name": "stdout", | |
| "output_type": "stream", | |
| "text": [ | |
| " 33.186 ms (19 allocations: 2.38 MiB)\n" | |
| ] | |
| } | |
| ], | |
| "source": [] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 36, | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/plain": [ | |
| "just_pool (generic function with 1 method)" | |
| ] | |
| }, | |
| "execution_count": 36, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "function just_pool(data)\n", | |
| " dv = PooledDataArray(data)\n", | |
| " ngroups = length(dv.pool)\n", | |
| " x = copy!(similar(dv.refs, UInt32), dv.refs)\n", | |
| "end\n", | |
| "@btime just_pool($(make_data()));" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 38, | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "name": "stdout", | |
| "output_type": "stream", | |
| "text": [ | |
| " 14.524 ms (99521 allocations: 3.07 MiB)\n" | |
| ] | |
| } | |
| ], | |
| "source": [] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": null, | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [] | |
| } | |
| ], | |
| "metadata": { | |
| "kernelspec": { | |
| "display_name": "Julia 0.6.2", | |
| "language": "julia", | |
| "name": "julia-0.6" | |
| }, | |
| "language_info": { | |
| "file_extension": ".jl", | |
| "mimetype": "application/julia", | |
| "name": "julia", | |
| "version": "0.6.2" | |
| } | |
| }, | |
| "nbformat": 4, | |
| "nbformat_minor": 2 | |
| } |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment