oxinabox · May 14, 2018 15:36
diff --git a/demo.ipynb b/demo.ipynb
 {
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "using InternedStrings\n",
    "using SortingAlgorithms\n",
    "using DataArrays, DataFrames"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "using BenchmarkTools"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 49,
   "metadata": {},
   "outputs": [],
   "source": [
    "# These algorithms may not actually be correct.\n",
    "# They might have eg off-by-one indexing errors\n",
    "# But for performance evaluations they are close enough"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "make_data (generic function with 5 methods)"
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "function make_data(\n",
    "    nstrings = 100_000,\n",
    "    ninterned = 99_000,\n",
    "    maxlen = 50,\n",
    "    ncategories = 5)\n",
    "    \n",
    "    empty!(InternedStrings.pool) #for testing purposes\n",
    "\n",
    "    categories = [randstring(rand(1:maxlen)) for _ in 1:ncategories]\n",
    "    data = rand(categories, nstrings)\n",
    "    intern.(@view data[1:ninterned])\n",
    "    shuffle!(data)\n",
    "end"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 39,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "current (generic function with 1 method)"
      ]
     },
     "execution_count": 39,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "function current(data)\n",
    "    # translated from Wes McKinney's groupsort_indexer in pandas (file: src/groupby.pyx).   \n",
    "    dv = PooledDataArray(data)\n",
    "    ngroups = length(dv.pool)\n",
    "    x = copy!(similar(dv.refs, UInt32), dv.refs)\n",
    "\n",
    "\n",
    "    # count group sizes, location 0 for NA\n",
    "    n = length(x)\n",
    "    # counts = x.pool\n",
    "    counts = fill(0, ngroups + 1)\n",
    "    for i = 1:n\n",
    "        @inbounds counts[x[i] + 1] += 1\n",
    "    end\n",
    "\n",
    "    # mark the start of each contiguous group of like-indexed data\n",
    "    starts = fill(1, ngroups + 1)\n",
    "    for i = 2:ngroups+1\n",
    "        @inbounds starts[i] = starts[i - 1] + counts[i - 1]\n",
    "    end\n",
    "\n",
    "\n",
    "    # this is our indexer\n",
    "    idx = fill(0, n)\n",
    "    for i = 1:n\n",
    "        @inbounds label = x[i] + 1\n",
    "        @inbounds idx[starts[label]] = i\n",
    "        @inbounds starts[label] += 1\n",
    "    end\n",
    "    starts = DataFrames._uniqueofsorted(starts)\n",
    "    idx, starts\n",
    "end\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 40,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "  14.418 ms (99530 allocations: 3.84 MiB)\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "([3, 4, 8, 13, 16, 24, 30, 31, 35, 37  …  99948, 99951, 99959, 99960, 99972, 99983, 99985, 99991, 99992, 99993], [1, 20147, 40258, 60145, 79967, 100001])"
      ]
     },
     "execution_count": 40,
     "metadata": {},
     "output_type": "execute_result"
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "WARNING: both DataFrames and DataArrays export \"coefnames\"; uses of it in module Main must be qualified\n",
      "WARNING: both StatsBase and Compat export \"stderr\"; uses of it in module DataFrames must be qualified\n",
      "WARNING: both DataArrays and BenchmarkTools export \"trim\"; uses of it in module Main must be qualified\n"
     ]
    }
   ],
   "source": [
    "@btime current($(make_data()))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 32,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(Ptr{UInt8} @0x00007fa187591138, Ptr{UInt8} @0x00007fa187591178)"
      ]
     },
     "execution_count": 32,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "extrema([pointer(\"a\"),pointer(\"b\")])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 45,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "  4.657 ms (14 allocations: 4.43 MiB)\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "([3, 6, 14, 18, 23, 24, 26, 33, 36, 41  …  99942, 99950, 99968, 99971, 99973, 99977, 99981, 99982, 99988, 99997], [1, 20009, 39669, 59770, 79884, 100001])"
      ]
     },
     "execution_count": 45,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "function sneaky_assume_interned(data, ngroups_=nothing)\n",
    "    # translated from Wes McKinney's groupsort_indexer in pandas (file: src/groupby.pyx).   \n",
    "    ptrs = Int.(pointer.(data))\n",
    "    \n",
    "    local ngroups\n",
    "    if ngroups_ isa Int\n",
    "        ngroups = ngroups_\n",
    "    else\n",
    "        ptr_min, ptr_max = extrema(ptrs)\n",
    "        ngroups = ptr_max - ptr_min + 1\n",
    "    end\n",
    "\n",
    "\n",
    "    # count group sizes, location 0 for NA\n",
    "    n = length(data)\n",
    "    counts = fill(0, ngroups + 1)\n",
    "    for i = 1:n\n",
    "        @inbounds lbl = ptrs[i]%ngroups\n",
    "        @inbounds counts[lbl + 1] += 1\n",
    "    end\n",
    "\n",
    "    # mark the start of each contiguous group of like-indexed data\n",
    "    starts = fill(1, ngroups + 1)\n",
    "    for i = 2:ngroups+1\n",
    "        @inbounds starts[i] = starts[i - 1] + counts[i - 1]\n",
    "    end\n",
    "\n",
    "\n",
    "    # this is our indexer\n",
    "    idx = fill(0, n)\n",
    "    for i = 1:n\n",
    "        @inbounds lbl = ptrs[i]%ngroups\n",
    "        @inbounds label = lbl + 1\n",
    "        @inbounds idx[starts[label]] = i\n",
    "        @inbounds starts[label] += 1\n",
    "    end\n",
    "    starts = DataFrames._uniqueofsorted(starts)\n",
    "    idx, starts\n",
    "end\n",
    "@btime sneaky_assume_interned($(intern(make_data())))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 46,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "  3.492 ms (11 allocations: 1.53 MiB)\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "([1, 2, 3, 5, 6, 7, 8, 9, 10, 11  …  99945, 99946, 99949, 99953, 99955, 99956, 99967, 99978, 99983, 99988], [1, 59825, 79904, 100001])"
      ]
     },
     "execution_count": 46,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "@btime sneaky_assume_interned($(intern(make_data())), 10)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "fancy (generic function with 1 method)"
      ]
     },
     "execution_count": 27,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "SortingAlgorithms.uint_mapping(::Base.Order.ForwardOrdering, ptr::Ptr{UInt8}) = UInt64(ptr)\n",
    "\n",
    "function fancy(data::Vector{T}) where T\n",
    "    \n",
    "    # translated from Wes McKinney's groupsort_indexer in pandas (file: src/groupby.pyx).   \n",
    "    n = length(data)\n",
    "    \n",
    "    \n",
    "    ptrs = Int.(pointer.(data))\n",
    "    inds = sortperm(ptrs, alg=RadixSort)\n",
    "    \n",
    "    groups2inds = Dict{Ptr{UInt8}, Vector{Int}}()\n",
    "    \n",
    "    local cur_group_inds::Vector{Int}\n",
    "    prev_ptr = Ptr{UInt8}(0)\n",
    "    for i in 1:n\n",
    "        @inbounds ind = inds[i]\n",
    "        ptr = ptrs[ind]\n",
    "        if ptr != prev_ptr\n",
    "            @inbounds datum = data[ind]\n",
    "            iptr = pointer(intern(datum))\n",
    "            prev_ptr = iptr\n",
    "            # may or maynot have been added\n",
    "            cur_group_inds = get!(Vector{Int}, groups2inds, iptr)\n",
    "            push!(cur_group_inds, ind)\n",
    "            \n",
    "        else #this element is same as previous elemnt, which means this is interned and was added\n",
    "            prev_ptr = ptr\n",
    "            push!(cur_group_inds, ind)\n",
    "        end        \n",
    "    end\n",
    "    \n",
    "    ngroups = length(groups2inds)\n",
    "    starts = fill(0, ngroups)\n",
    "    idx = fill(0, n)\n",
    "    group_num = 0\n",
    "    item_num = 0\n",
    "    for group_inds in values(groups2inds)\n",
    "        group_num+=1\n",
    "        starts[group_num] = item_num+1\n",
    "        for ind in group_inds\n",
    "            item_num += 1\n",
    "            @inbounds idx[item_num] = ind\n",
    "        end\n",
    "    end\n",
    "    idx, starts\n",
    "end\n",
    "@btime fancy($(make_data()))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 47,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "  14.725 ms (298996 allocations: 6.93 MiB)\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "([15, 20, 26, 36, 42, 56, 60, 62, 64, 65  …  99944, 99955, 99959, 99962, 99964, 99965, 99984, 99988, 99991, 99992], [20050, 40038, 59972, 80305, 1])"
      ]
     },
     "execution_count": 47,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "SortingAlgorithms.uint_mapping(::Base.Order.ForwardOrdering, ptr::Ptr{UInt8}) = UInt64(ptr)\n",
    "\n",
    "function assume_interned(data::Vector{T}) where T\n",
    "    n = length(data)\n",
    "    \n",
    "    ptrs = Int.(pointer.(data))\n",
    "    inds = sortperm(ptrs, alg=RadixSort)\n",
    "    ngroups = min(length(InternedStrings.get_pool(T)), n)\n",
    "    starts = fill(1, ngroups)\n",
    "    \n",
    "    prev_ptr = Int(0) #will never occur as this is NULL pointer\n",
    "    group_num = 0\n",
    "    for i in 1:n\n",
    "        @inbounds ind = inds[i]\n",
    "        @inbounds ptr = ptrs[ind]\n",
    "        if ptr != prev_ptr\n",
    "            @inbounds starts[group_num] = i\n",
    "            group_num+=1\n",
    "        end\n",
    "        prev_ptr = ptr\n",
    "    end\n",
    "    inds, starts\n",
    "end\n",
    "@btime assume_interned($(intern(make_data())))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 48,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "  4.472 ms (16 allocations: 2.37 MiB)\n"
     ]
    }
   ],
   "source": [
    "function just_pointer_and_sort(data)       \n",
    "    ptrs = Int.(pointer.(data))\n",
    "    inds = sortperm(ptrs, alg=RadixSort)\n",
    "end\n",
    "@btime just_pointer_and_sort($(intern(make_data())));"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 33,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "  5.260 ms (19 allocations: 2.38 MiB)\n"
     ]
    }
   ],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": 34,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "just_intern_and_sort (generic function with 1 method)"
      ]
     },
     "execution_count": 34,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "function just_intern_and_sort(data)\n",
    "    ptrs = pointer.(intern.(data))\n",
    "    inds = sortperm(ptrs, alg=RadixSort)\n",
    "end\n",
    "@btime just_intern_and_sort($(make_data()));"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 35,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "  33.186 ms (19 allocations: 2.38 MiB)\n"
     ]
    }
   ],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": 36,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "just_pool (generic function with 1 method)"
      ]
     },
     "execution_count": 36,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "function just_pool(data)\n",
    "    dv = PooledDataArray(data)\n",
    "    ngroups = length(dv.pool)\n",
    "    x = copy!(similar(dv.refs, UInt32), dv.refs)\n",
    "end\n",
    "@btime just_pool($(make_data()));"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 38,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "  14.524 ms (99521 allocations: 3.07 MiB)\n"
     ]
    }
   ],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Julia 0.6.2",
   "language": "julia",
   "name": "julia-0.6"
  },
  "language_info": {
   "file_extension": ".jl",
   "mimetype": "application/julia",
   "name": "julia",
   "version": "0.6.2"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
 }
	{
	"cells": [
	{
	"cell_type": "code",
	"execution_count": 3,
	"metadata": {},
	"outputs": [],
	"source": [
	"using InternedStrings\n",
	"using SortingAlgorithms\n",
	"using DataArrays, DataFrames"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 4,
	"metadata": {},
	"outputs": [],
	"source": [
	"using BenchmarkTools"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 49,
	"metadata": {},
	"outputs": [],
	"source": [
	"# These algorithms may not actually be correct.\n",
	"# They might have eg off-by-one indexing errors\n",
	"# But for performance evaluations they are close enough"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": []
	},
	{
	"cell_type": "code",
	"execution_count": 5,
	"metadata": {},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"make_data (generic function with 5 methods)"
	]
	},
	"execution_count": 5,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"function make_data(\n",
	" nstrings = 100_000,\n",
	" ninterned = 99_000,\n",
	" maxlen = 50,\n",
	" ncategories = 5)\n",
	" \n",
	" empty!(InternedStrings.pool) #for testing purposes\n",
	"\n",
	" categories = [randstring(rand(1:maxlen)) for _ in 1:ncategories]\n",
	" data = rand(categories, nstrings)\n",
	" intern.(@view data[1:ninterned])\n",
	" shuffle!(data)\n",
	"end"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 39,
	"metadata": {},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"current (generic function with 1 method)"
	]
	},
	"execution_count": 39,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"function current(data)\n",
	" # translated from Wes McKinney's groupsort_indexer in pandas (file: src/groupby.pyx). \n",
	" dv = PooledDataArray(data)\n",
	" ngroups = length(dv.pool)\n",
	" x = copy!(similar(dv.refs, UInt32), dv.refs)\n",
	"\n",
	"\n",
	" # count group sizes, location 0 for NA\n",
	" n = length(x)\n",
	" # counts = x.pool\n",
	" counts = fill(0, ngroups + 1)\n",
	" for i = 1:n\n",
	" @inbounds counts[x[i] + 1] += 1\n",
	" end\n",
	"\n",
	" # mark the start of each contiguous group of like-indexed data\n",
	" starts = fill(1, ngroups + 1)\n",
	" for i = 2:ngroups+1\n",
	" @inbounds starts[i] = starts[i - 1] + counts[i - 1]\n",
	" end\n",
	"\n",
	"\n",
	" # this is our indexer\n",
	" idx = fill(0, n)\n",
	" for i = 1:n\n",
	" @inbounds label = x[i] + 1\n",
	" @inbounds idx[starts[label]] = i\n",
	" @inbounds starts[label] += 1\n",
	" end\n",
	" starts = DataFrames._uniqueofsorted(starts)\n",
	" idx, starts\n",
	"end\n"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 40,
	"metadata": {},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	" 14.418 ms (99530 allocations: 3.84 MiB)\n"
	]
	},
	{
	"data": {
	"text/plain": [
	"([3, 4, 8, 13, 16, 24, 30, 31, 35, 37 … 99948, 99951, 99959, 99960, 99972, 99983, 99985, 99991, 99992, 99993], [1, 20147, 40258, 60145, 79967, 100001])"
	]
	},
	"execution_count": 40,
	"metadata": {},
	"output_type": "execute_result"
	},
	{
	"name": "stderr",
	"output_type": "stream",
	"text": [
	"WARNING: both DataFrames and DataArrays export \"coefnames\"; uses of it in module Main must be qualified\n",
	"WARNING: both StatsBase and Compat export \"stderr\"; uses of it in module DataFrames must be qualified\n",
	"WARNING: both DataArrays and BenchmarkTools export \"trim\"; uses of it in module Main must be qualified\n"
	]
	}
	],
	"source": [
	"@btime current($(make_data()))"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 32,
	"metadata": {},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"(Ptr{UInt8} @0x00007fa187591138, Ptr{UInt8} @0x00007fa187591178)"
	]
	},
	"execution_count": 32,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"extrema([pointer(\"a\"),pointer(\"b\")])"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 45,
	"metadata": {},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	" 4.657 ms (14 allocations: 4.43 MiB)\n"
	]
	},
	{
	"data": {
	"text/plain": [
	"([3, 6, 14, 18, 23, 24, 26, 33, 36, 41 … 99942, 99950, 99968, 99971, 99973, 99977, 99981, 99982, 99988, 99997], [1, 20009, 39669, 59770, 79884, 100001])"
	]
	},
	"execution_count": 45,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"function sneaky_assume_interned(data, ngroups_=nothing)\n",
	" # translated from Wes McKinney's groupsort_indexer in pandas (file: src/groupby.pyx). \n",
	" ptrs = Int.(pointer.(data))\n",
	" \n",
	" local ngroups\n",
	" if ngroups_ isa Int\n",
	" ngroups = ngroups_\n",
	" else\n",
	" ptr_min, ptr_max = extrema(ptrs)\n",
	" ngroups = ptr_max - ptr_min + 1\n",
	" end\n",
	"\n",
	"\n",
	" # count group sizes, location 0 for NA\n",
	" n = length(data)\n",
	" counts = fill(0, ngroups + 1)\n",
	" for i = 1:n\n",
	" @inbounds lbl = ptrs[i]%ngroups\n",
	" @inbounds counts[lbl + 1] += 1\n",
	" end\n",
	"\n",
	" # mark the start of each contiguous group of like-indexed data\n",
	" starts = fill(1, ngroups + 1)\n",
	" for i = 2:ngroups+1\n",
	" @inbounds starts[i] = starts[i - 1] + counts[i - 1]\n",
	" end\n",
	"\n",
	"\n",
	" # this is our indexer\n",
	" idx = fill(0, n)\n",
	" for i = 1:n\n",
	" @inbounds lbl = ptrs[i]%ngroups\n",
	" @inbounds label = lbl + 1\n",
	" @inbounds idx[starts[label]] = i\n",
	" @inbounds starts[label] += 1\n",
	" end\n",
	" starts = DataFrames._uniqueofsorted(starts)\n",
	" idx, starts\n",
	"end\n",
	"@btime sneaky_assume_interned($(intern(make_data())))"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 46,
	"metadata": {},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	" 3.492 ms (11 allocations: 1.53 MiB)\n"
	]
	},
	{
	"data": {
	"text/plain": [
	"([1, 2, 3, 5, 6, 7, 8, 9, 10, 11 … 99945, 99946, 99949, 99953, 99955, 99956, 99967, 99978, 99983, 99988], [1, 59825, 79904, 100001])"
	]
	},
	"execution_count": 46,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"@btime sneaky_assume_interned($(intern(make_data())), 10)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 27,
	"metadata": {},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"fancy (generic function with 1 method)"
	]
	},
	"execution_count": 27,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"SortingAlgorithms.uint_mapping(::Base.Order.ForwardOrdering, ptr::Ptr{UInt8}) = UInt64(ptr)\n",
	"\n",
	"function fancy(data::Vector{T}) where T\n",
	" \n",
	" # translated from Wes McKinney's groupsort_indexer in pandas (file: src/groupby.pyx). \n",
	" n = length(data)\n",
	" \n",
	" \n",
	" ptrs = Int.(pointer.(data))\n",
	" inds = sortperm(ptrs, alg=RadixSort)\n",
	" \n",
	" groups2inds = Dict{Ptr{UInt8}, Vector{Int}}()\n",
	" \n",
	" local cur_group_inds::Vector{Int}\n",
	" prev_ptr = Ptr{UInt8}(0)\n",
	" for i in 1:n\n",
	" @inbounds ind = inds[i]\n",
	" ptr = ptrs[ind]\n",
	" if ptr != prev_ptr\n",
	" @inbounds datum = data[ind]\n",
	" iptr = pointer(intern(datum))\n",
	" prev_ptr = iptr\n",
	" # may or maynot have been added\n",
	" cur_group_inds = get!(Vector{Int}, groups2inds, iptr)\n",
	" push!(cur_group_inds, ind)\n",
	" \n",
	" else #this element is same as previous elemnt, which means this is interned and was added\n",
	" prev_ptr = ptr\n",
	" push!(cur_group_inds, ind)\n",
	" end \n",
	" end\n",
	" \n",
	" ngroups = length(groups2inds)\n",
	" starts = fill(0, ngroups)\n",
	" idx = fill(0, n)\n",
	" group_num = 0\n",
	" item_num = 0\n",
	" for group_inds in values(groups2inds)\n",
	" group_num+=1\n",
	" starts[group_num] = item_num+1\n",
	" for ind in group_inds\n",
	" item_num += 1\n",
	" @inbounds idx[item_num] = ind\n",
	" end\n",
	" end\n",
	" idx, starts\n",
	"end\n",
	"@btime fancy($(make_data()))"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 47,
	"metadata": {},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	" 14.725 ms (298996 allocations: 6.93 MiB)\n"
	]
	},
	{
	"data": {
	"text/plain": [
	"([15, 20, 26, 36, 42, 56, 60, 62, 64, 65 … 99944, 99955, 99959, 99962, 99964, 99965, 99984, 99988, 99991, 99992], [20050, 40038, 59972, 80305, 1])"
	]
	},
	"execution_count": 47,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"SortingAlgorithms.uint_mapping(::Base.Order.ForwardOrdering, ptr::Ptr{UInt8}) = UInt64(ptr)\n",
	"\n",
	"function assume_interned(data::Vector{T}) where T\n",
	" n = length(data)\n",
	" \n",
	" ptrs = Int.(pointer.(data))\n",
	" inds = sortperm(ptrs, alg=RadixSort)\n",
	" ngroups = min(length(InternedStrings.get_pool(T)), n)\n",
	" starts = fill(1, ngroups)\n",
	" \n",
	" prev_ptr = Int(0) #will never occur as this is NULL pointer\n",
	" group_num = 0\n",
	" for i in 1:n\n",
	" @inbounds ind = inds[i]\n",
	" @inbounds ptr = ptrs[ind]\n",
	" if ptr != prev_ptr\n",
	" @inbounds starts[group_num] = i\n",
	" group_num+=1\n",
	" end\n",
	" prev_ptr = ptr\n",
	" end\n",
	" inds, starts\n",
	"end\n",
	"@btime assume_interned($(intern(make_data())))"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 48,
	"metadata": {},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	" 4.472 ms (16 allocations: 2.37 MiB)\n"
	]
	}
	],
	"source": [
	"function just_pointer_and_sort(data) \n",
	" ptrs = Int.(pointer.(data))\n",
	" inds = sortperm(ptrs, alg=RadixSort)\n",
	"end\n",
	"@btime just_pointer_and_sort($(intern(make_data())));"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 33,
	"metadata": {},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	" 5.260 ms (19 allocations: 2.38 MiB)\n"
	]
	}
	],
	"source": []
	},
	{
	"cell_type": "code",
	"execution_count": 34,
	"metadata": {},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"just_intern_and_sort (generic function with 1 method)"
	]
	},
	"execution_count": 34,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"function just_intern_and_sort(data)\n",
	" ptrs = pointer.(intern.(data))\n",
	" inds = sortperm(ptrs, alg=RadixSort)\n",
	"end\n",
	"@btime just_intern_and_sort($(make_data()));"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 35,
	"metadata": {},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	" 33.186 ms (19 allocations: 2.38 MiB)\n"
	]
	}
	],
	"source": []
	},
	{
	"cell_type": "code",
	"execution_count": 36,
	"metadata": {},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"just_pool (generic function with 1 method)"
	]
	},
	"execution_count": 36,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"function just_pool(data)\n",
	" dv = PooledDataArray(data)\n",
	" ngroups = length(dv.pool)\n",
	" x = copy!(similar(dv.refs, UInt32), dv.refs)\n",
	"end\n",
	"@btime just_pool($(make_data()));"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 38,
	"metadata": {},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	" 14.524 ms (99521 allocations: 3.07 MiB)\n"
	]
	}
	],
	"source": []
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": []
	}
	],
	"metadata": {
	"kernelspec": {
	"display_name": "Julia 0.6.2",
	"language": "julia",
	"name": "julia-0.6"
	},
	"language_info": {
	"file_extension": ".jl",
	"mimetype": "application/julia",
	"name": "julia",
	"version": "0.6.2"
	}
	},
	"nbformat": 4,
	"nbformat_minor": 2
	}