Skip to content

Instantly share code, notes, and snippets.

@simonbyrne
Last active August 29, 2015 14:11
Show Gist options
  • Save simonbyrne/3ebdcc6298b1661be19c to your computer and use it in GitHub Desktop.
Save simonbyrne/3ebdcc6298b1661be19c to your computer and use it in GitHub Desktop.
Display the source blob
Display the rendered blob
Raw
{
"metadata": {
"language": "Julia",
"name": "",
"signature": "sha256:1bf557764c9a9123ef164a88185357a19f3f240d35202789cd31f2f6ba0046bc"
},
"nbformat": 3,
"nbformat_minor": 0,
"worksheets": [
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Modified version of https://gist.github.com/tshort/9b872f0cd12760d9563d\n",
"\n",
"* Tuple approach uses `Field{:a}()` instead of `Field(:a)` (including in the length of the loop).\n",
"* Column `b` is changed to `Int`."
]
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"module SimonDataFrame\n",
"\n",
"type DataFrame{N,D}\n",
" data::D\n",
"end\n",
"\n",
"immutable Field{s}\n",
"end\n",
"\n",
"Field(s::Symbol) = Field{s}()\n",
"\n",
"function DataFrame(;kwds...)\n",
" names = Any[]\n",
" data = Any[]\n",
" types = Any[]\n",
" for (n, d) in kwds\n",
" push!(names,n)\n",
" push!(data,d)\n",
" push!(types,typeof(d))\n",
" end\n",
" N = tuple(names...)\n",
" T = tuple(types...)\n",
" DataFrame{N,T}(tuple(data...))\n",
"end\n",
"\n",
"stagedfunction getindex{N,D,s}(d::DataFrame{N,D},f::Field{s})\n",
" m = Dict(zip(N,1:length(N)))\n",
" j = m[s]\n",
" :(d.data[$j])\n",
"end\n",
"stagedfunction getindex{N,D,s}(d::DataFrame{N,D},i::Integer,f::Field{s})\n",
" m = Dict(zip(N,1:length(N)))\n",
" j = m[s]\n",
" :(d.data[$j][i])\n",
"end\n",
"stagedfunction getindex{N,D}(d::DataFrame{N,D},i::Integer)\n",
" Expr(:tuple,[:(d.data[$j][i]) for j in 1:length(D)]...)\n",
"end\n",
"\n",
"\n",
"\n",
"getindex(d::DataFrame,s::Symbol) = d[Field(s)]\n",
"getindex(d::DataFrame,i::Int,s::Symbol) = d[i,Field(s)]\n",
"\n",
"srand(1)\n",
"const n = 5_000_000\n",
"a = rand(n)\n",
"b = round(Int,rand(n))\n",
"sdf = DataFrame(a = a, b = b)\n",
"function dot1(df::DataFrame)\n",
" x = 0.0\n",
" for i in 1:length(df[:a])\n",
" x += df[:a][i] * df[:a][i]\n",
" end\n",
" return x\n",
"end\n",
"\n",
"function dot2(df::DataFrame)\n",
" x = 0.0\n",
" for i in 1:length(df[:a])\n",
" x += df[i,:a] * df[i,:a]\n",
" end\n",
" return x\n",
"end\n",
"\n",
"function dot3(df::DataFrame)\n",
" x = 0.0\n",
" for i in 1:length(df[Field{:a}()])\n",
" x += df[Field{:a}()][i] * df[Field{:a}()][i]\n",
" end\n",
" return x\n",
"end\n",
"\n",
"function dot4(df::DataFrame)\n",
" x = 0.0\n",
" for i in 1:length(df[Field{:a}()])\n",
" x += df[i,Field{:a}()] * df[i,Field{:a}()]\n",
" end\n",
" return x\n",
"end\n",
"sdf[1,Field(:a)]\n",
"@show t1 = @elapsed dot1(sdf)\n",
"@show t2 = @elapsed dot2(sdf)\n",
"@show t3 = @elapsed dot3(sdf)\n",
"@show t4 = @elapsed dot4(sdf)\n",
"\n",
"end"
],
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "stream",
"stream": "stderr",
"text": [
"Warning: replacing module SimonDataFrame\n"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"t1 = @elapsed dot1(sdf) = 3.73443803"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"t2 = @elapsed dot2(sdf) = "
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"5.29104101\n",
"t3 = @elapsed dot3(sdf) = 0.026739018\n",
"t4 = @elapsed dot4(sdf) = 0.022498017\n"
]
}
],
"prompt_number": 31
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"module CompositeDataFramesTimings\n",
"\n",
"using DataArrays, DataFrames\n",
"using DataFramesMeta ## NOTE: MUST BE THE DEVELOPMENT VERSION\n",
"\n",
"srand(1)\n",
"const n = 5_000_000\n",
"a = rand(n)\n",
"b = round(Int,rand(n))\n",
"cdf = CompositeDataFrame(a = a, b = b)\n",
"df = DataFrame(cdf)\n",
"\n",
"function dot1(df::AbstractDataFrame)\n",
" x = 0.0\n",
" for i in 1:size(df, 1)\n",
" x += df[:a][i] * df[:a][i]\n",
" end\n",
" return x\n",
"end\n",
"\n",
"function dot2(df::AbstractDataFrame)\n",
" x = 0.0\n",
" for i in 1:size(df, 1)\n",
" x += df[i,:a] * df[i,:a]\n",
" end\n",
" return x\n",
"end\n",
"\n",
"@show c1 = @elapsed dot1(df)\n",
"@show c2 = @elapsed dot2(df)\n",
"@show c1c = @elapsed dot1(cdf)\n",
"@show c2c = @elapsed dot2(cdf)\n",
"end"
],
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "stream",
"stream": "stderr",
"text": [
"Warning: replacing module CompositeDataFramesTimings\n"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"c1 = @elapsed dot1(df) = 2.254317254"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"c2 = @elapsed dot2(df) = "
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"2.329794122\n",
"c1c = @elapsed dot1(cdf) = "
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"1.543457532\n",
"c2c = @elapsed dot2(cdf) = "
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"1.634219199\n"
]
}
],
"prompt_number": 33
},
{
"cell_type": "code",
"collapsed": false,
"input": [],
"language": "python",
"metadata": {},
"outputs": []
}
],
"metadata": {}
}
]
}
# Modified version of https://gist.github.com/tshort/9b872f0cd12760d9563d
# * Tuple approach uses `Field{:a}()` instead of `Field(:a)` (including in the length of the loop).
# * Column `b` is changed to `Int`.
module SimonDataFrame
type DataFrame{N,D}
data::D
end
immutable Field{s}
end
Field(s::Symbol) = Field{s}()
function DataFrame(;kwds...)
names = Any[]
data = Any[]
types = Any[]
for (n, d) in kwds
push!(names,n)
push!(data,d)
push!(types,typeof(d))
end
N = tuple(names...)
T = tuple(types...)
DataFrame{N,T}(tuple(data...))
end
stagedfunction getindex{N,D,s}(d::DataFrame{N,D},f::Field{s})
m = Dict(zip(N,1:length(N)))
j = m[s]
:(d.data[$j])
end
stagedfunction getindex{N,D,s}(d::DataFrame{N,D},i::Integer,f::Field{s})
m = Dict(zip(N,1:length(N)))
j = m[s]
:(d.data[$j][i])
end
stagedfunction getindex{N,D}(d::DataFrame{N,D},i::Integer)
Expr(:tuple,[:(d.data[$j][i]) for j in 1:length(D)]...)
end
getindex(d::DataFrame,s::Symbol) = d[Field(s)]
getindex(d::DataFrame,i::Int,s::Symbol) = d[i,Field(s)]
srand(1)
const n = 5_000_000
a = rand(n)
b = round(Int,rand(n))
sdf = DataFrame(a = a, b = b)
function dot1(df::DataFrame)
x = 0.0
for i in 1:length(df[:a])
x += df[:a][i] * df[:a][i]
end
return x
end
function dot2(df::DataFrame)
x = 0.0
for i in 1:length(df[:a])
x += df[i,:a] * df[i,:a]
end
return x
end
function dot3(df::DataFrame)
x = 0.0
for i in 1:length(df[Field{:a}()])
x += df[Field{:a}()][i] * df[Field{:a}()][i]
end
return x
end
function dot4(df::DataFrame)
x = 0.0
for i in 1:length(df[Field{:a}()])
x += df[i,Field{:a}()] * df[i,Field{:a}()]
end
return x
end
sdf[1,Field(:a)]
@show t1 = @elapsed dot1(sdf)
@show t2 = @elapsed dot2(sdf)
@show t3 = @elapsed dot3(sdf)
@show t4 = @elapsed dot4(sdf)
end
module CompositeDataFramesTimings
using DataArrays, DataFrames
using DataFramesMeta ## NOTE: MUST BE THE DEVELOPMENT VERSION
srand(1)
const n = 5_000_000
a = rand(n)
b = round(Int,rand(n))
cdf = CompositeDataFrame(a = a, b = b)
df = DataFrame(cdf)
function dot1(df::AbstractDataFrame)
x = 0.0
for i in 1:size(df, 1)
x += df[:a][i] * df[:a][i]
end
return x
end
function dot2(df::AbstractDataFrame)
x = 0.0
for i in 1:size(df, 1)
x += df[i,:a] * df[i,:a]
end
return x
end
@show c1 = @elapsed dot1(df)
@show c2 = @elapsed dot2(df)
@show c1c = @elapsed dot1(cdf)
@show c2c = @elapsed dot2(cdf)
end
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment