Created
January 9, 2018 00:25
-
-
Save tkf/8a7ce6177ba16db2514d0f4652a03b71 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"cell_type": "code", | |
"execution_count": 1, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"Julia Version 0.6.0\n", | |
"Commit 903644385b* (2017-06-19 13:05 UTC)\n", | |
"Platform Info:\n", | |
" OS: Linux (x86_64-pc-linux-gnu)\n", | |
" CPU: Intel(R) Core(TM) i7-4500U CPU @ 1.80GHz\n", | |
" WORD_SIZE: 64\n", | |
" BLAS: libblas\n", | |
" LAPACK: liblapack\n", | |
" LIBM: libm\n", | |
" LLVM: libLLVM-4.0.0 (ORCJIT, haswell)\n" | |
] | |
} | |
], | |
"source": [ | |
"versioninfo()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 2, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"0" | |
] | |
}, | |
"execution_count": 2, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"Base.JLOptions().check_bounds # 0: unspecified; 1: yes; 2: no" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 3, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"using BenchmarkTools" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 4, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"x = ones(100000)\n", | |
"y = ones(100000);" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 5, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"dot_fastmath_inbounds (generic function with 1 method)" | |
] | |
}, | |
"execution_count": 5, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"function dot_vanilla(a, b)\n", | |
" s = zero(promote_type(eltype(a), eltype(b)))\n", | |
" for i in 1:endof(a)\n", | |
" s += a[i] * b[i]\n", | |
" end\n", | |
" return s\n", | |
"end\n", | |
"\n", | |
"function dot_only_inbounds(a, b)\n", | |
" s = zero(promote_type(eltype(a), eltype(b)))\n", | |
" for i in 1:endof(a)\n", | |
" @inbounds s += a[i] * b[i]\n", | |
" end\n", | |
" return s\n", | |
"end\n", | |
"\n", | |
"function dot_only_fastmath(a, b)\n", | |
" s = zero(promote_type(eltype(a), eltype(b)))\n", | |
" for i in 1:endof(a)\n", | |
" @fastmath s += a[i] * b[i]\n", | |
" end\n", | |
" return s\n", | |
"end\n", | |
"\n", | |
"function dot_fastmath_inbounds(a, b)\n", | |
" s = zero(promote_type(eltype(a), eltype(b)))\n", | |
" for i in 1:endof(a)\n", | |
" @fastmath @inbounds s += a[i] * b[i]\n", | |
" end\n", | |
" return s\n", | |
"end" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 6, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"BenchmarkTools.Trial: \n", | |
" memory estimate: 16 bytes\n", | |
" allocs estimate: 1\n", | |
" --------------\n", | |
" minimum time: 100.339 μs (0.00% GC)\n", | |
" median time: 100.353 μs (0.00% GC)\n", | |
" mean time: 102.806 μs (0.00% GC)\n", | |
" maximum time: 187.489 μs (0.00% GC)\n", | |
" --------------\n", | |
" samples: 10000\n", | |
" evals/sample: 1" | |
] | |
}, | |
"execution_count": 6, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"@benchmark dot_vanilla(x, y)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 7, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"BenchmarkTools.Trial: \n", | |
" memory estimate: 16 bytes\n", | |
" allocs estimate: 1\n", | |
" --------------\n", | |
" minimum time: 100.320 μs (0.00% GC)\n", | |
" median time: 100.334 μs (0.00% GC)\n", | |
" mean time: 102.757 μs (0.00% GC)\n", | |
" maximum time: 215.776 μs (0.00% GC)\n", | |
" --------------\n", | |
" samples: 10000\n", | |
" evals/sample: 1" | |
] | |
}, | |
"execution_count": 7, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"@benchmark dot_only_inbounds(x, y)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 8, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"BenchmarkTools.Trial: \n", | |
" memory estimate: 16 bytes\n", | |
" allocs estimate: 1\n", | |
" --------------\n", | |
" minimum time: 100.348 μs (0.00% GC)\n", | |
" median time: 100.368 μs (0.00% GC)\n", | |
" mean time: 102.633 μs (0.00% GC)\n", | |
" maximum time: 154.923 μs (0.00% GC)\n", | |
" --------------\n", | |
" samples: 10000\n", | |
" evals/sample: 1" | |
] | |
}, | |
"execution_count": 8, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"@benchmark dot_only_fastmath(x, y)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 9, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"BenchmarkTools.Trial: \n", | |
" memory estimate: 16 bytes\n", | |
" allocs estimate: 1\n", | |
" --------------\n", | |
" minimum time: 45.539 μs (0.00% GC)\n", | |
" median time: 45.848 μs (0.00% GC)\n", | |
" mean time: 47.630 μs (0.00% GC)\n", | |
" maximum time: 226.709 μs (0.00% GC)\n", | |
" --------------\n", | |
" samples: 10000\n", | |
" evals/sample: 1" | |
] | |
}, | |
"execution_count": 9, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"@benchmark dot_fastmath_inbounds(x, y)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 10, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"dot_simd_inbounds (generic function with 1 method)" | |
] | |
}, | |
"execution_count": 10, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"function dot_simd_inbounds(a, b)\n", | |
" s = zero(promote_type(eltype(a), eltype(b)))\n", | |
" @simd for i in 1:endof(a)\n", | |
" @inbounds s += a[i] * b[i]\n", | |
" end\n", | |
" return s\n", | |
"end" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 11, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"BenchmarkTools.Trial: \n", | |
" memory estimate: 16 bytes\n", | |
" allocs estimate: 1\n", | |
" --------------\n", | |
" minimum time: 45.520 μs (0.00% GC)\n", | |
" median time: 45.876 μs (0.00% GC)\n", | |
" mean time: 49.059 μs (0.00% GC)\n", | |
" maximum time: 685.778 μs (0.00% GC)\n", | |
" --------------\n", | |
" samples: 10000\n", | |
" evals/sample: 1" | |
] | |
}, | |
"execution_count": 11, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"@benchmark dot_simd_inbounds(x, y)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 12, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"dot_simd_fastmath_inbounds (generic function with 1 method)" | |
] | |
}, | |
"execution_count": 12, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"function dot_simd_fastmath_inbounds(a, b)\n", | |
" s = zero(promote_type(eltype(a), eltype(b)))\n", | |
" @simd for i in 1:endof(a)\n", | |
" @fastmath @inbounds s += a[i] * b[i]\n", | |
" end\n", | |
" return s\n", | |
"end" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 13, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"BenchmarkTools.Trial: \n", | |
" memory estimate: 16 bytes\n", | |
" allocs estimate: 1\n", | |
" --------------\n", | |
" minimum time: 45.571 μs (0.00% GC)\n", | |
" median time: 50.872 μs (0.00% GC)\n", | |
" mean time: 50.527 μs (0.00% GC)\n", | |
" maximum time: 106.176 μs (0.00% GC)\n", | |
" --------------\n", | |
" samples: 10000\n", | |
" evals/sample: 1" | |
] | |
}, | |
"execution_count": 13, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"@benchmark dot_simd_fastmath_inbounds(x, y)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 14, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"\n", | |
"define double @julia_dot_vanilla_61218(i8** dereferenceable(40), i8** dereferenceable(40)) #0 !dbg !5 {\n", | |
"top:\n", | |
" %2 = getelementptr i8*, i8** %0, i64 3\n", | |
" %3 = bitcast i8** %2 to i64*\n", | |
" %4 = load i64, i64* %3, align 8\n", | |
" %not. = icmp slt i64 %4, 1\n", | |
" br i1 %not., label %L27, label %if.lr.ph\n", | |
"\n", | |
"if.lr.ph: ; preds = %top\n", | |
" %5 = getelementptr i8*, i8** %1, i64 3\n", | |
" %6 = bitcast i8** %5 to i64*\n", | |
" %7 = load i64, i64* %6, align 8\n", | |
" %8 = bitcast i8** %0 to double**\n", | |
" %9 = load double*, double** %8, align 8\n", | |
" %10 = bitcast i8** %1 to double**\n", | |
" %11 = load double*, double** %10, align 8\n", | |
" br label %if\n", | |
"\n", | |
"if: ; preds = %if.lr.ph, %idxend2\n", | |
" %s.09 = phi double [ 0.000000e+00, %if.lr.ph ], [ %23, %idxend2 ]\n", | |
" %\"#temp#.08\" = phi i64 [ 1, %if.lr.ph ], [ %17, %idxend2 ]\n", | |
" %12 = add i64 %\"#temp#.08\", -1\n", | |
" %13 = icmp ult i64 %12, %4\n", | |
" br i1 %13, label %idxend, label %oob\n", | |
"\n", | |
"L27.loopexit: ; preds = %idxend2\n", | |
" br label %L27\n", | |
"\n", | |
"L27: ; preds = %L27.loopexit, %top\n", | |
" %s.0.lcssa = phi double [ 0.000000e+00, %top ], [ %23, %L27.loopexit ]\n", | |
" ret double %s.0.lcssa\n", | |
"\n", | |
"oob: ; preds = %if\n", | |
" %14 = alloca i64, align 8\n", | |
" store i64 %\"#temp#.08\", i64* %14, align 8\n", | |
" call void @jl_bounds_error_ints(i8** nonnull %0, i64* nonnull %14, i64 1)\n", | |
" unreachable\n", | |
"\n", | |
"idxend: ; preds = %if\n", | |
" %15 = icmp ult i64 %12, %7\n", | |
" br i1 %15, label %idxend2, label %oob1\n", | |
"\n", | |
"oob1: ; preds = %idxend\n", | |
" %16 = alloca i64, align 8\n", | |
" store i64 %\"#temp#.08\", i64* %16, align 8\n", | |
" call void @jl_bounds_error_ints(i8** nonnull %1, i64* nonnull %16, i64 1)\n", | |
" unreachable\n", | |
"\n", | |
"idxend2: ; preds = %idxend\n", | |
" %17 = add i64 %\"#temp#.08\", 1\n", | |
" %18 = getelementptr double, double* %9, i64 %12\n", | |
" %19 = load double, double* %18, align 8\n", | |
" %20 = getelementptr double, double* %11, i64 %12\n", | |
" %21 = load double, double* %20, align 8\n", | |
" %22 = fmul double %19, %21\n", | |
" %23 = fadd double %s.09, %22\n", | |
" %24 = icmp eq i64 %\"#temp#.08\", %4\n", | |
" br i1 %24, label %L27.loopexit, label %if\n", | |
"}\n" | |
] | |
} | |
], | |
"source": [ | |
"@code_llvm dot_vanilla(x, y)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 15, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"\n", | |
"define double @julia_dot_only_inbounds_61296(i8** dereferenceable(40), i8** dereferenceable(40)) #0 !dbg !5 {\n", | |
"top:\n", | |
" %2 = getelementptr i8*, i8** %0, i64 3\n", | |
" %3 = bitcast i8** %2 to i64*\n", | |
" %4 = load i64, i64* %3, align 8\n", | |
" %not. = icmp slt i64 %4, 1\n", | |
" br i1 %not., label %L29, label %if.lr.ph\n", | |
"\n", | |
"if.lr.ph: ; preds = %top\n", | |
" %5 = bitcast i8** %0 to double**\n", | |
" %6 = load double*, double** %5, align 8\n", | |
" %7 = bitcast i8** %1 to double**\n", | |
" %8 = load double*, double** %7, align 8\n", | |
" br label %if\n", | |
"\n", | |
"if: ; preds = %if.lr.ph, %if\n", | |
" %s.03 = phi double [ 0.000000e+00, %if.lr.ph ], [ %16, %if ]\n", | |
" %\"#temp#.02\" = phi i64 [ 1, %if.lr.ph ], [ %9, %if ]\n", | |
" %9 = add i64 %\"#temp#.02\", 1\n", | |
" %10 = add i64 %\"#temp#.02\", -1\n", | |
" %11 = getelementptr double, double* %6, i64 %10\n", | |
" %12 = load double, double* %11, align 8\n", | |
" %13 = getelementptr double, double* %8, i64 %10\n", | |
" %14 = load double, double* %13, align 8\n", | |
" %15 = fmul double %12, %14\n", | |
" %16 = fadd double %s.03, %15\n", | |
" %17 = icmp eq i64 %\"#temp#.02\", %4\n", | |
" br i1 %17, label %L29.loopexit, label %if\n", | |
"\n", | |
"L29.loopexit: ; preds = %if\n", | |
" br label %L29\n", | |
"\n", | |
"L29: ; preds = %L29.loopexit, %top\n", | |
" %s.0.lcssa = phi double [ 0.000000e+00, %top ], [ %16, %L29.loopexit ]\n", | |
" ret double %s.0.lcssa\n", | |
"}\n" | |
] | |
} | |
], | |
"source": [ | |
"@code_llvm dot_only_inbounds(x, y)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 16, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"\n", | |
"define double @julia_dot_fastmath_inbounds_61338(i8** dereferenceable(40), i8** dereferenceable(40)) #0 !dbg !5 {\n", | |
"top:\n", | |
" %2 = getelementptr i8*, i8** %0, i64 3\n", | |
" %3 = bitcast i8** %2 to i64*\n", | |
" %4 = load i64, i64* %3, align 8\n", | |
" %not. = icmp slt i64 %4, 1\n", | |
" br i1 %not., label %L29, label %if.lr.ph\n", | |
"\n", | |
"if.lr.ph: ; preds = %top\n", | |
" %5 = bitcast i8** %0 to double**\n", | |
" %6 = load double*, double** %5, align 8\n", | |
" %7 = bitcast i8** %1 to double**\n", | |
" %8 = load double*, double** %7, align 8\n", | |
" %min.iters.check = icmp ult i64 %4, 4\n", | |
" br i1 %min.iters.check, label %scalar.ph, label %min.iters.checked\n", | |
"\n", | |
"min.iters.checked: ; preds = %if.lr.ph\n", | |
" %n.vec = and i64 %4, -4\n", | |
" %cmp.zero = icmp eq i64 %n.vec, 0\n", | |
" %ind.end = or i64 %n.vec, 1\n", | |
" br i1 %cmp.zero, label %scalar.ph, label %vector.ph\n", | |
"\n", | |
"vector.ph: ; preds = %min.iters.checked\n", | |
" br label %vector.body\n", | |
"\n", | |
"vector.body: ; preds = %vector.body, %vector.ph\n", | |
" %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]\n", | |
" %vec.phi = phi <2 x double> [ zeroinitializer, %vector.ph ], [ %20, %vector.body ]\n", | |
" %vec.phi4 = phi <2 x double> [ zeroinitializer, %vector.ph ], [ %21, %vector.body ]\n", | |
" %offset.idx = or i64 %index, 1\n", | |
" %9 = add i64 %offset.idx, -1\n", | |
" %10 = getelementptr double, double* %6, i64 %9\n", | |
" %11 = bitcast double* %10 to <2 x double>*\n", | |
" %wide.load = load <2 x double>, <2 x double>* %11, align 8\n", | |
" %12 = getelementptr double, double* %10, i64 2\n", | |
" %13 = bitcast double* %12 to <2 x double>*\n", | |
" %wide.load6 = load <2 x double>, <2 x double>* %13, align 8\n", | |
" %14 = getelementptr double, double* %8, i64 %9\n", | |
" %15 = bitcast double* %14 to <2 x double>*\n", | |
" %wide.load7 = load <2 x double>, <2 x double>* %15, align 8\n", | |
" %16 = getelementptr double, double* %14, i64 2\n", | |
" %17 = bitcast double* %16 to <2 x double>*\n", | |
" %wide.load8 = load <2 x double>, <2 x double>* %17, align 8\n", | |
" %18 = fmul fast <2 x double> %wide.load7, %wide.load\n", | |
" %19 = fmul fast <2 x double> %wide.load8, %wide.load6\n", | |
" %20 = fadd fast <2 x double> %18, %vec.phi\n", | |
" %21 = fadd fast <2 x double> %19, %vec.phi4\n", | |
" %index.next = add i64 %index, 4\n", | |
" %22 = icmp eq i64 %index.next, %n.vec\n", | |
" br i1 %22, label %middle.block, label %vector.body\n", | |
"\n", | |
"middle.block: ; preds = %vector.body\n", | |
" %bin.rdx = fadd fast <2 x double> %21, %20\n", | |
" %rdx.shuf = shufflevector <2 x double> %bin.rdx, <2 x double> undef, <2 x i32> <i32 1, i32 undef>\n", | |
" %bin.rdx9 = fadd fast <2 x double> %bin.rdx, %rdx.shuf\n", | |
" %23 = extractelement <2 x double> %bin.rdx9, i32 0\n", | |
" %cmp.n = icmp eq i64 %4, %n.vec\n", | |
" br i1 %cmp.n, label %L29.loopexit, label %scalar.ph\n", | |
"\n", | |
"scalar.ph: ; preds = %middle.block, %min.iters.checked, %if.lr.ph\n", | |
" %bc.resume.val = phi i64 [ %ind.end, %middle.block ], [ 1, %if.lr.ph ], [ 1, %min.iters.checked ]\n", | |
" %bc.merge.rdx = phi double [ %23, %middle.block ], [ 0.000000e+00, %if.lr.ph ], [ 0.000000e+00, %min.iters.checked ]\n", | |
" br label %if\n", | |
"\n", | |
"if: ; preds = %scalar.ph, %if\n", | |
" %s.03 = phi double [ %bc.merge.rdx, %scalar.ph ], [ %31, %if ]\n", | |
" %\"#temp#.02\" = phi i64 [ %bc.resume.val, %scalar.ph ], [ %24, %if ]\n", | |
" %24 = add i64 %\"#temp#.02\", 1\n", | |
" %25 = add i64 %\"#temp#.02\", -1\n", | |
" %26 = getelementptr double, double* %6, i64 %25\n", | |
" %27 = load double, double* %26, align 8\n", | |
" %28 = getelementptr double, double* %8, i64 %25\n", | |
" %29 = load double, double* %28, align 8\n", | |
" %30 = fmul fast double %29, %27\n", | |
" %31 = fadd fast double %30, %s.03\n", | |
" %32 = icmp eq i64 %\"#temp#.02\", %4\n", | |
" br i1 %32, label %L29.loopexit, label %if\n", | |
"\n", | |
"L29.loopexit: ; preds = %middle.block, %if\n", | |
" %.lcssa = phi double [ %31, %if ], [ %23, %middle.block ]\n", | |
" br label %L29\n", | |
"\n", | |
"L29: ; preds = %L29.loopexit, %top\n", | |
" %s.0.lcssa = phi double [ 0.000000e+00, %top ], [ %.lcssa, %L29.loopexit ]\n", | |
" ret double %s.0.lcssa\n", | |
"}\n" | |
] | |
} | |
], | |
"source": [ | |
"@code_llvm dot_fastmath_inbounds(x, y)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 17, | |
"metadata": { | |
"collapsed": false, | |
"scrolled": false | |
}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"\n", | |
"define double @julia_dot_simd_inbounds_61359(i8** dereferenceable(40), i8** dereferenceable(40)) #0 !dbg !5 {\n", | |
"top:\n", | |
" %2 = getelementptr i8*, i8** %0, i64 3\n", | |
" %3 = bitcast i8** %2 to i64*\n", | |
" %4 = load i64, i64* %3, align 8\n", | |
" %5 = icmp sgt i64 %4, 0\n", | |
" %.op = add i64 %4, -1\n", | |
" %6 = select i1 %5, i64 %.op, i64 -1\n", | |
" %7 = call { i64, i1 } @llvm.sadd.with.overflow.i64(i64 %6, i64 1)\n", | |
" %8 = extractvalue { i64, i1 } %7, 0\n", | |
" %9 = extractvalue { i64, i1 } %7, 1\n", | |
" br i1 %9, label %if9, label %top.top.split_crit_edge\n", | |
"\n", | |
"top.top.split_crit_edge: ; preds = %top\n", | |
" %10 = icmp slt i64 %8, 1\n", | |
" br i1 %10, label %L91, label %top.split.top.split.split_crit_edge\n", | |
"\n", | |
"top.split.top.split.split_crit_edge: ; preds = %top.top.split_crit_edge\n", | |
" %11 = bitcast i8** %0 to double**\n", | |
" %12 = load double*, double** %11, align 8\n", | |
" %13 = bitcast i8** %1 to double**\n", | |
" %14 = load double*, double** %13, align 8\n", | |
" %15 = add i64 %6, 1\n", | |
" br label %L21.outer.split.L21.outer.split.split_crit_edge.outer\n", | |
"\n", | |
"L21.outer.split.L21.outer.split.split_crit_edge.outer.loopexit: ; preds = %middle.block, %if13\n", | |
" %.lcssa93 = phi double [ %39, %if13 ], [ %33, %middle.block ]\n", | |
" br label %L21.outer.split.L21.outer.split.split_crit_edge.outer\n", | |
"\n", | |
"L21.outer.split.L21.outer.split.split_crit_edge.outer: ; preds = %L21.outer.split.L21.outer.split.split_crit_edge.outer.loopexit, %top.split.top.split.split_crit_edge\n", | |
" %\"#temp#.0.ph.ph\" = phi i64 [ 0, %top.split.top.split.split_crit_edge ], [ %17, %L21.outer.split.L21.outer.split.split_crit_edge.outer.loopexit ]\n", | |
" %s.0.ph.ph = phi double [ 0.000000e+00, %top.split.top.split.split_crit_edge ], [ %.lcssa93, %L21.outer.split.L21.outer.split.split_crit_edge.outer.loopexit ]\n", | |
" br label %L21.outer.split.L21.outer.split.split_crit_edge\n", | |
"\n", | |
"L21.outer.split.L21.outer.split.split_crit_edge: ; preds = %L21.outer.split.L21.outer.split.split_crit_edge.outer, %L56.preheader\n", | |
" %\"#temp#.0.ph\" = phi i64 [ %17, %L56.preheader ], [ %\"#temp#.0.ph.ph\", %L21.outer.split.L21.outer.split.split_crit_edge.outer ]\n", | |
" %16 = icmp eq i64 %\"#temp#.0.ph\", 1\n", | |
" br i1 %16, label %L91.loopexit, label %L56.preheader\n", | |
"\n", | |
"L91.loopexit: ; preds = %L21.outer.split.L21.outer.split.split_crit_edge\n", | |
" br label %L91\n", | |
"\n", | |
"L91: ; preds = %L91.loopexit, %top.top.split_crit_edge\n", | |
" %s.0.ph.lcssa22.ph = phi double [ 0.000000e+00, %top.top.split_crit_edge ], [ %s.0.ph.ph, %L91.loopexit ]\n", | |
" ret double %s.0.ph.lcssa22.ph\n", | |
"\n", | |
"if9: ; preds = %top\n", | |
" call void @jl_throw(i8** inttoptr (i64 140122402279312 to i8**))\n", | |
" unreachable\n", | |
"\n", | |
"L56.preheader: ; preds = %L21.outer.split.L21.outer.split.split_crit_edge\n", | |
" %17 = add i64 %\"#temp#.0.ph\", 1\n", | |
" %18 = icmp sgt i64 %8, 0\n", | |
" br i1 %18, label %if13.preheader, label %L21.outer.split.L21.outer.split.split_crit_edge\n", | |
"\n", | |
"if13.preheader: ; preds = %L56.preheader\n", | |
" %min.iters.check = icmp ult i64 %15, 4\n", | |
" br i1 %min.iters.check, label %scalar.ph, label %min.iters.checked\n", | |
"\n", | |
"min.iters.checked: ; preds = %if13.preheader\n", | |
" %n.vec = and i64 %15, -4\n", | |
" %cmp.zero = icmp eq i64 %n.vec, 0\n", | |
" br i1 %cmp.zero, label %scalar.ph, label %vector.ph\n", | |
"\n", | |
"vector.ph: ; preds = %min.iters.checked\n", | |
" %19 = insertelement <2 x double> <double undef, double 0.000000e+00>, double %s.0.ph.ph, i32 0\n", | |
" br label %vector.body\n", | |
"\n", | |
"vector.body: ; preds = %vector.body, %vector.ph\n", | |
" %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]\n", | |
" %vec.phi = phi <2 x double> [ %19, %vector.ph ], [ %30, %vector.body ]\n", | |
" %vec.phi94 = phi <2 x double> [ zeroinitializer, %vector.ph ], [ %31, %vector.body ]\n", | |
" %20 = getelementptr double, double* %12, i64 %index\n", | |
" %21 = bitcast double* %20 to <2 x double>*\n", | |
" %wide.load = load <2 x double>, <2 x double>* %21, align 8\n", | |
" %22 = getelementptr double, double* %20, i64 2\n", | |
" %23 = bitcast double* %22 to <2 x double>*\n", | |
" %wide.load96 = load <2 x double>, <2 x double>* %23, align 8\n", | |
" %24 = getelementptr double, double* %14, i64 %index\n", | |
" %25 = bitcast double* %24 to <2 x double>*\n", | |
" %wide.load97 = load <2 x double>, <2 x double>* %25, align 8\n", | |
" %26 = getelementptr double, double* %24, i64 2\n", | |
" %27 = bitcast double* %26 to <2 x double>*\n", | |
" %wide.load98 = load <2 x double>, <2 x double>* %27, align 8\n", | |
" %28 = fmul <2 x double> %wide.load, %wide.load97\n", | |
" %29 = fmul <2 x double> %wide.load96, %wide.load98\n", | |
" %30 = fadd fast <2 x double> %vec.phi, %28\n", | |
" %31 = fadd fast <2 x double> %vec.phi94, %29\n", | |
" %index.next = add i64 %index, 4\n", | |
" %32 = icmp eq i64 %index.next, %n.vec\n", | |
" br i1 %32, label %middle.block, label %vector.body\n", | |
"\n", | |
"middle.block: ; preds = %vector.body\n", | |
" %bin.rdx = fadd fast <2 x double> %31, %30\n", | |
" %rdx.shuf = shufflevector <2 x double> %bin.rdx, <2 x double> undef, <2 x i32> <i32 1, i32 undef>\n", | |
" %bin.rdx99 = fadd fast <2 x double> %bin.rdx, %rdx.shuf\n", | |
" %33 = extractelement <2 x double> %bin.rdx99, i32 0\n", | |
" %cmp.n = icmp eq i64 %15, %n.vec\n", | |
" br i1 %cmp.n, label %L21.outer.split.L21.outer.split.split_crit_edge.outer.loopexit, label %scalar.ph\n", | |
"\n", | |
"scalar.ph: ; preds = %middle.block, %min.iters.checked, %if13.preheader\n", | |
" %bc.resume.val = phi i64 [ %n.vec, %middle.block ], [ 0, %if13.preheader ], [ 0, %min.iters.checked ]\n", | |
" %bc.merge.rdx = phi double [ %33, %middle.block ], [ %s.0.ph.ph, %if13.preheader ], [ %s.0.ph.ph, %min.iters.checked ]\n", | |
" br label %if13\n", | |
"\n", | |
"if13: ; preds = %scalar.ph, %if13\n", | |
" %s.124 = phi double [ %39, %if13 ], [ %bc.merge.rdx, %scalar.ph ]\n", | |
" %\"i#680.023\" = phi i64 [ %40, %if13 ], [ %bc.resume.val, %scalar.ph ]\n", | |
" %34 = getelementptr double, double* %12, i64 %\"i#680.023\"\n", | |
" %35 = load double, double* %34, align 8\n", | |
" %36 = getelementptr double, double* %14, i64 %\"i#680.023\"\n", | |
" %37 = load double, double* %36, align 8\n", | |
" %38 = fmul double %35, %37\n", | |
" %39 = fadd fast double %s.124, %38\n", | |
" %40 = add nuw nsw i64 %\"i#680.023\", 1\n", | |
" %41 = icmp slt i64 %40, %8\n", | |
" br i1 %41, label %if13, label %L21.outer.split.L21.outer.split.split_crit_edge.outer.loopexit\n", | |
"}\n" | |
] | |
} | |
], | |
"source": [ | |
"@code_llvm dot_simd_inbounds(x, y)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [] | |
} | |
], | |
"metadata": { | |
"kernelspec": { | |
"display_name": "Julia 0.6.0", | |
"language": "julia", | |
"name": "julia-0.6" | |
}, | |
"language_info": { | |
"file_extension": ".jl", | |
"mimetype": "application/julia", | |
"name": "julia", | |
"version": "0.6.0" | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 0 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment