Skip to content

Instantly share code, notes, and snippets.

@tkf
Created January 9, 2018 00:25
Show Gist options
  • Save tkf/8a7ce6177ba16db2514d0f4652a03b71 to your computer and use it in GitHub Desktop.
Save tkf/8a7ce6177ba16db2514d0f4652a03b71 to your computer and use it in GitHub Desktop.
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Julia Version 0.6.0\n",
"Commit 903644385b* (2017-06-19 13:05 UTC)\n",
"Platform Info:\n",
" OS: Linux (x86_64-pc-linux-gnu)\n",
" CPU: Intel(R) Core(TM) i7-4500U CPU @ 1.80GHz\n",
" WORD_SIZE: 64\n",
" BLAS: libblas\n",
" LAPACK: liblapack\n",
" LIBM: libm\n",
" LLVM: libLLVM-4.0.0 (ORCJIT, haswell)\n"
]
}
],
"source": [
"versioninfo()"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"0"
]
},
"execution_count": 2,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"Base.JLOptions().check_bounds # 0: unspecified; 1: yes; 2: no"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"using BenchmarkTools"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"x = ones(100000)\n",
"y = ones(100000);"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"dot_fastmath_inbounds (generic function with 1 method)"
]
},
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"function dot_vanilla(a, b)\n",
" s = zero(promote_type(eltype(a), eltype(b)))\n",
" for i in 1:endof(a)\n",
" s += a[i] * b[i]\n",
" end\n",
" return s\n",
"end\n",
"\n",
"function dot_only_inbounds(a, b)\n",
" s = zero(promote_type(eltype(a), eltype(b)))\n",
" for i in 1:endof(a)\n",
" @inbounds s += a[i] * b[i]\n",
" end\n",
" return s\n",
"end\n",
"\n",
"function dot_only_fastmath(a, b)\n",
" s = zero(promote_type(eltype(a), eltype(b)))\n",
" for i in 1:endof(a)\n",
" @fastmath s += a[i] * b[i]\n",
" end\n",
" return s\n",
"end\n",
"\n",
"function dot_fastmath_inbounds(a, b)\n",
" s = zero(promote_type(eltype(a), eltype(b)))\n",
" for i in 1:endof(a)\n",
" @fastmath @inbounds s += a[i] * b[i]\n",
" end\n",
" return s\n",
"end"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"BenchmarkTools.Trial: \n",
" memory estimate: 16 bytes\n",
" allocs estimate: 1\n",
" --------------\n",
" minimum time: 100.339 μs (0.00% GC)\n",
" median time: 100.353 μs (0.00% GC)\n",
" mean time: 102.806 μs (0.00% GC)\n",
" maximum time: 187.489 μs (0.00% GC)\n",
" --------------\n",
" samples: 10000\n",
" evals/sample: 1"
]
},
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"@benchmark dot_vanilla(x, y)"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"BenchmarkTools.Trial: \n",
" memory estimate: 16 bytes\n",
" allocs estimate: 1\n",
" --------------\n",
" minimum time: 100.320 μs (0.00% GC)\n",
" median time: 100.334 μs (0.00% GC)\n",
" mean time: 102.757 μs (0.00% GC)\n",
" maximum time: 215.776 μs (0.00% GC)\n",
" --------------\n",
" samples: 10000\n",
" evals/sample: 1"
]
},
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"@benchmark dot_only_inbounds(x, y)"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"BenchmarkTools.Trial: \n",
" memory estimate: 16 bytes\n",
" allocs estimate: 1\n",
" --------------\n",
" minimum time: 100.348 μs (0.00% GC)\n",
" median time: 100.368 μs (0.00% GC)\n",
" mean time: 102.633 μs (0.00% GC)\n",
" maximum time: 154.923 μs (0.00% GC)\n",
" --------------\n",
" samples: 10000\n",
" evals/sample: 1"
]
},
"execution_count": 8,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"@benchmark dot_only_fastmath(x, y)"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"BenchmarkTools.Trial: \n",
" memory estimate: 16 bytes\n",
" allocs estimate: 1\n",
" --------------\n",
" minimum time: 45.539 μs (0.00% GC)\n",
" median time: 45.848 μs (0.00% GC)\n",
" mean time: 47.630 μs (0.00% GC)\n",
" maximum time: 226.709 μs (0.00% GC)\n",
" --------------\n",
" samples: 10000\n",
" evals/sample: 1"
]
},
"execution_count": 9,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"@benchmark dot_fastmath_inbounds(x, y)"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"dot_simd_inbounds (generic function with 1 method)"
]
},
"execution_count": 10,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"function dot_simd_inbounds(a, b)\n",
" s = zero(promote_type(eltype(a), eltype(b)))\n",
" @simd for i in 1:endof(a)\n",
" @inbounds s += a[i] * b[i]\n",
" end\n",
" return s\n",
"end"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"BenchmarkTools.Trial: \n",
" memory estimate: 16 bytes\n",
" allocs estimate: 1\n",
" --------------\n",
" minimum time: 45.520 μs (0.00% GC)\n",
" median time: 45.876 μs (0.00% GC)\n",
" mean time: 49.059 μs (0.00% GC)\n",
" maximum time: 685.778 μs (0.00% GC)\n",
" --------------\n",
" samples: 10000\n",
" evals/sample: 1"
]
},
"execution_count": 11,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"@benchmark dot_simd_inbounds(x, y)"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"dot_simd_fastmath_inbounds (generic function with 1 method)"
]
},
"execution_count": 12,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"function dot_simd_fastmath_inbounds(a, b)\n",
" s = zero(promote_type(eltype(a), eltype(b)))\n",
" @simd for i in 1:endof(a)\n",
" @fastmath @inbounds s += a[i] * b[i]\n",
" end\n",
" return s\n",
"end"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"BenchmarkTools.Trial: \n",
" memory estimate: 16 bytes\n",
" allocs estimate: 1\n",
" --------------\n",
" minimum time: 45.571 μs (0.00% GC)\n",
" median time: 50.872 μs (0.00% GC)\n",
" mean time: 50.527 μs (0.00% GC)\n",
" maximum time: 106.176 μs (0.00% GC)\n",
" --------------\n",
" samples: 10000\n",
" evals/sample: 1"
]
},
"execution_count": 13,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"@benchmark dot_simd_fastmath_inbounds(x, y)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
"define double @julia_dot_vanilla_61218(i8** dereferenceable(40), i8** dereferenceable(40)) #0 !dbg !5 {\n",
"top:\n",
" %2 = getelementptr i8*, i8** %0, i64 3\n",
" %3 = bitcast i8** %2 to i64*\n",
" %4 = load i64, i64* %3, align 8\n",
" %not. = icmp slt i64 %4, 1\n",
" br i1 %not., label %L27, label %if.lr.ph\n",
"\n",
"if.lr.ph: ; preds = %top\n",
" %5 = getelementptr i8*, i8** %1, i64 3\n",
" %6 = bitcast i8** %5 to i64*\n",
" %7 = load i64, i64* %6, align 8\n",
" %8 = bitcast i8** %0 to double**\n",
" %9 = load double*, double** %8, align 8\n",
" %10 = bitcast i8** %1 to double**\n",
" %11 = load double*, double** %10, align 8\n",
" br label %if\n",
"\n",
"if: ; preds = %if.lr.ph, %idxend2\n",
" %s.09 = phi double [ 0.000000e+00, %if.lr.ph ], [ %23, %idxend2 ]\n",
" %\"#temp#.08\" = phi i64 [ 1, %if.lr.ph ], [ %17, %idxend2 ]\n",
" %12 = add i64 %\"#temp#.08\", -1\n",
" %13 = icmp ult i64 %12, %4\n",
" br i1 %13, label %idxend, label %oob\n",
"\n",
"L27.loopexit: ; preds = %idxend2\n",
" br label %L27\n",
"\n",
"L27: ; preds = %L27.loopexit, %top\n",
" %s.0.lcssa = phi double [ 0.000000e+00, %top ], [ %23, %L27.loopexit ]\n",
" ret double %s.0.lcssa\n",
"\n",
"oob: ; preds = %if\n",
" %14 = alloca i64, align 8\n",
" store i64 %\"#temp#.08\", i64* %14, align 8\n",
" call void @jl_bounds_error_ints(i8** nonnull %0, i64* nonnull %14, i64 1)\n",
" unreachable\n",
"\n",
"idxend: ; preds = %if\n",
" %15 = icmp ult i64 %12, %7\n",
" br i1 %15, label %idxend2, label %oob1\n",
"\n",
"oob1: ; preds = %idxend\n",
" %16 = alloca i64, align 8\n",
" store i64 %\"#temp#.08\", i64* %16, align 8\n",
" call void @jl_bounds_error_ints(i8** nonnull %1, i64* nonnull %16, i64 1)\n",
" unreachable\n",
"\n",
"idxend2: ; preds = %idxend\n",
" %17 = add i64 %\"#temp#.08\", 1\n",
" %18 = getelementptr double, double* %9, i64 %12\n",
" %19 = load double, double* %18, align 8\n",
" %20 = getelementptr double, double* %11, i64 %12\n",
" %21 = load double, double* %20, align 8\n",
" %22 = fmul double %19, %21\n",
" %23 = fadd double %s.09, %22\n",
" %24 = icmp eq i64 %\"#temp#.08\", %4\n",
" br i1 %24, label %L27.loopexit, label %if\n",
"}\n"
]
}
],
"source": [
"@code_llvm dot_vanilla(x, y)"
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
"define double @julia_dot_only_inbounds_61296(i8** dereferenceable(40), i8** dereferenceable(40)) #0 !dbg !5 {\n",
"top:\n",
" %2 = getelementptr i8*, i8** %0, i64 3\n",
" %3 = bitcast i8** %2 to i64*\n",
" %4 = load i64, i64* %3, align 8\n",
" %not. = icmp slt i64 %4, 1\n",
" br i1 %not., label %L29, label %if.lr.ph\n",
"\n",
"if.lr.ph: ; preds = %top\n",
" %5 = bitcast i8** %0 to double**\n",
" %6 = load double*, double** %5, align 8\n",
" %7 = bitcast i8** %1 to double**\n",
" %8 = load double*, double** %7, align 8\n",
" br label %if\n",
"\n",
"if: ; preds = %if.lr.ph, %if\n",
" %s.03 = phi double [ 0.000000e+00, %if.lr.ph ], [ %16, %if ]\n",
" %\"#temp#.02\" = phi i64 [ 1, %if.lr.ph ], [ %9, %if ]\n",
" %9 = add i64 %\"#temp#.02\", 1\n",
" %10 = add i64 %\"#temp#.02\", -1\n",
" %11 = getelementptr double, double* %6, i64 %10\n",
" %12 = load double, double* %11, align 8\n",
" %13 = getelementptr double, double* %8, i64 %10\n",
" %14 = load double, double* %13, align 8\n",
" %15 = fmul double %12, %14\n",
" %16 = fadd double %s.03, %15\n",
" %17 = icmp eq i64 %\"#temp#.02\", %4\n",
" br i1 %17, label %L29.loopexit, label %if\n",
"\n",
"L29.loopexit: ; preds = %if\n",
" br label %L29\n",
"\n",
"L29: ; preds = %L29.loopexit, %top\n",
" %s.0.lcssa = phi double [ 0.000000e+00, %top ], [ %16, %L29.loopexit ]\n",
" ret double %s.0.lcssa\n",
"}\n"
]
}
],
"source": [
"@code_llvm dot_only_inbounds(x, y)"
]
},
{
"cell_type": "code",
"execution_count": 16,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
"define double @julia_dot_fastmath_inbounds_61338(i8** dereferenceable(40), i8** dereferenceable(40)) #0 !dbg !5 {\n",
"top:\n",
" %2 = getelementptr i8*, i8** %0, i64 3\n",
" %3 = bitcast i8** %2 to i64*\n",
" %4 = load i64, i64* %3, align 8\n",
" %not. = icmp slt i64 %4, 1\n",
" br i1 %not., label %L29, label %if.lr.ph\n",
"\n",
"if.lr.ph: ; preds = %top\n",
" %5 = bitcast i8** %0 to double**\n",
" %6 = load double*, double** %5, align 8\n",
" %7 = bitcast i8** %1 to double**\n",
" %8 = load double*, double** %7, align 8\n",
" %min.iters.check = icmp ult i64 %4, 4\n",
" br i1 %min.iters.check, label %scalar.ph, label %min.iters.checked\n",
"\n",
"min.iters.checked: ; preds = %if.lr.ph\n",
" %n.vec = and i64 %4, -4\n",
" %cmp.zero = icmp eq i64 %n.vec, 0\n",
" %ind.end = or i64 %n.vec, 1\n",
" br i1 %cmp.zero, label %scalar.ph, label %vector.ph\n",
"\n",
"vector.ph: ; preds = %min.iters.checked\n",
" br label %vector.body\n",
"\n",
"vector.body: ; preds = %vector.body, %vector.ph\n",
" %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]\n",
" %vec.phi = phi <2 x double> [ zeroinitializer, %vector.ph ], [ %20, %vector.body ]\n",
" %vec.phi4 = phi <2 x double> [ zeroinitializer, %vector.ph ], [ %21, %vector.body ]\n",
" %offset.idx = or i64 %index, 1\n",
" %9 = add i64 %offset.idx, -1\n",
" %10 = getelementptr double, double* %6, i64 %9\n",
" %11 = bitcast double* %10 to <2 x double>*\n",
" %wide.load = load <2 x double>, <2 x double>* %11, align 8\n",
" %12 = getelementptr double, double* %10, i64 2\n",
" %13 = bitcast double* %12 to <2 x double>*\n",
" %wide.load6 = load <2 x double>, <2 x double>* %13, align 8\n",
" %14 = getelementptr double, double* %8, i64 %9\n",
" %15 = bitcast double* %14 to <2 x double>*\n",
" %wide.load7 = load <2 x double>, <2 x double>* %15, align 8\n",
" %16 = getelementptr double, double* %14, i64 2\n",
" %17 = bitcast double* %16 to <2 x double>*\n",
" %wide.load8 = load <2 x double>, <2 x double>* %17, align 8\n",
" %18 = fmul fast <2 x double> %wide.load7, %wide.load\n",
" %19 = fmul fast <2 x double> %wide.load8, %wide.load6\n",
" %20 = fadd fast <2 x double> %18, %vec.phi\n",
" %21 = fadd fast <2 x double> %19, %vec.phi4\n",
" %index.next = add i64 %index, 4\n",
" %22 = icmp eq i64 %index.next, %n.vec\n",
" br i1 %22, label %middle.block, label %vector.body\n",
"\n",
"middle.block: ; preds = %vector.body\n",
" %bin.rdx = fadd fast <2 x double> %21, %20\n",
" %rdx.shuf = shufflevector <2 x double> %bin.rdx, <2 x double> undef, <2 x i32> <i32 1, i32 undef>\n",
" %bin.rdx9 = fadd fast <2 x double> %bin.rdx, %rdx.shuf\n",
" %23 = extractelement <2 x double> %bin.rdx9, i32 0\n",
" %cmp.n = icmp eq i64 %4, %n.vec\n",
" br i1 %cmp.n, label %L29.loopexit, label %scalar.ph\n",
"\n",
"scalar.ph: ; preds = %middle.block, %min.iters.checked, %if.lr.ph\n",
" %bc.resume.val = phi i64 [ %ind.end, %middle.block ], [ 1, %if.lr.ph ], [ 1, %min.iters.checked ]\n",
" %bc.merge.rdx = phi double [ %23, %middle.block ], [ 0.000000e+00, %if.lr.ph ], [ 0.000000e+00, %min.iters.checked ]\n",
" br label %if\n",
"\n",
"if: ; preds = %scalar.ph, %if\n",
" %s.03 = phi double [ %bc.merge.rdx, %scalar.ph ], [ %31, %if ]\n",
" %\"#temp#.02\" = phi i64 [ %bc.resume.val, %scalar.ph ], [ %24, %if ]\n",
" %24 = add i64 %\"#temp#.02\", 1\n",
" %25 = add i64 %\"#temp#.02\", -1\n",
" %26 = getelementptr double, double* %6, i64 %25\n",
" %27 = load double, double* %26, align 8\n",
" %28 = getelementptr double, double* %8, i64 %25\n",
" %29 = load double, double* %28, align 8\n",
" %30 = fmul fast double %29, %27\n",
" %31 = fadd fast double %30, %s.03\n",
" %32 = icmp eq i64 %\"#temp#.02\", %4\n",
" br i1 %32, label %L29.loopexit, label %if\n",
"\n",
"L29.loopexit: ; preds = %middle.block, %if\n",
" %.lcssa = phi double [ %31, %if ], [ %23, %middle.block ]\n",
" br label %L29\n",
"\n",
"L29: ; preds = %L29.loopexit, %top\n",
" %s.0.lcssa = phi double [ 0.000000e+00, %top ], [ %.lcssa, %L29.loopexit ]\n",
" ret double %s.0.lcssa\n",
"}\n"
]
}
],
"source": [
"@code_llvm dot_fastmath_inbounds(x, y)"
]
},
{
"cell_type": "code",
"execution_count": 17,
"metadata": {
"collapsed": false,
"scrolled": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
"define double @julia_dot_simd_inbounds_61359(i8** dereferenceable(40), i8** dereferenceable(40)) #0 !dbg !5 {\n",
"top:\n",
" %2 = getelementptr i8*, i8** %0, i64 3\n",
" %3 = bitcast i8** %2 to i64*\n",
" %4 = load i64, i64* %3, align 8\n",
" %5 = icmp sgt i64 %4, 0\n",
" %.op = add i64 %4, -1\n",
" %6 = select i1 %5, i64 %.op, i64 -1\n",
" %7 = call { i64, i1 } @llvm.sadd.with.overflow.i64(i64 %6, i64 1)\n",
" %8 = extractvalue { i64, i1 } %7, 0\n",
" %9 = extractvalue { i64, i1 } %7, 1\n",
" br i1 %9, label %if9, label %top.top.split_crit_edge\n",
"\n",
"top.top.split_crit_edge: ; preds = %top\n",
" %10 = icmp slt i64 %8, 1\n",
" br i1 %10, label %L91, label %top.split.top.split.split_crit_edge\n",
"\n",
"top.split.top.split.split_crit_edge: ; preds = %top.top.split_crit_edge\n",
" %11 = bitcast i8** %0 to double**\n",
" %12 = load double*, double** %11, align 8\n",
" %13 = bitcast i8** %1 to double**\n",
" %14 = load double*, double** %13, align 8\n",
" %15 = add i64 %6, 1\n",
" br label %L21.outer.split.L21.outer.split.split_crit_edge.outer\n",
"\n",
"L21.outer.split.L21.outer.split.split_crit_edge.outer.loopexit: ; preds = %middle.block, %if13\n",
" %.lcssa93 = phi double [ %39, %if13 ], [ %33, %middle.block ]\n",
" br label %L21.outer.split.L21.outer.split.split_crit_edge.outer\n",
"\n",
"L21.outer.split.L21.outer.split.split_crit_edge.outer: ; preds = %L21.outer.split.L21.outer.split.split_crit_edge.outer.loopexit, %top.split.top.split.split_crit_edge\n",
" %\"#temp#.0.ph.ph\" = phi i64 [ 0, %top.split.top.split.split_crit_edge ], [ %17, %L21.outer.split.L21.outer.split.split_crit_edge.outer.loopexit ]\n",
" %s.0.ph.ph = phi double [ 0.000000e+00, %top.split.top.split.split_crit_edge ], [ %.lcssa93, %L21.outer.split.L21.outer.split.split_crit_edge.outer.loopexit ]\n",
" br label %L21.outer.split.L21.outer.split.split_crit_edge\n",
"\n",
"L21.outer.split.L21.outer.split.split_crit_edge: ; preds = %L21.outer.split.L21.outer.split.split_crit_edge.outer, %L56.preheader\n",
" %\"#temp#.0.ph\" = phi i64 [ %17, %L56.preheader ], [ %\"#temp#.0.ph.ph\", %L21.outer.split.L21.outer.split.split_crit_edge.outer ]\n",
" %16 = icmp eq i64 %\"#temp#.0.ph\", 1\n",
" br i1 %16, label %L91.loopexit, label %L56.preheader\n",
"\n",
"L91.loopexit: ; preds = %L21.outer.split.L21.outer.split.split_crit_edge\n",
" br label %L91\n",
"\n",
"L91: ; preds = %L91.loopexit, %top.top.split_crit_edge\n",
" %s.0.ph.lcssa22.ph = phi double [ 0.000000e+00, %top.top.split_crit_edge ], [ %s.0.ph.ph, %L91.loopexit ]\n",
" ret double %s.0.ph.lcssa22.ph\n",
"\n",
"if9: ; preds = %top\n",
" call void @jl_throw(i8** inttoptr (i64 140122402279312 to i8**))\n",
" unreachable\n",
"\n",
"L56.preheader: ; preds = %L21.outer.split.L21.outer.split.split_crit_edge\n",
" %17 = add i64 %\"#temp#.0.ph\", 1\n",
" %18 = icmp sgt i64 %8, 0\n",
" br i1 %18, label %if13.preheader, label %L21.outer.split.L21.outer.split.split_crit_edge\n",
"\n",
"if13.preheader: ; preds = %L56.preheader\n",
" %min.iters.check = icmp ult i64 %15, 4\n",
" br i1 %min.iters.check, label %scalar.ph, label %min.iters.checked\n",
"\n",
"min.iters.checked: ; preds = %if13.preheader\n",
" %n.vec = and i64 %15, -4\n",
" %cmp.zero = icmp eq i64 %n.vec, 0\n",
" br i1 %cmp.zero, label %scalar.ph, label %vector.ph\n",
"\n",
"vector.ph: ; preds = %min.iters.checked\n",
" %19 = insertelement <2 x double> <double undef, double 0.000000e+00>, double %s.0.ph.ph, i32 0\n",
" br label %vector.body\n",
"\n",
"vector.body: ; preds = %vector.body, %vector.ph\n",
" %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]\n",
" %vec.phi = phi <2 x double> [ %19, %vector.ph ], [ %30, %vector.body ]\n",
" %vec.phi94 = phi <2 x double> [ zeroinitializer, %vector.ph ], [ %31, %vector.body ]\n",
" %20 = getelementptr double, double* %12, i64 %index\n",
" %21 = bitcast double* %20 to <2 x double>*\n",
" %wide.load = load <2 x double>, <2 x double>* %21, align 8\n",
" %22 = getelementptr double, double* %20, i64 2\n",
" %23 = bitcast double* %22 to <2 x double>*\n",
" %wide.load96 = load <2 x double>, <2 x double>* %23, align 8\n",
" %24 = getelementptr double, double* %14, i64 %index\n",
" %25 = bitcast double* %24 to <2 x double>*\n",
" %wide.load97 = load <2 x double>, <2 x double>* %25, align 8\n",
" %26 = getelementptr double, double* %24, i64 2\n",
" %27 = bitcast double* %26 to <2 x double>*\n",
" %wide.load98 = load <2 x double>, <2 x double>* %27, align 8\n",
" %28 = fmul <2 x double> %wide.load, %wide.load97\n",
" %29 = fmul <2 x double> %wide.load96, %wide.load98\n",
" %30 = fadd fast <2 x double> %vec.phi, %28\n",
" %31 = fadd fast <2 x double> %vec.phi94, %29\n",
" %index.next = add i64 %index, 4\n",
" %32 = icmp eq i64 %index.next, %n.vec\n",
" br i1 %32, label %middle.block, label %vector.body\n",
"\n",
"middle.block: ; preds = %vector.body\n",
" %bin.rdx = fadd fast <2 x double> %31, %30\n",
" %rdx.shuf = shufflevector <2 x double> %bin.rdx, <2 x double> undef, <2 x i32> <i32 1, i32 undef>\n",
" %bin.rdx99 = fadd fast <2 x double> %bin.rdx, %rdx.shuf\n",
" %33 = extractelement <2 x double> %bin.rdx99, i32 0\n",
" %cmp.n = icmp eq i64 %15, %n.vec\n",
" br i1 %cmp.n, label %L21.outer.split.L21.outer.split.split_crit_edge.outer.loopexit, label %scalar.ph\n",
"\n",
"scalar.ph: ; preds = %middle.block, %min.iters.checked, %if13.preheader\n",
" %bc.resume.val = phi i64 [ %n.vec, %middle.block ], [ 0, %if13.preheader ], [ 0, %min.iters.checked ]\n",
" %bc.merge.rdx = phi double [ %33, %middle.block ], [ %s.0.ph.ph, %if13.preheader ], [ %s.0.ph.ph, %min.iters.checked ]\n",
" br label %if13\n",
"\n",
"if13: ; preds = %scalar.ph, %if13\n",
" %s.124 = phi double [ %39, %if13 ], [ %bc.merge.rdx, %scalar.ph ]\n",
" %\"i#680.023\" = phi i64 [ %40, %if13 ], [ %bc.resume.val, %scalar.ph ]\n",
" %34 = getelementptr double, double* %12, i64 %\"i#680.023\"\n",
" %35 = load double, double* %34, align 8\n",
" %36 = getelementptr double, double* %14, i64 %\"i#680.023\"\n",
" %37 = load double, double* %36, align 8\n",
" %38 = fmul double %35, %37\n",
" %39 = fadd fast double %s.124, %38\n",
" %40 = add nuw nsw i64 %\"i#680.023\", 1\n",
" %41 = icmp slt i64 %40, %8\n",
" br i1 %41, label %if13, label %L21.outer.split.L21.outer.split.split_crit_edge.outer.loopexit\n",
"}\n"
]
}
],
"source": [
"@code_llvm dot_simd_inbounds(x, y)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Julia 0.6.0",
"language": "julia",
"name": "julia-0.6"
},
"language_info": {
"file_extension": ".jl",
"mimetype": "application/julia",
"name": "julia",
"version": "0.6.0"
}
},
"nbformat": 4,
"nbformat_minor": 0
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment