Skip to content

Instantly share code, notes, and snippets.

@ArchRobison
Created May 26, 2015 22:13
Show Gist options
  • Select an option

  • Save ArchRobison/654a6573a248fd46b71d to your computer and use it in GitHub Desktop.

Select an option

Save ArchRobison/654a6573a248fd46b71d to your computer and use it in GitHub Desktop.
Example input and output from Julia using LLVM 3.6.1 and JULIA_LLVM_ARGS=-debug-only=loop-vectorize
function mysum!{T}(F, A::AbstractArray{T})
for j = 1:size(A,2)
@simd for i = 1:size(A,1)
@inbounds F[i,j] += A[i,j]
end
end
return F
end
LV: Checking a loop in "julia_mysum!_20989" from /localdisk/adrobiso/julia-trunk/b.cpp:75:1
LV: Loop hints: force=? width=0 unroll=0
LV: Found a loop: L14
LV: Found an induction variable.
LV: A loop annotated parallel, ignore memory dependency checks.
LV: We can vectorize this loop!
LV: Found trip count: 0
LV: The Widest type: 64 bits.
LV: The Widest register is: 256 bits.
LV: Found an estimated cost of 0 for VF 1 For instruction: %"##i#1682.0" = phi i64 [ 0, %if12 ], [ %56, %L14 ]
LV: Found an estimated cost of 1 for VF 1 For instruction: %37 = add i64 %"##i#1682.0", %35, !dbg !35
LV: Found an estimated cost of 0 for VF 1 For instruction: %38 = getelementptr float* %11, i64 %37, !dbg !35
LV: Found an estimated cost of 1 for VF 1 For instruction: %39 = load float* %38, align 4, !dbg !35, !tbaa !44, !llvm.mem.parallel_loop_access !45
LV: Found an estimated cost of 1 for VF 1 For instruction: %40 = load %jl_value_t** %17, align 8, !dbg !35, !tbaa !47, !llvm.mem.parallel_loop_access !45
LV: Found an estimated cost of 1 for VF 1 For instruction: %41 = load i64* %19, align 8, !dbg !35, !llvm.mem.parallel_loop_access !45
LV: Found an estimated cost of 1 for VF 1 For instruction: %42 = load i64* %21, align 8, !dbg !35, !llvm.mem.parallel_loop_access !45
LV: Found an estimated cost of 0 for VF 1 For instruction: %43 = getelementptr inbounds %jl_value_t* %40, i64 3, i32 0, !dbg !35
LV: Found an estimated cost of 0 for VF 1 For instruction: %44 = bitcast %jl_value_t** %43 to i64*, !dbg !35
LV: Found an estimated cost of 1 for VF 1 For instruction: %45 = load i64* %44, align 8, !dbg !35, !tbaa !31, !llvm.mem.parallel_loop_access !45
LV: Found an estimated cost of 1 for VF 1 For instruction: %46 = add i64 %36, %42, !dbg !35
LV: Found an estimated cost of 1 for VF 1 For instruction: %47 = mul i64 %46, %45, !dbg !35
LV: Found an estimated cost of 1 for VF 1 For instruction: %48 = add nsw i64 %"##i#1682.0", -1, !dbg !35
LV: Found an estimated cost of 1 for VF 1 For instruction: %49 = add i64 %48, %41, !dbg !35
LV: Found an estimated cost of 1 for VF 1 For instruction: %50 = add i64 %49, %47, !dbg !35
LV: Found an estimated cost of 0 for VF 1 For instruction: %51 = bitcast %jl_value_t* %40 to float**, !dbg !35
LV: Found an estimated cost of 1 for VF 1 For instruction: %52 = load float** %51, align 8, !dbg !35, !tbaa !27, !llvm.mem.parallel_loop_access !45
LV: Found an estimated cost of 0 for VF 1 For instruction: %53 = getelementptr float* %52, i64 %50, !dbg !35
LV: Found an estimated cost of 1 for VF 1 For instruction: %54 = load float* %53, align 4, !dbg !35, !tbaa !44, !llvm.mem.parallel_loop_access !45
LV: Found an estimated cost of 2 for VF 1 For instruction: %55 = fadd float %39, %54, !dbg !35
LV: Found an estimated cost of 1 for VF 1 For instruction: store float %55, float* %38, align 4, !dbg !35, !tbaa !44, !llvm.mem.parallel_loop_access !45
LV: Found an estimated cost of 1 for VF 1 For instruction: %56 = add nuw nsw i64 %"##i#1682.0", 1, !dbg !48, !simd_loop !4
LV: Found an estimated cost of 1 for VF 1 For instruction: %exitcond = icmp eq i64 %56, %32, !dbg !49
LV: Found an estimated cost of 0 for VF 1 For instruction: br i1 %exitcond, label %L21.loopexit, label %L14, !dbg !49, !llvm.loop !46
LV: Scalar loop costs: 18.
LV: Found an estimated cost of 0 for VF 2 For instruction: %"##i#1682.0" = phi i64 [ 0, %if12 ], [ %56, %L14 ]
LV: Found an estimated cost of 1 for VF 2 For instruction: %37 = add i64 %"##i#1682.0", %35, !dbg !35
LV: Found an estimated cost of 0 for VF 2 For instruction: %38 = getelementptr float* %11, i64 %37, !dbg !35
LV: Found an estimated cost of 1 for VF 2 For instruction: %39 = load float* %38, align 4, !dbg !35, !tbaa !44, !llvm.mem.parallel_loop_access !45
LV: Found an estimated cost of 26 for VF 2 For instruction: %40 = load %jl_value_t** %17, align 8, !dbg !35, !tbaa !47, !llvm.mem.parallel_loop_access !45
LV: Found an estimated cost of 26 for VF 2 For instruction: %41 = load i64* %19, align 8, !dbg !35, !llvm.mem.parallel_loop_access !45
LV: Found an estimated cost of 26 for VF 2 For instruction: %42 = load i64* %21, align 8, !dbg !35, !llvm.mem.parallel_loop_access !45
LV: Found an estimated cost of 0 for VF 2 For instruction: %43 = getelementptr inbounds %jl_value_t* %40, i64 3, i32 0, !dbg !35
LV: Found an estimated cost of 0 for VF 2 For instruction: %44 = bitcast %jl_value_t** %43 to i64*, !dbg !35
LV: Found an estimated cost of 26 for VF 2 For instruction: %45 = load i64* %44, align 8, !dbg !35, !tbaa !31, !llvm.mem.parallel_loop_access !45
LV: Found an estimated cost of 1 for VF 2 For instruction: %46 = add i64 %36, %42, !dbg !35
LV: Found an estimated cost of 9 for VF 2 For instruction: %47 = mul i64 %46, %45, !dbg !35
LV: Found an estimated cost of 1 for VF 2 For instruction: %48 = add nsw i64 %"##i#1682.0", -1, !dbg !35
LV: Found an estimated cost of 1 for VF 2 For instruction: %49 = add i64 %48, %41, !dbg !35
LV: Found an estimated cost of 1 for VF 2 For instruction: %50 = add i64 %49, %47, !dbg !35
LV: Found an estimated cost of 0 for VF 2 For instruction: %51 = bitcast %jl_value_t* %40 to float**, !dbg !35
LV: Found an estimated cost of 26 for VF 2 For instruction: %52 = load float** %51, align 8, !dbg !35, !tbaa !27, !llvm.mem.parallel_loop_access !45
LV: Found an estimated cost of 0 for VF 2 For instruction: %53 = getelementptr float* %52, i64 %50, !dbg !35
LV: Found an estimated cost of 25 for VF 2 For instruction: %54 = load float* %53, align 4, !dbg !35, !tbaa !44, !llvm.mem.parallel_loop_access !45
LV: Found an estimated cost of 2 for VF 2 For instruction: %55 = fadd float %39, %54, !dbg !35
LV: Found an estimated cost of 1 for VF 2 For instruction: store float %55, float* %38, align 4, !dbg !35, !tbaa !44, !llvm.mem.parallel_loop_access !45
LV: Found an estimated cost of 1 for VF 2 For instruction: %56 = add nuw nsw i64 %"##i#1682.0", 1, !dbg !48, !simd_loop !4
LV: Found an estimated cost of 1 for VF 2 For instruction: %exitcond = icmp eq i64 %56, %32, !dbg !49
LV: Found an estimated cost of 0 for VF 2 For instruction: br i1 %exitcond, label %L21.loopexit, label %L14, !dbg !49, !llvm.loop !46
LV: Vector loop of width 2 costs: 87.
LV: Found an estimated cost of 0 for VF 4 For instruction: %"##i#1682.0" = phi i64 [ 0, %if12 ], [ %56, %L14 ]
LV: Found an estimated cost of 1 for VF 4 For instruction: %37 = add i64 %"##i#1682.0", %35, !dbg !35
LV: Found an estimated cost of 0 for VF 4 For instruction: %38 = getelementptr float* %11, i64 %37, !dbg !35
LV: Found an estimated cost of 1 for VF 4 For instruction: %39 = load float* %38, align 4, !dbg !35, !tbaa !44, !llvm.mem.parallel_loop_access !45
LV: Found an estimated cost of 52 for VF 4 For instruction: %40 = load %jl_value_t** %17, align 8, !dbg !35, !tbaa !47, !llvm.mem.parallel_loop_access !45
LV: Found an estimated cost of 52 for VF 4 For instruction: %41 = load i64* %19, align 8, !dbg !35, !llvm.mem.parallel_loop_access !45
LV: Found an estimated cost of 52 for VF 4 For instruction: %42 = load i64* %21, align 8, !dbg !35, !llvm.mem.parallel_loop_access !45
LV: Found an estimated cost of 0 for VF 4 For instruction: %43 = getelementptr inbounds %jl_value_t* %40, i64 3, i32 0, !dbg !35
LV: Found an estimated cost of 0 for VF 4 For instruction: %44 = bitcast %jl_value_t** %43 to i64*, !dbg !35
LV: Found an estimated cost of 52 for VF 4 For instruction: %45 = load i64* %44, align 8, !dbg !35, !tbaa !31, !llvm.mem.parallel_loop_access !45
LV: Found an estimated cost of 1 for VF 4 For instruction: %46 = add i64 %36, %42, !dbg !35
LV: Found an estimated cost of 9 for VF 4 For instruction: %47 = mul i64 %46, %45, !dbg !35
LV: Found an estimated cost of 1 for VF 4 For instruction: %48 = add nsw i64 %"##i#1682.0", -1, !dbg !35
LV: Found an estimated cost of 1 for VF 4 For instruction: %49 = add i64 %48, %41, !dbg !35
LV: Found an estimated cost of 1 for VF 4 For instruction: %50 = add i64 %49, %47, !dbg !35
LV: Found an estimated cost of 0 for VF 4 For instruction: %51 = bitcast %jl_value_t* %40 to float**, !dbg !35
LV: Found an estimated cost of 52 for VF 4 For instruction: %52 = load float** %51, align 8, !dbg !35, !tbaa !27, !llvm.mem.parallel_loop_access !45
LV: Found an estimated cost of 0 for VF 4 For instruction: %53 = getelementptr float* %52, i64 %50, !dbg !35
LV: Found an estimated cost of 51 for VF 4 For instruction: %54 = load float* %53, align 4, !dbg !35, !tbaa !44, !llvm.mem.parallel_loop_access !45
LV: Found an estimated cost of 2 for VF 4 For instruction: %55 = fadd float %39, %54, !dbg !35
LV: Found an estimated cost of 1 for VF 4 For instruction: store float %55, float* %38, align 4, !dbg !35, !tbaa !44, !llvm.mem.parallel_loop_access !45
LV: Found an estimated cost of 1 for VF 4 For instruction: %56 = add nuw nsw i64 %"##i#1682.0", 1, !dbg !48, !simd_loop !4
LV: Found an estimated cost of 1 for VF 4 For instruction: %exitcond = icmp eq i64 %56, %32, !dbg !49
LV: Found an estimated cost of 0 for VF 4 For instruction: br i1 %exitcond, label %L21.loopexit, label %L14, !dbg !49, !llvm.loop !46
LV: Vector loop of width 4 costs: 82.
LV: Selecting VF: 1.
LV: The target has 16 registers
LV(REG): Calculating max register usage:
LV(REG): At #0 Interval # 0
LV(REG): At #1 Interval # 1
LV(REG): At #2 Interval # 2
LV(REG): At #3 Interval # 2
LV(REG): At #4 Interval # 3
LV(REG): At #5 Interval # 4
LV(REG): At #6 Interval # 5
LV(REG): At #7 Interval # 6
LV(REG): At #8 Interval # 7
LV(REG): At #9 Interval # 7
LV(REG): At #10 Interval # 7
LV(REG): At #11 Interval # 7
LV(REG): At #12 Interval # 6
LV(REG): At #13 Interval # 7
LV(REG): At #14 Interval # 6
LV(REG): At #15 Interval # 5
LV(REG): At #16 Interval # 5
LV(REG): At #17 Interval # 5
LV(REG): At #18 Interval # 4
LV(REG): At #19 Interval # 4
LV(REG): At #21 Interval # 3
LV(REG): At #23 Interval # 4
LV(REG): Found max usage: 7
LV(REG): Found invariant usage: 7
LV(REG): LoopSize: 25
LV: Loop cost is 18
LV: Unrolling to reduce branch cost.
LV: Found a vectorizable loop (1) in /localdisk/adrobiso/julia-trunk/b.cpp:75:1
LV: Unroll Factor is 1
LV: Vectorization is possible but not beneficial
; Function Attrs: sspreq
define %jl_value_t* @"julia_mysum!_20989"(%jl_value_t*, %jl_value_t**, i32) #-1 {
top:
%3 = load %jl_value_t** %1, align 8
%4 = getelementptr %jl_value_t** %1, i64 1
%5 = load %jl_value_t** %4, align 8
%6 = getelementptr inbounds %jl_value_t* %5, i64 6
%7 = bitcast %jl_value_t* %6 to i64*
%8 = load i64* %7, align 8
%9 = icmp sgt i64 %8, 0
br i1 %9, label %L3.preheader, label %L23
L3.preheader: ; preds = %top
%10 = bitcast %jl_value_t* %3 to float**
%11 = load float** %10, align 8
%12 = getelementptr inbounds %jl_value_t* %3, i64 3, i32 0
%13 = bitcast %jl_value_t** %12 to i64*
%14 = load i64* %13, align 8
%15 = getelementptr %jl_value_t* %5, i64 5
%16 = bitcast %jl_value_t* %15 to i64*
%17 = getelementptr inbounds %jl_value_t* %5, i64 0, i32 0
%18 = getelementptr %jl_value_t* %5, i64 1
%19 = bitcast %jl_value_t* %18 to i64*
%20 = getelementptr inbounds %jl_value_t* %5, i64 3
%21 = bitcast %jl_value_t* %20 to i64*
br label %L3
L3: ; preds = %L21, %L3.preheader
%"#s1.0" = phi i64 [ %57, %L21 ], [ 1, %L3.preheader ]
%22 = load i64* %16, align 8
%23 = icmp sgt i64 %22, 0
%24 = select i1 %23, i64 %22, i64 0
%25 = call { i64, i1 } @llvm.ssub.with.overflow.i64(i64 %24, i64 1)
%26 = extractvalue { i64, i1 } %25, 1
br i1 %26, label %fail.split, label %L3.L3.split_crit_edge
L3.L3.split_crit_edge: ; preds = %L3
%27 = extractvalue { i64, i1 } %25, 0
%28 = call { i64, i1 } @llvm.sadd.with.overflow.i64(i64 %27, i64 1)
%29 = extractvalue { i64, i1 } %28, 1
br i1 %29, label %fail10, label %pass11
fail.split: ; preds = %L3
%30 = load %jl_value_t** @jl_overflow_exception, align 8
call void @jl_throw_with_superfluous_argument(%jl_value_t* %30, i32 67)
unreachable
fail10: ; preds = %L3.L3.split_crit_edge
%31 = load %jl_value_t** @jl_overflow_exception, align 8
call void @jl_throw_with_superfluous_argument(%jl_value_t* %31, i32 67)
unreachable
pass11: ; preds = %L3.L3.split_crit_edge
%32 = extractvalue { i64, i1 } %28, 0
%33 = icmp slt i64 %32, 1
br i1 %33, label %L21, label %if12
if12: ; preds = %pass11
%34 = add i64 %"#s1.0", -1
%35 = mul i64 %34, %14
%36 = add i64 %"#s1.0", -2
br label %L14
L14: ; preds = %L14, %if12
%"##i#1682.0" = phi i64 [ 0, %if12 ], [ %56, %L14 ]
%37 = add i64 %"##i#1682.0", %35
%38 = getelementptr float* %11, i64 %37
%39 = load float* %38, align 4
%40 = load %jl_value_t** %17, align 8
%41 = load i64* %19, align 8
%42 = load i64* %21, align 8
%43 = getelementptr inbounds %jl_value_t* %40, i64 3, i32 0
%44 = bitcast %jl_value_t** %43 to i64*
%45 = load i64* %44, align 8
%46 = add i64 %36, %42
%47 = mul i64 %46, %45
%48 = add nsw i64 %"##i#1682.0", -1
%49 = add i64 %48, %41
%50 = add i64 %49, %47
%51 = bitcast %jl_value_t* %40 to float**
%52 = load float** %51, align 8
%53 = getelementptr float* %52, i64 %50
%54 = load float* %53, align 4
%55 = fadd float %39, %54
store float %55, float* %38, align 4
%56 = add nuw nsw i64 %"##i#1682.0", 1
%exitcond = icmp eq i64 %56, %32
br i1 %exitcond, label %L21.loopexit, label %L14
L21.loopexit: ; preds = %L14
br label %L21
L21: ; preds = %L21.loopexit, %pass11
%57 = add i64 %"#s1.0", 1
%58 = icmp eq i64 %"#s1.0", %8
br i1 %58, label %L23.loopexit, label %L3
L23.loopexit: ; preds = %L21
br label %L23
L23: ; preds = %L23.loopexit, %top
ret %jl_value_t* %3
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment