Created
May 26, 2015 22:13
-
-
Save ArchRobison/654a6573a248fd46b71d to your computer and use it in GitHub Desktop.
Example input and output from Julia using LLVM 3.6.1 and JULIA_LLVM_ARGS=-debug-only=loop-vectorize
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| function mysum!{T}(F, A::AbstractArray{T}) | |
| for j = 1:size(A,2) | |
| @simd for i = 1:size(A,1) | |
| @inbounds F[i,j] += A[i,j] | |
| end | |
| end | |
| return F | |
| end |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| LV: Checking a loop in "julia_mysum!_20989" from /localdisk/adrobiso/julia-trunk/b.cpp:75:1 | |
| LV: Loop hints: force=? width=0 unroll=0 | |
| LV: Found a loop: L14 | |
| LV: Found an induction variable. | |
| LV: A loop annotated parallel, ignore memory dependency checks. | |
| LV: We can vectorize this loop! | |
| LV: Found trip count: 0 | |
| LV: The Widest type: 64 bits. | |
| LV: The Widest register is: 256 bits. | |
| LV: Found an estimated cost of 0 for VF 1 For instruction: %"##i#1682.0" = phi i64 [ 0, %if12 ], [ %56, %L14 ] | |
| LV: Found an estimated cost of 1 for VF 1 For instruction: %37 = add i64 %"##i#1682.0", %35, !dbg !35 | |
| LV: Found an estimated cost of 0 for VF 1 For instruction: %38 = getelementptr float* %11, i64 %37, !dbg !35 | |
| LV: Found an estimated cost of 1 for VF 1 For instruction: %39 = load float* %38, align 4, !dbg !35, !tbaa !44, !llvm.mem.parallel_loop_access !45 | |
| LV: Found an estimated cost of 1 for VF 1 For instruction: %40 = load %jl_value_t** %17, align 8, !dbg !35, !tbaa !47, !llvm.mem.parallel_loop_access !45 | |
| LV: Found an estimated cost of 1 for VF 1 For instruction: %41 = load i64* %19, align 8, !dbg !35, !llvm.mem.parallel_loop_access !45 | |
| LV: Found an estimated cost of 1 for VF 1 For instruction: %42 = load i64* %21, align 8, !dbg !35, !llvm.mem.parallel_loop_access !45 | |
| LV: Found an estimated cost of 0 for VF 1 For instruction: %43 = getelementptr inbounds %jl_value_t* %40, i64 3, i32 0, !dbg !35 | |
| LV: Found an estimated cost of 0 for VF 1 For instruction: %44 = bitcast %jl_value_t** %43 to i64*, !dbg !35 | |
| LV: Found an estimated cost of 1 for VF 1 For instruction: %45 = load i64* %44, align 8, !dbg !35, !tbaa !31, !llvm.mem.parallel_loop_access !45 | |
| LV: Found an estimated cost of 1 for VF 1 For instruction: %46 = add i64 %36, %42, !dbg !35 | |
| LV: Found an estimated cost of 1 for VF 1 For instruction: %47 = mul i64 %46, %45, !dbg !35 | |
| LV: Found an estimated cost of 1 for VF 1 For instruction: %48 = add nsw i64 %"##i#1682.0", -1, !dbg !35 | |
| LV: Found an estimated cost of 1 for VF 1 For instruction: %49 = add i64 %48, %41, !dbg !35 | |
| LV: Found an estimated cost of 1 for VF 1 For instruction: %50 = add i64 %49, %47, !dbg !35 | |
| LV: Found an estimated cost of 0 for VF 1 For instruction: %51 = bitcast %jl_value_t* %40 to float**, !dbg !35 | |
| LV: Found an estimated cost of 1 for VF 1 For instruction: %52 = load float** %51, align 8, !dbg !35, !tbaa !27, !llvm.mem.parallel_loop_access !45 | |
| LV: Found an estimated cost of 0 for VF 1 For instruction: %53 = getelementptr float* %52, i64 %50, !dbg !35 | |
| LV: Found an estimated cost of 1 for VF 1 For instruction: %54 = load float* %53, align 4, !dbg !35, !tbaa !44, !llvm.mem.parallel_loop_access !45 | |
| LV: Found an estimated cost of 2 for VF 1 For instruction: %55 = fadd float %39, %54, !dbg !35 | |
| LV: Found an estimated cost of 1 for VF 1 For instruction: store float %55, float* %38, align 4, !dbg !35, !tbaa !44, !llvm.mem.parallel_loop_access !45 | |
| LV: Found an estimated cost of 1 for VF 1 For instruction: %56 = add nuw nsw i64 %"##i#1682.0", 1, !dbg !48, !simd_loop !4 | |
| LV: Found an estimated cost of 1 for VF 1 For instruction: %exitcond = icmp eq i64 %56, %32, !dbg !49 | |
| LV: Found an estimated cost of 0 for VF 1 For instruction: br i1 %exitcond, label %L21.loopexit, label %L14, !dbg !49, !llvm.loop !46 | |
| LV: Scalar loop costs: 18. | |
| LV: Found an estimated cost of 0 for VF 2 For instruction: %"##i#1682.0" = phi i64 [ 0, %if12 ], [ %56, %L14 ] | |
| LV: Found an estimated cost of 1 for VF 2 For instruction: %37 = add i64 %"##i#1682.0", %35, !dbg !35 | |
| LV: Found an estimated cost of 0 for VF 2 For instruction: %38 = getelementptr float* %11, i64 %37, !dbg !35 | |
| LV: Found an estimated cost of 1 for VF 2 For instruction: %39 = load float* %38, align 4, !dbg !35, !tbaa !44, !llvm.mem.parallel_loop_access !45 | |
| LV: Found an estimated cost of 26 for VF 2 For instruction: %40 = load %jl_value_t** %17, align 8, !dbg !35, !tbaa !47, !llvm.mem.parallel_loop_access !45 | |
| LV: Found an estimated cost of 26 for VF 2 For instruction: %41 = load i64* %19, align 8, !dbg !35, !llvm.mem.parallel_loop_access !45 | |
| LV: Found an estimated cost of 26 for VF 2 For instruction: %42 = load i64* %21, align 8, !dbg !35, !llvm.mem.parallel_loop_access !45 | |
| LV: Found an estimated cost of 0 for VF 2 For instruction: %43 = getelementptr inbounds %jl_value_t* %40, i64 3, i32 0, !dbg !35 | |
| LV: Found an estimated cost of 0 for VF 2 For instruction: %44 = bitcast %jl_value_t** %43 to i64*, !dbg !35 | |
| LV: Found an estimated cost of 26 for VF 2 For instruction: %45 = load i64* %44, align 8, !dbg !35, !tbaa !31, !llvm.mem.parallel_loop_access !45 | |
| LV: Found an estimated cost of 1 for VF 2 For instruction: %46 = add i64 %36, %42, !dbg !35 | |
| LV: Found an estimated cost of 9 for VF 2 For instruction: %47 = mul i64 %46, %45, !dbg !35 | |
| LV: Found an estimated cost of 1 for VF 2 For instruction: %48 = add nsw i64 %"##i#1682.0", -1, !dbg !35 | |
| LV: Found an estimated cost of 1 for VF 2 For instruction: %49 = add i64 %48, %41, !dbg !35 | |
| LV: Found an estimated cost of 1 for VF 2 For instruction: %50 = add i64 %49, %47, !dbg !35 | |
| LV: Found an estimated cost of 0 for VF 2 For instruction: %51 = bitcast %jl_value_t* %40 to float**, !dbg !35 | |
| LV: Found an estimated cost of 26 for VF 2 For instruction: %52 = load float** %51, align 8, !dbg !35, !tbaa !27, !llvm.mem.parallel_loop_access !45 | |
| LV: Found an estimated cost of 0 for VF 2 For instruction: %53 = getelementptr float* %52, i64 %50, !dbg !35 | |
| LV: Found an estimated cost of 25 for VF 2 For instruction: %54 = load float* %53, align 4, !dbg !35, !tbaa !44, !llvm.mem.parallel_loop_access !45 | |
| LV: Found an estimated cost of 2 for VF 2 For instruction: %55 = fadd float %39, %54, !dbg !35 | |
| LV: Found an estimated cost of 1 for VF 2 For instruction: store float %55, float* %38, align 4, !dbg !35, !tbaa !44, !llvm.mem.parallel_loop_access !45 | |
| LV: Found an estimated cost of 1 for VF 2 For instruction: %56 = add nuw nsw i64 %"##i#1682.0", 1, !dbg !48, !simd_loop !4 | |
| LV: Found an estimated cost of 1 for VF 2 For instruction: %exitcond = icmp eq i64 %56, %32, !dbg !49 | |
| LV: Found an estimated cost of 0 for VF 2 For instruction: br i1 %exitcond, label %L21.loopexit, label %L14, !dbg !49, !llvm.loop !46 | |
| LV: Vector loop of width 2 costs: 87. | |
| LV: Found an estimated cost of 0 for VF 4 For instruction: %"##i#1682.0" = phi i64 [ 0, %if12 ], [ %56, %L14 ] | |
| LV: Found an estimated cost of 1 for VF 4 For instruction: %37 = add i64 %"##i#1682.0", %35, !dbg !35 | |
| LV: Found an estimated cost of 0 for VF 4 For instruction: %38 = getelementptr float* %11, i64 %37, !dbg !35 | |
| LV: Found an estimated cost of 1 for VF 4 For instruction: %39 = load float* %38, align 4, !dbg !35, !tbaa !44, !llvm.mem.parallel_loop_access !45 | |
| LV: Found an estimated cost of 52 for VF 4 For instruction: %40 = load %jl_value_t** %17, align 8, !dbg !35, !tbaa !47, !llvm.mem.parallel_loop_access !45 | |
| LV: Found an estimated cost of 52 for VF 4 For instruction: %41 = load i64* %19, align 8, !dbg !35, !llvm.mem.parallel_loop_access !45 | |
| LV: Found an estimated cost of 52 for VF 4 For instruction: %42 = load i64* %21, align 8, !dbg !35, !llvm.mem.parallel_loop_access !45 | |
| LV: Found an estimated cost of 0 for VF 4 For instruction: %43 = getelementptr inbounds %jl_value_t* %40, i64 3, i32 0, !dbg !35 | |
| LV: Found an estimated cost of 0 for VF 4 For instruction: %44 = bitcast %jl_value_t** %43 to i64*, !dbg !35 | |
| LV: Found an estimated cost of 52 for VF 4 For instruction: %45 = load i64* %44, align 8, !dbg !35, !tbaa !31, !llvm.mem.parallel_loop_access !45 | |
| LV: Found an estimated cost of 1 for VF 4 For instruction: %46 = add i64 %36, %42, !dbg !35 | |
| LV: Found an estimated cost of 9 for VF 4 For instruction: %47 = mul i64 %46, %45, !dbg !35 | |
| LV: Found an estimated cost of 1 for VF 4 For instruction: %48 = add nsw i64 %"##i#1682.0", -1, !dbg !35 | |
| LV: Found an estimated cost of 1 for VF 4 For instruction: %49 = add i64 %48, %41, !dbg !35 | |
| LV: Found an estimated cost of 1 for VF 4 For instruction: %50 = add i64 %49, %47, !dbg !35 | |
| LV: Found an estimated cost of 0 for VF 4 For instruction: %51 = bitcast %jl_value_t* %40 to float**, !dbg !35 | |
| LV: Found an estimated cost of 52 for VF 4 For instruction: %52 = load float** %51, align 8, !dbg !35, !tbaa !27, !llvm.mem.parallel_loop_access !45 | |
| LV: Found an estimated cost of 0 for VF 4 For instruction: %53 = getelementptr float* %52, i64 %50, !dbg !35 | |
| LV: Found an estimated cost of 51 for VF 4 For instruction: %54 = load float* %53, align 4, !dbg !35, !tbaa !44, !llvm.mem.parallel_loop_access !45 | |
| LV: Found an estimated cost of 2 for VF 4 For instruction: %55 = fadd float %39, %54, !dbg !35 | |
| LV: Found an estimated cost of 1 for VF 4 For instruction: store float %55, float* %38, align 4, !dbg !35, !tbaa !44, !llvm.mem.parallel_loop_access !45 | |
| LV: Found an estimated cost of 1 for VF 4 For instruction: %56 = add nuw nsw i64 %"##i#1682.0", 1, !dbg !48, !simd_loop !4 | |
| LV: Found an estimated cost of 1 for VF 4 For instruction: %exitcond = icmp eq i64 %56, %32, !dbg !49 | |
| LV: Found an estimated cost of 0 for VF 4 For instruction: br i1 %exitcond, label %L21.loopexit, label %L14, !dbg !49, !llvm.loop !46 | |
| LV: Vector loop of width 4 costs: 82. | |
| LV: Selecting VF: 1. | |
| LV: The target has 16 registers | |
| LV(REG): Calculating max register usage: | |
| LV(REG): At #0 Interval # 0 | |
| LV(REG): At #1 Interval # 1 | |
| LV(REG): At #2 Interval # 2 | |
| LV(REG): At #3 Interval # 2 | |
| LV(REG): At #4 Interval # 3 | |
| LV(REG): At #5 Interval # 4 | |
| LV(REG): At #6 Interval # 5 | |
| LV(REG): At #7 Interval # 6 | |
| LV(REG): At #8 Interval # 7 | |
| LV(REG): At #9 Interval # 7 | |
| LV(REG): At #10 Interval # 7 | |
| LV(REG): At #11 Interval # 7 | |
| LV(REG): At #12 Interval # 6 | |
| LV(REG): At #13 Interval # 7 | |
| LV(REG): At #14 Interval # 6 | |
| LV(REG): At #15 Interval # 5 | |
| LV(REG): At #16 Interval # 5 | |
| LV(REG): At #17 Interval # 5 | |
| LV(REG): At #18 Interval # 4 | |
| LV(REG): At #19 Interval # 4 | |
| LV(REG): At #21 Interval # 3 | |
| LV(REG): At #23 Interval # 4 | |
| LV(REG): Found max usage: 7 | |
| LV(REG): Found invariant usage: 7 | |
| LV(REG): LoopSize: 25 | |
| LV: Loop cost is 18 | |
| LV: Unrolling to reduce branch cost. | |
| LV: Found a vectorizable loop (1) in /localdisk/adrobiso/julia-trunk/b.cpp:75:1 | |
| LV: Unroll Factor is 1 | |
| LV: Vectorization is possible but not beneficial | |
| ; Function Attrs: sspreq | |
| define %jl_value_t* @"julia_mysum!_20989"(%jl_value_t*, %jl_value_t**, i32) #-1 { | |
| top: | |
| %3 = load %jl_value_t** %1, align 8 | |
| %4 = getelementptr %jl_value_t** %1, i64 1 | |
| %5 = load %jl_value_t** %4, align 8 | |
| %6 = getelementptr inbounds %jl_value_t* %5, i64 6 | |
| %7 = bitcast %jl_value_t* %6 to i64* | |
| %8 = load i64* %7, align 8 | |
| %9 = icmp sgt i64 %8, 0 | |
| br i1 %9, label %L3.preheader, label %L23 | |
| L3.preheader: ; preds = %top | |
| %10 = bitcast %jl_value_t* %3 to float** | |
| %11 = load float** %10, align 8 | |
| %12 = getelementptr inbounds %jl_value_t* %3, i64 3, i32 0 | |
| %13 = bitcast %jl_value_t** %12 to i64* | |
| %14 = load i64* %13, align 8 | |
| %15 = getelementptr %jl_value_t* %5, i64 5 | |
| %16 = bitcast %jl_value_t* %15 to i64* | |
| %17 = getelementptr inbounds %jl_value_t* %5, i64 0, i32 0 | |
| %18 = getelementptr %jl_value_t* %5, i64 1 | |
| %19 = bitcast %jl_value_t* %18 to i64* | |
| %20 = getelementptr inbounds %jl_value_t* %5, i64 3 | |
| %21 = bitcast %jl_value_t* %20 to i64* | |
| br label %L3 | |
| L3: ; preds = %L21, %L3.preheader | |
| %"#s1.0" = phi i64 [ %57, %L21 ], [ 1, %L3.preheader ] | |
| %22 = load i64* %16, align 8 | |
| %23 = icmp sgt i64 %22, 0 | |
| %24 = select i1 %23, i64 %22, i64 0 | |
| %25 = call { i64, i1 } @llvm.ssub.with.overflow.i64(i64 %24, i64 1) | |
| %26 = extractvalue { i64, i1 } %25, 1 | |
| br i1 %26, label %fail.split, label %L3.L3.split_crit_edge | |
| L3.L3.split_crit_edge: ; preds = %L3 | |
| %27 = extractvalue { i64, i1 } %25, 0 | |
| %28 = call { i64, i1 } @llvm.sadd.with.overflow.i64(i64 %27, i64 1) | |
| %29 = extractvalue { i64, i1 } %28, 1 | |
| br i1 %29, label %fail10, label %pass11 | |
| fail.split: ; preds = %L3 | |
| %30 = load %jl_value_t** @jl_overflow_exception, align 8 | |
| call void @jl_throw_with_superfluous_argument(%jl_value_t* %30, i32 67) | |
| unreachable | |
| fail10: ; preds = %L3.L3.split_crit_edge | |
| %31 = load %jl_value_t** @jl_overflow_exception, align 8 | |
| call void @jl_throw_with_superfluous_argument(%jl_value_t* %31, i32 67) | |
| unreachable | |
| pass11: ; preds = %L3.L3.split_crit_edge | |
| %32 = extractvalue { i64, i1 } %28, 0 | |
| %33 = icmp slt i64 %32, 1 | |
| br i1 %33, label %L21, label %if12 | |
| if12: ; preds = %pass11 | |
| %34 = add i64 %"#s1.0", -1 | |
| %35 = mul i64 %34, %14 | |
| %36 = add i64 %"#s1.0", -2 | |
| br label %L14 | |
| L14: ; preds = %L14, %if12 | |
| %"##i#1682.0" = phi i64 [ 0, %if12 ], [ %56, %L14 ] | |
| %37 = add i64 %"##i#1682.0", %35 | |
| %38 = getelementptr float* %11, i64 %37 | |
| %39 = load float* %38, align 4 | |
| %40 = load %jl_value_t** %17, align 8 | |
| %41 = load i64* %19, align 8 | |
| %42 = load i64* %21, align 8 | |
| %43 = getelementptr inbounds %jl_value_t* %40, i64 3, i32 0 | |
| %44 = bitcast %jl_value_t** %43 to i64* | |
| %45 = load i64* %44, align 8 | |
| %46 = add i64 %36, %42 | |
| %47 = mul i64 %46, %45 | |
| %48 = add nsw i64 %"##i#1682.0", -1 | |
| %49 = add i64 %48, %41 | |
| %50 = add i64 %49, %47 | |
| %51 = bitcast %jl_value_t* %40 to float** | |
| %52 = load float** %51, align 8 | |
| %53 = getelementptr float* %52, i64 %50 | |
| %54 = load float* %53, align 4 | |
| %55 = fadd float %39, %54 | |
| store float %55, float* %38, align 4 | |
| %56 = add nuw nsw i64 %"##i#1682.0", 1 | |
| %exitcond = icmp eq i64 %56, %32 | |
| br i1 %exitcond, label %L21.loopexit, label %L14 | |
| L21.loopexit: ; preds = %L14 | |
| br label %L21 | |
| L21: ; preds = %L21.loopexit, %pass11 | |
| %57 = add i64 %"#s1.0", 1 | |
| %58 = icmp eq i64 %"#s1.0", %8 | |
| br i1 %58, label %L23.loopexit, label %L3 | |
| L23.loopexit: ; preds = %L21 | |
| br label %L23 | |
| L23: ; preds = %L23.loopexit, %top | |
| ret %jl_value_t* %3 | |
| } |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment