ArchRobison · May 26, 2015 22:13
diff --git a/input.jl b/input.jl
 function mysum!{T}(F, A::AbstractArray{T})
    for j = 1:size(A,2)
        @simd for i = 1:size(A,1)
            @inbounds F[i,j] += A[i,j]
        end
    end
    return F
 end
diff --git a/output.txt b/output.txt

 LV: Checking a loop in "julia_mysum!_20989" from /localdisk/adrobiso/julia-trunk/b.cpp:75:1
 LV: Loop hints: force=? width=0 unroll=0
 LV: Found a loop: L14
 LV: Found an induction variable.
 LV: A loop annotated parallel, ignore memory dependency checks.
 LV: We can vectorize this loop!
 LV: Found trip count: 0
 LV: The Widest type: 64 bits.
 LV: The Widest register is: 256 bits.
 LV: Found an estimated cost of 0 for VF 1 For instruction:   %"##i#1682.0" = phi i64 [ 0, %if12 ], [ %56, %L14 ]
 LV: Found an estimated cost of 1 for VF 1 For instruction:   %37 = add i64 %"##i#1682.0", %35, !dbg !35
 LV: Found an estimated cost of 0 for VF 1 For instruction:   %38 = getelementptr float* %11, i64 %37, !dbg !35
 LV: Found an estimated cost of 1 for VF 1 For instruction:   %39 = load float* %38, align 4, !dbg !35, !tbaa !44, !llvm.mem.parallel_loop_access !45
 LV: Found an estimated cost of 1 for VF 1 For instruction:   %40 = load %jl_value_t** %17, align 8, !dbg !35, !tbaa !47, !llvm.mem.parallel_loop_access !45
 LV: Found an estimated cost of 1 for VF 1 For instruction:   %41 = load i64* %19, align 8, !dbg !35, !llvm.mem.parallel_loop_access !45
 LV: Found an estimated cost of 1 for VF 1 For instruction:   %42 = load i64* %21, align 8, !dbg !35, !llvm.mem.parallel_loop_access !45
 LV: Found an estimated cost of 0 for VF 1 For instruction:   %43 = getelementptr inbounds %jl_value_t* %40, i64 3, i32 0, !dbg !35
 LV: Found an estimated cost of 0 for VF 1 For instruction:   %44 = bitcast %jl_value_t** %43 to i64*, !dbg !35
 LV: Found an estimated cost of 1 for VF 1 For instruction:   %45 = load i64* %44, align 8, !dbg !35, !tbaa !31, !llvm.mem.parallel_loop_access !45
 LV: Found an estimated cost of 1 for VF 1 For instruction:   %46 = add i64 %36, %42, !dbg !35
 LV: Found an estimated cost of 1 for VF 1 For instruction:   %47 = mul i64 %46, %45, !dbg !35
 LV: Found an estimated cost of 1 for VF 1 For instruction:   %48 = add nsw i64 %"##i#1682.0", -1, !dbg !35
 LV: Found an estimated cost of 1 for VF 1 For instruction:   %49 = add i64 %48, %41, !dbg !35
 LV: Found an estimated cost of 1 for VF 1 For instruction:   %50 = add i64 %49, %47, !dbg !35
 LV: Found an estimated cost of 0 for VF 1 For instruction:   %51 = bitcast %jl_value_t* %40 to float**, !dbg !35
 LV: Found an estimated cost of 1 for VF 1 For instruction:   %52 = load float** %51, align 8, !dbg !35, !tbaa !27, !llvm.mem.parallel_loop_access !45
 LV: Found an estimated cost of 0 for VF 1 For instruction:   %53 = getelementptr float* %52, i64 %50, !dbg !35
 LV: Found an estimated cost of 1 for VF 1 For instruction:   %54 = load float* %53, align 4, !dbg !35, !tbaa !44, !llvm.mem.parallel_loop_access !45
 LV: Found an estimated cost of 2 for VF 1 For instruction:   %55 = fadd float %39, %54, !dbg !35
 LV: Found an estimated cost of 1 for VF 1 For instruction:   store float %55, float* %38, align 4, !dbg !35, !tbaa !44, !llvm.mem.parallel_loop_access !45
 LV: Found an estimated cost of 1 for VF 1 For instruction:   %56 = add nuw nsw i64 %"##i#1682.0", 1, !dbg !48, !simd_loop !4
 LV: Found an estimated cost of 1 for VF 1 For instruction:   %exitcond = icmp eq i64 %56, %32, !dbg !49
 LV: Found an estimated cost of 0 for VF 1 For instruction:   br i1 %exitcond, label %L21.loopexit, label %L14, !dbg !49, !llvm.loop !46
 LV: Scalar loop costs: 18.
 LV: Found an estimated cost of 0 for VF 2 For instruction:   %"##i#1682.0" = phi i64 [ 0, %if12 ], [ %56, %L14 ]
 LV: Found an estimated cost of 1 for VF 2 For instruction:   %37 = add i64 %"##i#1682.0", %35, !dbg !35
 LV: Found an estimated cost of 0 for VF 2 For instruction:   %38 = getelementptr float* %11, i64 %37, !dbg !35
 LV: Found an estimated cost of 1 for VF 2 For instruction:   %39 = load float* %38, align 4, !dbg !35, !tbaa !44, !llvm.mem.parallel_loop_access !45
 LV: Found an estimated cost of 26 for VF 2 For instruction:   %40 = load %jl_value_t** %17, align 8, !dbg !35, !tbaa !47, !llvm.mem.parallel_loop_access !45
 LV: Found an estimated cost of 26 for VF 2 For instruction:   %41 = load i64* %19, align 8, !dbg !35, !llvm.mem.parallel_loop_access !45
 LV: Found an estimated cost of 26 for VF 2 For instruction:   %42 = load i64* %21, align 8, !dbg !35, !llvm.mem.parallel_loop_access !45
 LV: Found an estimated cost of 0 for VF 2 For instruction:   %43 = getelementptr inbounds %jl_value_t* %40, i64 3, i32 0, !dbg !35
 LV: Found an estimated cost of 0 for VF 2 For instruction:   %44 = bitcast %jl_value_t** %43 to i64*, !dbg !35
 LV: Found an estimated cost of 26 for VF 2 For instruction:   %45 = load i64* %44, align 8, !dbg !35, !tbaa !31, !llvm.mem.parallel_loop_access !45
 LV: Found an estimated cost of 1 for VF 2 For instruction:   %46 = add i64 %36, %42, !dbg !35
 LV: Found an estimated cost of 9 for VF 2 For instruction:   %47 = mul i64 %46, %45, !dbg !35
 LV: Found an estimated cost of 1 for VF 2 For instruction:   %48 = add nsw i64 %"##i#1682.0", -1, !dbg !35
 LV: Found an estimated cost of 1 for VF 2 For instruction:   %49 = add i64 %48, %41, !dbg !35
 LV: Found an estimated cost of 1 for VF 2 For instruction:   %50 = add i64 %49, %47, !dbg !35
 LV: Found an estimated cost of 0 for VF 2 For instruction:   %51 = bitcast %jl_value_t* %40 to float**, !dbg !35
 LV: Found an estimated cost of 26 for VF 2 For instruction:   %52 = load float** %51, align 8, !dbg !35, !tbaa !27, !llvm.mem.parallel_loop_access !45
 LV: Found an estimated cost of 0 for VF 2 For instruction:   %53 = getelementptr float* %52, i64 %50, !dbg !35
 LV: Found an estimated cost of 25 for VF 2 For instruction:   %54 = load float* %53, align 4, !dbg !35, !tbaa !44, !llvm.mem.parallel_loop_access !45
 LV: Found an estimated cost of 2 for VF 2 For instruction:   %55 = fadd float %39, %54, !dbg !35
 LV: Found an estimated cost of 1 for VF 2 For instruction:   store float %55, float* %38, align 4, !dbg !35, !tbaa !44, !llvm.mem.parallel_loop_access !45
 LV: Found an estimated cost of 1 for VF 2 For instruction:   %56 = add nuw nsw i64 %"##i#1682.0", 1, !dbg !48, !simd_loop !4
 LV: Found an estimated cost of 1 for VF 2 For instruction:   %exitcond = icmp eq i64 %56, %32, !dbg !49
 LV: Found an estimated cost of 0 for VF 2 For instruction:   br i1 %exitcond, label %L21.loopexit, label %L14, !dbg !49, !llvm.loop !46
 LV: Vector loop of width 2 costs: 87.
 LV: Found an estimated cost of 0 for VF 4 For instruction:   %"##i#1682.0" = phi i64 [ 0, %if12 ], [ %56, %L14 ]
 LV: Found an estimated cost of 1 for VF 4 For instruction:   %37 = add i64 %"##i#1682.0", %35, !dbg !35
 LV: Found an estimated cost of 0 for VF 4 For instruction:   %38 = getelementptr float* %11, i64 %37, !dbg !35
 LV: Found an estimated cost of 1 for VF 4 For instruction:   %39 = load float* %38, align 4, !dbg !35, !tbaa !44, !llvm.mem.parallel_loop_access !45
 LV: Found an estimated cost of 52 for VF 4 For instruction:   %40 = load %jl_value_t** %17, align 8, !dbg !35, !tbaa !47, !llvm.mem.parallel_loop_access !45
 LV: Found an estimated cost of 52 for VF 4 For instruction:   %41 = load i64* %19, align 8, !dbg !35, !llvm.mem.parallel_loop_access !45
 LV: Found an estimated cost of 52 for VF 4 For instruction:   %42 = load i64* %21, align 8, !dbg !35, !llvm.mem.parallel_loop_access !45
 LV: Found an estimated cost of 0 for VF 4 For instruction:   %43 = getelementptr inbounds %jl_value_t* %40, i64 3, i32 0, !dbg !35
 LV: Found an estimated cost of 0 for VF 4 For instruction:   %44 = bitcast %jl_value_t** %43 to i64*, !dbg !35
 LV: Found an estimated cost of 52 for VF 4 For instruction:   %45 = load i64* %44, align 8, !dbg !35, !tbaa !31, !llvm.mem.parallel_loop_access !45
 LV: Found an estimated cost of 1 for VF 4 For instruction:   %46 = add i64 %36, %42, !dbg !35
 LV: Found an estimated cost of 9 for VF 4 For instruction:   %47 = mul i64 %46, %45, !dbg !35
 LV: Found an estimated cost of 1 for VF 4 For instruction:   %48 = add nsw i64 %"##i#1682.0", -1, !dbg !35
 LV: Found an estimated cost of 1 for VF 4 For instruction:   %49 = add i64 %48, %41, !dbg !35
 LV: Found an estimated cost of 1 for VF 4 For instruction:   %50 = add i64 %49, %47, !dbg !35
 LV: Found an estimated cost of 0 for VF 4 For instruction:   %51 = bitcast %jl_value_t* %40 to float**, !dbg !35
 LV: Found an estimated cost of 52 for VF 4 For instruction:   %52 = load float** %51, align 8, !dbg !35, !tbaa !27, !llvm.mem.parallel_loop_access !45
 LV: Found an estimated cost of 0 for VF 4 For instruction:   %53 = getelementptr float* %52, i64 %50, !dbg !35
 LV: Found an estimated cost of 51 for VF 4 For instruction:   %54 = load float* %53, align 4, !dbg !35, !tbaa !44, !llvm.mem.parallel_loop_access !45
 LV: Found an estimated cost of 2 for VF 4 For instruction:   %55 = fadd float %39, %54, !dbg !35
 LV: Found an estimated cost of 1 for VF 4 For instruction:   store float %55, float* %38, align 4, !dbg !35, !tbaa !44, !llvm.mem.parallel_loop_access !45
 LV: Found an estimated cost of 1 for VF 4 For instruction:   %56 = add nuw nsw i64 %"##i#1682.0", 1, !dbg !48, !simd_loop !4
 LV: Found an estimated cost of 1 for VF 4 For instruction:   %exitcond = icmp eq i64 %56, %32, !dbg !49
 LV: Found an estimated cost of 0 for VF 4 For instruction:   br i1 %exitcond, label %L21.loopexit, label %L14, !dbg !49, !llvm.loop !46
 LV: Vector loop of width 4 costs: 82.
 LV: Selecting VF: 1.
 LV: The target has 16 registers
 LV(REG): Calculating max register usage:
 LV(REG): At #0 Interval # 0
 LV(REG): At #1 Interval # 1
 LV(REG): At #2 Interval # 2
 LV(REG): At #3 Interval # 2
 LV(REG): At #4 Interval # 3
 LV(REG): At #5 Interval # 4
 LV(REG): At #6 Interval # 5
 LV(REG): At #7 Interval # 6
 LV(REG): At #8 Interval # 7
 LV(REG): At #9 Interval # 7
 LV(REG): At #10 Interval # 7
 LV(REG): At #11 Interval # 7
 LV(REG): At #12 Interval # 6
 LV(REG): At #13 Interval # 7
 LV(REG): At #14 Interval # 6
 LV(REG): At #15 Interval # 5
 LV(REG): At #16 Interval # 5
 LV(REG): At #17 Interval # 5
 LV(REG): At #18 Interval # 4
 LV(REG): At #19 Interval # 4
 LV(REG): At #21 Interval # 3
 LV(REG): At #23 Interval # 4
 LV(REG): Found max usage: 7
 LV(REG): Found invariant usage: 7
 LV(REG): LoopSize: 25
 LV: Loop cost is 18
 LV: Unrolling to reduce branch cost.
 LV: Found a vectorizable loop (1) in /localdisk/adrobiso/julia-trunk/b.cpp:75:1
 LV: Unroll Factor is 1
 LV: Vectorization is possible but not beneficial

 ; Function Attrs: sspreq
 define %jl_value_t* @"julia_mysum!_20989"(%jl_value_t*, %jl_value_t**, i32) #-1 {
 top:
  %3 = load %jl_value_t** %1, align 8
  %4 = getelementptr %jl_value_t** %1, i64 1
  %5 = load %jl_value_t** %4, align 8
  %6 = getelementptr inbounds %jl_value_t* %5, i64 6
  %7 = bitcast %jl_value_t* %6 to i64*
  %8 = load i64* %7, align 8
  %9 = icmp sgt i64 %8, 0
  br i1 %9, label %L3.preheader, label %L23

 L3.preheader:                                     ; preds = %top
  %10 = bitcast %jl_value_t* %3 to float**
  %11 = load float** %10, align 8
  %12 = getelementptr inbounds %jl_value_t* %3, i64 3, i32 0
  %13 = bitcast %jl_value_t** %12 to i64*
  %14 = load i64* %13, align 8
  %15 = getelementptr %jl_value_t* %5, i64 5
  %16 = bitcast %jl_value_t* %15 to i64*
  %17 = getelementptr inbounds %jl_value_t* %5, i64 0, i32 0
  %18 = getelementptr %jl_value_t* %5, i64 1
  %19 = bitcast %jl_value_t* %18 to i64*
  %20 = getelementptr inbounds %jl_value_t* %5, i64 3
  %21 = bitcast %jl_value_t* %20 to i64*
  br label %L3

 L3:                                               ; preds = %L21, %L3.preheader
  %"#s1.0" = phi i64 [ %57, %L21 ], [ 1, %L3.preheader ]
  %22 = load i64* %16, align 8
  %23 = icmp sgt i64 %22, 0
  %24 = select i1 %23, i64 %22, i64 0
  %25 = call { i64, i1 } @llvm.ssub.with.overflow.i64(i64 %24, i64 1)
  %26 = extractvalue { i64, i1 } %25, 1
  br i1 %26, label %fail.split, label %L3.L3.split_crit_edge

 L3.L3.split_crit_edge:                            ; preds = %L3
  %27 = extractvalue { i64, i1 } %25, 0
  %28 = call { i64, i1 } @llvm.sadd.with.overflow.i64(i64 %27, i64 1)
  %29 = extractvalue { i64, i1 } %28, 1
  br i1 %29, label %fail10, label %pass11

 fail.split:                                       ; preds = %L3
  %30 = load %jl_value_t** @jl_overflow_exception, align 8
  call void @jl_throw_with_superfluous_argument(%jl_value_t* %30, i32 67)
  unreachable

 fail10:                                           ; preds = %L3.L3.split_crit_edge
  %31 = load %jl_value_t** @jl_overflow_exception, align 8
  call void @jl_throw_with_superfluous_argument(%jl_value_t* %31, i32 67)
  unreachable

 pass11:                                           ; preds = %L3.L3.split_crit_edge
  %32 = extractvalue { i64, i1 } %28, 0
  %33 = icmp slt i64 %32, 1
  br i1 %33, label %L21, label %if12

 if12:                                             ; preds = %pass11
  %34 = add i64 %"#s1.0", -1
  %35 = mul i64 %34, %14
  %36 = add i64 %"#s1.0", -2
  br label %L14

 L14:                                              ; preds = %L14, %if12
  %"##i#1682.0" = phi i64 [ 0, %if12 ], [ %56, %L14 ]
  %37 = add i64 %"##i#1682.0", %35
  %38 = getelementptr float* %11, i64 %37
  %39 = load float* %38, align 4
  %40 = load %jl_value_t** %17, align 8
  %41 = load i64* %19, align 8
  %42 = load i64* %21, align 8
  %43 = getelementptr inbounds %jl_value_t* %40, i64 3, i32 0
  %44 = bitcast %jl_value_t** %43 to i64*
  %45 = load i64* %44, align 8
  %46 = add i64 %36, %42
  %47 = mul i64 %46, %45
  %48 = add nsw i64 %"##i#1682.0", -1
  %49 = add i64 %48, %41
  %50 = add i64 %49, %47
  %51 = bitcast %jl_value_t* %40 to float**
  %52 = load float** %51, align 8
  %53 = getelementptr float* %52, i64 %50
  %54 = load float* %53, align 4
  %55 = fadd float %39, %54
  store float %55, float* %38, align 4
  %56 = add nuw nsw i64 %"##i#1682.0", 1
  %exitcond = icmp eq i64 %56, %32
  br i1 %exitcond, label %L21.loopexit, label %L14

 L21.loopexit:                                     ; preds = %L14
  br label %L21

 L21:                                              ; preds = %L21.loopexit, %pass11
  %57 = add i64 %"#s1.0", 1
  %58 = icmp eq i64 %"#s1.0", %8
  br i1 %58, label %L23.loopexit, label %L3

 L23.loopexit:                                     ; preds = %L21
  br label %L23

 L23:                                              ; preds = %L23.loopexit, %top
  ret %jl_value_t* %3
 }
	function mysum!{T}(F, A::AbstractArray{T})
	for j = 1:size(A,2)
	@simd for i = 1:size(A,1)
	@inbounds F[i,j] += A[i,j]
	end
	end
	return F
	end

	LV: Checking a loop in "julia_mysum!_20989" from /localdisk/adrobiso/julia-trunk/b.cpp:75:1
	LV: Loop hints: force=? width=0 unroll=0
	LV: Found a loop: L14
	LV: Found an induction variable.
	LV: A loop annotated parallel, ignore memory dependency checks.
	LV: We can vectorize this loop!
	LV: Found trip count: 0
	LV: The Widest type: 64 bits.
	LV: The Widest register is: 256 bits.
	LV: Found an estimated cost of 0 for VF 1 For instruction: %"##i#1682.0" = phi i64 [ 0, %if12 ], [ %56, %L14 ]
	LV: Found an estimated cost of 1 for VF 1 For instruction: %37 = add i64 %"##i#1682.0", %35, !dbg !35
	LV: Found an estimated cost of 0 for VF 1 For instruction: %38 = getelementptr float* %11, i64 %37, !dbg !35
	LV: Found an estimated cost of 1 for VF 1 For instruction: %39 = load float* %38, align 4, !dbg !35, !tbaa !44, !llvm.mem.parallel_loop_access !45
	LV: Found an estimated cost of 1 for VF 1 For instruction: %40 = load %jl_value_t** %17, align 8, !dbg !35, !tbaa !47, !llvm.mem.parallel_loop_access !45
	LV: Found an estimated cost of 1 for VF 1 For instruction: %41 = load i64* %19, align 8, !dbg !35, !llvm.mem.parallel_loop_access !45
	LV: Found an estimated cost of 1 for VF 1 For instruction: %42 = load i64* %21, align 8, !dbg !35, !llvm.mem.parallel_loop_access !45
	LV: Found an estimated cost of 0 for VF 1 For instruction: %43 = getelementptr inbounds %jl_value_t* %40, i64 3, i32 0, !dbg !35
	LV: Found an estimated cost of 0 for VF 1 For instruction: %44 = bitcast %jl_value_t** %43 to i64*, !dbg !35
	LV: Found an estimated cost of 1 for VF 1 For instruction: %45 = load i64* %44, align 8, !dbg !35, !tbaa !31, !llvm.mem.parallel_loop_access !45
	LV: Found an estimated cost of 1 for VF 1 For instruction: %46 = add i64 %36, %42, !dbg !35
	LV: Found an estimated cost of 1 for VF 1 For instruction: %47 = mul i64 %46, %45, !dbg !35
	LV: Found an estimated cost of 1 for VF 1 For instruction: %48 = add nsw i64 %"##i#1682.0", -1, !dbg !35
	LV: Found an estimated cost of 1 for VF 1 For instruction: %49 = add i64 %48, %41, !dbg !35
	LV: Found an estimated cost of 1 for VF 1 For instruction: %50 = add i64 %49, %47, !dbg !35
	LV: Found an estimated cost of 0 for VF 1 For instruction: %51 = bitcast %jl_value_t* %40 to float**, !dbg !35
	LV: Found an estimated cost of 1 for VF 1 For instruction: %52 = load float** %51, align 8, !dbg !35, !tbaa !27, !llvm.mem.parallel_loop_access !45
	LV: Found an estimated cost of 0 for VF 1 For instruction: %53 = getelementptr float* %52, i64 %50, !dbg !35
	LV: Found an estimated cost of 1 for VF 1 For instruction: %54 = load float* %53, align 4, !dbg !35, !tbaa !44, !llvm.mem.parallel_loop_access !45
	LV: Found an estimated cost of 2 for VF 1 For instruction: %55 = fadd float %39, %54, !dbg !35
	LV: Found an estimated cost of 1 for VF 1 For instruction: store float %55, float* %38, align 4, !dbg !35, !tbaa !44, !llvm.mem.parallel_loop_access !45
	LV: Found an estimated cost of 1 for VF 1 For instruction: %56 = add nuw nsw i64 %"##i#1682.0", 1, !dbg !48, !simd_loop !4
	LV: Found an estimated cost of 1 for VF 1 For instruction: %exitcond = icmp eq i64 %56, %32, !dbg !49
	LV: Found an estimated cost of 0 for VF 1 For instruction: br i1 %exitcond, label %L21.loopexit, label %L14, !dbg !49, !llvm.loop !46
	LV: Scalar loop costs: 18.
	LV: Found an estimated cost of 0 for VF 2 For instruction: %"##i#1682.0" = phi i64 [ 0, %if12 ], [ %56, %L14 ]
	LV: Found an estimated cost of 1 for VF 2 For instruction: %37 = add i64 %"##i#1682.0", %35, !dbg !35
	LV: Found an estimated cost of 0 for VF 2 For instruction: %38 = getelementptr float* %11, i64 %37, !dbg !35
	LV: Found an estimated cost of 1 for VF 2 For instruction: %39 = load float* %38, align 4, !dbg !35, !tbaa !44, !llvm.mem.parallel_loop_access !45
	LV: Found an estimated cost of 26 for VF 2 For instruction: %40 = load %jl_value_t** %17, align 8, !dbg !35, !tbaa !47, !llvm.mem.parallel_loop_access !45
	LV: Found an estimated cost of 26 for VF 2 For instruction: %41 = load i64* %19, align 8, !dbg !35, !llvm.mem.parallel_loop_access !45
	LV: Found an estimated cost of 26 for VF 2 For instruction: %42 = load i64* %21, align 8, !dbg !35, !llvm.mem.parallel_loop_access !45
	LV: Found an estimated cost of 0 for VF 2 For instruction: %43 = getelementptr inbounds %jl_value_t* %40, i64 3, i32 0, !dbg !35
	LV: Found an estimated cost of 0 for VF 2 For instruction: %44 = bitcast %jl_value_t** %43 to i64*, !dbg !35
	LV: Found an estimated cost of 26 for VF 2 For instruction: %45 = load i64* %44, align 8, !dbg !35, !tbaa !31, !llvm.mem.parallel_loop_access !45
	LV: Found an estimated cost of 1 for VF 2 For instruction: %46 = add i64 %36, %42, !dbg !35
	LV: Found an estimated cost of 9 for VF 2 For instruction: %47 = mul i64 %46, %45, !dbg !35
	LV: Found an estimated cost of 1 for VF 2 For instruction: %48 = add nsw i64 %"##i#1682.0", -1, !dbg !35
	LV: Found an estimated cost of 1 for VF 2 For instruction: %49 = add i64 %48, %41, !dbg !35
	LV: Found an estimated cost of 1 for VF 2 For instruction: %50 = add i64 %49, %47, !dbg !35
	LV: Found an estimated cost of 0 for VF 2 For instruction: %51 = bitcast %jl_value_t* %40 to float**, !dbg !35
	LV: Found an estimated cost of 26 for VF 2 For instruction: %52 = load float** %51, align 8, !dbg !35, !tbaa !27, !llvm.mem.parallel_loop_access !45
	LV: Found an estimated cost of 0 for VF 2 For instruction: %53 = getelementptr float* %52, i64 %50, !dbg !35
	LV: Found an estimated cost of 25 for VF 2 For instruction: %54 = load float* %53, align 4, !dbg !35, !tbaa !44, !llvm.mem.parallel_loop_access !45
	LV: Found an estimated cost of 2 for VF 2 For instruction: %55 = fadd float %39, %54, !dbg !35
	LV: Found an estimated cost of 1 for VF 2 For instruction: store float %55, float* %38, align 4, !dbg !35, !tbaa !44, !llvm.mem.parallel_loop_access !45
	LV: Found an estimated cost of 1 for VF 2 For instruction: %56 = add nuw nsw i64 %"##i#1682.0", 1, !dbg !48, !simd_loop !4
	LV: Found an estimated cost of 1 for VF 2 For instruction: %exitcond = icmp eq i64 %56, %32, !dbg !49
	LV: Found an estimated cost of 0 for VF 2 For instruction: br i1 %exitcond, label %L21.loopexit, label %L14, !dbg !49, !llvm.loop !46
	LV: Vector loop of width 2 costs: 87.
	LV: Found an estimated cost of 0 for VF 4 For instruction: %"##i#1682.0" = phi i64 [ 0, %if12 ], [ %56, %L14 ]
	LV: Found an estimated cost of 1 for VF 4 For instruction: %37 = add i64 %"##i#1682.0", %35, !dbg !35
	LV: Found an estimated cost of 0 for VF 4 For instruction: %38 = getelementptr float* %11, i64 %37, !dbg !35
	LV: Found an estimated cost of 1 for VF 4 For instruction: %39 = load float* %38, align 4, !dbg !35, !tbaa !44, !llvm.mem.parallel_loop_access !45
	LV: Found an estimated cost of 52 for VF 4 For instruction: %40 = load %jl_value_t** %17, align 8, !dbg !35, !tbaa !47, !llvm.mem.parallel_loop_access !45
	LV: Found an estimated cost of 52 for VF 4 For instruction: %41 = load i64* %19, align 8, !dbg !35, !llvm.mem.parallel_loop_access !45
	LV: Found an estimated cost of 52 for VF 4 For instruction: %42 = load i64* %21, align 8, !dbg !35, !llvm.mem.parallel_loop_access !45
	LV: Found an estimated cost of 0 for VF 4 For instruction: %43 = getelementptr inbounds %jl_value_t* %40, i64 3, i32 0, !dbg !35
	LV: Found an estimated cost of 0 for VF 4 For instruction: %44 = bitcast %jl_value_t** %43 to i64*, !dbg !35
	LV: Found an estimated cost of 52 for VF 4 For instruction: %45 = load i64* %44, align 8, !dbg !35, !tbaa !31, !llvm.mem.parallel_loop_access !45
	LV: Found an estimated cost of 1 for VF 4 For instruction: %46 = add i64 %36, %42, !dbg !35
	LV: Found an estimated cost of 9 for VF 4 For instruction: %47 = mul i64 %46, %45, !dbg !35
	LV: Found an estimated cost of 1 for VF 4 For instruction: %48 = add nsw i64 %"##i#1682.0", -1, !dbg !35
	LV: Found an estimated cost of 1 for VF 4 For instruction: %49 = add i64 %48, %41, !dbg !35
	LV: Found an estimated cost of 1 for VF 4 For instruction: %50 = add i64 %49, %47, !dbg !35
	LV: Found an estimated cost of 0 for VF 4 For instruction: %51 = bitcast %jl_value_t* %40 to float**, !dbg !35
	LV: Found an estimated cost of 52 for VF 4 For instruction: %52 = load float** %51, align 8, !dbg !35, !tbaa !27, !llvm.mem.parallel_loop_access !45
	LV: Found an estimated cost of 0 for VF 4 For instruction: %53 = getelementptr float* %52, i64 %50, !dbg !35
	LV: Found an estimated cost of 51 for VF 4 For instruction: %54 = load float* %53, align 4, !dbg !35, !tbaa !44, !llvm.mem.parallel_loop_access !45
	LV: Found an estimated cost of 2 for VF 4 For instruction: %55 = fadd float %39, %54, !dbg !35
	LV: Found an estimated cost of 1 for VF 4 For instruction: store float %55, float* %38, align 4, !dbg !35, !tbaa !44, !llvm.mem.parallel_loop_access !45
	LV: Found an estimated cost of 1 for VF 4 For instruction: %56 = add nuw nsw i64 %"##i#1682.0", 1, !dbg !48, !simd_loop !4
	LV: Found an estimated cost of 1 for VF 4 For instruction: %exitcond = icmp eq i64 %56, %32, !dbg !49
	LV: Found an estimated cost of 0 for VF 4 For instruction: br i1 %exitcond, label %L21.loopexit, label %L14, !dbg !49, !llvm.loop !46
	LV: Vector loop of width 4 costs: 82.
	LV: Selecting VF: 1.
	LV: The target has 16 registers
	LV(REG): Calculating max register usage:
	LV(REG): At #0 Interval # 0
	LV(REG): At #1 Interval # 1
	LV(REG): At #2 Interval # 2
	LV(REG): At #3 Interval # 2
	LV(REG): At #4 Interval # 3
	LV(REG): At #5 Interval # 4
	LV(REG): At #6 Interval # 5
	LV(REG): At #7 Interval # 6
	LV(REG): At #8 Interval # 7
	LV(REG): At #9 Interval # 7
	LV(REG): At #10 Interval # 7
	LV(REG): At #11 Interval # 7
	LV(REG): At #12 Interval # 6
	LV(REG): At #13 Interval # 7
	LV(REG): At #14 Interval # 6
	LV(REG): At #15 Interval # 5
	LV(REG): At #16 Interval # 5
	LV(REG): At #17 Interval # 5
	LV(REG): At #18 Interval # 4
	LV(REG): At #19 Interval # 4
	LV(REG): At #21 Interval # 3
	LV(REG): At #23 Interval # 4
	LV(REG): Found max usage: 7
	LV(REG): Found invariant usage: 7
	LV(REG): LoopSize: 25
	LV: Loop cost is 18
	LV: Unrolling to reduce branch cost.
	LV: Found a vectorizable loop (1) in /localdisk/adrobiso/julia-trunk/b.cpp:75:1
	LV: Unroll Factor is 1
	LV: Vectorization is possible but not beneficial

	; Function Attrs: sspreq
	define %jl_value_t* @"julia_mysum!_20989"(%jl_value_t, %jl_value_t*, i32) #-1 {
	top:
	%3 = load %jl_value_t** %1, align 8
	%4 = getelementptr %jl_value_t** %1, i64 1
	%5 = load %jl_value_t** %4, align 8
	%6 = getelementptr inbounds %jl_value_t* %5, i64 6
	%7 = bitcast %jl_value_t* %6 to i64*
	%8 = load i64* %7, align 8
	%9 = icmp sgt i64 %8, 0
	br i1 %9, label %L3.preheader, label %L23

	L3.preheader: ; preds = %top
	%10 = bitcast %jl_value_t* %3 to float**
	%11 = load float** %10, align 8
	%12 = getelementptr inbounds %jl_value_t* %3, i64 3, i32 0
	%13 = bitcast %jl_value_t** %12 to i64*
	%14 = load i64* %13, align 8
	%15 = getelementptr %jl_value_t* %5, i64 5
	%16 = bitcast %jl_value_t* %15 to i64*
	%17 = getelementptr inbounds %jl_value_t* %5, i64 0, i32 0
	%18 = getelementptr %jl_value_t* %5, i64 1
	%19 = bitcast %jl_value_t* %18 to i64*
	%20 = getelementptr inbounds %jl_value_t* %5, i64 3
	%21 = bitcast %jl_value_t* %20 to i64*
	br label %L3

	L3: ; preds = %L21, %L3.preheader
	%"#s1.0" = phi i64 [ %57, %L21 ], [ 1, %L3.preheader ]
	%22 = load i64* %16, align 8
	%23 = icmp sgt i64 %22, 0
	%24 = select i1 %23, i64 %22, i64 0
	%25 = call { i64, i1 } @llvm.ssub.with.overflow.i64(i64 %24, i64 1)
	%26 = extractvalue { i64, i1 } %25, 1
	br i1 %26, label %fail.split, label %L3.L3.split_crit_edge

	L3.L3.split_crit_edge: ; preds = %L3
	%27 = extractvalue { i64, i1 } %25, 0
	%28 = call { i64, i1 } @llvm.sadd.with.overflow.i64(i64 %27, i64 1)
	%29 = extractvalue { i64, i1 } %28, 1
	br i1 %29, label %fail10, label %pass11

	fail.split: ; preds = %L3
	%30 = load %jl_value_t** @jl_overflow_exception, align 8
	call void @jl_throw_with_superfluous_argument(%jl_value_t* %30, i32 67)
	unreachable

	fail10: ; preds = %L3.L3.split_crit_edge
	%31 = load %jl_value_t** @jl_overflow_exception, align 8
	call void @jl_throw_with_superfluous_argument(%jl_value_t* %31, i32 67)
	unreachable

	pass11: ; preds = %L3.L3.split_crit_edge
	%32 = extractvalue { i64, i1 } %28, 0
	%33 = icmp slt i64 %32, 1
	br i1 %33, label %L21, label %if12

	if12: ; preds = %pass11
	%34 = add i64 %"#s1.0", -1
	%35 = mul i64 %34, %14
	%36 = add i64 %"#s1.0", -2
	br label %L14

	L14: ; preds = %L14, %if12
	%"##i#1682.0" = phi i64 [ 0, %if12 ], [ %56, %L14 ]
	%37 = add i64 %"##i#1682.0", %35
	%38 = getelementptr float* %11, i64 %37
	%39 = load float* %38, align 4
	%40 = load %jl_value_t** %17, align 8
	%41 = load i64* %19, align 8
	%42 = load i64* %21, align 8
	%43 = getelementptr inbounds %jl_value_t* %40, i64 3, i32 0
	%44 = bitcast %jl_value_t** %43 to i64*
	%45 = load i64* %44, align 8
	%46 = add i64 %36, %42
	%47 = mul i64 %46, %45
	%48 = add nsw i64 %"##i#1682.0", -1
	%49 = add i64 %48, %41
	%50 = add i64 %49, %47
	%51 = bitcast %jl_value_t* %40 to float**
	%52 = load float** %51, align 8
	%53 = getelementptr float* %52, i64 %50
	%54 = load float* %53, align 4
	%55 = fadd float %39, %54
	store float %55, float* %38, align 4
	%56 = add nuw nsw i64 %"##i#1682.0", 1
	%exitcond = icmp eq i64 %56, %32
	br i1 %exitcond, label %L21.loopexit, label %L14

	L21.loopexit: ; preds = %L14
	br label %L21

	L21: ; preds = %L21.loopexit, %pass11
	%57 = add i64 %"#s1.0", 1
	%58 = icmp eq i64 %"#s1.0", %8
	br i1 %58, label %L23.loopexit, label %L3

	L23.loopexit: ; preds = %L21
	br label %L23

	L23: ; preds = %L23.loopexit, %top
	ret %jl_value_t* %3
	}