November 28, 2016 10:41 · November 29, 2016 01:00 · November 28, 2016 23:03 · November 29, 2016 03:17 · November 29, 2016 05:33 · November 29, 2016 05:36
 $ ./opt -O3 -debug-pass=Structure -o swap.opt.bc swap.bc
 Pass Arguments:  -tti -tbaa -scoped-noalias -assumption-cache-tracker -targetlibinfo -verify -simplifycfg -domtree -sroa -early-cse -basicaa -aa -memdep -memoryssa -gvn-hoist -lower-expect
 Target Transform Information
 Type-Based Alias Analysis
 Scoped NoAlias Alias Analysis
 Assumption Cache Tracker
 Target Library Information
  FunctionPass Manager
    Module Verifier
    Simplify the CFG
 define i32 @_Z4testv() #0 {
 entry:
  %a = alloca i32, align 4
  %b = alloca i32, align 4
  %call = call i32 @_Z3getv()             ; int a = get();
  store i32 %call, i32* %a, align 4
  %call1 = call i32 @_Z3getv()            ; int b = get();
  store i32 %call1, i32* %b, align 4
  %0 = load i32, i32* %a, align 4         ; temp_swap(a, b);
  %1 = load i32, i32* %b, align 4
 define linkonce_odr void @void temp_swap<int>(int&, int&)(i32* %a, i32* %b) #2 {
 entry:                             ; %0
  %0 = load i32, i32* %a, align 4  ; W  %1
  %1 = load i32, i32* %b, align 4  ; |  W
  store i32 %1, i32* %a, align 4   ; R  |
  store i32 %0, i32* %b, align 4   ;    R
  ret void
 }

 define linkonce_odr void @void xor_swap<int>(int&, int&)(i32* %a, i32* %b) #2 {
 define i32 @_Z4testv() #0 {
 entry:
  %call = call i32 @_Z3getv()        ; a = get();
  %call1 = call i32 @_Z3getv()       ; b = get();
                                     ; temp_swap(a, b)는 사라짐
  %xor.i = xor i32 %call1, %call     ; xor_swap(a, b)가 xor_swap(b, a)로 바뀜
  %xor1.i = xor i32 %call, %xor.i
  %xor2.i = xor i32 %xor.i, %xor1.i
  %call2 = call i32 @_Z7processii(i32 %xor2.i, i32 %xor1.i)
  ret i32 %call2
 %a = call i32 @get()
 %b = call i32 @get()
 %xor = xor i32 %b, %a

 %xor1 = xor i32 %a, %xor
 ; => %xor1 = %a ^ %xor
 ; => %xor1 = %a ^ (%b ^ %a)       ; a ^ (b ^ a) = b ^ 0
 ; => %xor1 = %b ^ 0               ; b ^ 0 = b
 ; => %xor1 = %b                   ; 이후 %xor1 사용처를 모두 %b로 바꿈
 ; => %xor1 삭제
 define i32 @_Z4testv() #0 {
 entry:
  %a = call i32 @get()
  %b = call i32 @get()
  %b = call i32 @process(i32 %a, i32 %b)
  ret i32 %b
 }
 /*
  This is a version (aka dlmalloc) of malloc/free/realloc written by
  Doug Lea and released to the public domain, as explained at
  http://creativecommons.org/publicdomain/zero/1.0/ Send questions,
  comments, complaints, performance data, etc to dl@cs.oswego.edu

 * Version 2.8.6 Wed Aug 29 06:57:58 2012  Doug Lea
   Note: There may be an updated version of this malloc obtainable at
           ftp://gee.cs.oswego.edu/pub/misc/malloc.c
         Check before installing!
 diff --git a/include/triton/Conversion/CMakeLists.txt b/include/triton/Conversion/CMakeLists.txt
 index ae31ac93..691104f3 100644
 --- a/include/triton/Conversion/CMakeLists.txt
 +++ b/include/triton/Conversion/CMakeLists.txt
 @@ -1,4 +1,4 @@
 -add_subdirectory(TritonCPUToLLVM)
 +# add_subdirectory(TritonCPUToLLVM)
 add_subdirectory(TritonGPUToLLVM)
 -add_subdirectory(TritonToTritonCPU)
 +# add_subdirectory(TritonToTritonCPU)
 from typing import Optional, Union

 import os
 import numpy as np
 import torch
 import triton
 import triton.language as tl
 import math
 from triton.runtime.jit import TensorWrapper, reinterpret
 from numpy.random import RandomState
 ; ModuleID = 'LLVMDialectModule'
 source_filename = "LLVMDialectModule"
 target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128"
 target triple = "x86_64-unknown-linux-gnu"

 ; Function Attrs: nofree norecurse nosync nounwind memory(argmem: readwrite)
 define void @matmul_kernel(ptr addrspace(1) nocapture readonly %0, ptr addrspace(1) nocapture readonly %1, ptr addrspace(1) nocapture writeonly %2, i32 %3, i32 %4, i32 %5, i32 %6, i32 %7, i32 %8, i32 %9, i32 %10, i32 %11, i32 %12, i32 %13, i32 %14, i32 %15, i32 %16) local_unnamed_addr #0 !dbg !3 {
  %18 = add i32 %3, 15, !dbg !6
  %19 = sdiv i32 %18, 16, !dbg !10
  %20 = add i32 %4, 15, !dbg !11
	$ ./opt -O3 -debug-pass=Structure -o swap.opt.bc swap.bc
	Pass Arguments: -tti -tbaa -scoped-noalias -assumption-cache-tracker -targetlibinfo -verify -simplifycfg -domtree -sroa -early-cse -basicaa -aa -memdep -memoryssa -gvn-hoist -lower-expect
	Target Transform Information
	Type-Based Alias Analysis
	Scoped NoAlias Alias Analysis
	Assumption Cache Tracker
	Target Library Information
	FunctionPass Manager
	Module Verifier
	Simplify the CFG
	define i32 @_Z4testv() #0 {
	entry:
	%a = alloca i32, align 4
	%b = alloca i32, align 4
	%call = call i32 @_Z3getv() ; int a = get();
	store i32 %call, i32* %a, align 4
	%call1 = call i32 @_Z3getv() ; int b = get();
	store i32 %call1, i32* %b, align 4
	%0 = load i32, i32* %a, align 4 ; temp_swap(a, b);
	%1 = load i32, i32* %b, align 4
	define linkonce_odr void @void temp_swap<int>(int&, int&)(i32* %a, i32* %b) #2 {
	entry: ; %0
	%0 = load i32, i32* %a, align 4 ; W %1
	%1 = load i32, i32* %b, align 4 ; \| W
	store i32 %1, i32* %a, align 4 ; R \|
	store i32 %0, i32* %b, align 4 ; R
	ret void
	}

	define linkonce_odr void @void xor_swap<int>(int&, int&)(i32* %a, i32* %b) #2 {
	define i32 @_Z4testv() #0 {
	entry:
	%call = call i32 @_Z3getv() ; a = get();
	%call1 = call i32 @_Z3getv() ; b = get();
	; temp_swap(a, b)는 사라짐
	%xor.i = xor i32 %call1, %call ; xor_swap(a, b)가 xor_swap(b, a)로 바뀜
	%xor1.i = xor i32 %call, %xor.i
	%xor2.i = xor i32 %xor.i, %xor1.i
	%call2 = call i32 @_Z7processii(i32 %xor2.i, i32 %xor1.i)
	ret i32 %call2
	%a = call i32 @get()
	%b = call i32 @get()
	%xor = xor i32 %b, %a

	%xor1 = xor i32 %a, %xor
	; => %xor1 = %a ^ %xor
	; => %xor1 = %a ^ (%b ^ %a) ; a ^ (b ^ a) = b ^ 0
	; => %xor1 = %b ^ 0 ; b ^ 0 = b
	; => %xor1 = %b ; 이후 %xor1 사용처를 모두 %b로 바꿈
	; => %xor1 삭제
	/*
	This is a version (aka dlmalloc) of malloc/free/realloc written by
	Doug Lea and released to the public domain, as explained at
	http://creativecommons.org/publicdomain/zero/1.0/ Send questions,
	comments, complaints, performance data, etc to dl@cs.oswego.edu

	* Version 2.8.6 Wed Aug 29 06:57:58 2012 Doug Lea
	Note: There may be an updated version of this malloc obtainable at
	ftp://gee.cs.oswego.edu/pub/misc/malloc.c
	Check before installing!
	diff --git a/include/triton/Conversion/CMakeLists.txt b/include/triton/Conversion/CMakeLists.txt
	index ae31ac93..691104f3 100644
	--- a/include/triton/Conversion/CMakeLists.txt
	+++ b/include/triton/Conversion/CMakeLists.txt
	@@ -1,4 +1,4 @@
	-add_subdirectory(TritonCPUToLLVM)
	+# add_subdirectory(TritonCPUToLLVM)
	add_subdirectory(TritonGPUToLLVM)
	-add_subdirectory(TritonToTritonCPU)
	+# add_subdirectory(TritonToTritonCPU)
	from typing import Optional, Union

	import os
	import numpy as np
	import torch
	import triton
	import triton.language as tl
	import math
	from triton.runtime.jit import TensorWrapper, reinterpret
	from numpy.random import RandomState
	; ModuleID = 'LLVMDialectModule'
	source_filename = "LLVMDialectModule"
	target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128"
	target triple = "x86_64-unknown-linux-gnu"

	; Function Attrs: nofree norecurse nosync nounwind memory(argmem: readwrite)
	define void @matmul_kernel(ptr addrspace(1) nocapture readonly %0, ptr addrspace(1) nocapture readonly %1, ptr addrspace(1) nocapture writeonly %2, i32 %3, i32 %4, i32 %5, i32 %6, i32 %7, i32 %8, i32 %9, i32 %10, i32 %11, i32 %12, i32 %13, i32 %14, i32 %15, i32 %16) local_unnamed_addr #0 !dbg !3 {
	%18 = add i32 %3, 15, !dbg !6
	%19 = sdiv i32 %18, 16, !dbg !10
	%20 = add i32 %4, 15, !dbg !11