Abhiroop · June 27, 2018 23:19 · cartazio · Jun 27, 2018
diff --git a/sinking assignment b/sinking assignment
 for a simple function like this:

 ```
 main :: IO ()
 main
  = case unpackFloatX4# (packFloatX4# (# 9.2#, 8.15#, 7.0#, 6.4# #)) of
      (# a, b, c, d #) -> print (F# a, F# b, F# c, F# d)
 ```

 Some background:

 packFloat   corresponds to MO_VF_Insert  constructor in Cmm
 unpackFloat corresponds to MO_VF_Extract constructor in Cmm
 -----------------------------------------------------------------------------------------------

 following is the Cmm generated(lots of details elided) without -O2


 c1K6: // global
           _c1K9::Fx4V128 = %MO_VF_Insert_4_W32(9.2 :: W32,
                                                0 :: W32);   // CmmAssign
           _c1K9::Fx4V128 = %MO_VF_Insert_4_W32(8.15 :: W32,
                                                80 :: W32);   // CmmAssign
           _c1K9::Fx4V128 = %MO_VF_Insert_4_W32(7.0 :: W32,
                                                160 :: W32);   // CmmAssign
           _c1K9::Fx4V128 = %MO_VF_Insert_4_W32(6.4 :: W32,
                                                240 :: W32);   // CmmAssign
           _c1K8::Fx4V128 = _c1K9::Fx4V128;   // CmmAssign
           _s1JK::Fx4V128 = _c1K8::Fx4V128;   // CmmAssign
           _s1JM::F32 = %MO_VF_Extract_4_W32(_s1JK::Fx4V128,
                                             0 :: W32);   // CmmAssign
           _s1JN::F32 = %MO_VF_Extract_4_W32(_s1JK::Fx4V128,
                                             1 :: W32);   // CmmAssign
           _s1JO::F32 = %MO_VF_Extract_4_W32(_s1JK::Fx4V128,
                                             2 :: W32);   // CmmAssign
           _s1JP::F32 = %MO_VF_Extract_4_W32(_s1JK::Fx4V128,
                                             3 :: W32);   // CmmAssign


 ----------------------------------------------------------------------------------

 and with O2


 c3s6: // global
           I64[Hp - 48] = sat_s3rU_info;   // CmmStore
           _s3rG::Fx4V128 = %MO_VF_Insert_4_W32(6.4 :: W32,
                                                240 :: W32);   // CmmAssign
           F32[Hp - 32] = %MO_VF_Extract_4_W32(_s3rG::Fx4V128,
                                               0 :: W32);   // CmmStore
           F32[Hp - 28] = %MO_VF_Extract_4_W32(_s3rG::Fx4V128,
                                               1 :: W32);   // CmmStore
           F32[Hp - 24] = %MO_VF_Extract_4_W32(_s3rG::Fx4V128,
                                               2 :: W32);   // CmmStore
           F32[Hp - 20] = %MO_VF_Extract_4_W32(_s3rG::Fx4V128,
                                               3 :: W32);   // CmmStore

 --------------------------------------------------------------------------------------

 Lots of details have been elided but the major cause for this is the sinking assignment Cmm pass which sees the
 following code in the first case:

           _c1K9::Fx4V128 = %MO_VF_Insert_4_W32(9.2 :: W32,
                                                0 :: W32);   // CmmAssign
           _c1K9::Fx4V128 = %MO_VF_Insert_4_W32(8.15 :: W32,
                                                80 :: W32);   // CmmAssign
           _c1K9::Fx4V128 = %MO_VF_Insert_4_W32(7.0 :: W32,
                                                160 :: W32);   // CmmAssign
           _c1K9::Fx4V128 = %MO_VF_Insert_4_W32(6.4 :: W32,
                                                240 :: W32);   // CmmAssign
                                                
  and assumes that the same variable _c1K9 is being reassigned four times, so it quietly moves the first three cases and
  reduces the Cmm to 
  
  _c1K9::Fx4V128 = %MO_VF_Insert_4_W32(6.4 :: W32,
                                                240 :: W32);   // CmmAssign
                                                
                                                
 keeping just the last assignment. The problem here is that in each of the assignment I am actually using an offset value
 to pack the Xmm register and not just reassigning. So the solution is perhaps to teach the "sinking assignment" pass to 
 ignore vector variables. Currently investigating that.
	for a simple function like this:

	```
	main :: IO ()
	main
	= case unpackFloatX4# (packFloatX4# (# 9.2#, 8.15#, 7.0#, 6.4# #)) of
	(# a, b, c, d #) -> print (F# a, F# b, F# c, F# d)
	```

	Some background:

	packFloat corresponds to MO_VF_Insert constructor in Cmm
	unpackFloat corresponds to MO_VF_Extract constructor in Cmm
	-----------------------------------------------------------------------------------------------

	following is the Cmm generated(lots of details elided) without -O2


	c1K6: // global
	_c1K9::Fx4V128 = %MO_VF_Insert_4_W32(9.2 :: W32,
	0 :: W32); // CmmAssign
	_c1K9::Fx4V128 = %MO_VF_Insert_4_W32(8.15 :: W32,
	80 :: W32); // CmmAssign
	_c1K9::Fx4V128 = %MO_VF_Insert_4_W32(7.0 :: W32,
	160 :: W32); // CmmAssign
	_c1K9::Fx4V128 = %MO_VF_Insert_4_W32(6.4 :: W32,
	240 :: W32); // CmmAssign
	_c1K8::Fx4V128 = _c1K9::Fx4V128; // CmmAssign
	_s1JK::Fx4V128 = _c1K8::Fx4V128; // CmmAssign
	_s1JM::F32 = %MO_VF_Extract_4_W32(_s1JK::Fx4V128,
	0 :: W32); // CmmAssign
	_s1JN::F32 = %MO_VF_Extract_4_W32(_s1JK::Fx4V128,
	1 :: W32); // CmmAssign
	_s1JO::F32 = %MO_VF_Extract_4_W32(_s1JK::Fx4V128,
	2 :: W32); // CmmAssign
	_s1JP::F32 = %MO_VF_Extract_4_W32(_s1JK::Fx4V128,
	3 :: W32); // CmmAssign


	----------------------------------------------------------------------------------

	and with O2


	c3s6: // global
	I64[Hp - 48] = sat_s3rU_info; // CmmStore
	_s3rG::Fx4V128 = %MO_VF_Insert_4_W32(6.4 :: W32,
	240 :: W32); // CmmAssign
	F32[Hp - 32] = %MO_VF_Extract_4_W32(_s3rG::Fx4V128,
	0 :: W32); // CmmStore
	F32[Hp - 28] = %MO_VF_Extract_4_W32(_s3rG::Fx4V128,
	1 :: W32); // CmmStore
	F32[Hp - 24] = %MO_VF_Extract_4_W32(_s3rG::Fx4V128,
	2 :: W32); // CmmStore
	F32[Hp - 20] = %MO_VF_Extract_4_W32(_s3rG::Fx4V128,
	3 :: W32); // CmmStore

	--------------------------------------------------------------------------------------

	Lots of details have been elided but the major cause for this is the sinking assignment Cmm pass which sees the
	following code in the first case:

	_c1K9::Fx4V128 = %MO_VF_Insert_4_W32(9.2 :: W32,
	0 :: W32); // CmmAssign
	_c1K9::Fx4V128 = %MO_VF_Insert_4_W32(8.15 :: W32,
	80 :: W32); // CmmAssign
	_c1K9::Fx4V128 = %MO_VF_Insert_4_W32(7.0 :: W32,
	160 :: W32); // CmmAssign
	_c1K9::Fx4V128 = %MO_VF_Insert_4_W32(6.4 :: W32,
	240 :: W32); // CmmAssign

	and assumes that the same variable _c1K9 is being reassigned four times, so it quietly moves the first three cases and
	reduces the Cmm to

	_c1K9::Fx4V128 = %MO_VF_Insert_4_W32(6.4 :: W32,
	240 :: W32); // CmmAssign


	keeping just the last assignment. The problem here is that in each of the assignment I am actually using an offset value
	to pack the Xmm register and not just reassigning. So the solution is perhaps to teach the "sinking assignment" pass to
	ignore vector variables. Currently investigating that.