-
-
Save navyxliu/ee4465e2146ef99c5ae1fa1ba6b70e25 to your computer and use it in GitHub Desktop.
// -Xcomp -Xms16M -Xmx16M -XX:+AlwaysPreTouch -XX:+UnlockExperimentalVMOptions -XX:+UseEpsilonGC -XX:-UseOnStackReplacement -XX:CompileOnly='Example2.foo' -XX:CompileCommand=dontinline,Example2.blackhole | |
class Example2 { | |
private Object _cache; | |
public Object foo(boolean cond) { | |
Object x = new Object(); | |
blackhole(); | |
if (cond) { | |
_cache = x; | |
} | |
return x; | |
} | |
public static void blackhole() {} | |
public static void main(String[] args) { | |
Example2 kase = new Example2(); | |
// Epsilon Test: | |
// By setting the maximal heap and use EpsilonGC, let's see how long and how many iterations the program can sustain. | |
// if PEA manages to reduce allocation rate, we expect the program to stay longer. | |
// Roman commented it with a resonable doubt: "or your code slow down the program..." | |
// That's why I suggest to observe iterations. It turns out not trivial because inner OOME will implode hotspot. We don't have a chance to execute the final statement... | |
long iterations = 0; | |
try { | |
while (true) { | |
kase.foo(0 == (iterations & 0xf)); | |
iterations++; | |
} | |
} finally { | |
System.err.println("Epsilon Test: " + iterations); | |
} | |
} | |
} |
We can see that C2 EA/SR gets rid of the original object, or 27 Allocate as expected.
======== Connection graph for Example2::foo
invocation #0: 2 iterations and 0.000019 sec to build connection graph with 164 nodes and worklist size 19
JavaObject(3) NoEscape(NoEscape) [ [ 39 44 ]] 27 Allocate === 5 6 7 8 1 (25 23 24 1 1 10 11 1 ) [[ 28 29 30 37 38 39 ]] rawptr:NotNull ( int:>=0, java/lang/Object:NotNull *, bool, top, bool ) Example2::foo @ bci:0 (line 5) !jvms: Example2::foo @ bci:0 (line 5)
LocalVar(9) [ 27P [ 44 ]] 39 Proj === 27 [[ 40 44 ]] #5 !jvms: Example2::foo @ bci:0 (line 5)
LocalVar(12) [ 39 27P [ ]] 44 CheckCastPP === 41 39 [[ 106 129 76 ]] #java/lang/Object:NotNull:exact * Oop:java/lang/Object:NotNull:exact * !jvms: Example2::foo @ bci:0 (line 5)
Scalar 44 CheckCastPP === 41 39 [[ 106 129 76 ]] #java/lang/Object:NotNull:exact *,iid=27 Oop:java/lang/Object:NotNull:exact *,iid=27 !jvms: Example2::foo @ bci:0 (line 5)
++++ Eliminated: 27 Allocate
Here is the generated code. C2 with PEA generates 2 instances individually in B3 and B6. one is for IfTrue and the other one is for IfFalse.
The original allocation has been scalar replaced as we expected. It's worth noting that the callsite of Example2::blackhole
generates debuginfo in L[2] entry for the original object. If deoptimization does take place, runtime will rematerialize the original object using the debuginfo info.
043 call,static Example2::blackhole
# Example2::foo @ bci:8 (line 7) L[0]=rsp + #0 L[1]=RBP L[2]=#ScObj0
============================= C2-compiled nmethod ==============================
#r018 rsi:rsi : parm 0: Example2:NotNull *
#r016 rdx : parm 1: int
# -- Old rsp -- Framesize: 48 --
#r591 rsp+44: in_preserve
#r590 rsp+40: return address
#r589 rsp+36: in_preserve
#r588 rsp+32: saved fp register
#r587 rsp+28: pad2, stack alignment
#r586 rsp+24: pad2, stack alignment
#r585 rsp+20: Fixed slot 1
#r584 rsp+16: Fixed slot 0
#r595 rsp+12: spill
#r594 rsp+ 8: spill
#r593 rsp+ 4: spill
#r592 rsp+ 0: spill
#
----------------------- MetaData before Compile_id = 27 ------------------------
{method}
- this oop: 0x00007f7a55700438
- method holder: 'Example2'
- constants: 0x00007f7a55700048 constant pool [65]/operands[5] {0x00007f7a55700048} for 'Example2' cache=0x00007f7a557006b8
- access: 0x81000001 public
- name: 'foo'
- signature: '(Z)Ljava/lang/Object;'
- max stack: 3
- max locals: 3
- size of params: 2
- method size: 14
- highest level: 3
- vtable index: 5
- i2i entry: 0x00007f7a60435ec0
- adapters: AHE@0x00007f7a680556d0: 0xba i2c: 0x00007f7a60545be0 c2i: 0x00007f7a60545c91 c2iUV: 0x00007f7a60545c68 c2iNCI: 0x00007f7a60545ccb
- compiled entry 0x00007f7a59000200
- code size: 22
- code start: 0x00007f7a55700418
- code end (excl): 0x00007f7a5570042e
- method data: 0x00007f7a557008c0
- checked ex length: 0
- linenumber start: 0x00007f7a5570042e
- localvar length: 0
- compiled code: nmethod 1743 26 3 Example2::foo (22 bytes)
------------------------ OptoAssembly for Compile_id = 27 -----------------------
#
# java/lang/Object * ( Example2:NotNull *, int )
#
000 N108: # out( B1 ) <- BLOCK HEAD IS JUNK Freq: 1
000 movl rscratch1, [j_rarg0 + oopDesc::klass_offset_in_bytes()] # compressed klass
decode_klass_not_null rscratch1, rscratch1
cmpq rax, rscratch1 # Inline cache check
jne SharedRuntime::_ic_miss_stub
nop # nops to align entry point
nop # 12 bytes pad for loops and calls
020 B1: # out( B9 B2 ) <- BLOCK HEAD IS JUNK Freq: 1
020 # stack bang (136 bytes)
pushq rbp # Save rbp
subq rsp, #32 # Create frame
03a movl RBP, RDX # spill
03c movq [rsp + #0], RSI # spill
nop # 3 bytes pad for loops and calls
043 call,static Example2::blackhole
# Example2::foo @ bci:8 (line 7) L[0]=rsp + #0 L[1]=RBP L[2]=#ScObj0
# ScObj0 java/lang/Object={ }
# OopMap {[0]=Oop off=72/0x48}
048 B2: # out( B6 B3 ) <- in( B1 ) Freq: 0.99998
# Block is sole successor of call
048 testl RBP, RBP
04a je,s B6 P=0.100000 C=-1.000000
04c B3: # out( B10 B4 ) <- in( B2 ) Freq: 0.899982
04c movq RSI, precise java/lang/Object: 0x00007f7a2c007d80:Constant:exact * # ptr
nop # 1 bytes pad for loops and calls
057 call,static wrapper for: _new_instance_Java
# Example2::foo @ bci:17 (line 10) L[0]=_ L[1]=_ L[2]=#ScObj0 STK[0]=rsp + #0
# ScObj0 java/lang/Object={ }
# OopMap {[0]=Oop off=92/0x5c}
05c B4: # out( B5 ) <- in( B3 ) Freq: 0.899964
# Block is sole successor of call
05c
05c # checkcastPP of RAX
05c encode_heap_oop_not_null R11,RAX
09d movq R10, [rsp + #0] # spill
0a1 movl [R10 + #12 (8-bit)], R11 # compressed ptr ! Field: Example2._cache
0a5 B5: # out( N108 ) <- in( B4 B7 ) Freq: 0.99996
0a5 addq rsp, 32 # Destroy frame
popq rbp
cmpq rsp, poll_offset[r15_thread]
ja #safepoint_stub # Safepoint: poll for GC
0b7 ret
0b8 B6: # out( B8 B7 ) <- in( B2 ) Freq: 0.099998
0b8 movq RSI, precise java/lang/Object: 0x00007f7a2c007d80:Constant:exact * # ptr
nop # 1 bytes pad for loops and calls
0c3 call,static wrapper for: _new_instance_Java
# Example2::foo @ bci:4 (line 5) L[0]=rsp + #0 L[1]=RBP L[2]=_ STK[0]=#ScObj0
# ScObj0 java/lang/Object={ }
# OopMap {[0]=Oop off=200/0xc8}
0c8 B7: # out( B5 ) <- in( B6 ) Freq: 0.099996
# Block is sole successor of call
0c8
0c8 # checkcastPP of RAX
0c8 jmp,s B5
0ca B8: # out( B11 ) <- in( B6 ) Freq: 9.9998e-07
0ca # exception oop is in rax; no code emitted
0ca movq RSI, RAX # spill
0cd jmp,s B11
0cf B9: # out( B11 ) <- in( B1 ) Freq: 1e-05
0cf # exception oop is in rax; no code emitted
0cf movq RSI, RAX # spill
0d2 jmp,s B11
0d4 B10: # out( B11 ) <- in( B3 ) Freq: 8.99982e-06
0d4 # exception oop is in rax; no code emitted
0d4 movq RSI, RAX # spill
0d7 B11: # out( N108 ) <- in( B8 B9 B10 ) Freq: 1.99998e-05
0d7 addq rsp, 32 # Destroy frame
popq rbp
0dc jmp rethrow_stub
We expect to transform code to this after PEA. PHI node merges 2 predecessor basic blocks.
x2 is placed because x1 has been materialized. x0 becomes an obsolete object after then. C2 EA/SR will take care of it.
public Object foo(boolean cond) {
Object x0 = new Object();
blackhole();
if (cond) {
x1 = new Object();
_cache = x1;
}
x3 = phi(x2 = new Object(), x1);
return x3;
}
I wonder how can you deal with the identity issue mentioned by Vladimir, i.e this code
public void foo(boolean cond1, boolean cond2) {
Object x = new Object();
blackhole();
if (cond1) {
_cache1 = x;
}
blackhole();
if (cond2) {
_cache2 = x;
}
}
Here if cond1 && cond2
, we must have _cache1 == _cache2
, so you can't perform the transformation as you do above. In other word, the transformation can only be done if the object is sure to have not escaped at that point.
C2 generates code like this after parser with -XX:+DoPartialEscapeAnalysis
.
public void foo(boolean cond1, boolean cond2) {
Object x0 = new Object();
blackhole();
if (cond1) {
x1 = new Object();
_cache1 = x1;
}
x2 = phi(x2=new Object(), x1);
blackhole();
if (cond2) {
_cache2 = x2;
}
}
if cond1 == true && cond2== true, then we have _cache1 == __cache2 == x1
Thank you for vetting this. here is a modified program from and enable assertion.
We can verify that cache1 == cache2.
The reason is explained above. One thing is worth noting: when we parse "if(cond2) ...", object x has been materialized. PEA won't materialize it again at "_cache2 = x".
// -ea -Xcomp -Xms16M -Xmx16M -XX:+AlwaysPreTouch -XX:+UnlockExperimentalVMOptions -XX:+UseEpsilonGC -XX:-UseOnStackReplacement -XX:CompileOnly='Example2_merykitty.foo' -XX:CompileCommand=dontinline,Example2_merykitty.blackhole -XX:+DoPartialEscapeAnalysis
class Example2_merykitty {
private static Object _cache1;
private static Object _cache2;
public void foo(boolean cond1, boolean cond2) {
Object x = new Object();
blackhole();
if (cond1) {
_cache1 = x;
}
blackhole();
if (cond2) {
_cache2 = x;
}
}
public static void blackhole() {}
public static void main(String[] args) {
Example2_merykitty kase = new Example2_merykitty();
long iterations = 0;
try {
while (true) {
boolean cond = 0 == (iterations & 0xf);
kase.foo(cond, cond);
assert Example2_merykitty._cache1 == Example2_merykitty._cache2 :"check";
iterations++;
}
} finally {
System.err.println("Epsilon Test: " + iterations);
}
}
}
Link to Example3_1. It features non-trivial object with stateful fields and inlined methods.
https://gist.github.com/navyxliu/74d0546004a773cb5219754f6ed63d43
The is very similar to Example-1. there are 2 differences:
Here is the ideal graph after parse by default.

Here is the idea graph after parse with PEA.
Node 106 is the materialized object because of alignment '_cache = x'. this is very similar to Example1.
Node 129 is yet another materialized object. it's also a clone of original object. The reason we generate it because PEA attempts to merge predecessors. One predecessor has already materialized the object, PEA merge processor has to materialize others.