-
-
Save synopse/726a4706d95905101b4de87c4719329f to your computer and use it in GitHub Desktop.
| _ZN6Mormot4Core10Interfaces8FindGuidEPPNS1_17TInterfaceFactoryEiyy PROC | |
| ; WEAK _ZN6Mormot4Core10Interfaces8FindGuidEPPNS1_17TInterfaceFactoryEiyy | |
| push rbp ; 0000 _ 55 | |
| mov rbp, rsp ; 0001 _ 48: 89. E5 | |
| mov qword ptr [rbp-20H], rcx ; 0004 _ 48: 89. 4D, E0 | |
| mov qword ptr [rbp-18H], rdx ; 0008 _ 48: 89. 55, E8 | |
| mov dword ptr [rbp-0CH], esi ; 000C _ 89. 75, F4 | |
| mov qword ptr [rbp-8H], rdi ; 000F _ 48: 89. 7D, F8 | |
| ; Note: Immediate operand could be made smaller by sign extension | |
| cmp dword ptr [rbp-0CH], 0 ; 0013 _ 81. 7D, F4, 00000000 | |
| jle ?_1806 ; 001A _ 7E, 5F | |
| jmp ?_1804 ; 001C _ EB, 00 | |
| ?_1804: mov rax, qword ptr [rbp-8H] ; 001E _ 48: 8B. 45, F8 | |
| mov rax, qword ptr [rax] ; 0022 _ 48: 8B. 00 | |
| mov qword ptr [rbp-28H], rax ; 0025 _ 48: 89. 45, D8 | |
| mov rax, qword ptr [rbp-28H] ; 0029 _ 48: 8B. 45, D8 | |
| mov rax, qword ptr [rax+8H] ; 002D _ 48: 8B. 40, 08 | |
| mov rax, qword ptr [rax+28H] ; 0031 _ 48: 8B. 40, 28 | |
| mov qword ptr [rbp-30H], rax ; 0035 _ 48: 89. 45, D0 | |
| mov rax, qword ptr [rbp-30H] ; 0039 _ 48: 8B. 45, D0 | |
| mov rax, qword ptr [rax] ; 003D _ 48: 8B. 00 | |
| cmp rax, qword ptr [rbp-18H] ; 0040 _ 48: 3B. 45, E8 | |
| jnz ?_1805 ; 0044 _ 75, 10 | |
| mov rax, qword ptr [rbp-30H] ; 0046 _ 48: 8B. 45, D0 | |
| mov rax, qword ptr [rax+8H] ; 004A _ 48: 8B. 40, 08 | |
| cmp rax, qword ptr [rbp-20H] ; 004E _ 48: 3B. 45, E0 | |
| jnz ?_1805 ; 0052 _ 75, 02 | |
| jmp ?_1807 ; 0054 _ EB, 2D | |
| ?_1805: mov rax, qword ptr [rbp-8H] ; 0056 _ 48: 8B. 45, F8 | |
| ; Note: Immediate operand could be made smaller by sign extension | |
| add rax, 8 ; 005A _ 48: 05, 00000008 | |
| mov qword ptr [rbp-8H], rax ; 0060 _ 48: 89. 45, F8 | |
| mov ecx, dword ptr [rbp-0CH] ; 0064 _ 8B. 4D, F4 | |
| ; Note: Immediate operand could be made smaller by sign extension | |
| sub ecx, 1 ; 0067 _ 81. E9, 00000001 | |
| mov dword ptr [rbp-0CH], ecx ; 006D _ 89. 4D, F4 | |
| ; Note: Immediate operand could be made smaller by sign extension | |
| cmp dword ptr [rbp-0CH], 0 ; 0070 _ 81. 7D, F4, 00000000 | |
| jnz ?_1804 ; 0077 _ 75, A5 | |
| jmp ?_1806 ; 0079 _ EB, 00 | |
| ?_1806: mov qword ptr [rbp-28H], 0 ; 007B _ 48: C7. 45, D8, 00000000 | |
| ?_1807: mov rax, qword ptr [rbp-28H] ; 0083 _ 48: 8B. 45, D8 | |
| pop rbp ; 0087 _ 5D | |
| ret ; 0088 _ C3 |
| .text.n_mormot.core.interfaces_$$_findguid$pinterfacefactory$longint$qword$qword$$tinterfacefactory SEGMENT PARA 'CODE' ; section number 49 | |
| MORMOT.CORE.INTERFACES_$$_FINDGUID$PINTERFACEFACTORY$LONGINT$QWORD$QWORD$$TINTERFACEFACTORY LABEL NEAR | |
| test esi, esi ; 0000 _ 85. F6 | |
| jle ?_0283 ; 0002 _ 7E, 23 | |
| ; Filling space: 4H | |
| ; Filler type: Multi-byte NOP | |
| ; db 0FH, 1FH, 40H, 00H | |
| ALIGN 8 | |
| ?_0281: mov rax, qword ptr [rdi] ; 0008 _ 48: 8B. 07 | |
| mov r8, qword ptr [rax+8H] ; 000B _ 4C: 8B. 40, 08 | |
| mov r8, qword ptr [r8+28H] ; 000F _ 4D: 8B. 40, 28 | |
| cmp rdx, qword ptr [r8] ; 0013 _ 49: 3B. 10 | |
| jnz ?_0282 ; 0016 _ 75, 06 | |
| cmp rcx, qword ptr [r8+8H] ; 0018 _ 49: 3B. 48, 08 | |
| jz ?_0284 ; 001C _ 74, 0B | |
| ?_0282: add rdi, 8 ; 001E _ 48: 83. C7, 08 | |
| sub esi, 1 ; 0022 _ 83. EE, 01 | |
| jnz ?_0281 ; 0025 _ 75, E1 | |
| ?_0283: xor eax, eax ; 0027 _ 31. C0 | |
| ?_0284: ret ; 0029 _ C3 | |
On Godbolt with GCC 14.2 / Clang 19 (current as of 2026), the loop is at the same level as FPC:
FindGuid:
test esi, esi
jle .no
mov r10, rdi ; f in register, updated in place
.loop:
mov rax, [r10] ; result = *f
mov rdx, [rax+8] ; rtti
mov rdx, [rdx+0x28] ; guid (hoisted offset)
mov rax, [rdx] ; L
cmp rax, r8 ; gL (in reg)
jne .next
mov rax, [rdx+8] ; H
cmp rax, r9 ; gH (in reg)
je .found
.next:
add r10, 8 ; ++f
dec esi ; --n
jnz .loop
.no:
xor eax, eax
ret
.found:
mov rax, [r10] ; return the pointer
ret
it is even worse because of the .found branch which is clearly not needed. 👍
@synopse I assume the ASM above was obtained from compiling the following code: https://github.com/synopse/mORMot2/blob/365c45c11b29ce7eefab2611c1db54e0faa25113/src/core/mormot.core.interfaces.pas#L3881-L3902
Yes, from this function.
This is an optimized brute-force O(1) search function of a TGuid field of an array of class instances.
On Delphi Win32/Win64 the generated asm is very efficient too.
@synopse I reduced the original code to the following 2 units (to facilitate some tests):
unit mormot.core.interfaces;
interface
uses
mormot.core.utils;
type
TInterfaceFactory = class
protected
fInterfaceRtti: TRttiJson;
end;
PInterfaceFactory = ^TInterfaceFactory;
function FindGuid(f: PInterfaceFactory; n: integer;
{$ifdef CPU64BITS} gL, gH : QWord {$else} g: PHash128Rec {$endif}): TInterfaceFactory;
implementation
function FindGuid(f: PInterfaceFactory; n: integer;
{$ifdef CPU64BITS} gL, gH : QWord {$else} g: PHash128Rec {$endif}): TInterfaceFactory;
begin
if n > 0 then
repeat
result := f^;
with PHash128Rec(result.fInterfaceRtti.Cache.InterfaceGuid)^ do
{$ifdef CPU64BITS}
if (L = gL) and
(H = gH) then
{$else}
if (c0 = g^.c0) and
(c1 = g^.c1) and
(c2 = g^.c2) and
(c3 = g^.c3) then
{$endif CPU64BITS}
exit;
inc(f);
dec(n);
until n = 0;
result := nil;
end;
end.unit mormot.core.utils;
interface
type
{$ifdef UNICODE}
QWord = UInt64;
{$else}
QWord = type Int64;
{$endif UNICODE}
/// points to an unsigned Int64
PQWord = ^QWord;
TBlock128 = array[0..3] of cardinal;
THash128 = array[0..15] of byte;
THash128Rec = packed record
case integer of
0: (Lo, Hi: Int64);
1: (L, H: QWord);
2: (i0, i1, i2, i3: integer);
3: (c0, c1, c2 ,c3: cardinal);
4: (c: TBlock128);
5: (b: THash128);
6: (w: array[0..7] of word);
7: (guid: TGuid);
end;
PHash128Rec = ^THash128Rec;
TRttiKind = (
rkUnknown,
rkInteger,
rkChar,
rkEnumeration,
rkFloat,
rkSString,
rkSet,
rkClass,
rkMethod,
rkWChar,
rkLString,
rkWString,
rkVariant,
rkArray,
rkRecord,
rkInterface,
rkInt64,
rkDynArray
{$ifdef UNICODE},
rkUString,
rkClassRef,
rkPointer,
rkProcedure,
rkMRecord
{$endif UNICODE});
TRttiCache = record
Info: Pointer;
Size: integer;
Kind: TRttiKind;
Flags: Integer;
RttiOrd: Integer;
BinarySize: byte;
VarDataVType: word;
RttiVarDataVType: word;
case TRttiKind of
rkFloat: (
RttiFloat: Pointer;
IsDateTime, IsPureDate: boolean);
rkLString,
rkEnumeration,
rkSet,
rkDynArray,
rkArray: (
ItemInfoManaged: Pointer;
ItemInfoRaw: Pointer;
ItemSize: integer;
ItemCount: integer;
ObjArrayClass: TClass;
);
rkClass: (
NewInstance: pointer;
ValueClass: TClass;
SerializableInterface: pointer;
);
rkInterface: (
NewInterface: pointer;
InterfaceGuid: PGuid;
InterfaceFactory: pointer;
SerializableClass: TClass;
SerializableInterfaceEntryOffset: integer;
);
end;
TRttiCustom = class
protected
fCache: TRttiCache;
public
property Cache: TRttiCache read fCache;
end;
TRttiJson = class(TRttiCustom)
end;
implementation
end.And what I got from dcc64 was:
mormot.core.interfaces.o: file format coff-x86-64
Disassembly of section .text:
0000000000000000 <_ZN6Mormot4Core10Interfaces8FindGuidEPPNS1_17TInterfaceFactoryEiyy>:
0: 55 push rbp
1: 48 8b ec mov rbp, rsp
4: 85 d2 test edx, edx
6: 7e 21 jle 0x29 <_ZN6Mormot4Core10Interfaces8FindGuidEPPNS1_17TInterfaceFactoryEiyy+0x29>
8: 48 8b 01 mov rax, qword ptr [rcx]
b: 4c 8b 50 08 mov r10, qword ptr [rax + 0x8]
f: 4d 8b 52 30 mov r10, qword ptr [r10 + 0x30]
13: 4d 39 02 cmp qword ptr [r10], r8
16: 75 06 jne 0x1e <_ZN6Mormot4Core10Interfaces8FindGuidEPPNS1_17TInterfaceFactoryEiyy+0x1e>
18: 4d 39 4a 08 cmp qword ptr [r10 + 0x8], r9
1c: 74 0d je 0x2b <_ZN6Mormot4Core10Interfaces8FindGuidEPPNS1_17TInterfaceFactoryEiyy+0x2b>
1e: 48 83 c1 08 add rcx, 0x8
22: 83 ea 01 sub edx, 0x1
25: 85 d2 test edx, edx
27: 75 df jne 0x8 <_ZN6Mormot4Core10Interfaces8FindGuidEPPNS1_17TInterfaceFactoryEiyy+0x8>
29: 33 c0 xor eax, eax
2b: 48 8b e5 mov rsp, rbp
2e: 5d pop rbp
2f: c3 retIt seems to me a little better than Clang 19, but it's still worse than FPC.
To be faire, dcc64 is not really worse than FPC. It is almost the same. FPC just get rid of a not-needed "test edx, edx" instruction, but in practice, the CPU would just ignore this and optimized it at micro-op level.
Both Delphi dcc32 and dcc64 generates FPC-like code for this function.
The issue is the LLVM backend usage in Delphi.
FPC is much better for sure here.