Skip to content

Instantly share code, notes, and snippets.

@synopse
Created March 24, 2026 21:08
Show Gist options
  • Select an option

  • Save synopse/726a4706d95905101b4de87c4719329f to your computer and use it in GitHub Desktop.

Select an option

Save synopse/726a4706d95905101b4de87c4719329f to your computer and use it in GitHub Desktop.
FPC and Delphi for Linux asm Comparison
_ZN6Mormot4Core10Interfaces8FindGuidEPPNS1_17TInterfaceFactoryEiyy PROC
; WEAK _ZN6Mormot4Core10Interfaces8FindGuidEPPNS1_17TInterfaceFactoryEiyy
push rbp ; 0000 _ 55
mov rbp, rsp ; 0001 _ 48: 89. E5
mov qword ptr [rbp-20H], rcx ; 0004 _ 48: 89. 4D, E0
mov qword ptr [rbp-18H], rdx ; 0008 _ 48: 89. 55, E8
mov dword ptr [rbp-0CH], esi ; 000C _ 89. 75, F4
mov qword ptr [rbp-8H], rdi ; 000F _ 48: 89. 7D, F8
; Note: Immediate operand could be made smaller by sign extension
cmp dword ptr [rbp-0CH], 0 ; 0013 _ 81. 7D, F4, 00000000
jle ?_1806 ; 001A _ 7E, 5F
jmp ?_1804 ; 001C _ EB, 00
?_1804: mov rax, qword ptr [rbp-8H] ; 001E _ 48: 8B. 45, F8
mov rax, qword ptr [rax] ; 0022 _ 48: 8B. 00
mov qword ptr [rbp-28H], rax ; 0025 _ 48: 89. 45, D8
mov rax, qword ptr [rbp-28H] ; 0029 _ 48: 8B. 45, D8
mov rax, qword ptr [rax+8H] ; 002D _ 48: 8B. 40, 08
mov rax, qword ptr [rax+28H] ; 0031 _ 48: 8B. 40, 28
mov qword ptr [rbp-30H], rax ; 0035 _ 48: 89. 45, D0
mov rax, qword ptr [rbp-30H] ; 0039 _ 48: 8B. 45, D0
mov rax, qword ptr [rax] ; 003D _ 48: 8B. 00
cmp rax, qword ptr [rbp-18H] ; 0040 _ 48: 3B. 45, E8
jnz ?_1805 ; 0044 _ 75, 10
mov rax, qword ptr [rbp-30H] ; 0046 _ 48: 8B. 45, D0
mov rax, qword ptr [rax+8H] ; 004A _ 48: 8B. 40, 08
cmp rax, qword ptr [rbp-20H] ; 004E _ 48: 3B. 45, E0
jnz ?_1805 ; 0052 _ 75, 02
jmp ?_1807 ; 0054 _ EB, 2D
?_1805: mov rax, qword ptr [rbp-8H] ; 0056 _ 48: 8B. 45, F8
; Note: Immediate operand could be made smaller by sign extension
add rax, 8 ; 005A _ 48: 05, 00000008
mov qword ptr [rbp-8H], rax ; 0060 _ 48: 89. 45, F8
mov ecx, dword ptr [rbp-0CH] ; 0064 _ 8B. 4D, F4
; Note: Immediate operand could be made smaller by sign extension
sub ecx, 1 ; 0067 _ 81. E9, 00000001
mov dword ptr [rbp-0CH], ecx ; 006D _ 89. 4D, F4
; Note: Immediate operand could be made smaller by sign extension
cmp dword ptr [rbp-0CH], 0 ; 0070 _ 81. 7D, F4, 00000000
jnz ?_1804 ; 0077 _ 75, A5
jmp ?_1806 ; 0079 _ EB, 00
?_1806: mov qword ptr [rbp-28H], 0 ; 007B _ 48: C7. 45, D8, 00000000
?_1807: mov rax, qword ptr [rbp-28H] ; 0083 _ 48: 8B. 45, D8
pop rbp ; 0087 _ 5D
ret ; 0088 _ C3
.text.n_mormot.core.interfaces_$$_findguid$pinterfacefactory$longint$qword$qword$$tinterfacefactory SEGMENT PARA 'CODE' ; section number 49
MORMOT.CORE.INTERFACES_$$_FINDGUID$PINTERFACEFACTORY$LONGINT$QWORD$QWORD$$TINTERFACEFACTORY LABEL NEAR
test esi, esi ; 0000 _ 85. F6
jle ?_0283 ; 0002 _ 7E, 23
; Filling space: 4H
; Filler type: Multi-byte NOP
; db 0FH, 1FH, 40H, 00H
ALIGN 8
?_0281: mov rax, qword ptr [rdi] ; 0008 _ 48: 8B. 07
mov r8, qword ptr [rax+8H] ; 000B _ 4C: 8B. 40, 08
mov r8, qword ptr [r8+28H] ; 000F _ 4D: 8B. 40, 28
cmp rdx, qword ptr [r8] ; 0013 _ 49: 3B. 10
jnz ?_0282 ; 0016 _ 75, 06
cmp rcx, qword ptr [r8+8H] ; 0018 _ 49: 3B. 48, 08
jz ?_0284 ; 001C _ 74, 0B
?_0282: add rdi, 8 ; 001E _ 48: 83. C7, 08
sub esi, 1 ; 0022 _ 83. EE, 01
jnz ?_0281 ; 0025 _ 75, E1
?_0283: xor eax, eax ; 0027 _ 31. C0
?_0284: ret ; 0029 _ C3
@synopse
Copy link
Copy Markdown
Author

synopse commented Mar 24, 2026

FPC is much better for sure here.

@synopse
Copy link
Copy Markdown
Author

synopse commented Mar 25, 2026

On Godbolt with GCC 14.2 / Clang 19 (current as of 2026), the loop is at the same level as FPC:

FindGuid:
        test    esi, esi
        jle     .no
        mov     r10, rdi                ; f in register, updated in place
.loop:
        mov     rax, [r10]              ; result = *f
        mov     rdx, [rax+8]            ; rtti
        mov     rdx, [rdx+0x28]         ; guid (hoisted offset)
        mov     rax, [rdx]              ; L
        cmp     rax, r8                 ; gL (in reg)
        jne     .next
        mov     rax, [rdx+8]            ; H
        cmp     rax, r9                 ; gH (in reg)
        je      .found
.next:
        add     r10, 8                  ; ++f
        dec     esi                     ; --n
        jnz     .loop
.no:
        xor     eax, eax
        ret
.found:
        mov     rax, [r10]              ; return the pointer
        ret

it is even worse because of the .found branch which is clearly not needed. 👍

@viniciusfbb
Copy link
Copy Markdown

@synopse
Copy link
Copy Markdown
Author

synopse commented May 8, 2026

Yes, from this function.
This is an optimized brute-force O(1) search function of a TGuid field of an array of class instances.

On Delphi Win32/Win64 the generated asm is very efficient too.

@viniciusfbb
Copy link
Copy Markdown

viniciusfbb commented May 11, 2026

@synopse I reduced the original code to the following 2 units (to facilitate some tests):

unit mormot.core.interfaces;

interface

uses
  mormot.core.utils;

type
  TInterfaceFactory = class
  protected
    fInterfaceRtti: TRttiJson;
  end;
  PInterfaceFactory = ^TInterfaceFactory;

function FindGuid(f: PInterfaceFactory; n: integer;
  {$ifdef CPU64BITS} gL, gH : QWord {$else} g: PHash128Rec {$endif}): TInterfaceFactory;

implementation

function FindGuid(f: PInterfaceFactory; n: integer;
  {$ifdef CPU64BITS} gL, gH : QWord {$else} g: PHash128Rec {$endif}): TInterfaceFactory;
begin
  if n > 0 then
    repeat
      result := f^;
      with PHash128Rec(result.fInterfaceRtti.Cache.InterfaceGuid)^ do
        {$ifdef CPU64BITS}
        if (L = gL) and
           (H = gH) then
        {$else}
        if (c0 = g^.c0) and
           (c1 = g^.c1) and
           (c2 = g^.c2) and
           (c3 = g^.c3) then
        {$endif CPU64BITS}
          exit;
      inc(f);
      dec(n);
    until n = 0;
  result := nil;
end;

end.
unit mormot.core.utils;

interface

type
  {$ifdef UNICODE}
  QWord = UInt64;
  {$else}
  QWord = type Int64;
  {$endif UNICODE}
  /// points to an unsigned Int64
  PQWord = ^QWord;

  TBlock128 = array[0..3] of cardinal;
  THash128 = array[0..15] of byte;
  THash128Rec = packed record
    case integer of
      0: (Lo, Hi: Int64);
      1: (L, H: QWord);
      2: (i0, i1, i2, i3: integer);
      3: (c0, c1, c2 ,c3: cardinal);
      4: (c: TBlock128);
      5: (b: THash128);
      6: (w: array[0..7] of word);
      7: (guid: TGuid);
  end;
  PHash128Rec = ^THash128Rec;

  TRttiKind = (
    rkUnknown,
    rkInteger,
    rkChar,
    rkEnumeration,
    rkFloat,
    rkSString,
    rkSet,
    rkClass,
    rkMethod,
    rkWChar,
    rkLString,
    rkWString,
    rkVariant,
    rkArray,
    rkRecord,
    rkInterface,
    rkInt64,
    rkDynArray
    {$ifdef UNICODE},
    rkUString,
    rkClassRef,
    rkPointer,
    rkProcedure,
    rkMRecord
    {$endif UNICODE});

  TRttiCache = record
    Info: Pointer;
    Size: integer;
    Kind: TRttiKind;
    Flags: Integer;
    RttiOrd: Integer;
    BinarySize: byte;
    VarDataVType: word;
    RttiVarDataVType: word;
    case TRttiKind of
      rkFloat: (
        RttiFloat: Pointer;
        IsDateTime, IsPureDate: boolean);
      rkLString,
      rkEnumeration,
      rkSet,
      rkDynArray,
      rkArray: (
        ItemInfoManaged: Pointer;
        ItemInfoRaw: Pointer;
        ItemSize: integer;
        ItemCount: integer;
        ObjArrayClass: TClass;
      );
      rkClass: (
        NewInstance: pointer;
        ValueClass: TClass;
        SerializableInterface: pointer;
      );
      rkInterface: (
        NewInterface: pointer;
        InterfaceGuid: PGuid;
        InterfaceFactory: pointer;
        SerializableClass: TClass;
        SerializableInterfaceEntryOffset: integer;
      );
  end;

  TRttiCustom = class
  protected
    fCache: TRttiCache;
  public
    property Cache: TRttiCache read fCache;
  end;

  TRttiJson = class(TRttiCustom)
  end;

implementation

end.

And what I got from dcc64 was:

mormot.core.interfaces.o:       file format coff-x86-64

Disassembly of section .text:

0000000000000000 <_ZN6Mormot4Core10Interfaces8FindGuidEPPNS1_17TInterfaceFactoryEiyy>:
       0: 55                            push    rbp
       1: 48 8b ec                      mov     rbp, rsp
       4: 85 d2                         test    edx, edx
       6: 7e 21                         jle     0x29 <_ZN6Mormot4Core10Interfaces8FindGuidEPPNS1_17TInterfaceFactoryEiyy+0x29>
       8: 48 8b 01                      mov     rax, qword ptr [rcx]
       b: 4c 8b 50 08                   mov     r10, qword ptr [rax + 0x8]
       f: 4d 8b 52 30                   mov     r10, qword ptr [r10 + 0x30]
      13: 4d 39 02                      cmp     qword ptr [r10], r8
      16: 75 06                         jne     0x1e <_ZN6Mormot4Core10Interfaces8FindGuidEPPNS1_17TInterfaceFactoryEiyy+0x1e>
      18: 4d 39 4a 08                   cmp     qword ptr [r10 + 0x8], r9
      1c: 74 0d                         je      0x2b <_ZN6Mormot4Core10Interfaces8FindGuidEPPNS1_17TInterfaceFactoryEiyy+0x2b>
      1e: 48 83 c1 08                   add     rcx, 0x8
      22: 83 ea 01                      sub     edx, 0x1
      25: 85 d2                         test    edx, edx
      27: 75 df                         jne     0x8 <_ZN6Mormot4Core10Interfaces8FindGuidEPPNS1_17TInterfaceFactoryEiyy+0x8>
      29: 33 c0                         xor     eax, eax
      2b: 48 8b e5                      mov     rsp, rbp
      2e: 5d                            pop     rbp
      2f: c3                            ret

It seems to me a little better than Clang 19, but it's still worse than FPC.

@synopse
Copy link
Copy Markdown
Author

synopse commented May 12, 2026

To be faire, dcc64 is not really worse than FPC. It is almost the same. FPC just get rid of a not-needed "test edx, edx" instruction, but in practice, the CPU would just ignore this and optimized it at micro-op level.
Both Delphi dcc32 and dcc64 generates FPC-like code for this function.
The issue is the LLVM backend usage in Delphi.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment