Skip to content

Instantly share code, notes, and snippets.

@jeremyd2019
Last active April 9, 2025 09:34
Show Gist options
  • Save jeremyd2019/aa167df0a0ae422fa6ebaea5b60c80c9 to your computer and use it in GitHub Desktop.
Save jeremyd2019/aa167df0a0ae422fa6ebaea5b60c80c9 to your computer and use it in GitHub Desktop.
prototype code to find fast cwd pointer on arm64
#include <windows.h>
/* hacks on top of hacks ... */
#define _PEB _NOT__PEB
#define PEB _NOT_PEB
#define PPEB _NOT_PPEB
#define _TEB _NOT__TEB
#define TEB _NOT_TEB
#define PTEB _NOT_PTEB
#include <winternl.h>
#undef _PEB
#undef PEB
#undef PPEB
#undef _TEB
#undef TEB
#undef PTEB
#include <stdint.h>
#include <stdio.h>
#include <assert.h>
#include <stdbool.h>
#include <stddef.h>
#if defined (__x86_64__) || defined (__i386__)
#include <udis86.h>
#endif
typedef struct fcwd_access_t fcwd_access_t;
/* hacks from cygwin for testing */
typedef struct _PEB
{
BYTE Reserved1[2];
BYTE BeingDebugged;
BYTE Reserved2[1];
PVOID Reserved3[2];
PVOID Ldr;
PVOID ProcessParameters;
PVOID Reserved4;
PVOID ProcessHeap;
PRTL_CRITICAL_SECTION FastPebLock;
/* A lot more follows... */
} PEB, *PPEB;
typedef struct _TEB
{
NT_TIB Tib;
PVOID EnvironmentPointer;
CLIENT_ID ClientId;
PVOID ActiveRpcHandle;
PVOID ThreadLocalStoragePointer;
PPEB Peb;
/* A lot more follows... */
} TEB, *PTEB;
/* This is the layout used in Windows 8 and later. */
struct FAST_CWD_8 {
LONG ReferenceCount; /* Only release when this is 0. */
HANDLE DirectoryHandle;
ULONG OldDismountCount; /* Reflects the system DismountCount
at the time the CWD has been set. */
UNICODE_STRING Path; /* Path's Buffer member always refers
to the following Buffer array. */
LONG FSCharacteristics; /* Taken from FileFsDeviceInformation */
WCHAR Buffer[MAX_PATH] __attribute ((aligned (8)));
};
/* end hacks from cygwin for testing */
#if _WIN32_WINNT < 0x0a00
static BOOL
IsWow64Process2 (HANDLE hProcess, USHORT *pProcessMachine, USHORT *pNativeMachine)
{
typedef BOOL (WINAPI * IsWow64Process2_t) (HANDLE, USHORT *, USHORT *);
static IsWow64Process2_t pIsWow64Process2 = NULL;
static bool bIsWow64Process2Inited = false;
if (!bIsWow64Process2Inited)
{
pIsWow64Process2 = (IsWow64Process2_t) GetProcAddress (
GetModuleHandle("KERNEL32"),
"IsWow64Process2");
bIsWow64Process2Inited = true;
}
if (pIsWow64Process2)
return pIsWow64Process2 (hProcess, pProcessMachine, pNativeMachine);
SetLastError (ERROR_PROC_NOT_FOUND);
return FALSE;
}
#endif
static LPCVOID
GetArm64ProcAddress (HMODULE hModule, LPCSTR procname)
{
const BYTE *proc = (const BYTE *) GetProcAddress (hModule, procname);
#if defined (__aarch64__)
return proc;
#else
#if defined (__i386__)
static const BYTE thunk[] = "\x8b\xff\x55\x8b\xec\x5d\x90\xe9";
static const BYTE thunk2[0] = {};
#elif defined (__x86_64__)
/* see
https://learn.microsoft.com/en-us/windows/arm/arm64ec-abi#fast-forward-sequences */
static const BYTE thunk[] = "\x48\x8b\xc4\x48\x89\x58\x20\x55\x5d\xe9";
/* on windows 11 22000 the thunk is different than documented on that page */
static const BYTE thunk2[] = "\x48\x8b\xff\x55\x48\x8b\xec\x5d\x90\xe9";
#else
#error "Unhandled architecture for thunk detection"
#endif
if (proc && (memcmp (proc, thunk, sizeof (thunk) - 1) == 0 ||
(sizeof(thunk2) && memcmp (proc, thunk2, sizeof (thunk2) - 1) == 0)))
{
proc += sizeof (thunk) - 1;
proc += 4 + *(const int32_t *) proc;
}
return proc;
#endif
}
/* these ids and masks, as well as the names of the various other parts of
instructions used in this file, came from
https://developer.arm.com/documentation/ddi0602/2024-09/Index-by-Encoding
(Arm A-profile A64 Instruction Set Architecture)
*/
#define IS_INSN(pc, name) ((*(pc) & name##_mask) == name##_id)
static const uint32_t add_id = 0x11000000;
static const uint32_t add_mask = 0x7fc00000;
static const uint32_t adrp_id = 0x90000000;
static const uint32_t adrp_mask = 0x9f000000;
static const uint32_t b_id = 0x14000000;
static const uint32_t b_mask = 0xfc000000;
static const uint32_t bl_id = 0x94000000;
static const uint32_t bl_mask = 0xfc000000;
/* matches both cbz and cbnz */
static const uint32_t cbz_id = 0x34000000;
static const uint32_t cbz_mask = 0x7e000000;
static const uint32_t ldr_id = 0xb9400000;
static const uint32_t ldr_mask = 0xbfc00000;
/* matches both ret and br (which are the same except ret is a 'hint' that
it's a subroutine return */
static const uint32_t ret_id = 0xd61f0000;
static const uint32_t ret_mask = 0xffbffc1f;
/* this would work for either bl or b, but we only use it for bl */
static inline LPCVOID
extract_bl_target (const uint32_t *pc)
{
assert (IS_INSN (pc, bl) || IS_INSN (pc, b));
int32_t offset = *pc & ~bl_mask;
/* sign extend */
if (offset & (1 << 25))
offset |= bl_mask;
/* Note uint32_t * artithmatic will implicitly multiply the offset by 4 */
return pc + offset;
}
static inline uint64_t
extract_adrp_address (const uint32_t *pc)
{
assert (IS_INSN (pc, adrp));
uint64_t adrp_base = (uint64_t) pc & ~0xFFF;
int64_t adrp_imm = (*pc >> (5+19+5)) & 0x3;
adrp_imm |= ((*pc >> 5) & 0x7FFFF) << 2;
/* sign extend */
if (adrp_imm & (1 << 20))
adrp_imm |= ~((1 << 21) - 1);
adrp_imm <<= 12;
return adrp_base + adrp_imm;
}
fcwd_access_t **
find_fast_cwd_pointer_aarch64 ()
{
/* Fetch entry points of relevant functions in ntdll.dll. */
HMODULE ntdll = GetModuleHandle ("ntdll.dll");
if (!ntdll)
return NULL;
LPCVOID get_dir = GetArm64ProcAddress (ntdll, "RtlGetCurrentDirectory_U");
LPCVOID ent_crit = GetArm64ProcAddress (ntdll, "RtlEnterCriticalSection");
if (!get_dir || !ent_crit)
return NULL;
printf("%p\n", get_dir);
LPCVOID use_cwd = NULL;
const uint32_t *start = (const uint32_t *) get_dir;
const uint32_t *pc = start;
/* find the call to RtlpReferenceCurrentDirectory, and get its address */
for (; pc < start + 20 && !IS_INSN (pc, ret) && !IS_INSN (pc, b); pc++)
{
if (IS_INSN (pc, bl))
{
use_cwd = extract_bl_target (pc);
break;
}
}
printf("%p\n", use_cwd);
if (!use_cwd)
return NULL;
start = pc = (const uint32_t *) use_cwd;
const uint32_t *ldrpc = NULL;
uint32_t ldroffset, ldrsz;
uint32_t ldrrn, ldrrd;
/* find the ldr (immediate unsigned offset) for RtlpCurDirRef */
for (; pc < start + 20 && !IS_INSN (pc, ret) && !IS_INSN (pc, b); pc++)
{
if (IS_INSN (pc, ldr))
{
ldrpc = pc;
ldrsz = (*pc & 0x40000000);
ldroffset = (*pc >> (5+5)) & 0xFFF;
ldroffset <<= ldrsz ? 3 : 2;
ldrrn = (*pc >> 5) & 0x1F;
ldrrd = *pc & 0x1F;
break;
}
}
printf("%p -> %x\n", pc, ldroffset);
if (ldrpc == NULL)
return NULL;
/* the next instruction after the ldr should be checking if it was NULL:
either a compare and branch if zero or not zero (hence why cbz_mask is 7e
instead of 7f) */
if (!IS_INSN (pc + 1, cbz) || (*(pc + 1) & 0x1F) != ldrrd
|| (*(pc + 1) & 0x80000000) != (ldrsz << 1))
return NULL;
/* work backwards, find a bl to RtlEnterCriticalSection whose argument
is the fast peb lock */
for (pc = ldrpc; pc >= start; pc--)
{
if (IS_INSN (pc, bl) && extract_bl_target (pc) == ent_crit)
break;
}
uint32_t addoffset;
uint32_t addrn;
for (; pc >= start; pc--)
{
if (IS_INSN (pc, add) && (*pc & 0x1F) == 0)
{
addoffset = (*pc >> (5+5)) & 0xFFF;
addrn = (*pc >> 5) & 0x1F;
break;
}
}
PRTL_CRITICAL_SECTION lockaddr = NULL;
for (; pc >= start; pc--)
{
if (IS_INSN (pc, adrp) && (*pc & 0x1F) == addrn)
{
lockaddr = (PRTL_CRITICAL_SECTION) (extract_adrp_address (pc) +
addoffset);
break;
}
}
if (lockaddr != NtCurrentTeb ()->Peb->FastPebLock)
return NULL;
/* work backwards from the ldr to find the corresponding adrp */
fcwd_access_t **RtlpCurDirRef = NULL;
for (pc = ldrpc; pc >= start; pc--)
{
if (IS_INSN (pc, adrp) && (*pc & 0x1F) == ldrrn)
{
RtlpCurDirRef = (fcwd_access_t **) (extract_adrp_address (pc) +
ldroffset);
break;
}
}
printf("%p -> %p\n", pc, RtlpCurDirRef);
return RtlpCurDirRef;
}
#if defined (__x86_64__) || defined (__i386__)
#define PTR_BITS (sizeof (void *) * 8)
fcwd_access_t **
find_fast_cwd_pointer_x86 ()
{
/* Fetch entry points of relevant functions in ntdll.dll. */
HMODULE ntdll = GetModuleHandle ("ntdll.dll");
if (!ntdll)
return NULL;
const uint8_t *get_dir = (const uint8_t *)
GetProcAddress (ntdll, "RtlGetCurrentDirectory_U");
const uint8_t *ent_crit = (const uint8_t *)
GetProcAddress (ntdll, "RtlEnterCriticalSection");
printf("%p\n", get_dir);
if (!get_dir || !ent_crit)
return NULL;
/* Initialize udis86 */
ud_t ud_obj;
ud_init (&ud_obj);
/* Set mode to current bitness */
ud_set_mode (&ud_obj, PTR_BITS);
ud_set_input_buffer (&ud_obj, get_dir, 80);
/* Set pc (rip) so that subsequent calls to ud_insn_off will return the pc of
the instruction, saving us the hassle of tracking it ourselves */
ud_set_pc (&ud_obj, (uint64_t) get_dir);
/* some short names for more readable code */
const ud_operand_t &opr0 = ud_obj.operand[0],
&opr1 = ud_obj.operand[1];
const ud_mnemonic_code_t &insn = ud_obj.mnemonic;
ud_type_t reg = UD_NONE;
/* Search first relative call instruction in RtlGetCurrentDirectory_U. */
const uint8_t *use_cwd = NULL;
while (ud_disassemble (&ud_obj) && insn != UD_Iret && insn != UD_Ijmp)
{
if (insn == UD_Icall)
{
if (opr0.type == UD_OP_JIMM && opr0.size == 32)
{
/* Fetch offset from instruction and compute address of called
function. This function actually fetches the current FAST_CWD
instance and performs some other actions, not important to us.
*/
use_cwd = (const uint8_t *) (ud_obj.pc + opr0.lval.sdword);
break;
}
}
}
printf("%p\n", use_cwd);
if (!use_cwd)
return NULL;
ud_set_input_buffer (&ud_obj, use_cwd, 120);
ud_set_pc (&ud_obj, (uint64_t) use_cwd);
/* Next we search for the locking mechanism and perform a sanity check.
we basically look for the RtlEnterCriticalSection call and test if the
code uses the FastPebLock. */
PRTL_CRITICAL_SECTION lockaddr = NULL;
while (ud_disassemble (&ud_obj) && insn != UD_Iret && insn != UD_Ijmp)
{
if (PTR_BITS == 64 && insn == UD_Ilea)
{
/* udis86 seems to follow intel syntax, in that operand 0 is the
dest and 1 is the src */
if (opr1.type == UD_OP_MEM && opr1.base == UD_R_RIP &&
opr1.index == UD_NONE && opr1.scale == 0 && opr1.offset == 32 &&
opr0.type == UD_OP_REG && opr0.size == PTR_BITS)
{
lockaddr = (PRTL_CRITICAL_SECTION) (ud_obj.pc + opr1.lval.sdword);
reg = opr0.base;
break;
}
}
else if (PTR_BITS == 32 && insn == UD_Imov)
{
if (opr1.type == UD_OP_IMM && opr1.size == 32 &&
opr0.type == UD_OP_REG && opr0.size == PTR_BITS)
{
lockaddr = (PRTL_CRITICAL_SECTION) (intptr_t) opr1.lval.sdword;
reg = opr0.base;
break;
}
}
else if (PTR_BITS == 32 && insn == UD_Ipush)
{
if (opr0.type == UD_OP_IMM && opr0.size == 32)
{
lockaddr = (PRTL_CRITICAL_SECTION) (intptr_t) opr0.lval.sdword;
/* cheat because the lock is already on the stack, we don't need
to find where it's pushed (just like the case where it's
already in rcx on 64-bit) */
reg = UD_R_RCX;
break;
}
}
}
/* Test if lock address is FastPebLock. */
if (lockaddr != NtCurrentTeb ()->Peb->FastPebLock)
return NULL;
/* Find where the lock address is loaded into rcx as the first parameter of
a function call */
bool found = false;
if (reg != UD_R_RCX)
{
while (ud_disassemble (&ud_obj) && insn != UD_Iret && insn != UD_Ijmp)
{
if (PTR_BITS == 64 && insn == UD_Imov)
{
if (opr1.type == UD_OP_REG && opr1.size == PTR_BITS &&
opr1.base == reg && opr0.type == UD_OP_REG &&
opr0.size == PTR_BITS && opr0.base == UD_R_RCX)
{
found = true;
break;
}
}
else if (PTR_BITS == 32 && insn == UD_Ipush)
{
if (opr0.type == UD_OP_REG && opr0.size == PTR_BITS &&
opr0.base == reg)
{
found = true;
break;
}
}
}
if (!found)
return NULL;
}
/* Next is the `callq RtlEnterCriticalSection' */
found = false;
while (ud_disassemble (&ud_obj) && insn != UD_Iret && insn != UD_Ijmp &&
(PTR_BITS != 32 || insn != UD_Ipush))
{
if (insn == UD_Icall)
{
if (opr0.type == UD_OP_JIMM && opr0.size == 32)
{
if (ent_crit != (const void *) (ud_obj.pc + opr0.lval.sdword))
return NULL;
found = true;
break;
}
}
}
if (!found)
return NULL;
fcwd_access_t **f_cwd_ptr = NULL;
/* now we're looking for a mov rel(%rip), %<reg> */
while (ud_disassemble (&ud_obj) && insn != UD_Iret && insn != UD_Ijmp)
{
if (insn == UD_Imov)
{
if (opr1.type == UD_OP_MEM && opr1.size == PTR_BITS &&
opr1.index == UD_NONE && opr1.scale == 0 && opr1.offset == 32 &&
opr0.type == UD_OP_REG && opr0.size == PTR_BITS)
{
if (opr1.base == UD_R_RIP)
f_cwd_ptr = (fcwd_access_t **) (ud_obj.pc + opr1.lval.sdword);
else if (opr1.base == UD_NONE)
f_cwd_ptr = (fcwd_access_t **) (intptr_t) opr1.lval.sdword;
else
continue;
reg = opr0.base;
break;
}
}
}
printf("%llx -> %p\n", ud_obj.pc, f_cwd_ptr);
if (!f_cwd_ptr || !ud_disassemble (&ud_obj))
return NULL;
ud_type_t zeroreg = UD_NONE;
if (PTR_BITS == 32 && insn == UD_Ixor)
{
if (opr0.type != UD_OP_REG || opr0.size != PTR_BITS ||
opr1.type != UD_OP_REG || opr1.size != PTR_BITS ||
opr0.base != opr1.base)
return NULL;
zeroreg = opr0.base;
if (!ud_disassemble (&ud_obj))
return NULL;
}
/* Check that the next instruction is a test. */
if (insn == UD_Itest)
{
/* ... and that it's testing the same register that the mov above loaded
* the f_cwd_ptr into against itself */
if (opr0.type != UD_OP_REG || opr0.size != PTR_BITS || opr0.base != reg ||
opr1.type != UD_OP_REG || opr1.size != PTR_BITS || opr1.base != reg)
return NULL;
}
else if (PTR_BITS == 32 && insn == UD_Icmp)
{
/* could be xor rY, rY/cmp reg, rY */
if (opr0.type != UD_OP_REG || opr0.size != PTR_BITS ||
opr1.type != UD_OP_REG || opr1.size != PTR_BITS)
return NULL;
if ((opr0.base != reg || opr1.base != zeroreg) &&
(opr0.base != zeroreg || opr1.base != reg))
return NULL;
}
else
return NULL;
return f_cwd_ptr;
}
#endif /* __x86_64__ || __i386__ */
int main (void)
{
typedef ULONG (WINAPI * RtlGetCurrentDirectory_U_t) (ULONG, LPWSTR);
RtlGetCurrentDirectory_U_t pRtlGetCurrentDirectory_U = (RtlGetCurrentDirectory_U_t) GetProcAddress(GetModuleHandle("ntdll"), "RtlGetCurrentDirectory_U");
printf("%p\n", pRtlGetCurrentDirectory_U);
USHORT WowMachine, NativeMachine;
if (!IsWow64Process2 (GetCurrentProcess (), &WowMachine, &NativeMachine))
NativeMachine = IMAGE_FILE_MACHINE_AMD64;
struct FAST_CWD_8 **RtlpCurDirRef = NULL;
if (NativeMachine == IMAGE_FILE_MACHINE_ARM64)
RtlpCurDirRef = (struct FAST_CWD_8 **) find_fast_cwd_pointer_aarch64 ();
#if defined (__x86_64__) || defined (__i386__)
else
RtlpCurDirRef = (struct FAST_CWD_8 **) find_fast_cwd_pointer_x86 ();
#endif
printf("%p\n", RtlpCurDirRef);
getchar();
printf("%.*S\n", (*RtlpCurDirRef)->Path.Length/2, (*RtlpCurDirRef)->Path.Buffer);
WCHAR buf[MAX_PATH];
pRtlGetCurrentDirectory_U (sizeof(buf), buf);
printf("%S\n", buf);
return 0;
}
@jeremyd2019
Copy link
Author

jeremyd2019 commented Nov 28, 2024

References:

@dscho
Copy link

dscho commented Feb 7, 2025

Wooow. This looks intricate, and at the same time enticing. Have you verified that it works? If so, how about getting it into Cygwin?

@jeremyd2019
Copy link
Author

https://inbox.sourceware.org/cygwin-patches/[email protected]/t/#u

That's low on my priority list because,

  1. I've never actually seen an issue from ARM64 not having this, and
  2. This whole idea of digging through the machine code to extract a private variable is nasty, and I'm not horribly excited to be perpetuating that

@dscho
Copy link

dscho commented Feb 7, 2025

Makes sense. IIRC the only reason why this is needed is so that Cygwin can delete the current working directory, which is something you can do on Linux, but not on Windows.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment