Last active
April 9, 2025 09:34
-
-
Save jeremyd2019/aa167df0a0ae422fa6ebaea5b60c80c9 to your computer and use it in GitHub Desktop.
prototype code to find fast cwd pointer on arm64
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include <windows.h> | |
/* hacks on top of hacks ... */ | |
#define _PEB _NOT__PEB | |
#define PEB _NOT_PEB | |
#define PPEB _NOT_PPEB | |
#define _TEB _NOT__TEB | |
#define TEB _NOT_TEB | |
#define PTEB _NOT_PTEB | |
#include <winternl.h> | |
#undef _PEB | |
#undef PEB | |
#undef PPEB | |
#undef _TEB | |
#undef TEB | |
#undef PTEB | |
#include <stdint.h> | |
#include <stdio.h> | |
#include <assert.h> | |
#include <stdbool.h> | |
#include <stddef.h> | |
#if defined (__x86_64__) || defined (__i386__) | |
#include <udis86.h> | |
#endif | |
typedef struct fcwd_access_t fcwd_access_t; | |
/* hacks from cygwin for testing */ | |
typedef struct _PEB | |
{ | |
BYTE Reserved1[2]; | |
BYTE BeingDebugged; | |
BYTE Reserved2[1]; | |
PVOID Reserved3[2]; | |
PVOID Ldr; | |
PVOID ProcessParameters; | |
PVOID Reserved4; | |
PVOID ProcessHeap; | |
PRTL_CRITICAL_SECTION FastPebLock; | |
/* A lot more follows... */ | |
} PEB, *PPEB; | |
typedef struct _TEB | |
{ | |
NT_TIB Tib; | |
PVOID EnvironmentPointer; | |
CLIENT_ID ClientId; | |
PVOID ActiveRpcHandle; | |
PVOID ThreadLocalStoragePointer; | |
PPEB Peb; | |
/* A lot more follows... */ | |
} TEB, *PTEB; | |
/* This is the layout used in Windows 8 and later. */ | |
struct FAST_CWD_8 { | |
LONG ReferenceCount; /* Only release when this is 0. */ | |
HANDLE DirectoryHandle; | |
ULONG OldDismountCount; /* Reflects the system DismountCount | |
at the time the CWD has been set. */ | |
UNICODE_STRING Path; /* Path's Buffer member always refers | |
to the following Buffer array. */ | |
LONG FSCharacteristics; /* Taken from FileFsDeviceInformation */ | |
WCHAR Buffer[MAX_PATH] __attribute ((aligned (8))); | |
}; | |
/* end hacks from cygwin for testing */ | |
#if _WIN32_WINNT < 0x0a00 | |
static BOOL | |
IsWow64Process2 (HANDLE hProcess, USHORT *pProcessMachine, USHORT *pNativeMachine) | |
{ | |
typedef BOOL (WINAPI * IsWow64Process2_t) (HANDLE, USHORT *, USHORT *); | |
static IsWow64Process2_t pIsWow64Process2 = NULL; | |
static bool bIsWow64Process2Inited = false; | |
if (!bIsWow64Process2Inited) | |
{ | |
pIsWow64Process2 = (IsWow64Process2_t) GetProcAddress ( | |
GetModuleHandle("KERNEL32"), | |
"IsWow64Process2"); | |
bIsWow64Process2Inited = true; | |
} | |
if (pIsWow64Process2) | |
return pIsWow64Process2 (hProcess, pProcessMachine, pNativeMachine); | |
SetLastError (ERROR_PROC_NOT_FOUND); | |
return FALSE; | |
} | |
#endif | |
static LPCVOID | |
GetArm64ProcAddress (HMODULE hModule, LPCSTR procname) | |
{ | |
const BYTE *proc = (const BYTE *) GetProcAddress (hModule, procname); | |
#if defined (__aarch64__) | |
return proc; | |
#else | |
#if defined (__i386__) | |
static const BYTE thunk[] = "\x8b\xff\x55\x8b\xec\x5d\x90\xe9"; | |
static const BYTE thunk2[0] = {}; | |
#elif defined (__x86_64__) | |
/* see | |
https://learn.microsoft.com/en-us/windows/arm/arm64ec-abi#fast-forward-sequences */ | |
static const BYTE thunk[] = "\x48\x8b\xc4\x48\x89\x58\x20\x55\x5d\xe9"; | |
/* on windows 11 22000 the thunk is different than documented on that page */ | |
static const BYTE thunk2[] = "\x48\x8b\xff\x55\x48\x8b\xec\x5d\x90\xe9"; | |
#else | |
#error "Unhandled architecture for thunk detection" | |
#endif | |
if (proc && (memcmp (proc, thunk, sizeof (thunk) - 1) == 0 || | |
(sizeof(thunk2) && memcmp (proc, thunk2, sizeof (thunk2) - 1) == 0))) | |
{ | |
proc += sizeof (thunk) - 1; | |
proc += 4 + *(const int32_t *) proc; | |
} | |
return proc; | |
#endif | |
} | |
/* these ids and masks, as well as the names of the various other parts of | |
instructions used in this file, came from | |
https://developer.arm.com/documentation/ddi0602/2024-09/Index-by-Encoding | |
(Arm A-profile A64 Instruction Set Architecture) | |
*/ | |
#define IS_INSN(pc, name) ((*(pc) & name##_mask) == name##_id) | |
static const uint32_t add_id = 0x11000000; | |
static const uint32_t add_mask = 0x7fc00000; | |
static const uint32_t adrp_id = 0x90000000; | |
static const uint32_t adrp_mask = 0x9f000000; | |
static const uint32_t b_id = 0x14000000; | |
static const uint32_t b_mask = 0xfc000000; | |
static const uint32_t bl_id = 0x94000000; | |
static const uint32_t bl_mask = 0xfc000000; | |
/* matches both cbz and cbnz */ | |
static const uint32_t cbz_id = 0x34000000; | |
static const uint32_t cbz_mask = 0x7e000000; | |
static const uint32_t ldr_id = 0xb9400000; | |
static const uint32_t ldr_mask = 0xbfc00000; | |
/* matches both ret and br (which are the same except ret is a 'hint' that | |
it's a subroutine return */ | |
static const uint32_t ret_id = 0xd61f0000; | |
static const uint32_t ret_mask = 0xffbffc1f; | |
/* this would work for either bl or b, but we only use it for bl */ | |
static inline LPCVOID | |
extract_bl_target (const uint32_t *pc) | |
{ | |
assert (IS_INSN (pc, bl) || IS_INSN (pc, b)); | |
int32_t offset = *pc & ~bl_mask; | |
/* sign extend */ | |
if (offset & (1 << 25)) | |
offset |= bl_mask; | |
/* Note uint32_t * artithmatic will implicitly multiply the offset by 4 */ | |
return pc + offset; | |
} | |
static inline uint64_t | |
extract_adrp_address (const uint32_t *pc) | |
{ | |
assert (IS_INSN (pc, adrp)); | |
uint64_t adrp_base = (uint64_t) pc & ~0xFFF; | |
int64_t adrp_imm = (*pc >> (5+19+5)) & 0x3; | |
adrp_imm |= ((*pc >> 5) & 0x7FFFF) << 2; | |
/* sign extend */ | |
if (adrp_imm & (1 << 20)) | |
adrp_imm |= ~((1 << 21) - 1); | |
adrp_imm <<= 12; | |
return adrp_base + adrp_imm; | |
} | |
fcwd_access_t ** | |
find_fast_cwd_pointer_aarch64 () | |
{ | |
/* Fetch entry points of relevant functions in ntdll.dll. */ | |
HMODULE ntdll = GetModuleHandle ("ntdll.dll"); | |
if (!ntdll) | |
return NULL; | |
LPCVOID get_dir = GetArm64ProcAddress (ntdll, "RtlGetCurrentDirectory_U"); | |
LPCVOID ent_crit = GetArm64ProcAddress (ntdll, "RtlEnterCriticalSection"); | |
if (!get_dir || !ent_crit) | |
return NULL; | |
printf("%p\n", get_dir); | |
LPCVOID use_cwd = NULL; | |
const uint32_t *start = (const uint32_t *) get_dir; | |
const uint32_t *pc = start; | |
/* find the call to RtlpReferenceCurrentDirectory, and get its address */ | |
for (; pc < start + 20 && !IS_INSN (pc, ret) && !IS_INSN (pc, b); pc++) | |
{ | |
if (IS_INSN (pc, bl)) | |
{ | |
use_cwd = extract_bl_target (pc); | |
break; | |
} | |
} | |
printf("%p\n", use_cwd); | |
if (!use_cwd) | |
return NULL; | |
start = pc = (const uint32_t *) use_cwd; | |
const uint32_t *ldrpc = NULL; | |
uint32_t ldroffset, ldrsz; | |
uint32_t ldrrn, ldrrd; | |
/* find the ldr (immediate unsigned offset) for RtlpCurDirRef */ | |
for (; pc < start + 20 && !IS_INSN (pc, ret) && !IS_INSN (pc, b); pc++) | |
{ | |
if (IS_INSN (pc, ldr)) | |
{ | |
ldrpc = pc; | |
ldrsz = (*pc & 0x40000000); | |
ldroffset = (*pc >> (5+5)) & 0xFFF; | |
ldroffset <<= ldrsz ? 3 : 2; | |
ldrrn = (*pc >> 5) & 0x1F; | |
ldrrd = *pc & 0x1F; | |
break; | |
} | |
} | |
printf("%p -> %x\n", pc, ldroffset); | |
if (ldrpc == NULL) | |
return NULL; | |
/* the next instruction after the ldr should be checking if it was NULL: | |
either a compare and branch if zero or not zero (hence why cbz_mask is 7e | |
instead of 7f) */ | |
if (!IS_INSN (pc + 1, cbz) || (*(pc + 1) & 0x1F) != ldrrd | |
|| (*(pc + 1) & 0x80000000) != (ldrsz << 1)) | |
return NULL; | |
/* work backwards, find a bl to RtlEnterCriticalSection whose argument | |
is the fast peb lock */ | |
for (pc = ldrpc; pc >= start; pc--) | |
{ | |
if (IS_INSN (pc, bl) && extract_bl_target (pc) == ent_crit) | |
break; | |
} | |
uint32_t addoffset; | |
uint32_t addrn; | |
for (; pc >= start; pc--) | |
{ | |
if (IS_INSN (pc, add) && (*pc & 0x1F) == 0) | |
{ | |
addoffset = (*pc >> (5+5)) & 0xFFF; | |
addrn = (*pc >> 5) & 0x1F; | |
break; | |
} | |
} | |
PRTL_CRITICAL_SECTION lockaddr = NULL; | |
for (; pc >= start; pc--) | |
{ | |
if (IS_INSN (pc, adrp) && (*pc & 0x1F) == addrn) | |
{ | |
lockaddr = (PRTL_CRITICAL_SECTION) (extract_adrp_address (pc) + | |
addoffset); | |
break; | |
} | |
} | |
if (lockaddr != NtCurrentTeb ()->Peb->FastPebLock) | |
return NULL; | |
/* work backwards from the ldr to find the corresponding adrp */ | |
fcwd_access_t **RtlpCurDirRef = NULL; | |
for (pc = ldrpc; pc >= start; pc--) | |
{ | |
if (IS_INSN (pc, adrp) && (*pc & 0x1F) == ldrrn) | |
{ | |
RtlpCurDirRef = (fcwd_access_t **) (extract_adrp_address (pc) + | |
ldroffset); | |
break; | |
} | |
} | |
printf("%p -> %p\n", pc, RtlpCurDirRef); | |
return RtlpCurDirRef; | |
} | |
#if defined (__x86_64__) || defined (__i386__) | |
#define PTR_BITS (sizeof (void *) * 8) | |
fcwd_access_t ** | |
find_fast_cwd_pointer_x86 () | |
{ | |
/* Fetch entry points of relevant functions in ntdll.dll. */ | |
HMODULE ntdll = GetModuleHandle ("ntdll.dll"); | |
if (!ntdll) | |
return NULL; | |
const uint8_t *get_dir = (const uint8_t *) | |
GetProcAddress (ntdll, "RtlGetCurrentDirectory_U"); | |
const uint8_t *ent_crit = (const uint8_t *) | |
GetProcAddress (ntdll, "RtlEnterCriticalSection"); | |
printf("%p\n", get_dir); | |
if (!get_dir || !ent_crit) | |
return NULL; | |
/* Initialize udis86 */ | |
ud_t ud_obj; | |
ud_init (&ud_obj); | |
/* Set mode to current bitness */ | |
ud_set_mode (&ud_obj, PTR_BITS); | |
ud_set_input_buffer (&ud_obj, get_dir, 80); | |
/* Set pc (rip) so that subsequent calls to ud_insn_off will return the pc of | |
the instruction, saving us the hassle of tracking it ourselves */ | |
ud_set_pc (&ud_obj, (uint64_t) get_dir); | |
/* some short names for more readable code */ | |
const ud_operand_t &opr0 = ud_obj.operand[0], | |
&opr1 = ud_obj.operand[1]; | |
const ud_mnemonic_code_t &insn = ud_obj.mnemonic; | |
ud_type_t reg = UD_NONE; | |
/* Search first relative call instruction in RtlGetCurrentDirectory_U. */ | |
const uint8_t *use_cwd = NULL; | |
while (ud_disassemble (&ud_obj) && insn != UD_Iret && insn != UD_Ijmp) | |
{ | |
if (insn == UD_Icall) | |
{ | |
if (opr0.type == UD_OP_JIMM && opr0.size == 32) | |
{ | |
/* Fetch offset from instruction and compute address of called | |
function. This function actually fetches the current FAST_CWD | |
instance and performs some other actions, not important to us. | |
*/ | |
use_cwd = (const uint8_t *) (ud_obj.pc + opr0.lval.sdword); | |
break; | |
} | |
} | |
} | |
printf("%p\n", use_cwd); | |
if (!use_cwd) | |
return NULL; | |
ud_set_input_buffer (&ud_obj, use_cwd, 120); | |
ud_set_pc (&ud_obj, (uint64_t) use_cwd); | |
/* Next we search for the locking mechanism and perform a sanity check. | |
we basically look for the RtlEnterCriticalSection call and test if the | |
code uses the FastPebLock. */ | |
PRTL_CRITICAL_SECTION lockaddr = NULL; | |
while (ud_disassemble (&ud_obj) && insn != UD_Iret && insn != UD_Ijmp) | |
{ | |
if (PTR_BITS == 64 && insn == UD_Ilea) | |
{ | |
/* udis86 seems to follow intel syntax, in that operand 0 is the | |
dest and 1 is the src */ | |
if (opr1.type == UD_OP_MEM && opr1.base == UD_R_RIP && | |
opr1.index == UD_NONE && opr1.scale == 0 && opr1.offset == 32 && | |
opr0.type == UD_OP_REG && opr0.size == PTR_BITS) | |
{ | |
lockaddr = (PRTL_CRITICAL_SECTION) (ud_obj.pc + opr1.lval.sdword); | |
reg = opr0.base; | |
break; | |
} | |
} | |
else if (PTR_BITS == 32 && insn == UD_Imov) | |
{ | |
if (opr1.type == UD_OP_IMM && opr1.size == 32 && | |
opr0.type == UD_OP_REG && opr0.size == PTR_BITS) | |
{ | |
lockaddr = (PRTL_CRITICAL_SECTION) (intptr_t) opr1.lval.sdword; | |
reg = opr0.base; | |
break; | |
} | |
} | |
else if (PTR_BITS == 32 && insn == UD_Ipush) | |
{ | |
if (opr0.type == UD_OP_IMM && opr0.size == 32) | |
{ | |
lockaddr = (PRTL_CRITICAL_SECTION) (intptr_t) opr0.lval.sdword; | |
/* cheat because the lock is already on the stack, we don't need | |
to find where it's pushed (just like the case where it's | |
already in rcx on 64-bit) */ | |
reg = UD_R_RCX; | |
break; | |
} | |
} | |
} | |
/* Test if lock address is FastPebLock. */ | |
if (lockaddr != NtCurrentTeb ()->Peb->FastPebLock) | |
return NULL; | |
/* Find where the lock address is loaded into rcx as the first parameter of | |
a function call */ | |
bool found = false; | |
if (reg != UD_R_RCX) | |
{ | |
while (ud_disassemble (&ud_obj) && insn != UD_Iret && insn != UD_Ijmp) | |
{ | |
if (PTR_BITS == 64 && insn == UD_Imov) | |
{ | |
if (opr1.type == UD_OP_REG && opr1.size == PTR_BITS && | |
opr1.base == reg && opr0.type == UD_OP_REG && | |
opr0.size == PTR_BITS && opr0.base == UD_R_RCX) | |
{ | |
found = true; | |
break; | |
} | |
} | |
else if (PTR_BITS == 32 && insn == UD_Ipush) | |
{ | |
if (opr0.type == UD_OP_REG && opr0.size == PTR_BITS && | |
opr0.base == reg) | |
{ | |
found = true; | |
break; | |
} | |
} | |
} | |
if (!found) | |
return NULL; | |
} | |
/* Next is the `callq RtlEnterCriticalSection' */ | |
found = false; | |
while (ud_disassemble (&ud_obj) && insn != UD_Iret && insn != UD_Ijmp && | |
(PTR_BITS != 32 || insn != UD_Ipush)) | |
{ | |
if (insn == UD_Icall) | |
{ | |
if (opr0.type == UD_OP_JIMM && opr0.size == 32) | |
{ | |
if (ent_crit != (const void *) (ud_obj.pc + opr0.lval.sdword)) | |
return NULL; | |
found = true; | |
break; | |
} | |
} | |
} | |
if (!found) | |
return NULL; | |
fcwd_access_t **f_cwd_ptr = NULL; | |
/* now we're looking for a mov rel(%rip), %<reg> */ | |
while (ud_disassemble (&ud_obj) && insn != UD_Iret && insn != UD_Ijmp) | |
{ | |
if (insn == UD_Imov) | |
{ | |
if (opr1.type == UD_OP_MEM && opr1.size == PTR_BITS && | |
opr1.index == UD_NONE && opr1.scale == 0 && opr1.offset == 32 && | |
opr0.type == UD_OP_REG && opr0.size == PTR_BITS) | |
{ | |
if (opr1.base == UD_R_RIP) | |
f_cwd_ptr = (fcwd_access_t **) (ud_obj.pc + opr1.lval.sdword); | |
else if (opr1.base == UD_NONE) | |
f_cwd_ptr = (fcwd_access_t **) (intptr_t) opr1.lval.sdword; | |
else | |
continue; | |
reg = opr0.base; | |
break; | |
} | |
} | |
} | |
printf("%llx -> %p\n", ud_obj.pc, f_cwd_ptr); | |
if (!f_cwd_ptr || !ud_disassemble (&ud_obj)) | |
return NULL; | |
ud_type_t zeroreg = UD_NONE; | |
if (PTR_BITS == 32 && insn == UD_Ixor) | |
{ | |
if (opr0.type != UD_OP_REG || opr0.size != PTR_BITS || | |
opr1.type != UD_OP_REG || opr1.size != PTR_BITS || | |
opr0.base != opr1.base) | |
return NULL; | |
zeroreg = opr0.base; | |
if (!ud_disassemble (&ud_obj)) | |
return NULL; | |
} | |
/* Check that the next instruction is a test. */ | |
if (insn == UD_Itest) | |
{ | |
/* ... and that it's testing the same register that the mov above loaded | |
* the f_cwd_ptr into against itself */ | |
if (opr0.type != UD_OP_REG || opr0.size != PTR_BITS || opr0.base != reg || | |
opr1.type != UD_OP_REG || opr1.size != PTR_BITS || opr1.base != reg) | |
return NULL; | |
} | |
else if (PTR_BITS == 32 && insn == UD_Icmp) | |
{ | |
/* could be xor rY, rY/cmp reg, rY */ | |
if (opr0.type != UD_OP_REG || opr0.size != PTR_BITS || | |
opr1.type != UD_OP_REG || opr1.size != PTR_BITS) | |
return NULL; | |
if ((opr0.base != reg || opr1.base != zeroreg) && | |
(opr0.base != zeroreg || opr1.base != reg)) | |
return NULL; | |
} | |
else | |
return NULL; | |
return f_cwd_ptr; | |
} | |
#endif /* __x86_64__ || __i386__ */ | |
int main (void) | |
{ | |
typedef ULONG (WINAPI * RtlGetCurrentDirectory_U_t) (ULONG, LPWSTR); | |
RtlGetCurrentDirectory_U_t pRtlGetCurrentDirectory_U = (RtlGetCurrentDirectory_U_t) GetProcAddress(GetModuleHandle("ntdll"), "RtlGetCurrentDirectory_U"); | |
printf("%p\n", pRtlGetCurrentDirectory_U); | |
USHORT WowMachine, NativeMachine; | |
if (!IsWow64Process2 (GetCurrentProcess (), &WowMachine, &NativeMachine)) | |
NativeMachine = IMAGE_FILE_MACHINE_AMD64; | |
struct FAST_CWD_8 **RtlpCurDirRef = NULL; | |
if (NativeMachine == IMAGE_FILE_MACHINE_ARM64) | |
RtlpCurDirRef = (struct FAST_CWD_8 **) find_fast_cwd_pointer_aarch64 (); | |
#if defined (__x86_64__) || defined (__i386__) | |
else | |
RtlpCurDirRef = (struct FAST_CWD_8 **) find_fast_cwd_pointer_x86 (); | |
#endif | |
printf("%p\n", RtlpCurDirRef); | |
getchar(); | |
printf("%.*S\n", (*RtlpCurDirRef)->Path.Length/2, (*RtlpCurDirRef)->Path.Buffer); | |
WCHAR buf[MAX_PATH]; | |
pRtlGetCurrentDirectory_U (sizeof(buf), buf); | |
printf("%S\n", buf); | |
return 0; | |
} |
Wooow. This looks intricate, and at the same time enticing. Have you verified that it works? If so, how about getting it into Cygwin?
https://inbox.sourceware.org/cygwin-patches/[email protected]/t/#u
That's low on my priority list because,
- I've never actually seen an issue from ARM64 not having this, and
- This whole idea of digging through the machine code to extract a private variable is nasty, and I'm not horribly excited to be perpetuating that
Makes sense. IIRC the only reason why this is needed is so that Cygwin can delete the current working directory, which is something you can do on Linux, but not on Windows.
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
References:
IS_INSN
macro and adrp mask and handling (though I may have found a bug in that, it isn't sign extending, see ruby/ruby#12222).