Group2
head_32.S
and head_64.S
.head_64.S
is relevant for x86_64 architecture.vmlinux-objs-y
selects the appropriate file (head_32.o or head_64.o) based on $(BITS)
.$(BITS)
is determined in arch/x86/Makefile
based on kernel configuration (CONFIG_X86_32
).vmlinux-objs-y := $(obj)/vmlinux.lds $(obj)/head_$(BITS).o $(obj)/misc.o \
$(obj)/string.o $(obj)/cmdline.o \
$(obj)/piggy.o $(obj)/cpuflags.o
.head.text
and .code32
.KEEP_SEGMENTS
flag determines whether segment registers need reloadingKEEP_SEGMENTS
flag in the kernel setup header dictates segment register behavior.arch/x86/boot/compressed/head_64.S
movl $boot_stack_end, %eax
addl %ebp, %eax
movl %eax, %esp
ebp
: real address of startup_32
label
eax
: address of boot_stack_end
as it was linked at 0x0
sum of ebp
and eax
→ Real address of boot_stack_end
note: We need real address of startup_32 label and relative address of boot_stack_end to get the address where stack pointer should point to
arch/x86/boot/compressed/head_64.S
.bss
.balign 4
boot_heap:
.fill BOOT_HEAP_SIZE, 1, 0
boot_stack:
.fill BOOT_STACK_SIZE, 1, 0
boot_stack_end:
note: boot_stack_end is in bss section
call verify_cpu
testl %eax, %eax
jnz no_longmode
verify_cpu
: return 0 if support long modeno_longmode
: haltnote: After we have setup stack pointer, SSE stands for streaming SIMD extenstion, verify_cpu function checks whether cpu supports long mode and sse
movl $0x1,%eax # Does the cpu have what it takes
cpuid
andl $REQUIRED_MASK0,%edx
xorl $REQUIRED_MASK0,%edx
jnz .Lverify_cpu_no_longmode
movl $0x80000000,%eax # See if extended cpuid is implemented
cpuid
cmpl $0x80000001,%eax
jb .Lverify_cpu_no_longmode # no extended cpuid
movl $0x80000001,%eax # Does the cpu have what it takes
cpuid
andl $REQUIRED_MASK1,%edx
xorl $REQUIRED_MASK1,%edx
jnz .Lverify_cpu_no_longmode
note: cpu_check
movl $1,%eax
cpuid
andl $SSE_MASK,%edx
cmpl $SSE_MASK,%edx
je .Lverify_cpu_sse_ok
test %di,%di
jz .Lverify_cpu_no_longmode # only try to force SSE on AMD
movl $MSR_K7_HWCR,%ecx
rdmsr
btr $15,%eax # enable SSE
wrmsr
xor %di,%di # don't loop
jmp .Lverify_cpu_sse_test # try again
note: sse_test
note: make sure everything is ok, verify_cpu would return 0, we now have to calculate relcation address
CONFIG_PHYSICAL_START
: Default base address of Linux kernel
Value of CONFIG_PHYSICAL_START
is 0x1000000
(1MB)
rescue kernel
for kdump
which is configured to load from a different address→ CONFIG_RELOCATABLE
=y
This builds a kernel image that retains relocation
information so it can be loaded someplace besides the
default 1MB.
Note: If CONFIG_RELOCATABLE=y, then the kernel runs from the
address it has been loaded at and the compile time physical
address (CONFIG_PHYSICAL_START) is used as the minimum location.
Special section attribute is set in arch/x86/boot/ compressed/head_64.S before startup_32
__HEAD
.code32
ENTRY(startup_32)
note: we now dive into the assembly code, we may found special section attribute is set in (file) before startup_32 entry point
#define __HEAD .section ".head.text","ax"
__HEAD
is a macro defined in the include/linux/ init.h
.head.text
indicates that following code contains executable instructionsa
: this section is allocatablex
: this section can be executed by CPUnote: from the flags and macro seen in the assembly code, this kernel section can be booted from different address.
→ Compile the decompressor as position independent code
(PIC)
note: position independent code means not tied to a specific address, usually use offset instead of absolute addresses
arch/x86/boot/compressed/Makefile
KBUILD_CFLAGS += -fno-strict-aliasing -fPIC
Address is obtained by adding the address field of the instruction to the value of the program counter
→ depending on CONFIG_RELOCATABLE
#ifdef CONFIG_RELOCATABLE
movl %ebp, %ebx
movl BP_kernel_alignment(%esi), %eax
decl %eax
addl %eax, %ebx
notl %eax
andl %eax, %ebx
cmpl $LOAD_PHYSICAL_ADDR, %ebx
jge 1f
#endif
movl $LOAD_PHYSICAL_ADDR, %ebx
note: to find the address, it has to be aligned
LOAD_PHYSICAL_ADDR
macro
#define LOAD_PHYSICAL_ADDR ((CONFIG_PHYSICAL_START \
+ (CONFIG_PHYSICAL_ALIGN - 1)) \
& ~(CONFIG_PHYSICAL_ALIGN - 1))
note: LOAD_PHYSICAL_ADDR is just expanded to the aligned CONFIG_PHYSICAL_ALIGN value which represents the physical address where the kernel will be loaded
1:
movl BP_init_size(%esi), %eax
subl $_end, %eax
addl %eax, %ebx
BP_init_size
: the larger of the compressed and uncompressed vmlinux
sizes
addl %ebp, gdt+2(%ebp)
lgdt gdt(%ebp)
adjust the base address of the Global Descriptor table to the address where we actually loaded the kernel
load the Global Descriptor Table with the lgdt instruction
.data
...
gdt:
.word gdt_end - gdt
.long gdt
.word 0
.quad 0x00cf9a000000ffff /* __KERNEL32_CS */
.quad 0x00af9a000000ffff /* __KERNEL_CS */
.quad 0x00cf92000000ffff /* __KERNEL_DS */
.quad 0x0080890000000000 /* TS descriptor */
.quad 0x0000000000000000 /* TS continued */
gdt_end:
movl %cr4, %eax
orl $X86_CR4_PAE, %eax
movl %eax, %cr4
→ put value of the cr4
register into eax
, set the 5th bit and load it back into cr4
8 new general purpose registers from r8 to r15
All general purpose registers are 64-bit now
A 64-bit instruction pointer - RIP
64-Bit Addresses and Operands
RIP Relative Addressing
64-bit mode
compatibility mode.
Enable PAE
Build page tables and load the address of the top level page table into the cr3 register
Enable EFER.LME
Enable paging
4-level
paging, and we generally build 6 page tables:One PML4
(Page Map Level 4 table) with one entry
One PDP
(Page Directory Pointer table) with four entries
Four Page Directory tables with a total of 2048 entries
leal pgtable(%ebx), %edi
xorl %eax, %eax
movl $(BOOT_INIT_PGT_SIZE/4), %ecx
rep stosl
pgtable
is defined here .section ".pgtable","a",@nobits
.balign 4096
pgtable:
.fill BOOT_PGT_SIZE, 1, 0
CONFIG_X86_VERBOSE_BOOTUP
kernel configuration option:# ifdef CONFIG_X86_VERBOSE_BOOTUP
# define BOOT_PGT_SIZE (19*4096)
# else /* !CONFIG_X86_VERBOSE_BOOTUP */
# define BOOT_PGT_SIZE (17*4096)
# endif
# else /* !CONFIG_RANDOMIZE_BASE */
# define BOOT_PGT_SIZE BOOT_INIT_PGT_SIZE
# endif
PML4
-
leal pgtable + 0(%ebx), %edi
leal 0x1007 (%edi), %eax
movl %eax, 0(%edi)
Page Directory
entries in the Page Directory Pointer
table
leal pgtable + 0x1000(%ebx), %edi
leal 0x1007(%edi), %eax
movl $4, %ecx
1: movl %eax, 0x00(%edi)
addl $0x00001000, %eax
addl $8, %edi
decl %ecx
jnz 1b
2048
page table entries with 2-MByte
pages
leal pgtable + 0x2000(%ebx), %edi
movl $0x00000183, %eax
movl $2048, %ecx
1: movl %eax, 0(%edi)
addl $0x00200000, %eax
addl $8, %edi
decl %ecx
jnz 1b
PML4
- into the cr3
control register:
leal pgtable(%ebx), %eax
movl %eax, %cr3
EFER.LME
flag in the MSR to 0xC0000080
movl $MSR_EFER, %ecx
rdmsr
btsl $_EFER_LME, %eax
wrmsr
pushl $__KERNEL_CS
startup_64
routine in eax
leal startup_64(%ebp), %eax
eax
to the stack and enable paging
pushl %eax
movl $(X86_CR0_PG | X86_CR0_PE), %eax
movl %eax, %cr0
lret
.code64
.org 0x200
ENTRY(startup_64)
....
....
....
KEEP_SEGMENTS
flag in the Linux boot protocol and its impact on segment register initialization during the boot process?(a) It determines the boot protocol version (b) It controls the loading of segment registers during boot (c) It sets up heap memory allocation
startup_32
→ startup_64
arch/x86/boot/compressed/head_64.S
pushl $__KERNEL_CS
leal startup_64(%ebp), %eax
...
...
...
pushl %eax
...
...
...
lret
ebp
: the physical address of startup_32
.note: In the previous part, we've already covered the jump from startup_32
to startup_64
. The main purpose of the startup_64
function is to decompress the compressed kernel image and jump right to it.
Why again?
GDT
64-bit
mode).note: Remember we update the Global Descriptor Table
with 64-bit segments in the previous part.
arch/x86/boot/compressed/head_64.S
.code64
.org 0x200
ENTRY(startup_64)
xorl %eax, %eax
movl %eax, %ds
movl %eax, %es
movl %eax, %ss
movl %eax, %fs
movl %eax, %gs
All segment registers besides the cs
register are now reset in long mode
Why again?
64-bit boot protocol
now and startup_32
is no longer being executednote: We've done this before in the startup_32
function, but we need to do this calculation again because the bootloader can use the 64-bit boot protocol
now and startup_32
is no longer being executed.
#ifdef CONFIG_RELOCATABLE
leaq startup_32(%rip), %rbp
movl BP_kernel_alignment(%rsi), %eax
decl %eax
addq %rax, %rbp
notq %rax
andq %rax, %rbp
cmpq $LOAD_PHYSICAL_ADDR, %rbp
jge 1f
#endif
movq $LOAD_PHYSICAL_ADDR, %rbp
1:
movl BP_init_size(%rsi), %ebx
subl $_end, %ebx
addq %rbp, %rbx
rsi
: pointer to boot_params tablerbp
: decompressed kernel's start addressrbx
: address where the kernel code will be relocated to for decompressionnote: Because we're now in long mode, so in the first line, we could just use rip relative addressing
to get the physical address of startup_32
.
sp
, flags register and GDT
Set up stack pointer
leaq boot_stack_end(%rbx), %rsp
...
.bss
.balign4
boot_heap:
.fill BOOT_HEAP_SIZE, 1, 0
boot_stack:
.fill BOOT_STACK_SIZE, 1, 0
boot_stack_end:
Set up the Global Descriptor Table
leaq gdt(%rip), %rax
movq %rax, gdt64+2(%rip)
lgdt gdt64(%rip)
...
.data
gdt64:
.word gdt_end - gdt
.long 0
.word 0
.quad 0
gdt:
.word gdt_end - gdt
.long gdt
.word 0
.quad 0x00cf9a000000ffff /* __KERNEL32_CS */
.quad 0x00af9a000000ffff /* __KERNEL_CS */
.quad 0x00cf92000000ffff /* __KERNEL_DS */
.quad 0x0080890000000000 /* TS descriptor */
.quad 0x0000000000000000 /* TS continued */
gdt_end:
Zero EFLAGS
pushq $0
popfq
note: Zeroing the EFLAGS register is done to ensure a clean state of the processor's flags during the kernel boot process. This helps prevent any unintended side effects or undefined behavior by resetting the flags to a known state before executing the decompression process.
note: Since the stack is now correct, we can copy the compressed kernel to the address that we got above, when we calculated the relocation address of the decompressed kernel
Copy the compressed kernel to the end of our burffer where decompression in place becomes safe
pushq %rsi
leaq (_bss-8)(%rip), %rsi
leaq (_bss-8)(%rbx), %rdi
movq $_bss, %rcx
shrq $3, %rcx
std
rep movsq
cld
popq %rsi
arch/x86/boot/compressed/vmlinux.lds.S
.text
sectionJump to the relocated address
leaq relocated(%rbx), %rax
jmp *%rax
...
.text
relocated:
/* decompress the kernel */
.bss
sectionClear BSS (stack is currently empty) arch/x86/boot/compressed/head_64.S
.text
relocated:
xorl %eax, %eax
leaq _bss(%rip), %rdi
leaq _ebss(%rip), %rcx
subq %rdi, %rcx
shrq $3, %rcx
rep stosq
extract_kernel
function pushq %rsi
movq %rsi, %rdi
leaq boot_heap(%rip), %rsi
leaq input_data(%rip), %rdx
movl $z_input_len, %ecx
movq %rbp, %r8
movq $z_output_len, %r9
call extract_kernel
popq %rsi
arch/x86/boot/compressed/misc.c
asmlinkage __visible void *extract_kernel(
void *rmode, memptr heap,
unsigned char *input_data,
unsigned long input_len,
unsigned char *output,
unsigned long output_len)
Arguments of extract_kernel
rmode
: a pointer to the boot_params
structureheap
: a pointer to boot_heapinput_data
*: a pointer to the start of the compressed kernelinput_len
*: the size of the compressed kerneloutput
: the start address of the decompressed kerneloutput_len
*: the size of the decompressed kernel*generated by arch/x86/boot/compressed/mkpiggy.c
note: The extract_kernel
function starts with the video/console initialization that we already saw in the previous parts. We need to do this again because we don't know if we started in real mode or if a bootloader was used, or whether the bootloader used the 32
or 64-bit
boot protocol.
Initialize heap pointers arch/x86/boot/compressed/misc.c
free_mem_ptr = heap;
free_mem_end_ptr = heap + BOOT_HEAP_SIZE;
heap
is the second parameter of the extract_kernel
function.
arch/x86/boot/compressed/head_64.S
leaq boot_heap(%rip), %rsi
...
.bss
.balign 4
boot_heap:
.fill BOOT_HEAP_SIZE, 1, 0
choose_random_location
functionchoose_random_location(
(unsigned long)input_data, input_len,
(unsigned long *)&output,
max(output_len, kernel_total_size),
&virt_addr);
defined in arch/x86/boot/compressed/kaslr.c
kASLR
is enablednote: The choose_random_location
function chooses a memory location to write the decompressed kernel to. The Linux kernel supports kASLR
which allow decompression of the kernel into a random address, for security reasons. If the kASLR
isn't enabled, we just use output
parameter that we passed into extract_kernel
as the start address of the decompression kernel.
note: After getting the address for the kernel image, we need to check that the random address we got is correctly aligned, and in general, not wrong.
__decompress
functionnote: Now, we call the __decompress
function to decompress the kernel.
arch/x86/boot/compressed/misc.c
debug_putstr("\nDecompressing Linux... ");
__decompress(input_data, input_len, NULL,
NULL, output, output_len, NULL, error);
__decompress
function depends on what decompression algorithm was choosen during kernel compilationparse_elf
functionnote: So now the kernel has been decompressed, there are two more functions are called: parse_elf
and handle_relocations
, the main point of these two functions is to move the decompressed kernel image to its correct place in memory. This is because decompression is done in-place, and we still need to move the kernel to the correct address.
Call the parse_elf
function
output
address we got from the choose_random_location
note: As we already know, the kernel image is an ELF executable. The main goal of the parse_elf
function is to move loadable segements to the correct address, which is the output
address we got from the choose_random_location
function
handle_relocations
functionnote: The next step after the parse_elf
function is to call the handle_relocaitons
function. The implementation of this function depends on the CONFIG_X86_NEED_RELOCS
kernel configuration option and if it is enabled, this function adjusts addresses in the kernel image. This function is also only called if the CONFIG_RANDOMIZE_BASE
configuration option was enabled during kernel configuration.
Call the handle_relocations
function
CONFIG_X86_NEED_RELOCS
is enabledstatic void handle_relocations(void *output,
unsigned long output_len,
unsigned long virt_addr)
{
...
delta = min_addr - LOAD_PHYSICAL_ADDR
...
for (reloc = output + output_len - sizeof(*reloc); *reloc; reloc--) {
...
*(uint32_t *) ptr += delta;
}
...
}
note: This function subtracts the value of LOAD_PHYSICAL_ADDR from the value of the base load address of the kernel and thus we obtain the difference between where the kernel was linked to load and where it was actually loaded. After this we can relocate the kernel since we know the actual address where the kernel was loaded, the address where it was linked to run and the relocation table which is at the end of the kernel image.
extract_kernel
and jump to kernelarch/x86/boot/compressed/misc.c
... void *extract_kernel(...)
{
...
return output;
}
arch/x86/boot/compressed/head_64.S
relocated:
call extract_kernel
...
jmp *%rax
note: The address of the kernel will be in the rax
register and we jump to it. That's all. Now we are in the kernel!
parse_elf
function called during the Linux kernel decompression process ?
(A). To parse the compressed kernel image.
(B). To move loadable segments to the correct address.
(C). To handle kernel relocations.Why randomization?
security reasons - exploitation of memory corruption
Enable by:
CONFIG_RANDOMIZE_BASE
note: Although there is a predefined load address determined by a kernel configuration for the entry point of the Linux kernel, this load address can also be configured to be a random value. The reason why we randomize the load address of the kernel is for security purposes. The randomization can help to prevent exploitations of memory corruption vulnerabilities. By doing so, it becomes more difficult for an attacker to predict the memory addresses of specific functions or data. So, if we want to randomize the load address of the kernel, a special kernel configuration option should be enabled, which is shown here.
note: So now we know why the kernel decompressor look for a random memory range to decompress and load the kernel. However, before the kernel decompression, the page tables should be initialized.
If bootloader boot with 16/32-bit boot protocol
→ Already have page tables
If the kernel decompressor selects a memory range which is valid only in a 64-bit context
→ Build new identity mapped page tables
note: In fact, the page tables have been initialized before the transition to 64-bit mode. So, if the bootloader uses the 16-bit or 32-bit boot protocol, we already have page tables. The reason why we have to initialize the page table again here is that the kernel decompressor may select a memory range which is only vaild in 64-bit mode. Therefore, we need to build new identity mapped page tables.
Called by extract_kernel
function from arch/x86/boot/compressed/misc.c
void choose_random_location(unsigned long input,
unsigned long input_size,
unsigned long *output,
unsigned long output_size,
unsigned long *virt_addr)
note: The randomization begins with the call to this choose_random_location function. The first parameter of this function input is a pointer to the compressed kernel image, while the second parameter is the size of the compressed kernel. The third and fourth parameters are the address of the decompressed kernel image and its length. As for the last parameter, it is the virtual address of the kernel load address. In general, this function basically handles everything about the randomization of load address, such as page initialization, avoid using reserved memory ranges, physical address randomization, and virtual address randomizatoin.
if (cmdline_find_option_bool("nokaslr")) {
warn("KASLR disabled: 'nokaslr' on cmdline.");
return;
}
note: The choose_random_location will start by checking a kernel command line option shown here. If this option is set, the function will return and the kernel load address will be still unrandomized.
initialize_identity_maps() // Called by choose_random_location function
struct x86_mapping_info {
void *(*alloc_pgt_page)(void *);
void *context;
unsigned long page_flag;
unsigned long offset;
bool direct_gbpages;
unsigned long kernpg_flag;
};
note: After checking the privous option, the first job of the choose_random_location function is to initalize the page table, and it first calls this initialize_identity_maps function. This function will initialize the structure shown here, which provides information about the memory mapping. The first field of this structure, which is a callback function, will check if there is enough memory space for a new page and allocate it, while the second field context is used to track the allocated page tables. For the remaining fields, two of them are page flags, the boolean value is used to check if huge pages are supported and the offset field is the offset between the kernel's virtual addresses and its physical addresses.
note: After the page tables are initialized, the next thing is to choose a random memory location to extract the kernel image. However, certain memory regions have been used by other things such as kernel command like tool, and these regions should not be choosed. Therefore, there are some mechanisms to aviod that.
mem_avoid_init(input, input_size, *output);
// Unsafe memory regions will be collected in an array
struct mem_vector {
unsigned long long start;
unsigned long long size;
};
static struct mem_vector mem_avoid[MEM_AVOID_MAX];
note: To address this issue, there is a mem_aviod_init function. This function is called after the page initialization, and its main goal is to store information about reserved memory regions with descriptors given by an enumeration, which will be shown in the next slide, and all information related to the memory region, such as the start address and the size of the region, will be stored in an array.
enum mem_avoid_index {
MEM_AVOID_ZO_RANGE = 0,
MEM_AVOID_INITRD,
MEM_AVOID_CMDLINE,
MEM_AVOID_BOOTPARAMS,
MEM_AVOID_MEMMAP_BEGIN,
MEM_AVOID_MEMMAP_END = MEM_AVOID_MEMMAP_BEGIN + MAX_MEMMAP_REGIONS - 1,
MEM_AVOID_MAX,
};
void add_identity_map(unsigned long start, unsigned long size)
note: There are many different types of reserved memory regions. However, The mem_avoid_init function does the same thing for all elements in this enumeration. After storing the information, it also calls the add_identity_map function to build the identity mapped page for each of the reserved memory regions. The parameters of this function are the start address and the size of the memory region, both of which are stored in an array previously.
min_addr = min(*output, 512UL << 20);
random_addr = find_random_phys_addr(min_addr, output_size);
static unsigned long find_random_phys_addr(unsigned long minimum,
unsigned long image_size)
{
minimum = ALIGN(minimum, CONFIG_PHYSICAL_ALIGN);
if (process_efi_entries(minimum, image_size))
return slots_fetch_random();
process_e820_entries(minimum, image_size);
return slots_fetch_random();
}
note:
random_addr = find_random_phys_addr(min_addr, output_size);
if (*output != random_addr) {
add_identity_map(random_addr, output_size);
*output = random_addr;
}
note: After selecting the random physical address, it will first generate identity mapped pages for the region.
if (IS_ENABLED(CONFIG_X86_64))
random_addr = find_random_virt_addr(LOAD_PHYSICAL_ADDR, output_size);
*virt_addr = random_addr;
note: After randomizing the physical address, we can also randomize the virtual address on the x86_64 architecture. For architectures other than x86_64, the function find_random_virt_addr calculates the number of virtual memory ranges needed to hold the kernel image. Now, both the base physical address (*output) and the virtual address (*virt_addr) for the decompressed kernel have been randomized.