Group2
head_32.S
and head_64.S
.head_64.S
is relevant for x86_64 architecture.vmlinux-objs-y
selects the appropriate file (head_32.o or head_64.o) based on $(BITS)
.$(BITS)
is determined in arch/x86/Makefile
based on kernel configuration (CONFIG_X86_32
).vmlinux-objs-y := $(obj)/vmlinux.lds $(obj)/head_$(BITS).o $(obj)/misc.o \
$(obj)/string.o $(obj)/cmdline.o \
$(obj)/piggy.o $(obj)/cpuflags.o
.head.text
and .code32
.KEEP_SEGMENTS
flag determines whether segment registers need reloadingKEEP_SEGMENTS
flag in the kernel setup header dictates segment register behavior.arch/x86/boot/compressed/head_64.S
movl $boot_stack_end, %eax addl %ebp, %eax movl %eax, %esp
ebp
: real address of startup_32
label
eax
: address of boot_stack_end
as it was linked at 0x0
sum of ebp
and eax
→ Real address of boot_stack_end
arch/x86/boot/compressed/head_64.S
.bss .balign 4 boot_heap: .fill BOOT_HEAP_SIZE, 1, 0 boot_stack: .fill BOOT_STACK_SIZE, 1, 0 boot_stack_end:
call verify_cpu testl %eax, %eax jnz no_longmode
verify_cpu
: return 0 if support long modeno_longmode
: halt movl $0x1,%eax # Does the cpu have what it takes cpuid andl $REQUIRED_MASK0,%edx xorl $REQUIRED_MASK0,%edx jnz .Lverify_cpu_no_longmode movl $0x80000000,%eax # See if extended cpuid is implemented cpuid cmpl $0x80000001,%eax jb .Lverify_cpu_no_longmode # no extended cpuid movl $0x80000001,%eax # Does the cpu have what it takes cpuid andl $REQUIRED_MASK1,%edx xorl $REQUIRED_MASK1,%edx jnz .Lverify_cpu_no_longmode
movl $1,%eax cpuid andl $SSE_MASK,%edx cmpl $SSE_MASK,%edx je .Lverify_cpu_sse_ok test %di,%di jz .Lverify_cpu_no_longmode # only try to force SSE on AMD movl $MSR_K7_HWCR,%ecx rdmsr btr $15,%eax # enable SSE wrmsr xor %di,%di # don't loop jmp .Lverify_cpu_sse_test # try again
CONFIG_PHYSICAL_START
: Default base address of Linux kernel
Value of CONFIG_PHYSICAL_START
is 0x1000000
(1MB)
rescue kernel
for kdump
which is configured to load from a different address→ CONFIG_RELOCATABLE
=y
This builds a kernel image that retains relocation
information so it can be loaded someplace besides the
default 1MB.
Note: If CONFIG_RELOCATABLE=y, then the kernel runs from the
address it has been loaded at and the compile time physical
address (CONFIG_PHYSICAL_START) is used as the minimum location.
Special section attribute is set in arch/x86/boot/ compressed/head_64.S before startup_32
__HEAD .code32 ENTRY(startup_32)
#define __HEAD .section ".head.text","ax"
__HEAD
is a macro defined in the include/linux/ init.h
.head.text
indicates that following code contains executable instructionsa
: this section is allocatablex
: this section can be executed by CPU→ Compile the decompressor as position independent code
(PIC)
arch/x86/boot/compressed/Makefile
KBUILD_CFLAGS += -fno-strict-aliasing -fPIC
Address is obtained by adding the address field of the instruction to the value of the program counter
→ depending on CONFIG_RELOCATABLE
#ifdef CONFIG_RELOCATABLE
movl %ebp, %ebx
movl BP_kernel_alignment(%esi), %eax
decl %eax
addl %eax, %ebx
notl %eax
andl %eax, %ebx
cmpl $LOAD_PHYSICAL_ADDR, %ebx
jge 1f
#endif
movl $LOAD_PHYSICAL_ADDR, %ebx
LOAD_PHYSICAL_ADDR
macro#define LOAD_PHYSICAL_ADDR ((CONFIG_PHYSICAL_START \ + (CONFIG_PHYSICAL_ALIGN - 1)) \ & ~(CONFIG_PHYSICAL_ALIGN - 1))
1: movl BP_init_size(%esi), %eax subl $_end, %eax addl %eax, %ebx
BP_init_size
: the larger of the compressed and uncompressed vmlinux
sizes addl %ebp, gdt+2(%ebp) lgdt gdt(%ebp)
adjust the base address of the Global Descriptor table to the address where we actually loaded the kernel
load the Global Descriptor Table with the lgdt instruction
.data ... gdt: .word gdt_end - gdt .long gdt .word 0 .quad 0x00cf9a000000ffff /* __KERNEL32_CS */ .quad 0x00af9a000000ffff /* __KERNEL_CS */ .quad 0x00cf92000000ffff /* __KERNEL_DS */ .quad 0x0080890000000000 /* TS descriptor */ .quad 0x0000000000000000 /* TS continued */ gdt_end:
movl %cr4, %eax orl $X86_CR4_PAE, %eax movl %eax, %cr4
→ put value of the cr4
register into eax
, set the 5th bit and load it back into cr4
8 new general purpose registers from r8 to r15
All general purpose registers are 64-bit now
A 64-bit instruction pointer - RIP
64-Bit Addresses and Operands
RIP Relative Addressing
64-bit mode
compatibility mode.
Enable PAE
Build page tables and load the address of the top level page table into the cr3 register
Enable EFER.LME
Enable paging
4-level
paging, and we generally build 6 page tables:One PML4
(Page Map Level 4 table) with one entry
One PDP
(Page Directory Pointer table) with four entries
Four Page Directory tables with a total of 2048 entries
leal pgtable(%ebx), %edi xorl %eax, %eax movl $(BOOT_INIT_PGT_SIZE/4), %ecx rep stosl
pgtable
is defined here .section ".pgtable","a",@nobits
.balign 4096
pgtable:
.fill BOOT_PGT_SIZE, 1, 0
CONFIG_X86_VERBOSE_BOOTUP
kernel configuration option:# ifdef CONFIG_X86_VERBOSE_BOOTUP
# define BOOT_PGT_SIZE (19*4096)
# else /* !CONFIG_X86_VERBOSE_BOOTUP */
# define BOOT_PGT_SIZE (17*4096)
# endif
# else /* !CONFIG_RANDOMIZE_BASE */
# define BOOT_PGT_SIZE BOOT_INIT_PGT_SIZE
# endif
PML4
- leal pgtable + 0(%ebx), %edi leal 0x1007 (%edi), %eax movl %eax, 0(%edi)
Page Directory
entries in the Page Directory Pointer
table leal pgtable + 0x1000(%ebx), %edi leal 0x1007(%edi), %eax movl $4, %ecx 1: movl %eax, 0x00(%edi) addl $0x00001000, %eax addl $8, %edi decl %ecx jnz 1b
2048
page table entries with 2-MByte
pages leal pgtable + 0x2000(%ebx), %edi movl $0x00000183, %eax movl $2048, %ecx 1: movl %eax, 0(%edi) addl $0x00200000, %eax addl $8, %edi decl %ecx jnz 1b
PML4
- into the cr3
control register: leal pgtable(%ebx), %eax movl %eax, %cr3
EFER.LME
flag in the MSR to 0xC0000080 movl $MSR_EFER, %ecx rdmsr btsl $_EFER_LME, %eax wrmsr
pushl $__KERNEL_CS
startup_64
routine in eax
leal startup_64(%ebp), %eax
eax
to the stack and enable paging pushl %eax movl $(X86_CR0_PG | X86_CR0_PE), %eax movl %eax, %cr0
lret
.code64 .org 0x200 ENTRY(startup_64) .... .... ....
KEEP_SEGMENTS
flag in the Linux boot protocol and its impact on segment register initialization during the boot process?(a) It determines the boot protocol version
(b) It controls the loading of segment registers during boot
(c) It sets up heap memory allocation
startup_32
→ startup_64
arch/x86/boot/compressed/head_64.S
pushl $__KERNEL_CS
leal startup_64(%ebp), %eax
...
...
...
pushl %eax
...
...
...
lret
ebp
: the physical address of startup_32
.Why again?
GDT
64-bit
mode).arch/x86/boot/compressed/head_64.S
.code64
.org 0x200
ENTRY(startup_64)
xorl %eax, %eax
movl %eax, %ds
movl %eax, %es
movl %eax, %ss
movl %eax, %fs
movl %eax, %gs
All segment registers besides the cs
register are now reset in long mode
Why again?
64-bit boot protocol
now and startup_32
is no longer being executed#ifdef CONFIG_RELOCATABLE
leaq startup_32(%rip), %rbp
movl BP_kernel_alignment(%rsi), %eax
decl %eax
addq %rax, %rbp
notq %rax
andq %rax, %rbp
cmpq $LOAD_PHYSICAL_ADDR, %rbp
jge 1f
#endif
movq $LOAD_PHYSICAL_ADDR, %rbp
1:
movl BP_init_size(%rsi), %ebx
subl $_end, %ebx
addq %rbp, %rbx
rsi
: pointer to boot_params tablerbp
: decompressed kernel's start addressrbx
: address where the kernel code will be relocated to for decompressionsp
, flags register and GDT
Set up stack pointer
leaq boot_stack_end(%rbx), %rsp
...
.bss
.balign4
boot_heap:
.fill BOOT_HEAP_SIZE, 1, 0
boot_stack:
.fill BOOT_STACK_SIZE, 1, 0
boot_stack_end:
Set up the Global Descriptor Table
leaq gdt(%rip), %rax
movq %rax, gdt64+2(%rip)
lgdt gdt64(%rip)
...
.data
gdt64:
.word gdt_end - gdt
.long 0
.word 0
.quad 0
gdt:
.word gdt_end - gdt
.long gdt
.word 0
.quad 0x00cf9a000000ffff /* __KERNEL32_CS */
.quad 0x00af9a000000ffff /* __KERNEL_CS */
.quad 0x00cf92000000ffff /* __KERNEL_DS */
.quad 0x0080890000000000 /* TS descriptor */
.quad 0x0000000000000000 /* TS continued */
gdt_end:
Zero EFLAGS
pushq $0
popfq
Copy the compressed kernel to the end of our burffer where decompression in place becomes safe
pushq %rsi
leaq (_bss-8)(%rip), %rsi
leaq (_bss-8)(%rbx), %rdi
movq $_bss, %rcx
shrq $3, %rcx
std
rep movsq
cld
popq %rsi
.text
sectionJump to the relocated address
leaq relocated(%rbx), %rax
jmp *%rax
...
.text
relocated:
/* decompress the kernel */
.bss
sectionClear BSS (stack is currently empty)
arch/x86/boot/compressed/head_64.S
.text
relocated:
xorl %eax, %eax
leaq _bss(%rip), %rdi
leaq _ebss(%rip), %rcx
subq %rdi, %rcx
shrq $3, %rcx
rep stosq
extract_kernel
function pushq %rsi
movq %rsi, %rdi
leaq boot_heap(%rip), %rsi
leaq input_data(%rip), %rdx
movl $z_input_len, %ecx
movq %rbp, %r8
movq $z_output_len, %r9
call extract_kernel
popq %rsi
arch/x86/boot/compressed/misc.c
asmlinkage __visible void *extract_kernel(
void *rmode, memptr heap,
unsigned char *input_data,
unsigned long input_len,
unsigned char *output,
unsigned long output_len)
Arguments of extract_kernel
rmode
: a pointer to the boot_params
structureheap
: a pointer to boot_heapinput_data
*: a pointer to the start of the compressed kernelinput_len
*: the size of the compressed kerneloutput
: the start address of the decompressed kerneloutput_len
*: the size of the decompressed kernel*generated by arch/x86/boot/compressed/mkpiggy.c
Initialize heap pointers
arch/x86/boot/compressed/misc.c
free_mem_ptr = heap;
free_mem_end_ptr = heap + BOOT_HEAP_SIZE;
heap
is the second parameter of the extract_kernel
function.
arch/x86/boot/compressed/head_64.S
leaq boot_heap(%rip), %rsi
...
.bss
.balign 4
boot_heap:
.fill BOOT_HEAP_SIZE, 1, 0
choose_random_location
functionchoose_random_location(
(unsigned long)input_data, input_len,
(unsigned long *)&output,
max(output_len, kernel_total_size),
&virt_addr);
defined in arch/x86/boot/compressed/kaslr.c
kASLR
is enabled__decompress
functionarch/x86/boot/compressed/misc.c
debug_putstr("\nDecompressing Linux... ");
__decompress(input_data, input_len, NULL,
NULL, output, output_len, NULL, error);
__decompress
function depends on what decompression algorithm was choosen during kernel compilationparse_elf
functionCall the parse_elf
function
output
address we got from the choose_random_location
handle_relocations
functionCall the handle_relocations
function
CONFIG_X86_NEED_RELOCS
is enabledstatic void handle_relocations(void *output,
unsigned long output_len,
unsigned long virt_addr)
{
...
delta = min_addr - LOAD_PHYSICAL_ADDR
...
for (reloc = output + output_len - sizeof(*reloc); *reloc; reloc--) {
...
*(uint32_t *) ptr += delta;
}
...
}
extract_kernel
and jump to kernelarch/x86/boot/compressed/misc.c
... void *extract_kernel(...)
{
...
return output;
}
arch/x86/boot/compressed/head_64.S
relocated:
call extract_kernel
...
jmp *%rax
parse_elf
function called during the Linux kernel decompression process ?Why randomization?
security reasons - exploitation of memory corruption
Enable by:
CONFIG_RANDOMIZE_BASE
If bootloader boot with 16/32-bit boot protocol
→ Already have page tables
If the kernel decompressor selects a memory range which is valid only in a 64-bit context
→ Build new identity mapped page tables
Called by extract_kernel
function from arch/x86/boot/compressed/misc.c
void choose_random_location(unsigned long input, unsigned long input_size, unsigned long *output, unsigned long output_size, unsigned long *virt_addr)
if (cmdline_find_option_bool("nokaslr")) { warn("KASLR disabled: 'nokaslr' on cmdline."); return; }
initialize_identity_maps() // Called by choose_random_location function struct x86_mapping_info { void *(*alloc_pgt_page)(void *); void *context; unsigned long page_flag; unsigned long offset; bool direct_gbpages; unsigned long kernpg_flag; };
mem_avoid_init(input, input_size, *output);
// Unsafe memory regions will be collected in an array struct mem_vector { unsigned long long start; unsigned long long size; }; static struct mem_vector mem_avoid[MEM_AVOID_MAX];
enum mem_avoid_index { MEM_AVOID_ZO_RANGE = 0, MEM_AVOID_INITRD, MEM_AVOID_CMDLINE, MEM_AVOID_BOOTPARAMS, MEM_AVOID_MEMMAP_BEGIN, MEM_AVOID_MEMMAP_END = MEM_AVOID_MEMMAP_BEGIN + MAX_MEMMAP_REGIONS - 1, MEM_AVOID_MAX, };
void add_identity_map(unsigned long start, unsigned long size)
min_addr = min(*output, 512UL << 20);
random_addr = find_random_phys_addr(min_addr, output_size);
static unsigned long find_random_phys_addr(unsigned long minimum, unsigned long image_size) { minimum = ALIGN(minimum, CONFIG_PHYSICAL_ALIGN); if (process_efi_entries(minimum, image_size)) return slots_fetch_random(); process_e820_entries(minimum, image_size); return slots_fetch_random(); }
random_addr = find_random_phys_addr(min_addr, output_size); if (*output != random_addr) { add_identity_map(random_addr, output_size); *output = random_addr; }
if (IS_ENABLED(CONFIG_X86_64)) random_addr = find_random_virt_addr(LOAD_PHYSICAL_ADDR, output_size); *virt_addr = random_addr;