Hi Kirill, Something is wrong in this patch. We regularly run CRIU tests on linux-next, and yesterday I found that a kernel didn't boot. We run this tests in Travis-CI, and we don't have access to kernel logs. I tried to reproduce the problem localy, but I failed.
In Travis-CI, we build kernel, then dump a travis deamon, boot the kernel with help of kexec and restore the travis daemon back. Here is logs without this patch: https://travis-ci.org/avagin/linux/jobs/340820418 Here is logs with this patch: https://travis-ci.org/avagin/linux/jobs/340820584 Thanks, Andrei On Sun, Feb 11, 2018 at 04:20:04AM -0800, tip-bot for Jacob Shin wrote: > Commit-ID: b4b56015ed1c98cbc9469e35ebbc4373a2844030 > Gitweb: > https://git.kernel.org/tip/b4b56015ed1c98cbc9469e35ebbc4373a2844030 > Author: Kirill A. Shutemov <kirill.shute...@linux.intel.com> > AuthorDate: Fri, 9 Feb 2018 17:22:28 +0300 > Committer: Ingo Molnar <mi...@kernel.org> > CommitDate: Sun, 11 Feb 2018 12:36:19 +0100 > > x86/boot/compressed/64: Handle 5-level paging boot if kernel is above 4G > > This patch addresses a shortcoming in current boot process on machines > that supports 5-level paging. > > If a bootloader enables 64-bit mode with 4-level paging, we might need to > switch over to 5-level paging. The switching requires the disabling > paging. It works fine if kernel itself is loaded below 4G. > > But if the bootloader put the kernel above 4G (not sure if anybody does > this), we would lose control as soon as paging is disabled, because the > code becomes unreachable to the CPU. > > This patch implements a trampoline in lower memory to handle this > situation. > > We only need the memory for a very short time, until the main kernel > image sets up own page tables. > > We go through the trampoline even if we don't have to: if we're already > in 5-level paging mode or if we don't need to switch to it. This way the > trampoline gets tested on every boot. > > Signed-off-by: Kirill A. Shutemov <kirill.shute...@linux.intel.com> > Cc: Andy Lutomirski <l...@amacapital.net> > Cc: Borislav Petkov <b...@suse.de> > Cc: Cyrill Gorcunov <gorcu...@openvz.org> > Cc: Linus Torvalds <torva...@linux-foundation.org> > Cc: Matthew Wilcox <wi...@infradead.org> > Cc: Peter Zijlstra <pet...@infradead.org> > Cc: Thomas Gleixner <t...@linutronix.de> > Cc: linux...@kvack.org > Link: > http://lkml.kernel.org/r/20180209142228.21231-5-kirill.shute...@linux.intel.com > Signed-off-by: Ingo Molnar <mi...@kernel.org> > --- > arch/x86/boot/compressed/head_64.S | 127 > ++++++++++++++++++++++++++----------- > 1 file changed, 89 insertions(+), 38 deletions(-) > > diff --git a/arch/x86/boot/compressed/head_64.S > b/arch/x86/boot/compressed/head_64.S > index af9ffbd..70b30f2 100644 > --- a/arch/x86/boot/compressed/head_64.S > +++ b/arch/x86/boot/compressed/head_64.S > @@ -307,13 +307,34 @@ ENTRY(startup_64) > > /* > * At this point we are in long mode with 4-level paging enabled, > - * but we want to enable 5-level paging. > + * but we might want to enable 5-level paging or vice versa. > * > - * The problem is that we cannot do it directly. Setting LA57 in > - * long mode would trigger #GP. So we need to switch off long mode > - * first. > + * The problem is that we cannot do it directly. Setting or clearing > + * CR4.LA57 in long mode would trigger #GP. So we need to switch off > + * long mode and paging first. > + * > + * We also need a trampoline in lower memory to switch over from > + * 4- to 5-level paging for cases when the bootloader puts the kernel > + * above 4G, but didn't enable 5-level paging for us. > + * > + * The same trampoline can be used to switch from 5- to 4-level paging > + * mode, like when starting 4-level paging kernel via kexec() when > + * original kernel worked in 5-level paging mode. > + * > + * For the trampoline, we need the top page table to reside in lower > + * memory as we don't have a way to load 64-bit values into CR3 in > + * 32-bit mode. > + * > + * We go though the trampoline even if we don't have to: if we're > + * already in a desired paging mode. This way the trampoline code gets > + * tested on every boot. > */ > > + /* Make sure we have GDT with 32-bit code segment */ > + leaq gdt(%rip), %rax > + movl %eax, gdt64+2(%rip) > + lgdt gdt64(%rip) > + > /* > * paging_prepare() sets up the trampoline and checks if we need to > * enable 5-level paging. > @@ -331,30 +352,20 @@ ENTRY(startup_64) > /* Save the trampoline address in RCX */ > movq %rax, %rcx > > - /* Check if we need to enable 5-level paging */ > - cmpq $0, %rdx > - jz lvl5 > - > - /* Clear additional page table */ > - leaq lvl5_pgtable(%rbx), %rdi > - xorq %rax, %rax > - movq $(PAGE_SIZE/8), %rcx > - rep stosq > - > /* > - * Setup current CR3 as the first and only entry in a new top level > - * page table. > + * Load the address of trampoline_return() into RDI. > + * It will be used by the trampoline to return to the main code. > */ > - movq %cr3, %rdi > - leaq 0x7 (%rdi), %rax > - movq %rax, lvl5_pgtable(%rbx) > + leaq trampoline_return(%rip), %rdi > > /* Switch to compatibility mode (CS.L = 0 CS.D = 1) via far return */ > pushq $__KERNEL32_CS > - leaq compatible_mode(%rip), %rax > + leaq TRAMPOLINE_32BIT_CODE_OFFSET(%rax), %rax > pushq %rax > lretq > -lvl5: > +trampoline_return: > + /* Restore the stack, the 32-bit trampoline uses its own stack */ > + leaq boot_stack_end(%rbx), %rsp > > /* > * cleanup_trampoline() would restore trampoline memory. > @@ -503,45 +514,82 @@ relocated: > jmp *%rax > > .code32 > +/* > + * This is the 32-bit trampoline that will be copied over to low memory. > + * > + * RDI contains the return address (might be above 4G). > + * ECX contains the base address of the trampoline memory. > + * Non zero RDX on return means we need to enable 5-level paging. > + */ > ENTRY(trampoline_32bit_src) > -compatible_mode: > /* Set up data and stack segments */ > movl $__KERNEL_DS, %eax > movl %eax, %ds > movl %eax, %ss > > + /* Setup new stack */ > + leal TRAMPOLINE_32BIT_STACK_END(%ecx), %esp > + > /* Disable paging */ > movl %cr0, %eax > btrl $X86_CR0_PG_BIT, %eax > movl %eax, %cr0 > > - /* Point CR3 to 5-level paging */ > - leal lvl5_pgtable(%ebx), %eax > - movl %eax, %cr3 > + /* Check what paging mode we want to be in after the trampoline */ > + cmpl $0, %edx > + jz 1f > > - /* Enable PAE and LA57 mode */ > + /* We want 5-level paging: don't touch CR3 if it already points to > 5-level page tables */ > movl %cr4, %eax > - orl $(X86_CR4_PAE | X86_CR4_LA57), %eax > + testl $X86_CR4_LA57, %eax > + jnz 3f > + jmp 2f > +1: > + /* We want 4-level paging: don't touch CR3 if it already points to > 4-level page tables */ > + movl %cr4, %eax > + testl $X86_CR4_LA57, %eax > + jz 3f > +2: > + /* Point CR3 to the trampoline's new top level page table */ > + leal TRAMPOLINE_32BIT_PGTABLE_OFFSET(%ecx), %eax > + movl %eax, %cr3 > +3: > + /* Enable PAE and LA57 (if required) paging modes */ > + movl $X86_CR4_PAE, %eax > + cmpl $0, %edx > + jz 1f > + orl $X86_CR4_LA57, %eax > +1: > movl %eax, %cr4 > > - /* Calculate address we are running at */ > - call 1f > -1: popl %edi > - subl $1b, %edi > + /* Calculate address of paging_enabled() once we are executing in the > trampoline */ > + leal paging_enabled - trampoline_32bit_src + > TRAMPOLINE_32BIT_CODE_OFFSET(%ecx), %eax > > - /* Prepare stack for far return to Long Mode */ > + /* Prepare the stack for far return to Long Mode */ > pushl $__KERNEL_CS > - leal lvl5(%edi), %eax > - push %eax > + pushl %eax > > - /* Enable paging back */ > + /* Enable paging again */ > movl $(X86_CR0_PG | X86_CR0_PE), %eax > movl %eax, %cr0 > > lret > > + .code64 > +paging_enabled: > + /* Return from the trampoline */ > + jmp *%rdi > + > + /* > + * The trampoline code has a size limit. > + * Make sure we fail to compile if the trampoline code grows > + * beyond TRAMPOLINE_32BIT_CODE_SIZE bytes. > + */ > + .org trampoline_32bit_src + TRAMPOLINE_32BIT_CODE_SIZE > + > + .code32 > no_longmode: > - /* This isn't an x86-64 CPU so hang */ > + /* This isn't an x86-64 CPU, so hang intentionally, we cannot continue > */ > 1: > hlt > jmp 1b > @@ -549,6 +597,11 @@ no_longmode: > #include "../../kernel/verify_cpu.S" > > .data > +gdt64: > + .word gdt_end - gdt > + .long 0 > + .word 0 > + .quad 0 > gdt: > .word gdt_end - gdt > .long gdt > @@ -602,8 +655,6 @@ trampoline_save: > .balign 4096 > pgtable: > .fill BOOT_PGT_SIZE, 1, 0 > -lvl5_pgtable: > - .fill PAGE_SIZE, 1, 0 > > .global pgtable_trampoline > pgtable_trampoline: