commit: 48d5e762de93d7df88c6f216d9947f5ac8a8c9a0 Author: Mike Pagano <mpagano <AT> gentoo <DOT> org> AuthorDate: Fri Aug 8 17:22:57 2014 +0000 Commit: Mike Pagano <mpagano <AT> gentoo <DOT> org> CommitDate: Fri Aug 8 17:22:57 2014 +0000 URL: http://git.overlays.gentoo.org/gitweb/?p=proj/linux-patches.git;a=commit;h=48d5e762
Linux patch 3.4.102 --- 0000_README | 4 + 1101_linux-3.4.102.patch | 1221 ++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 1225 insertions(+) diff --git a/0000_README b/0000_README index 0f65113..4747723 100644 --- a/0000_README +++ b/0000_README @@ -443,6 +443,10 @@ Patch: 1100_linux-3.4.101.patch From: http://www.kernel.org Desc: Linux 3.4.101 +Patch: 1101_linux-3.4.102.patch +From: http://www.kernel.org +Desc: Linux 3.4.102 + Patch: 1500_XATTR_USER_PREFIX.patch From: https://bugs.gentoo.org/show_bug.cgi?id=470644 Desc: Support for namespace user.pax.* on tmpfs. diff --git a/1101_linux-3.4.102.patch b/1101_linux-3.4.102.patch new file mode 100644 index 0000000..c444b8d --- /dev/null +++ b/1101_linux-3.4.102.patch @@ -0,0 +1,1221 @@ +diff --git a/Documentation/x86/x86_64/mm.txt b/Documentation/x86/x86_64/mm.txt +index d6498e3cd713..f33a9369e35b 100644 +--- a/Documentation/x86/x86_64/mm.txt ++++ b/Documentation/x86/x86_64/mm.txt +@@ -12,6 +12,8 @@ ffffc90000000000 - ffffe8ffffffffff (=45 bits) vmalloc/ioremap space + ffffe90000000000 - ffffe9ffffffffff (=40 bits) hole + ffffea0000000000 - ffffeaffffffffff (=40 bits) virtual memory map (1TB) + ... unused hole ... ++ffffff0000000000 - ffffff7fffffffff (=39 bits) %esp fixup stacks ++... unused hole ... + ffffffff80000000 - ffffffffa0000000 (=512 MB) kernel text mapping, from phys 0 + ffffffffa0000000 - fffffffffff00000 (=1536 MB) module mapping space + +diff --git a/Makefile b/Makefile +index a22bcb567348..dd03fa5777a0 100644 +--- a/Makefile ++++ b/Makefile +@@ -1,6 +1,6 @@ + VERSION = 3 + PATCHLEVEL = 4 +-SUBLEVEL = 101 ++SUBLEVEL = 102 + EXTRAVERSION = + NAME = Saber-toothed Squirrel + +diff --git a/arch/arm/mm/idmap.c b/arch/arm/mm/idmap.c +index ab88ed4f8e08..ef8f2df02540 100644 +--- a/arch/arm/mm/idmap.c ++++ b/arch/arm/mm/idmap.c +@@ -22,6 +22,13 @@ static void idmap_add_pmd(pud_t *pud, unsigned long addr, unsigned long end, + pr_warning("Failed to allocate identity pmd.\n"); + return; + } ++ /* ++ * Copy the original PMD to ensure that the PMD entries for ++ * the kernel image are preserved. ++ */ ++ if (!pud_none(*pud)) ++ memcpy(pmd, pmd_offset(pud, 0), ++ PTRS_PER_PMD * sizeof(pmd_t)); + pud_populate(&init_mm, pud, pmd); + pmd += pmd_index(addr); + } else +diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig +index 9df4ea1caaf1..917c1098775b 100644 +--- a/arch/x86/Kconfig ++++ b/arch/x86/Kconfig +@@ -915,10 +915,27 @@ config VM86 + default y + depends on X86_32 + ---help--- +- This option is required by programs like DOSEMU to run 16-bit legacy +- code on X86 processors. It also may be needed by software like +- XFree86 to initialize some video cards via BIOS. Disabling this +- option saves about 6k. ++ This option is required by programs like DOSEMU to run ++ 16-bit real mode legacy code on x86 processors. It also may ++ be needed by software like XFree86 to initialize some video ++ cards via BIOS. Disabling this option saves about 6K. ++ ++config X86_16BIT ++ bool "Enable support for 16-bit segments" if EXPERT ++ default y ++ ---help--- ++ This option is required by programs like Wine to run 16-bit ++ protected mode legacy code on x86 processors. Disabling ++ this option saves about 300 bytes on i386, or around 6K text ++ plus 16K runtime memory on x86-64, ++ ++config X86_ESPFIX32 ++ def_bool y ++ depends on X86_16BIT && X86_32 ++ ++config X86_ESPFIX64 ++ def_bool y ++ depends on X86_16BIT && X86_64 + + config TOSHIBA + tristate "Toshiba Laptop support" +diff --git a/arch/x86/include/asm/espfix.h b/arch/x86/include/asm/espfix.h +new file mode 100644 +index 000000000000..99efebb2f69d +--- /dev/null ++++ b/arch/x86/include/asm/espfix.h +@@ -0,0 +1,16 @@ ++#ifndef _ASM_X86_ESPFIX_H ++#define _ASM_X86_ESPFIX_H ++ ++#ifdef CONFIG_X86_64 ++ ++#include <asm/percpu.h> ++ ++DECLARE_PER_CPU_READ_MOSTLY(unsigned long, espfix_stack); ++DECLARE_PER_CPU_READ_MOSTLY(unsigned long, espfix_waddr); ++ ++extern void init_espfix_bsp(void); ++extern void init_espfix_ap(void); ++ ++#endif /* CONFIG_X86_64 */ ++ ++#endif /* _ASM_X86_ESPFIX_H */ +diff --git a/arch/x86/include/asm/irqflags.h b/arch/x86/include/asm/irqflags.h +index bba3cf88e624..0a8b519226b8 100644 +--- a/arch/x86/include/asm/irqflags.h ++++ b/arch/x86/include/asm/irqflags.h +@@ -129,7 +129,7 @@ static inline notrace unsigned long arch_local_irq_save(void) + + #define PARAVIRT_ADJUST_EXCEPTION_FRAME /* */ + +-#define INTERRUPT_RETURN iretq ++#define INTERRUPT_RETURN jmp native_iret + #define USERGS_SYSRET64 \ + swapgs; \ + sysretq; +diff --git a/arch/x86/include/asm/pgtable_64_types.h b/arch/x86/include/asm/pgtable_64_types.h +index 766ea16fbbbd..51817fae7047 100644 +--- a/arch/x86/include/asm/pgtable_64_types.h ++++ b/arch/x86/include/asm/pgtable_64_types.h +@@ -59,5 +59,7 @@ typedef struct { pteval_t pte; } pte_t; + #define MODULES_VADDR _AC(0xffffffffa0000000, UL) + #define MODULES_END _AC(0xffffffffff000000, UL) + #define MODULES_LEN (MODULES_END - MODULES_VADDR) ++#define ESPFIX_PGD_ENTRY _AC(-2, UL) ++#define ESPFIX_BASE_ADDR (ESPFIX_PGD_ENTRY << PGDIR_SHIFT) + + #endif /* _ASM_X86_PGTABLE_64_DEFS_H */ +diff --git a/arch/x86/include/asm/setup.h b/arch/x86/include/asm/setup.h +index d0f19f9fb846..16c7971457f8 100644 +--- a/arch/x86/include/asm/setup.h ++++ b/arch/x86/include/asm/setup.h +@@ -61,6 +61,8 @@ static inline void x86_ce4100_early_setup(void) { } + + #ifndef _SETUP + ++#include <asm/espfix.h> ++ + /* + * This is set up by the setup-routine at boot-time + */ +diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile +index 532d2e090e6f..9f15a46e9ccb 100644 +--- a/arch/x86/kernel/Makefile ++++ b/arch/x86/kernel/Makefile +@@ -28,6 +28,7 @@ obj-$(CONFIG_X86_64) += sys_x86_64.o x8664_ksyms_64.o + obj-y += syscall_$(BITS).o + obj-$(CONFIG_X86_64) += vsyscall_64.o + obj-$(CONFIG_X86_64) += vsyscall_emu_64.o ++obj-$(CONFIG_X86_ESPFIX64) += espfix_64.o + obj-y += bootflag.o e820.o + obj-y += pci-dma.o quirks.o topology.o kdebugfs.o + obj-y += alternative.o i8253.o pci-nommu.o hw_breakpoint.o +diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S +index e36c5cf38fde..a0751249d3c1 100644 +--- a/arch/x86/kernel/entry_32.S ++++ b/arch/x86/kernel/entry_32.S +@@ -525,6 +525,7 @@ syscall_exit: + restore_all: + TRACE_IRQS_IRET + restore_all_notrace: ++#ifdef CONFIG_X86_ESPFIX32 + movl PT_EFLAGS(%esp), %eax # mix EFLAGS, SS and CS + # Warning: PT_OLDSS(%esp) contains the wrong/random values if we + # are returning to the kernel. +@@ -535,6 +536,7 @@ restore_all_notrace: + cmpl $((SEGMENT_LDT << 8) | USER_RPL), %eax + CFI_REMEMBER_STATE + je ldt_ss # returning to user-space with LDT SS ++#endif + restore_nocheck: + RESTORE_REGS 4 # skip orig_eax/error_code + irq_return: +@@ -550,6 +552,7 @@ ENTRY(iret_exc) + .long irq_return,iret_exc + .previous + ++#ifdef CONFIG_X86_ESPFIX32 + CFI_RESTORE_STATE + ldt_ss: + #ifdef CONFIG_PARAVIRT +@@ -593,6 +596,7 @@ ldt_ss: + lss (%esp), %esp /* switch to espfix segment */ + CFI_ADJUST_CFA_OFFSET -8 + jmp restore_nocheck ++#endif + CFI_ENDPROC + ENDPROC(system_call) + +@@ -766,6 +770,7 @@ ENDPROC(ptregs_clone) + * the high word of the segment base from the GDT and swiches to the + * normal stack and adjusts ESP with the matching offset. + */ ++#ifdef CONFIG_X86_ESPFIX32 + /* fixup the stack */ + mov GDT_ESPFIX_SS + 4, %al /* bits 16..23 */ + mov GDT_ESPFIX_SS + 7, %ah /* bits 24..31 */ +@@ -775,8 +780,10 @@ ENDPROC(ptregs_clone) + pushl_cfi %eax + lss (%esp), %esp /* switch to the normal stack segment */ + CFI_ADJUST_CFA_OFFSET -8 ++#endif + .endm + .macro UNWIND_ESPFIX_STACK ++#ifdef CONFIG_X86_ESPFIX32 + movl %ss, %eax + /* see if on espfix stack */ + cmpw $__ESPFIX_SS, %ax +@@ -787,6 +794,7 @@ ENDPROC(ptregs_clone) + /* switch to normal stack */ + FIXUP_ESPFIX_STACK + 27: ++#endif + .endm + + /* +@@ -1318,11 +1326,13 @@ END(debug) + */ + ENTRY(nmi) + RING0_INT_FRAME ++#ifdef CONFIG_X86_ESPFIX32 + pushl_cfi %eax + movl %ss, %eax + cmpw $__ESPFIX_SS, %ax + popl_cfi %eax + je nmi_espfix_stack ++#endif + cmpl $ia32_sysenter_target,(%esp) + je nmi_stack_fixup + pushl_cfi %eax +@@ -1362,6 +1372,7 @@ nmi_debug_stack_check: + FIX_STACK 24, nmi_stack_correct, 1 + jmp nmi_stack_correct + ++#ifdef CONFIG_X86_ESPFIX32 + nmi_espfix_stack: + /* We have a RING0_INT_FRAME here. + * +@@ -1383,6 +1394,7 @@ nmi_espfix_stack: + lss 12+4(%esp), %esp # back to espfix stack + CFI_ADJUST_CFA_OFFSET -24 + jmp irq_return ++#endif + CFI_ENDPROC + END(nmi) + +diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S +index bd6f59203b5f..42b055e24691 100644 +--- a/arch/x86/kernel/entry_64.S ++++ b/arch/x86/kernel/entry_64.S +@@ -55,6 +55,7 @@ + #include <asm/paravirt.h> + #include <asm/ftrace.h> + #include <asm/percpu.h> ++#include <asm/pgtable_types.h> + #include <linux/err.h> + + /* Avoid __ASSEMBLER__'ifying <linux/audit.h> just for this. */ +@@ -901,17 +902,47 @@ restore_args: + irq_return: + INTERRUPT_RETURN + +- .section __ex_table, "a" +- .quad irq_return, bad_iret +- .previous +- +-#ifdef CONFIG_PARAVIRT + ENTRY(native_iret) ++ /* ++ * Are we returning to a stack segment from the LDT? Note: in ++ * 64-bit mode SS:RSP on the exception stack is always valid. ++ */ ++#ifdef CONFIG_X86_ESPFIX64 ++ testb $4,(SS-RIP)(%rsp) ++ jnz native_irq_return_ldt ++#endif ++ ++native_irq_return_iret: + iretq + + .section __ex_table,"a" +- .quad native_iret, bad_iret ++ .quad native_irq_return_iret, bad_iret + .previous ++ ++#ifdef CONFIG_X86_ESPFIX64 ++native_irq_return_ldt: ++ pushq_cfi %rax ++ pushq_cfi %rdi ++ SWAPGS ++ movq PER_CPU_VAR(espfix_waddr),%rdi ++ movq %rax,(0*8)(%rdi) /* RAX */ ++ movq (2*8)(%rsp),%rax /* RIP */ ++ movq %rax,(1*8)(%rdi) ++ movq (3*8)(%rsp),%rax /* CS */ ++ movq %rax,(2*8)(%rdi) ++ movq (4*8)(%rsp),%rax /* RFLAGS */ ++ movq %rax,(3*8)(%rdi) ++ movq (6*8)(%rsp),%rax /* SS */ ++ movq %rax,(5*8)(%rdi) ++ movq (5*8)(%rsp),%rax /* RSP */ ++ movq %rax,(4*8)(%rdi) ++ andl $0xffff0000,%eax ++ popq_cfi %rdi ++ orq PER_CPU_VAR(espfix_stack),%rax ++ SWAPGS ++ movq %rax,%rsp ++ popq_cfi %rax ++ jmp native_irq_return_iret + #endif + + .section .fixup,"ax" +@@ -977,9 +1008,40 @@ ENTRY(retint_kernel) + call preempt_schedule_irq + jmp exit_intr + #endif +- + CFI_ENDPROC + END(common_interrupt) ++ ++ /* ++ * If IRET takes a fault on the espfix stack, then we ++ * end up promoting it to a doublefault. In that case, ++ * modify the stack to make it look like we just entered ++ * the #GP handler from user space, similar to bad_iret. ++ */ ++#ifdef CONFIG_X86_ESPFIX64 ++ ALIGN ++__do_double_fault: ++ XCPT_FRAME 1 RDI+8 ++ movq RSP(%rdi),%rax /* Trap on the espfix stack? */ ++ sarq $PGDIR_SHIFT,%rax ++ cmpl $ESPFIX_PGD_ENTRY,%eax ++ jne do_double_fault /* No, just deliver the fault */ ++ cmpl $__KERNEL_CS,CS(%rdi) ++ jne do_double_fault ++ movq RIP(%rdi),%rax ++ cmpq $native_irq_return_iret,%rax ++ jne do_double_fault /* This shouldn't happen... */ ++ movq PER_CPU_VAR(kernel_stack),%rax ++ subq $(6*8-KERNEL_STACK_OFFSET),%rax /* Reset to original stack */ ++ movq %rax,RSP(%rdi) ++ movq $0,(%rax) /* Missing (lost) #GP error code */ ++ movq $general_protection,RIP(%rdi) ++ retq ++ CFI_ENDPROC ++END(__do_double_fault) ++#else ++# define __do_double_fault do_double_fault ++#endif ++ + /* + * End of kprobes section + */ +@@ -1155,7 +1217,7 @@ zeroentry overflow do_overflow + zeroentry bounds do_bounds + zeroentry invalid_op do_invalid_op + zeroentry device_not_available do_device_not_available +-paranoiderrorentry double_fault do_double_fault ++paranoiderrorentry double_fault __do_double_fault + zeroentry coprocessor_segment_overrun do_coprocessor_segment_overrun + errorentry invalid_TSS do_invalid_TSS + errorentry segment_not_present do_segment_not_present +@@ -1486,7 +1548,7 @@ error_sti: + */ + error_kernelspace: + incl %ebx +- leaq irq_return(%rip),%rcx ++ leaq native_irq_return_iret(%rip),%rcx + cmpq %rcx,RIP+8(%rsp) + je error_swapgs + movl %ecx,%eax /* zero extend */ +diff --git a/arch/x86/kernel/espfix_64.c b/arch/x86/kernel/espfix_64.c +new file mode 100644 +index 000000000000..94d857fb1033 +--- /dev/null ++++ b/arch/x86/kernel/espfix_64.c +@@ -0,0 +1,208 @@ ++/* ----------------------------------------------------------------------- * ++ * ++ * Copyright 2014 Intel Corporation; author: H. Peter Anvin ++ * ++ * This program is free software; you can redistribute it and/or modify it ++ * under the terms and conditions of the GNU General Public License, ++ * version 2, as published by the Free Software Foundation. ++ * ++ * This program is distributed in the hope it will be useful, but WITHOUT ++ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or ++ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for ++ * more details. ++ * ++ * ----------------------------------------------------------------------- */ ++ ++/* ++ * The IRET instruction, when returning to a 16-bit segment, only ++ * restores the bottom 16 bits of the user space stack pointer. This ++ * causes some 16-bit software to break, but it also leaks kernel state ++ * to user space. ++ * ++ * This works around this by creating percpu "ministacks", each of which ++ * is mapped 2^16 times 64K apart. When we detect that the return SS is ++ * on the LDT, we copy the IRET frame to the ministack and use the ++ * relevant alias to return to userspace. The ministacks are mapped ++ * readonly, so if the IRET fault we promote #GP to #DF which is an IST ++ * vector and thus has its own stack; we then do the fixup in the #DF ++ * handler. ++ * ++ * This file sets up the ministacks and the related page tables. The ++ * actual ministack invocation is in entry_64.S. ++ */ ++ ++#include <linux/init.h> ++#include <linux/init_task.h> ++#include <linux/kernel.h> ++#include <linux/percpu.h> ++#include <linux/gfp.h> ++#include <linux/random.h> ++#include <asm/pgtable.h> ++#include <asm/pgalloc.h> ++#include <asm/setup.h> ++#include <asm/espfix.h> ++ ++/* ++ * Note: we only need 6*8 = 48 bytes for the espfix stack, but round ++ * it up to a cache line to avoid unnecessary sharing. ++ */ ++#define ESPFIX_STACK_SIZE (8*8UL) ++#define ESPFIX_STACKS_PER_PAGE (PAGE_SIZE/ESPFIX_STACK_SIZE) ++ ++/* There is address space for how many espfix pages? */ ++#define ESPFIX_PAGE_SPACE (1UL << (PGDIR_SHIFT-PAGE_SHIFT-16)) ++ ++#define ESPFIX_MAX_CPUS (ESPFIX_STACKS_PER_PAGE * ESPFIX_PAGE_SPACE) ++#if CONFIG_NR_CPUS > ESPFIX_MAX_CPUS ++# error "Need more than one PGD for the ESPFIX hack" ++#endif ++ ++#define PGALLOC_GFP (GFP_KERNEL | __GFP_NOTRACK | __GFP_REPEAT | __GFP_ZERO) ++ ++/* This contains the *bottom* address of the espfix stack */ ++DEFINE_PER_CPU_READ_MOSTLY(unsigned long, espfix_stack); ++DEFINE_PER_CPU_READ_MOSTLY(unsigned long, espfix_waddr); ++ ++/* Initialization mutex - should this be a spinlock? */ ++static DEFINE_MUTEX(espfix_init_mutex); ++ ++/* Page allocation bitmap - each page serves ESPFIX_STACKS_PER_PAGE CPUs */ ++#define ESPFIX_MAX_PAGES DIV_ROUND_UP(CONFIG_NR_CPUS, ESPFIX_STACKS_PER_PAGE) ++static void *espfix_pages[ESPFIX_MAX_PAGES]; ++ ++static __page_aligned_bss pud_t espfix_pud_page[PTRS_PER_PUD] ++ __aligned(PAGE_SIZE); ++ ++static unsigned int page_random, slot_random; ++ ++/* ++ * This returns the bottom address of the espfix stack for a specific CPU. ++ * The math allows for a non-power-of-two ESPFIX_STACK_SIZE, in which case ++ * we have to account for some amount of padding at the end of each page. ++ */ ++static inline unsigned long espfix_base_addr(unsigned int cpu) ++{ ++ unsigned long page, slot; ++ unsigned long addr; ++ ++ page = (cpu / ESPFIX_STACKS_PER_PAGE) ^ page_random; ++ slot = (cpu + slot_random) % ESPFIX_STACKS_PER_PAGE; ++ addr = (page << PAGE_SHIFT) + (slot * ESPFIX_STACK_SIZE); ++ addr = (addr & 0xffffUL) | ((addr & ~0xffffUL) << 16); ++ addr += ESPFIX_BASE_ADDR; ++ return addr; ++} ++ ++#define PTE_STRIDE (65536/PAGE_SIZE) ++#define ESPFIX_PTE_CLONES (PTRS_PER_PTE/PTE_STRIDE) ++#define ESPFIX_PMD_CLONES PTRS_PER_PMD ++#define ESPFIX_PUD_CLONES (65536/(ESPFIX_PTE_CLONES*ESPFIX_PMD_CLONES)) ++ ++#define PGTABLE_PROT ((_KERNPG_TABLE & ~_PAGE_RW) | _PAGE_NX) ++ ++static void init_espfix_random(void) ++{ ++ unsigned long rand; ++ ++ /* ++ * This is run before the entropy pools are initialized, ++ * but this is hopefully better than nothing. ++ */ ++ if (!arch_get_random_long(&rand)) { ++ /* The constant is an arbitrary large prime */ ++ rdtscll(rand); ++ rand *= 0xc345c6b72fd16123UL; ++ } ++ ++ slot_random = rand % ESPFIX_STACKS_PER_PAGE; ++ page_random = (rand / ESPFIX_STACKS_PER_PAGE) ++ & (ESPFIX_PAGE_SPACE - 1); ++} ++ ++void __init init_espfix_bsp(void) ++{ ++ pgd_t *pgd_p; ++ pteval_t ptemask; ++ ++ ptemask = __supported_pte_mask; ++ ++ /* Install the espfix pud into the kernel page directory */ ++ pgd_p = &init_level4_pgt[pgd_index(ESPFIX_BASE_ADDR)]; ++ pgd_populate(&init_mm, pgd_p, (pud_t *)espfix_pud_page); ++ ++ /* Randomize the locations */ ++ init_espfix_random(); ++ ++ /* The rest is the same as for any other processor */ ++ init_espfix_ap(); ++} ++ ++void init_espfix_ap(void) ++{ ++ unsigned int cpu, page; ++ unsigned long addr; ++ pud_t pud, *pud_p; ++ pmd_t pmd, *pmd_p; ++ pte_t pte, *pte_p; ++ int n; ++ void *stack_page; ++ pteval_t ptemask; ++ ++ /* We only have to do this once... */ ++ if (likely(this_cpu_read(espfix_stack))) ++ return; /* Already initialized */ ++ ++ cpu = smp_processor_id(); ++ addr = espfix_base_addr(cpu); ++ page = cpu/ESPFIX_STACKS_PER_PAGE; ++ ++ /* Did another CPU already set this up? */ ++ stack_page = ACCESS_ONCE(espfix_pages[page]); ++ if (likely(stack_page)) ++ goto done; ++ ++ mutex_lock(&espfix_init_mutex); ++ ++ /* Did we race on the lock? */ ++ stack_page = ACCESS_ONCE(espfix_pages[page]); ++ if (stack_page) ++ goto unlock_done; ++ ++ ptemask = __supported_pte_mask; ++ ++ pud_p = &espfix_pud_page[pud_index(addr)]; ++ pud = *pud_p; ++ if (!pud_present(pud)) { ++ pmd_p = (pmd_t *)__get_free_page(PGALLOC_GFP); ++ pud = __pud(__pa(pmd_p) | (PGTABLE_PROT & ptemask)); ++ paravirt_alloc_pmd(&init_mm, __pa(pmd_p) >> PAGE_SHIFT); ++ for (n = 0; n < ESPFIX_PUD_CLONES; n++) ++ set_pud(&pud_p[n], pud); ++ } ++ ++ pmd_p = pmd_offset(&pud, addr); ++ pmd = *pmd_p; ++ if (!pmd_present(pmd)) { ++ pte_p = (pte_t *)__get_free_page(PGALLOC_GFP); ++ pmd = __pmd(__pa(pte_p) | (PGTABLE_PROT & ptemask)); ++ paravirt_alloc_pte(&init_mm, __pa(pte_p) >> PAGE_SHIFT); ++ for (n = 0; n < ESPFIX_PMD_CLONES; n++) ++ set_pmd(&pmd_p[n], pmd); ++ } ++ ++ pte_p = pte_offset_kernel(&pmd, addr); ++ stack_page = (void *)__get_free_page(GFP_KERNEL); ++ pte = __pte(__pa(stack_page) | (__PAGE_KERNEL_RO & ptemask)); ++ for (n = 0; n < ESPFIX_PTE_CLONES; n++) ++ set_pte(&pte_p[n*PTE_STRIDE], pte); ++ ++ /* Job is done for this CPU and any CPU which shares this page */ ++ ACCESS_ONCE(espfix_pages[page]) = stack_page; ++ ++unlock_done: ++ mutex_unlock(&espfix_init_mutex); ++done: ++ this_cpu_write(espfix_stack, addr); ++ this_cpu_write(espfix_waddr, (unsigned long)stack_page ++ + (addr & ~PAGE_MASK)); ++} +diff --git a/arch/x86/kernel/ldt.c b/arch/x86/kernel/ldt.c +index dcbbaa165bde..c37886d759cc 100644 +--- a/arch/x86/kernel/ldt.c ++++ b/arch/x86/kernel/ldt.c +@@ -20,8 +20,6 @@ + #include <asm/mmu_context.h> + #include <asm/syscalls.h> + +-int sysctl_ldt16 = 0; +- + #ifdef CONFIG_SMP + static void flush_ldt(void *current_mm) + { +@@ -231,16 +229,10 @@ static int write_ldt(void __user *ptr, unsigned long bytecount, int oldmode) + } + } + +- /* +- * On x86-64 we do not support 16-bit segments due to +- * IRET leaking the high bits of the kernel stack address. +- */ +-#ifdef CONFIG_X86_64 +- if (!ldt_info.seg_32bit && !sysctl_ldt16) { ++ if (!IS_ENABLED(CONFIG_X86_16BIT) && !ldt_info.seg_32bit) { + error = -EINVAL; + goto out_unlock; + } +-#endif + + fill_ldt(&ldt, &ldt_info); + if (oldmode) +diff --git a/arch/x86/kernel/paravirt_patch_64.c b/arch/x86/kernel/paravirt_patch_64.c +index 3f08f34f93eb..a1da6737ba5b 100644 +--- a/arch/x86/kernel/paravirt_patch_64.c ++++ b/arch/x86/kernel/paravirt_patch_64.c +@@ -6,7 +6,6 @@ DEF_NATIVE(pv_irq_ops, irq_disable, "cli"); + DEF_NATIVE(pv_irq_ops, irq_enable, "sti"); + DEF_NATIVE(pv_irq_ops, restore_fl, "pushq %rdi; popfq"); + DEF_NATIVE(pv_irq_ops, save_fl, "pushfq; popq %rax"); +-DEF_NATIVE(pv_cpu_ops, iret, "iretq"); + DEF_NATIVE(pv_mmu_ops, read_cr2, "movq %cr2, %rax"); + DEF_NATIVE(pv_mmu_ops, read_cr3, "movq %cr3, %rax"); + DEF_NATIVE(pv_mmu_ops, write_cr3, "movq %rdi, %cr3"); +@@ -50,7 +49,6 @@ unsigned native_patch(u8 type, u16 clobbers, void *ibuf, + PATCH_SITE(pv_irq_ops, save_fl); + PATCH_SITE(pv_irq_ops, irq_enable); + PATCH_SITE(pv_irq_ops, irq_disable); +- PATCH_SITE(pv_cpu_ops, iret); + PATCH_SITE(pv_cpu_ops, irq_enable_sysexit); + PATCH_SITE(pv_cpu_ops, usergs_sysret32); + PATCH_SITE(pv_cpu_ops, usergs_sysret64); +diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c +index 849cdcf2e76a..d28c59588ad3 100644 +--- a/arch/x86/kernel/smpboot.c ++++ b/arch/x86/kernel/smpboot.c +@@ -271,6 +271,13 @@ notrace static void __cpuinit start_secondary(void *unused) + check_tsc_sync_target(); + + /* ++ * Enable the espfix hack for this CPU ++ */ ++#ifdef CONFIG_X86_ESPFIX64 ++ init_espfix_ap(); ++#endif ++ ++ /* + * We need to hold call_lock, so there is no inconsistency + * between the time smp_call_function() determines number of + * IPI recipients, and the time when the determination is made +diff --git a/arch/x86/mm/dump_pagetables.c b/arch/x86/mm/dump_pagetables.c +index 0002a3a33081..e04e67753238 100644 +--- a/arch/x86/mm/dump_pagetables.c ++++ b/arch/x86/mm/dump_pagetables.c +@@ -30,11 +30,13 @@ struct pg_state { + unsigned long start_address; + unsigned long current_address; + const struct addr_marker *marker; ++ unsigned long lines; + }; + + struct addr_marker { + unsigned long start_address; + const char *name; ++ unsigned long max_lines; + }; + + /* indices for address_markers; keep sync'd w/ address_markers below */ +@@ -45,6 +47,7 @@ enum address_markers_idx { + LOW_KERNEL_NR, + VMALLOC_START_NR, + VMEMMAP_START_NR, ++ ESPFIX_START_NR, + HIGH_KERNEL_NR, + MODULES_VADDR_NR, + MODULES_END_NR, +@@ -67,6 +70,7 @@ static struct addr_marker address_markers[] = { + { PAGE_OFFSET, "Low Kernel Mapping" }, + { VMALLOC_START, "vmalloc() Area" }, + { VMEMMAP_START, "Vmemmap" }, ++ { ESPFIX_BASE_ADDR, "ESPfix Area", 16 }, + { __START_KERNEL_map, "High Kernel Mapping" }, + { MODULES_VADDR, "Modules" }, + { MODULES_END, "End Modules" }, +@@ -163,7 +167,7 @@ static void note_page(struct seq_file *m, struct pg_state *st, + pgprot_t new_prot, int level) + { + pgprotval_t prot, cur; +- static const char units[] = "KMGTPE"; ++ static const char units[] = "BKMGTPE"; + + /* + * If we have a "break" in the series, we need to flush the state that +@@ -178,6 +182,7 @@ static void note_page(struct seq_file *m, struct pg_state *st, + st->current_prot = new_prot; + st->level = level; + st->marker = address_markers; ++ st->lines = 0; + seq_printf(m, "---[ %s ]---\n", st->marker->name); + } else if (prot != cur || level != st->level || + st->current_address >= st->marker[1].start_address) { +@@ -188,17 +193,21 @@ static void note_page(struct seq_file *m, struct pg_state *st, + /* + * Now print the actual finished series + */ +- seq_printf(m, "0x%0*lx-0x%0*lx ", +- width, st->start_address, +- width, st->current_address); +- +- delta = (st->current_address - st->start_address) >> 10; +- while (!(delta & 1023) && unit[1]) { +- delta >>= 10; +- unit++; ++ if (!st->marker->max_lines || ++ st->lines < st->marker->max_lines) { ++ seq_printf(m, "0x%0*lx-0x%0*lx ", ++ width, st->start_address, ++ width, st->current_address); ++ ++ delta = (st->current_address - st->start_address); ++ while (!(delta & 1023) && unit[1]) { ++ delta >>= 10; ++ unit++; ++ } ++ seq_printf(m, "%9lu%c ", delta, *unit); ++ printk_prot(m, st->current_prot, st->level); + } +- seq_printf(m, "%9lu%c ", delta, *unit); +- printk_prot(m, st->current_prot, st->level); ++ st->lines++; + + /* + * We print markers for special areas of address space, +@@ -206,7 +215,15 @@ static void note_page(struct seq_file *m, struct pg_state *st, + * This helps in the interpretation. + */ + if (st->current_address >= st->marker[1].start_address) { ++ if (st->marker->max_lines && ++ st->lines > st->marker->max_lines) { ++ unsigned long nskip = ++ st->lines - st->marker->max_lines; ++ seq_printf(m, "... %lu entr%s skipped ... \n", ++ nskip, nskip == 1 ? "y" : "ies"); ++ } + st->marker++; ++ st->lines = 0; + seq_printf(m, "---[ %s ]---\n", st->marker->name); + } + +diff --git a/arch/x86/vdso/vdso32-setup.c b/arch/x86/vdso/vdso32-setup.c +index c734408d55e4..66e6d9359826 100644 +--- a/arch/x86/vdso/vdso32-setup.c ++++ b/arch/x86/vdso/vdso32-setup.c +@@ -41,7 +41,6 @@ enum { + #ifdef CONFIG_X86_64 + #define vdso_enabled sysctl_vsyscall32 + #define arch_setup_additional_pages syscall32_setup_pages +-extern int sysctl_ldt16; + #endif + + /* +@@ -381,13 +380,6 @@ static ctl_table abi_table2[] = { + .mode = 0644, + .proc_handler = proc_dointvec + }, +- { +- .procname = "ldt16", +- .data = &sysctl_ldt16, +- .maxlen = sizeof(int), +- .mode = 0644, +- .proc_handler = proc_dointvec +- }, + {} + }; + +diff --git a/crypto/af_alg.c b/crypto/af_alg.c +index ac33d5f30778..bf948e134981 100644 +--- a/crypto/af_alg.c ++++ b/crypto/af_alg.c +@@ -21,6 +21,7 @@ + #include <linux/module.h> + #include <linux/net.h> + #include <linux/rwsem.h> ++#include <linux/security.h> + + struct alg_type_list { + const struct af_alg_type *type; +@@ -243,6 +244,7 @@ int af_alg_accept(struct sock *sk, struct socket *newsock) + + sock_init_data(newsock, sk2); + sock_graft(sk2, newsock); ++ security_sk_clone(sk, sk2); + + err = type->accept(ask->private, sk2); + if (err) { +diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c +index 2bc036292d9d..ac99b46dc4a4 100644 +--- a/drivers/scsi/scsi_lib.c ++++ b/drivers/scsi/scsi_lib.c +@@ -795,6 +795,14 @@ void scsi_io_completion(struct scsi_cmnd *cmd, unsigned int good_bytes) + scsi_next_command(cmd); + return; + } ++ } else if (blk_rq_bytes(req) == 0 && result && !sense_deferred) { ++ /* ++ * Certain non BLOCK_PC requests are commands that don't ++ * actually transfer anything (FLUSH), so cannot use ++ * good_bytes != blk_rq_bytes(req) as the signal for an error. ++ * This sets the error explicitly for the problem case. ++ */ ++ error = __scsi_error_from_host_byte(cmd, result); + } + + /* no bidi support for !REQ_TYPE_BLOCK_PC yet */ +diff --git a/include/linux/printk.h b/include/linux/printk.h +index 0525927f203f..d633115489d1 100644 +--- a/include/linux/printk.h ++++ b/include/linux/printk.h +@@ -101,9 +101,9 @@ asmlinkage __printf(1, 2) __cold + int printk(const char *fmt, ...); + + /* +- * Special printk facility for scheduler use only, _DO_NOT_USE_ ! ++ * Special printk facility for scheduler/timekeeping use only, _DO_NOT_USE_ ! + */ +-__printf(1, 2) __cold int printk_sched(const char *fmt, ...); ++__printf(1, 2) __cold int printk_deferred(const char *fmt, ...); + + /* + * Please don't use printk_ratelimit(), because it shares ratelimiting state +@@ -133,7 +133,7 @@ int printk(const char *s, ...) + return 0; + } + static inline __printf(1, 2) __cold +-int printk_sched(const char *s, ...) ++int printk_deferred(const char *s, ...) + { + return 0; + } +diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h +index e22df7a4f1ab..4424db24396d 100644 +--- a/include/linux/skbuff.h ++++ b/include/linux/skbuff.h +@@ -2608,22 +2608,5 @@ static inline bool skb_is_recycleable(const struct sk_buff *skb, int skb_size) + + return true; + } +- +-/** +- * skb_gso_network_seglen - Return length of individual segments of a gso packet +- * +- * @skb: GSO skb +- * +- * skb_gso_network_seglen is used to determine the real size of the +- * individual segments, including Layer3 (IP, IPv6) and L4 headers (TCP/UDP). +- * +- * The MAC/L2 header is not accounted for. +- */ +-static inline unsigned int skb_gso_network_seglen(const struct sk_buff *skb) +-{ +- unsigned int hdr_len = skb_transport_header(skb) - +- skb_network_header(skb); +- return hdr_len + skb_gso_transport_seglen(skb); +-} + #endif /* __KERNEL__ */ + #endif /* _LINUX_SKBUFF_H */ +diff --git a/init/main.c b/init/main.c +index db8e381877ad..f30baf52f80b 100644 +--- a/init/main.c ++++ b/init/main.c +@@ -606,6 +606,10 @@ asmlinkage void __init start_kernel(void) + if (efi_enabled(EFI_RUNTIME_SERVICES)) + efi_enter_virtual_mode(); + #endif ++#ifdef CONFIG_X86_ESPFIX64 ++ /* Should be run before the first non-init thread is created */ ++ init_espfix_bsp(); ++#endif + thread_info_cache_init(); + cred_init(); + fork_init(totalram_pages); +diff --git a/kernel/printk.c b/kernel/printk.c +index e39adc13f5f5..544c0215939a 100644 +--- a/kernel/printk.c ++++ b/kernel/printk.c +@@ -1653,7 +1653,7 @@ late_initcall(printk_late_init); + + #if defined CONFIG_PRINTK + +-int printk_sched(const char *fmt, ...) ++int printk_deferred(const char *fmt, ...) + { + unsigned long flags; + va_list args; +diff --git a/kernel/sched/core.c b/kernel/sched/core.c +index af837d82634e..5701cb9a673f 100644 +--- a/kernel/sched/core.c ++++ b/kernel/sched/core.c +@@ -1313,7 +1313,7 @@ out: + * leave kernel. + */ + if (p->mm && printk_ratelimit()) { +- printk_sched("process %d (%s) no longer affine to cpu%d\n", ++ printk_deferred("process %d (%s) no longer affine to cpu%d\n", + task_pid_nr(p), p->comm, cpu); + } + } +diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c +index 2b0131835714..526c77d1b655 100644 +--- a/kernel/sched/rt.c ++++ b/kernel/sched/rt.c +@@ -884,7 +884,7 @@ static int sched_rt_runtime_exceeded(struct rt_rq *rt_rq) + + if (!once) { + once = true; +- printk_sched("sched: RT throttling activated\n"); ++ printk_deferred("sched: RT throttling activated\n"); + } + } else { + /* +diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c +index 0d37a6fd18af..4d7ee79ec469 100644 +--- a/kernel/time/clockevents.c ++++ b/kernel/time/clockevents.c +@@ -143,7 +143,8 @@ static int clockevents_increase_min_delta(struct clock_event_device *dev) + { + /* Nothing to do if we already reached the limit */ + if (dev->min_delta_ns >= MIN_DELTA_LIMIT) { +- printk(KERN_WARNING "CE: Reprogramming failure. Giving up\n"); ++ printk_deferred(KERN_WARNING ++ "CE: Reprogramming failure. Giving up\n"); + dev->next_event.tv64 = KTIME_MAX; + return -ETIME; + } +@@ -156,9 +157,10 @@ static int clockevents_increase_min_delta(struct clock_event_device *dev) + if (dev->min_delta_ns > MIN_DELTA_LIMIT) + dev->min_delta_ns = MIN_DELTA_LIMIT; + +- printk(KERN_WARNING "CE: %s increased min_delta_ns to %llu nsec\n", +- dev->name ? dev->name : "?", +- (unsigned long long) dev->min_delta_ns); ++ printk_deferred(KERN_WARNING ++ "CE: %s increased min_delta_ns to %llu nsec\n", ++ dev->name ? dev->name : "?", ++ (unsigned long long) dev->min_delta_ns); + return 0; + } + +diff --git a/lib/btree.c b/lib/btree.c +index 5cf9e74ec3f3..53a773ecf57c 100644 +--- a/lib/btree.c ++++ b/lib/btree.c +@@ -198,6 +198,7 @@ EXPORT_SYMBOL_GPL(btree_init); + + void btree_destroy(struct btree_head *head) + { ++ mempool_free(head->node, head->mempool); + mempool_destroy(head->mempool); + head->mempool = NULL; + } +diff --git a/mm/mlock.c b/mm/mlock.c +index ef726e8aa8e9..7a84dd1f5ec6 100644 +--- a/mm/mlock.c ++++ b/mm/mlock.c +@@ -78,6 +78,7 @@ void __clear_page_mlock(struct page *page) + */ + void mlock_vma_page(struct page *page) + { ++ /* Serialize with page migration */ + BUG_ON(!PageLocked(page)); + + if (!TestSetPageMlocked(page)) { +@@ -105,6 +106,7 @@ void mlock_vma_page(struct page *page) + */ + void munlock_vma_page(struct page *page) + { ++ /* For try_to_munlock() and to serialize with page migration */ + BUG_ON(!PageLocked(page)); + + if (TestClearPageMlocked(page)) { +diff --git a/mm/page_alloc.c b/mm/page_alloc.c +index 39d530a425b5..ff0b0997b953 100644 +--- a/mm/page_alloc.c ++++ b/mm/page_alloc.c +@@ -2187,7 +2187,7 @@ static inline int + gfp_to_alloc_flags(gfp_t gfp_mask) + { + int alloc_flags = ALLOC_WMARK_MIN | ALLOC_CPUSET; +- const gfp_t wait = gfp_mask & __GFP_WAIT; ++ const bool atomic = !(gfp_mask & (__GFP_WAIT | __GFP_NO_KSWAPD)); + + /* __GFP_HIGH is assumed to be the same as ALLOC_HIGH to save a branch. */ + BUILD_BUG_ON(__GFP_HIGH != (__force gfp_t) ALLOC_HIGH); +@@ -2196,20 +2196,20 @@ gfp_to_alloc_flags(gfp_t gfp_mask) + * The caller may dip into page reserves a bit more if the caller + * cannot run direct reclaim, or if the caller has realtime scheduling + * policy or is asking for __GFP_HIGH memory. GFP_ATOMIC requests will +- * set both ALLOC_HARDER (!wait) and ALLOC_HIGH (__GFP_HIGH). ++ * set both ALLOC_HARDER (atomic == true) and ALLOC_HIGH (__GFP_HIGH). + */ + alloc_flags |= (__force int) (gfp_mask & __GFP_HIGH); + +- if (!wait) { ++ if (atomic) { + /* +- * Not worth trying to allocate harder for +- * __GFP_NOMEMALLOC even if it can't schedule. ++ * Not worth trying to allocate harder for __GFP_NOMEMALLOC even ++ * if it can't schedule. + */ +- if (!(gfp_mask & __GFP_NOMEMALLOC)) ++ if (!(gfp_mask & __GFP_NOMEMALLOC)) + alloc_flags |= ALLOC_HARDER; + /* +- * Ignore cpuset if GFP_ATOMIC (!wait) rather than fail alloc. +- * See also cpuset_zone_allowed() comment in kernel/cpuset.c. ++ * Ignore cpuset mems for GFP_ATOMIC rather than fail, see the ++ * comment for __cpuset_node_allowed_softwall(). + */ + alloc_flags &= ~ALLOC_CPUSET; + } else if (unlikely(rt_task(current)) && !in_interrupt()) +diff --git a/mm/rmap.c b/mm/rmap.c +index 6dc46f345dba..695eaff55d77 100644 +--- a/mm/rmap.c ++++ b/mm/rmap.c +@@ -1449,9 +1449,19 @@ static int try_to_unmap_cluster(unsigned long cursor, unsigned int *mapcount, + BUG_ON(!page || PageAnon(page)); + + if (locked_vma) { +- mlock_vma_page(page); /* no-op if already mlocked */ +- if (page == check_page) ++ if (page == check_page) { ++ /* we know we have check_page locked */ ++ mlock_vma_page(page); + ret = SWAP_MLOCK; ++ } else if (trylock_page(page)) { ++ /* ++ * If we can lock the page, perform mlock. ++ * Otherwise leave the page alone, it will be ++ * eventually encountered again later. ++ */ ++ mlock_vma_page(page); ++ unlock_page(page); ++ } + continue; /* don't unmap */ + } + +diff --git a/net/ipv4/ip_forward.c b/net/ipv4/ip_forward.c +index 7593f3a46035..29a07b6c7168 100644 +--- a/net/ipv4/ip_forward.c ++++ b/net/ipv4/ip_forward.c +@@ -39,68 +39,6 @@ + #include <net/route.h> + #include <net/xfrm.h> + +-static bool ip_may_fragment(const struct sk_buff *skb) +-{ +- return unlikely((ip_hdr(skb)->frag_off & htons(IP_DF)) == 0) || +- skb->local_df; +-} +- +-static bool ip_exceeds_mtu(const struct sk_buff *skb, unsigned int mtu) +-{ +- if (skb->len <= mtu) +- return false; +- +- if (skb_is_gso(skb) && skb_gso_network_seglen(skb) <= mtu) +- return false; +- +- return true; +-} +- +-static bool ip_gso_exceeds_dst_mtu(const struct sk_buff *skb) +-{ +- unsigned int mtu; +- +- if (skb->local_df || !skb_is_gso(skb)) +- return false; +- +- mtu = dst_mtu(skb_dst(skb)); +- +- /* if seglen > mtu, do software segmentation for IP fragmentation on +- * output. DF bit cannot be set since ip_forward would have sent +- * icmp error. +- */ +- return skb_gso_network_seglen(skb) > mtu; +-} +- +-/* called if GSO skb needs to be fragmented on forward */ +-static int ip_forward_finish_gso(struct sk_buff *skb) +-{ +- struct sk_buff *segs; +- int ret = 0; +- +- segs = skb_gso_segment(skb, 0); +- if (IS_ERR(segs)) { +- kfree_skb(skb); +- return -ENOMEM; +- } +- +- consume_skb(skb); +- +- do { +- struct sk_buff *nskb = segs->next; +- int err; +- +- segs->next = NULL; +- err = dst_output(segs); +- +- if (err && ret == 0) +- ret = err; +- segs = nskb; +- } while (segs); +- +- return ret; +-} +- + static int ip_forward_finish(struct sk_buff *skb) + { + struct ip_options * opt = &(IPCB(skb)->opt); +@@ -110,9 +48,6 @@ static int ip_forward_finish(struct sk_buff *skb) + if (unlikely(opt->optlen)) + ip_forward_options(skb); + +- if (ip_gso_exceeds_dst_mtu(skb)) +- return ip_forward_finish_gso(skb); +- + return dst_output(skb); + } + +@@ -152,7 +87,8 @@ int ip_forward(struct sk_buff *skb) + if (opt->is_strictroute && opt->nexthop != rt->rt_gateway) + goto sr_failed; + +- if (!ip_may_fragment(skb) && ip_exceeds_mtu(skb, dst_mtu(&rt->dst))) { ++ if (unlikely(skb->len > dst_mtu(&rt->dst) && !skb_is_gso(skb) && ++ (ip_hdr(skb)->frag_off & htons(IP_DF))) && !skb->local_df) { + IP_INC_STATS(dev_net(rt->dst.dev), IPSTATS_MIB_FRAGFAILS); + icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, + htonl(dst_mtu(&rt->dst))); +diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c +index f976c3858df0..be5876079a8e 100644 +--- a/net/ipv6/addrconf.c ++++ b/net/ipv6/addrconf.c +@@ -2438,8 +2438,18 @@ static void init_loopback(struct net_device *dev) + if (sp_ifa->flags & (IFA_F_DADFAILED | IFA_F_TENTATIVE)) + continue; + +- if (sp_ifa->rt) +- continue; ++ if (sp_ifa->rt) { ++ /* This dst has been added to garbage list when ++ * lo device down, release this obsolete dst and ++ * reallocate a new router for ifa. ++ */ ++ if (sp_ifa->rt->dst.obsolete > 0) { ++ dst_release(&sp_ifa->rt->dst); ++ sp_ifa->rt = NULL; ++ } else { ++ continue; ++ } ++ } + + sp_rt = addrconf_dst_alloc(idev, &sp_ifa->addr, 0); + +diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c +index 077b9a3f8d25..6225b7cf12b9 100644 +--- a/net/ipv6/ip6_output.c ++++ b/net/ipv6/ip6_output.c +@@ -382,17 +382,6 @@ static inline int ip6_forward_finish(struct sk_buff *skb) + return dst_output(skb); + } + +-static bool ip6_pkt_too_big(const struct sk_buff *skb, unsigned int mtu) +-{ +- if (skb->len <= mtu || skb->local_df) +- return false; +- +- if (skb_is_gso(skb) && skb_gso_network_seglen(skb) <= mtu) +- return false; +- +- return true; +-} +- + int ip6_forward(struct sk_buff *skb) + { + struct dst_entry *dst = skb_dst(skb); +@@ -514,7 +503,7 @@ int ip6_forward(struct sk_buff *skb) + if (mtu < IPV6_MIN_MTU) + mtu = IPV6_MIN_MTU; + +- if (ip6_pkt_too_big(skb, mtu)) { ++ if (skb->len > mtu && !skb_is_gso(skb)) { + /* Again, force OUTPUT device used as source address */ + skb->dev = dst->dev; + icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu); +diff --git a/net/l2tp/l2tp_ppp.c b/net/l2tp/l2tp_ppp.c +index 4e38a81e48ee..82ed7dfb7b80 100644 +--- a/net/l2tp/l2tp_ppp.c ++++ b/net/l2tp/l2tp_ppp.c +@@ -1351,7 +1351,7 @@ static int pppol2tp_setsockopt(struct socket *sock, int level, int optname, + int err; + + if (level != SOL_PPPOL2TP) +- return udp_prot.setsockopt(sk, level, optname, optval, optlen); ++ return -EINVAL; + + if (optlen < sizeof(int)) + return -EINVAL; +@@ -1477,7 +1477,7 @@ static int pppol2tp_getsockopt(struct socket *sock, int level, + struct pppol2tp_session *ps; + + if (level != SOL_PPPOL2TP) +- return udp_prot.getsockopt(sk, level, optname, optval, optlen); ++ return -EINVAL; + + if (get_user(len, (int __user *) optlen)) + return -EFAULT;