This patch, by Jake Moilanen with some further hacking from me, adds a real execute permission bit to the linux PTEs on PPC64, and connects that into the kernel infrastructure for implementing non-executable stacks and heaps. This means that on any PPC64 cpu since the POWER4 (i.e. POWER4, PPC970, PPC970FX, POWER4+, POWER5) you will get a segfault if you try to execute instructions from a region that doesn't have PROT_EXEC permission. The patch also marks the pages of the linear mapping that aren't part of the kernel text as non-executable.
Andrew and Linus, could you try this on your G5s? I have tried it here on a Debian system and a SLES9 system and everything runs fine, but I haven't been able to try it on YDL, FC or RHEL4. With this patch we default to executable stack and read-implies-exec behaviour when there is no PT_GNU_STACK program header entry, or when there is one and it indicates the stack is executable. For 32-bit processes, the heap is always executable, because the PLT contains instructions and it ends up in the bss segment. Signed-off-by: Jake Moilanen <[EMAIL PROTECTED]> Signed-off-by: Paul Mackerras <[EMAIL PROTECTED]> diff -urN linux-2.5/arch/ppc64/kernel/head.S g5-ppc64/arch/ppc64/kernel/head.S --- linux-2.5/arch/ppc64/kernel/head.S 2005-03-07 08:21:53.000000000 +1100 +++ g5-ppc64/arch/ppc64/kernel/head.S 2005-03-15 17:14:44.000000000 +1100 @@ -950,11 +950,12 @@ * accessing a userspace segment (even from the kernel). We assume * kernel addresses always have the high bit set. */ - rlwinm r4,r4,32-23,29,29 /* DSISR_STORE -> _PAGE_RW */ + rlwinm r4,r4,32-25+9,31-9,31-9 /* DSISR_STORE -> _PAGE_RW */ rotldi r0,r3,15 /* Move high bit into MSR_PR posn */ orc r0,r12,r0 /* MSR_PR | ~high_bit */ rlwimi r4,r0,32-13,30,30 /* becomes _PAGE_USER access bit */ ori r4,r4,1 /* add _PAGE_PRESENT */ + rlwimi r4,r5,22+2,31-2,31-2 /* Set _PAGE_EXEC if trap is 0x400 */ /* * On iSeries, we soft-disable interrupts here, then diff -urN linux-2.5/arch/ppc64/kernel/iSeries_htab.c g5-ppc64/arch/ppc64/kernel/iSeries_htab.c --- linux-2.5/arch/ppc64/kernel/iSeries_htab.c 2004-09-24 15:23:06.000000000 +1000 +++ g5-ppc64/arch/ppc64/kernel/iSeries_htab.c 2005-03-15 17:15:36.000000000 +1100 @@ -144,6 +144,10 @@ HvCallHpt_get(&hpte, slot); if ((hpte.dw0.dw0.avpn == avpn) && (hpte.dw0.dw0.v)) { + /* + * Hypervisor expects bits as NPPP, which is + * different from how they are mapped in our PP. + */ HvCallHpt_setPp(slot, (newpp & 0x3) | ((newpp & 0x4) << 1)); iSeries_hunlock(slot); return 0; diff -urN linux-2.5/arch/ppc64/kernel/iSeries_setup.c g5-ppc64/arch/ppc64/kernel/iSeries_setup.c --- linux-2.5/arch/ppc64/kernel/iSeries_setup.c 2005-03-07 08:21:53.000000000 +1100 +++ g5-ppc64/arch/ppc64/kernel/iSeries_setup.c 2005-03-15 16:55:05.000000000 +1100 @@ -633,6 +633,10 @@ unsigned long vpn = va >> PAGE_SHIFT; unsigned long slot = HvCallHpt_findValid(&hpte, vpn); + /* Make non-kernel text non-executable */ + if (!in_kernel_text(ea)) + mode_rw |= HW_NO_EXEC; + if (hpte.dw0.dw0.v) { /* HPTE exists, so just bolt it */ HvCallHpt_setSwBits(slot, 0x10, 0); diff -urN linux-2.5/arch/ppc64/kernel/module.c g5-ppc64/arch/ppc64/kernel/module.c --- linux-2.5/arch/ppc64/kernel/module.c 2004-05-11 07:53:04.000000000 +1000 +++ g5-ppc64/arch/ppc64/kernel/module.c 2005-03-15 16:55:05.000000000 +1100 @@ -102,7 +102,8 @@ { if (size == 0) return NULL; - return vmalloc(size); + + return vmalloc_exec(size); } /* Free memory returned from module_alloc */ diff -urN linux-2.5/arch/ppc64/kernel/pSeries_lpar.c g5-ppc64/arch/ppc64/kernel/pSeries_lpar.c --- linux-2.5/arch/ppc64/kernel/pSeries_lpar.c 2005-03-07 08:21:53.000000000 +1100 +++ g5-ppc64/arch/ppc64/kernel/pSeries_lpar.c 2005-03-15 16:55:02.000000000 +1100 @@ -470,7 +470,7 @@ slot = pSeries_lpar_hpte_find(vpn); BUG_ON(slot == -1); - flags = newpp & 3; + flags = newpp & 7; lpar_rc = plpar_pte_protect(flags, slot, 0); BUG_ON(lpar_rc != H_Success); diff -urN linux-2.5/arch/ppc64/mm/fault.c g5-ppc64/arch/ppc64/mm/fault.c --- linux-2.5/arch/ppc64/mm/fault.c 2005-01-04 17:25:05.000000000 +1100 +++ g5-ppc64/arch/ppc64/mm/fault.c 2005-03-15 17:13:05.000000000 +1100 @@ -91,8 +91,9 @@ struct mm_struct *mm = current->mm; siginfo_t info; unsigned long code = SEGV_MAPERR; - unsigned long is_write = error_code & 0x02000000; + unsigned long is_write = error_code & DSISR_ISSTORE; unsigned long trap = TRAP(regs); + unsigned long is_exec = trap == 0x400; BUG_ON((trap == 0x380) || (trap == 0x480)); @@ -109,7 +110,7 @@ if (!user_mode(regs) && (address >= TASK_SIZE)) return SIGSEGV; - if (error_code & 0x00400000) { + if (error_code & DSISR_DABRMATCH) { if (notify_die(DIE_DABR_MATCH, "dabr_match", regs, error_code, 11, SIGSEGV) == NOTIFY_STOP) return 0; @@ -199,16 +200,19 @@ good_area: code = SEGV_ACCERR; + if (is_exec) { + /* protection fault */ + if (error_code & DSISR_PROTFAULT) + goto bad_area; + if (!(vma->vm_flags & VM_EXEC)) + goto bad_area; /* a write */ - if (is_write) { + } else if (is_write) { if (!(vma->vm_flags & VM_WRITE)) goto bad_area; /* a read */ } else { - /* protection fault */ - if (error_code & 0x08000000) - goto bad_area; - if (!(vma->vm_flags & (VM_READ | VM_EXEC))) + if (!(vma->vm_flags & VM_READ)) goto bad_area; } @@ -251,6 +255,12 @@ return 0; } + if (trap == 0x400 && (error_code & DSISR_PROTFAULT) + && printk_ratelimit()) + printk(KERN_CRIT "kernel tried to execute NX-protected" + " page (%lx) - exploit attempt? (uid: %d)\n", + address, current->uid); + return SIGSEGV; /* diff -urN linux-2.5/arch/ppc64/mm/hash_low.S g5-ppc64/arch/ppc64/mm/hash_low.S --- linux-2.5/arch/ppc64/mm/hash_low.S 2005-01-06 13:13:08.000000000 +1100 +++ g5-ppc64/arch/ppc64/mm/hash_low.S 2005-03-15 16:55:02.000000000 +1100 @@ -89,7 +89,7 @@ /* Prepare new PTE value (turn access RW into DIRTY, then * add BUSY,HASHPTE and ACCESSED) */ - rlwinm r30,r4,5,24,24 /* _PAGE_RW -> _PAGE_DIRTY */ + rlwinm r30,r4,32-9+7,31-7,31-7 /* _PAGE_RW -> _PAGE_DIRTY */ or r30,r30,r31 ori r30,r30,_PAGE_BUSY | _PAGE_ACCESSED | _PAGE_HASHPTE /* Write the linux PTE atomically (setting busy) */ @@ -112,11 +112,11 @@ rldicl r5,r5,0,25 /* vsid & 0x0000007fffffffff */ rldicl r0,r3,64-12,48 /* (ea >> 12) & 0xffff */ xor r28,r5,r0 - - /* Convert linux PTE bits into HW equivalents - */ - andi. r3,r30,0x1fa /* Get basic set of flags */ - rlwinm r0,r30,32-2+1,30,30 /* _PAGE_RW -> _PAGE_USER (r0) */ + + /* Convert linux PTE bits into HW equivalents */ + andi. r3,r30,0x1fe /* Get basic set of flags */ + xori r3,r3,HW_NO_EXEC /* _PAGE_EXEC -> NOEXEC */ + rlwinm r0,r30,32-9+1,30,30 /* _PAGE_RW -> _PAGE_USER (r0) */ rlwinm r4,r30,32-7+1,30,30 /* _PAGE_DIRTY -> _PAGE_USER (r4) */ and r0,r0,r4 /* _PAGE_RW & _PAGE_DIRTY -> r0 bit 30 */ andc r0,r30,r0 /* r0 = pte & ~r0 */ diff -urN linux-2.5/arch/ppc64/mm/hash_utils.c g5-ppc64/arch/ppc64/mm/hash_utils.c --- linux-2.5/arch/ppc64/mm/hash_utils.c 2005-03-07 08:21:53.000000000 +1100 +++ g5-ppc64/arch/ppc64/mm/hash_utils.c 2005-03-15 17:20:35.000000000 +1100 @@ -51,6 +51,7 @@ #include <asm/cacheflush.h> #include <asm/cputable.h> #include <asm/abs_addr.h> +#include <asm/sections.h> #ifdef DEBUG #define DBG(fmt...) udbg_printf(fmt) @@ -95,6 +96,7 @@ { unsigned long addr; unsigned int step; + unsigned long tmp_mode; if (large) step = 16*MB; @@ -112,6 +114,13 @@ else vpn = va >> PAGE_SHIFT; + + tmp_mode = mode; + + /* Make non-kernel text non-executable */ + if (!in_kernel_text(addr)) + tmp_mode = mode | HW_NO_EXEC; + hash = hpt_hash(vpn, large); hpteg = ((hash & htab_hash_mask) * HPTES_PER_GROUP); @@ -120,12 +129,12 @@ if (systemcfg->platform & PLATFORM_LPAR) ret = pSeries_lpar_hpte_insert(hpteg, va, virt_to_abs(addr) >> PAGE_SHIFT, - 0, mode, 1, large); + 0, tmp_mode, 1, large); else #endif /* CONFIG_PPC_PSERIES */ ret = native_hpte_insert(hpteg, va, virt_to_abs(addr) >> PAGE_SHIFT, - 0, mode, 1, large); + 0, tmp_mode, 1, large); if (ret == -1) { ppc64_terminate_msg(0x20, "create_pte_mapping"); @@ -238,8 +247,6 @@ { struct page *page; -#define PPC64_HWNOEXEC (1 << 2) - if (!pfn_valid(pte_pfn(pte))) return pp; @@ -251,7 +258,7 @@ __flush_dcache_icache(page_address(page)); set_bit(PG_arch_1, &page->flags); } else - pp |= PPC64_HWNOEXEC; + pp |= HW_NO_EXEC; } return pp; } diff -urN linux-2.5/arch/ppc64/mm/hugetlbpage.c g5-ppc64/arch/ppc64/mm/hugetlbpage.c --- linux-2.5/arch/ppc64/mm/hugetlbpage.c 2005-03-07 17:58:40.000000000 +1100 +++ g5-ppc64/arch/ppc64/mm/hugetlbpage.c 2005-03-15 17:27:33.000000000 +1100 @@ -782,7 +782,6 @@ { pte_t *ptep; unsigned long va, vpn; - int is_write; pte_t old_pte, new_pte; unsigned long hpteflags, prpn; long slot; @@ -809,8 +808,7 @@ * Check the user's access rights to the page. If access should be * prevented then send the problem up to do_page_fault. */ - is_write = access & _PAGE_RW; - if (unlikely(is_write && !(pte_val(*ptep) & _PAGE_RW))) + if (unlikely(access & ~pte_val(*ptep))) goto out; /* * At this point, we have a pte (old_pte) which can be used to build @@ -829,6 +827,8 @@ new_pte = old_pte; hpteflags = 0x2 | (! (pte_val(new_pte) & _PAGE_RW)); + /* _PAGE_EXEC -> HW_NO_EXEC since it's inverted */ + hpteflags |= ((pte_val(new_pte) & _PAGE_EXEC) ? 0 : HW_NO_EXEC); /* Check if pte already has an hpte (case 2) */ if (unlikely(pte_val(old_pte) & _PAGE_HASHPTE)) { diff -urN linux-2.5/include/asm-ppc64/elf.h g5-ppc64/include/asm-ppc64/elf.h --- linux-2.5/include/asm-ppc64/elf.h 2005-03-07 08:21:53.000000000 +1100 +++ g5-ppc64/include/asm-ppc64/elf.h 2005-03-17 19:34:20.000000000 +1100 @@ -226,6 +226,13 @@ else if (current->personality != PER_LINUX32) \ set_personality(PER_LINUX); \ } while (0) + +/* + * An executable for which elf_read_implies_exec() returns TRUE will + * have the READ_IMPLIES_EXEC personality flag set automatically. + */ +#define elf_read_implies_exec(ex, exec_stk) (exec_stk != EXSTACK_DISABLE_X) + #endif /* diff -urN linux-2.5/include/asm-ppc64/page.h g5-ppc64/include/asm-ppc64/page.h --- linux-2.5/include/asm-ppc64/page.h 2005-03-07 08:21:54.000000000 +1100 +++ g5-ppc64/include/asm-ppc64/page.h 2005-03-17 19:31:09.000000000 +1100 @@ -235,7 +235,26 @@ #define virt_addr_valid(kaddr) pfn_valid(__pa(kaddr) >> PAGE_SHIFT) -#define VM_DATA_DEFAULT_FLAGS (VM_READ | VM_WRITE | VM_EXEC | \ +/* + * Unfortunately the PLT is in the BSS in the PPC32 ELF ABI, + * and needs to be executable. This means the whole heap ends + * up being executable. + */ +#define VM_DATA_DEFAULT_FLAGS32 (VM_READ | VM_WRITE | VM_EXEC | \ + VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC) + +#define VM_DATA_DEFAULT_FLAGS64 (VM_READ | VM_WRITE | \ + VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC) + +#define VM_DATA_DEFAULT_FLAGS \ + (test_thread_flag(TIF_32BIT) ? \ + VM_DATA_DEFAULT_FLAGS32 : VM_DATA_DEFAULT_FLAGS64) + +/* + * This is the default if a program doesn't have a PT_GNU_STACK + * program header entry. + */ +#define VM_STACK_DEFAULT_FLAGS (VM_READ | VM_WRITE | VM_EXEC | \ VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC) #endif /* __KERNEL__ */ diff -urN linux-2.5/include/asm-ppc64/pgtable.h g5-ppc64/include/asm-ppc64/pgtable.h --- linux-2.5/include/asm-ppc64/pgtable.h 2005-03-07 17:58:40.000000000 +1100 +++ g5-ppc64/include/asm-ppc64/pgtable.h 2005-03-17 09:04:49.000000000 +1100 @@ -82,14 +82,14 @@ #define _PAGE_PRESENT 0x0001 /* software: pte contains a translation */ #define _PAGE_USER 0x0002 /* matches one of the PP bits */ #define _PAGE_FILE 0x0002 /* (!present only) software: pte holds file offset */ -#define _PAGE_RW 0x0004 /* software: user write access allowed */ +#define _PAGE_EXEC 0x0004 /* No execute on POWER4 and newer (we invert) */ #define _PAGE_GUARDED 0x0008 #define _PAGE_COHERENT 0x0010 /* M: enforce memory coherence (SMP systems) */ #define _PAGE_NO_CACHE 0x0020 /* I: cache inhibit */ #define _PAGE_WRITETHRU 0x0040 /* W: cache write-through */ #define _PAGE_DIRTY 0x0080 /* C: page changed */ #define _PAGE_ACCESSED 0x0100 /* R: page referenced */ -#define _PAGE_EXEC 0x0200 /* software: i-cache coherence required */ +#define _PAGE_RW 0x0200 /* software: user write access allowed */ #define _PAGE_HASHPTE 0x0400 /* software: pte has an associated HPTE */ #define _PAGE_BUSY 0x0800 /* software: PTE & hash are busy */ #define _PAGE_SECONDARY 0x8000 /* software: HPTE is in secondary group */ @@ -118,29 +118,38 @@ #define PAGE_KERNEL __pgprot(_PAGE_BASE | _PAGE_WRENABLE) #define PAGE_KERNEL_CI __pgprot(_PAGE_PRESENT | _PAGE_ACCESSED | \ _PAGE_WRENABLE | _PAGE_NO_CACHE | _PAGE_GUARDED) +#define PAGE_KERNEL_EXEC __pgprot(_PAGE_BASE | _PAGE_WRENABLE | _PAGE_EXEC) /* - * The PowerPC can only do execute protection on a segment (256MB) basis, - * not on a page basis. So we consider execute permission the same as read. + * This bit in a hardware PTE indicates that the page is *not* executable. + */ +#define HW_NO_EXEC _PAGE_EXEC + +/* + * POWER4 and newer have per page execute protection, older chips can only + * do this on a segment (256MB) basis. + * * Also, write permissions imply read permissions. * This is the closest we can get.. + * + * Note due to the way vm flags are laid out, the bits are XWR */ #define __P000 PAGE_NONE -#define __P001 PAGE_READONLY_X +#define __P001 PAGE_READONLY #define __P010 PAGE_COPY -#define __P011 PAGE_COPY_X -#define __P100 PAGE_READONLY +#define __P011 PAGE_COPY +#define __P100 PAGE_READONLY_X #define __P101 PAGE_READONLY_X -#define __P110 PAGE_COPY +#define __P110 PAGE_COPY_X #define __P111 PAGE_COPY_X #define __S000 PAGE_NONE -#define __S001 PAGE_READONLY_X +#define __S001 PAGE_READONLY #define __S010 PAGE_SHARED -#define __S011 PAGE_SHARED_X -#define __S100 PAGE_READONLY +#define __S011 PAGE_SHARED +#define __S100 PAGE_READONLY_X #define __S101 PAGE_READONLY_X -#define __S110 PAGE_SHARED +#define __S110 PAGE_SHARED_X #define __S111 PAGE_SHARED_X #ifndef __ASSEMBLY__ @@ -438,7 +447,7 @@ static inline void __ptep_set_access_flags(pte_t *ptep, pte_t entry, int dirty) { unsigned long bits = pte_val(entry) & - (_PAGE_DIRTY | _PAGE_ACCESSED | _PAGE_RW); + (_PAGE_DIRTY | _PAGE_ACCESSED | _PAGE_RW | _PAGE_EXEC); unsigned long old, tmp; __asm__ __volatile__( diff -urN linux-2.5/include/asm-ppc64/processor.h g5-ppc64/include/asm-ppc64/processor.h --- linux-2.5/include/asm-ppc64/processor.h 2005-03-07 08:21:54.000000000 +1100 +++ g5-ppc64/include/asm-ppc64/processor.h 2005-03-15 17:08:21.000000000 +1100 @@ -173,6 +173,11 @@ #define SPRN_DEC 0x016 /* Decrement Register */ #define SPRN_DMISS 0x3D0 /* Data TLB Miss Register */ #define SPRN_DSISR 0x012 /* Data Storage Interrupt Status Register */ +#define DSISR_NOHPTE 0x40000000 /* no translation found */ +#define DSISR_PROTFAULT 0x08000000 /* protection fault */ +#define DSISR_ISSTORE 0x02000000 /* access was a store */ +#define DSISR_DABRMATCH 0x00400000 /* hit data breakpoint */ +#define DSISR_NOSEGMENT 0x00200000 /* STAB/SLB miss */ #define SPRN_EAR 0x11A /* External Address Register */ #define SPRN_ESR 0x3D4 /* Exception Syndrome Register */ #define ESR_IMCP 0x80000000 /* Instr. Machine Check - Protection */ diff -urN linux-2.5/include/asm-ppc64/sections.h g5-ppc64/include/asm-ppc64/sections.h --- linux-2.5/include/asm-ppc64/sections.h 2004-02-12 18:17:28.000000000 +1100 +++ g5-ppc64/include/asm-ppc64/sections.h 2005-03-15 16:55:05.000000000 +1100 @@ -17,4 +17,13 @@ #define __openfirmware #define __openfirmwaredata + +static inline int in_kernel_text(unsigned long addr) +{ + if (addr >= (unsigned long)_stext && addr < (unsigned long)__init_end) + return 1; + + return 0; +} + #endif - To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/