On 2020/09/23 10:22, Theo de Raadt wrote:
> kettenis and I have spent some time trying to track this down.
>
> There is an obscure MP-unsafe / thread-unsafe in the W^X
> line-in-the-sand mechanism, which is done using GDT CS manipulation.
> It is related to sleeps in the trap function.
>
> On PAE NX systems, the GDT CS manipulation is not neccessary, so here is
> a diff to bypass all that.
>
> The pile of KERNEL_LOCK changes mechanically remove the need for the NX
> codepath to grab the lock -- they just get moved inside the non-NX fixup
> function.
>
> We will need to fix the non-NX case seperately. Such systems are
> quite rare now. The bug probably doesn't affect single-cpu i386.
Seems to work, I have been building sysutils/beats/packetbeat in a loop
for 3 hours with this version of the diff with no failure. Would have
been lucky to make it 5 minutes without.
>
> Index: i386/machdep.c
> ===================================================================
> RCS file: /cvs/src/sys/arch/i386/i386/machdep.c,v
> retrieving revision 1.639
> diff -u -p -u -r1.639 machdep.c
> --- i386/machdep.c 13 Sep 2020 05:57:28 -0000 1.639
> +++ i386/machdep.c 23 Sep 2020 13:06:57 -0000
> @@ -2955,15 +2955,21 @@ setregs(struct proc *p, struct exec_pack
> p->p_md.md_flags &= ~MDP_USEDFPU;
> #endif
>
> - /*
> - * Reset the code segment limit to I386_MAX_EXE_ADDR in the pmap;
> - * this gets copied into the GDT for GUCODE_SEL by pmap_activate().
> - * Similarly, reset the base of each of the two thread data
> - * segments to zero in the pcb; they'll get copied into the
> - * GDT for GUFS_SEL and GUGS_SEL.
> - */
> - setsegment(&pmap->pm_codeseg, 0, atop(I386_MAX_EXE_ADDR) - 1,
> - SDT_MEMERA, SEL_UPL, 1, 1);
> + if (cpu_pae) {
> + setsegment(&pmap->pm_codeseg, 0, atop(VM_MAXUSER_ADDRESS - 1),
> + SDT_MEMERA, SEL_UPL, 1, 1);
> + } else {
> + /*
> + * For pmap.c's non-PAE/NX line-in-the-sand execution,
> + * reset the code segment limit to I386_MAX_EXE_ADDR in the
> pmap;
> + * this gets copied into the GDT for GUCODE_SEL by
> pmap_activate().
> + * Similarly, reset the base of each of the two thread data
> + * segments to zero in the pcb; they'll get copied into the
> + * GDT for GUFS_SEL and GUGS_SEL.
> + */
> + setsegment(&pmap->pm_codeseg, 0, atop(I386_MAX_EXE_ADDR - 1),
> + SDT_MEMERA, SEL_UPL, 1, 1);
> + }
> setsegment(&pcb->pcb_threadsegs[TSEG_FS], 0,
> atop(VM_MAXUSER_ADDRESS) - 1, SDT_MEMRWA, SEL_UPL, 1, 1);
> setsegment(&pcb->pcb_threadsegs[TSEG_GS], 0,
> Index: i386/pmap.c
> ===================================================================
> RCS file: /cvs/src/sys/arch/i386/i386/pmap.c,v
> retrieving revision 1.207
> diff -u -p -u -r1.207 pmap.c
> --- i386/pmap.c 23 Sep 2020 15:13:26 -0000 1.207
> +++ i386/pmap.c 23 Sep 2020 16:15:22 -0000
> @@ -582,6 +582,9 @@ pmap_exec_account(struct pmap *pm, vaddr
> if ((opte ^ npte) & PG_X)
> pmap_tlb_shootpage(pm, va);
>
> + if (cpu_pae)
> + return;
> +
> /*
> * Executability was removed on the last executable change.
> * Reset the code segment to something conservative and
> @@ -614,6 +617,8 @@ pmap_exec_fixup(struct vm_map *map, stru
> vaddr_t va = 0;
> vaddr_t pm_cs, gdt_cs;
>
> + KERNEL_LOCK();
> +
> vm_map_lock(map);
> RBT_FOREACH_REVERSE(ent, uvm_map_addr, &map->addr) {
> if (ent->protection & PROT_EXEC)
> @@ -640,6 +645,7 @@ pmap_exec_fixup(struct vm_map *map, stru
> */
> if (va <= pm->pm_hiexec && pm_cs == pm->pm_hiexec &&
> gdt_cs == pm->pm_hiexec) {
> + KERNEL_UNLOCK();
> return (0);
> }
>
> @@ -652,6 +658,7 @@ pmap_exec_fixup(struct vm_map *map, stru
> */
> setcslimit(pm, tf, pcb, va);
>
> + KERNEL_UNLOCK();
> return (1);
> }
>
> @@ -1344,8 +1351,12 @@ pmap_create(void)
> pmap->pm_hiexec = 0;
> pmap->pm_flags = 0;
>
> - setsegment(&pmap->pm_codeseg, 0, atop(I386_MAX_EXE_ADDR) - 1,
> - SDT_MEMERA, SEL_UPL, 1, 1);
> + if (cpu_pae)
> + setsegment(&pmap->pm_codeseg, 0, atop(VM_MAXUSER_ADDRESS - 1),
> + SDT_MEMERA, SEL_UPL, 1, 1);
> + else
> + setsegment(&pmap->pm_codeseg, 0, atop(I386_MAX_EXE_ADDR - 1),
> + SDT_MEMERA, SEL_UPL, 1, 1);
>
> pmap_pinit_pd(pmap);
> return (pmap);
> Index: i386/trap.c
> ===================================================================
> RCS file: /cvs/src/sys/arch/i386/i386/trap.c,v
> retrieving revision 1.145
> diff -u -p -u -r1.145 trap.c
> --- i386/trap.c 15 Sep 2020 09:30:31 -0000 1.145
> +++ i386/trap.c 23 Sep 2020 16:14:15 -0000
> @@ -258,18 +258,14 @@ trap(struct trapframe *frame)
> return;
>
> case T_PROTFLT|T_USER: /* protection fault */
> - KERNEL_LOCK();
> -
> /* If pmap_exec_fixup does something, let's retry the trap. */
> - if (pmap_exec_fixup(&p->p_vmspace->vm_map, frame,
> - &p->p_addr->u_pcb)) {
> - KERNEL_UNLOCK();
> + if (cpu_pae == 0 &&
> + pmap_exec_fixup(&p->p_vmspace->vm_map, frame,
> + &p->p_addr->u_pcb))
> goto out;
> - }
>
> sv.sival_int = frame->tf_eip;
> trapsignal(p, SIGSEGV, type &~ T_USER, SEGV_MAPERR, sv);
> - KERNEL_UNLOCK();
> goto out;
>
> case T_TSSFLT|T_USER:
>