Hi, Ludovic Courtès <l...@gnu.org> skribis:
> What if ‘scm_i_vm_mark_stack’ walks the stack right before the ‘vp->fp’ > assignment? It can determine that one of the just-assigned ‘sp[nargs]’ > is a dead slot, and thus set it to SCM_UNSPECIFIED. Later, when we set > ‘vp->fp’, that stack slot that we just initialized has been overwritten > by ‘scm_i_vm_mark_stack’. Down the road, we get something like: > > Wrong type to apply: #<unspecified> I believe the patch below solves this problem. Also attached is a small program that reproduces the bug (without the patch) after a couple of minutes. Thoughts? The full grafting test case exposes another VM stack-corruption-looking crash that I’m still investigating. Ludo’.
diff --git a/libguile/vm.c b/libguile/vm.c index b20c6eb5f..5cc934e99 100644 --- a/libguile/vm.c +++ b/libguile/vm.c @@ -1351,7 +1351,8 @@ scm_i_vm_emergency_abort (SCM *tag_and_argv, size_t n) scm_t_bits *prompt; scm_t_dynstack_prompt_flags flags; ptrdiff_t fp_offset, sp_offset; - union scm_vm_stack_element *fp, *sp; + union scm_vm_stack_element *fp; + volatile union scm_vm_stack_element *sp; SCM *argv; uint32_t *vra; uint8_t *mra; @@ -1392,16 +1393,19 @@ scm_i_vm_emergency_abort (SCM *tag_and_argv, size_t n) to be jumping to an older part of the stack. */ if (sp < vp->sp) abort (); - sp[nargs].as_scm = cont; - - while (nargs--) - sp[nargs].as_scm = *argv++; /* Restore VM regs */ vp->fp = fp; - vp->sp = sp; + vp->sp = (union scm_vm_stack_element *) sp; vp->ip = vra; + /* Restore the arguments on SP. This must be done after 'vp->fp' has + been set so that a concurrent 'scm_i_vm_mark_stack' does not + overwrite it (see <https://bugs.gnu.org/28211>). */ + sp[nargs].as_scm = cont; + while (nargs--) + sp[nargs].as_scm = *argv++; + /* Jump! */ vp->mra_after_abort = mra; longjmp (*registers, 1); @@ -1417,7 +1421,8 @@ abort_to_prompt (scm_thread *thread, uint8_t *saved_mra) scm_t_bits *prompt; scm_t_dynstack_prompt_flags flags; ptrdiff_t fp_offset, sp_offset; - union scm_vm_stack_element *fp, *sp; + union scm_vm_stack_element *fp, *orig_sp; + volatile union scm_vm_stack_element *sp; uint32_t *vra; uint8_t *mra; jmp_buf *registers; @@ -1452,6 +1457,7 @@ abort_to_prompt (scm_thread *thread, uint8_t *saved_mra) /* Recompute FP, as scm_dynstack_unwind may have expanded the stack. */ fp = vp->stack_top - fp_offset; sp = vp->stack_top - sp_offset; + orig_sp = vp->sp; /* Continuation gets nargs+1 values: the one more is for the cont. */ sp = sp - nargs - 1; @@ -1460,15 +1466,19 @@ abort_to_prompt (scm_thread *thread, uint8_t *saved_mra) to be jumping to an older part of the stack. */ if (sp < vp->sp) abort (); - sp[nargs].as_scm = cont; - while (nargs--) - sp[nargs] = vp->sp[nargs]; /* Restore VM regs */ vp->fp = fp; - vp->sp = sp; + vp->sp = (union scm_vm_stack_element *) sp; vp->ip = vra; + /* Restore the arguments on SP. This must be done after 'vp->fp' has + been set so that a concurrent 'scm_i_vm_mark_stack' does not + overwrite it (see <https://bugs.gnu.org/28211>). */ + sp[nargs].as_scm = cont; + while (nargs--) + sp[nargs] = orig_sp[nargs]; + /* If there are intervening C frames, then jump over them, making a nonlocal exit. Otherwise fall through and let the VM pick up where it left off. */
;; https://issues.guix.gnu.org/issue/28211 (use-modules (ice-9 threads)) (define (thunk) (catch 'foo (lambda () (throw 'foo (iota 10))) (lambda (key lst) (unless (equal? lst (iota 10)) (primitive-_exit 42))))) (n-par-for-each (* 2 (current-processor-count)) (lambda _ (let loop () (thunk) (loop))) (iota 1000))