Andrew Gallatin writes:
>
> I have an older AlphaStation 600 5/266 running -current (cvsupped
> last week) which is setup as a router between 2 100mb networks. When
> the machine is pushed fairly hard (like running a netperf -tUDP_STREAM
> -- -m 100 across the router, eg about 10-20k 100byte packets/sec ) the
> alpha falls over almost instantly. I have not enabled any NAT or
> firewall functionality, just ip forwarding.
<...>
>
> This might be a red herring, but I've found that if I run the entire
> ip_input path under splnet() (added splnet() around the call to
> ip_input() in ipintr().), things get a hell of a lot more stable.
> Rather than crashing in a few seconds, it sometimes takes minutes.
> And rather than an illegal access, I tend to run out of kernel stack
> space ( either a panic("possible stack overflow\n"); in
> alpha/alpha/interrupt.c, or I end up in the SRM console after calling
> halt from a PC which isn't in the kernel, which smells like an overrun
> stack to me). I'm not sure if this is related, or if it is a separate
> problem entirely.
That was it.
The problem is that the interrupt handler returns through
exception_return, like the trap handler does. Exception_return checks
to see if the last ipl the system was at was 0. If it was, it
eventually lowers the ipl to zero and checks for a pending ast. This
was the problem. If you're getting interrupts quickly enough, there's
large window when you're still running on the interrupt stack where
you're sitting at ipl0 and you can get another interrupt & build onto
that stack. If you're getting 40,000 interrupts per second
(forwarding 20,000 packets/sec), this can build up & rapidly run you
out of stack space.
I've found the system can forward 70,000 packets per second & remain
perfectly stable with the appended patch. I'm not terribly good at
assembler, so rather than try to be tricky & check to see if the
current ipl is >= 4 (handling a device interrupt), I simply copied
exception_return & skipped the ipl lowering & the check for an ast
since I don't think you're ever going to need to check for an ast
after an interrupt.
I have NFC why mclfree was getting trashed, but it must have been
caused by running out of stack space as the appended patch seems to
take care of everything.
Doug -- should I commit this as-is, or do you want to take a more
refined approach?
Drew
------------------------------------------------------------------------------
Andrew Gallatin, Sr Systems Programmer http://www.cs.duke.edu/~gallatin
Duke University Email: [EMAIL PROTECTED]
Department of Computer Science Phone: (919) 660-6590
Index: exception.s
===================================================================
RCS file: /home/ncvs/src/sys/alpha/alpha/exception.s,v
retrieving revision 1.3
diff -u -r1.3 exception.s
--- exception.s 1999/08/28 00:38:26 1.3
+++ exception.s 1999/10/28 19:17:26
@@ -76,7 +76,7 @@
/* a0, a1, & a2 already set up */
mov sp, a3 ; .loc 1 __LINE__
CALL(interrupt)
- jmp zero, exception_return
+ jmp zero, interrupt_return
END(XentInt)
/**************************************************************************/
Index: swtch.s
===================================================================
RCS file: /home/ncvs/src/sys/alpha/alpha/swtch.s,v
retrieving revision 1.11
diff -u -r1.11 swtch.s
--- swtch.s 1999/08/28 00:38:32 1.11
+++ swtch.s 1999/10/28 20:08:24
@@ -308,6 +308,61 @@
.set at
END(exception_return)
+
+
+LEAF(interrupt_return, 1) /* XXX should be NESTED */
+ br pv, Lintr_er1
+Lintr_er1: LDGP(pv)
+
+ ldq s1, (FRAME_PS * 8)(sp) /* get the saved PS */
+ and s1, ALPHA_PSL_IPL_MASK, t0 /* look at the saved IPL */
+ bne t0, Lintr_restoreregs /* != 0: can't do AST or SIR */
+
+ /* see if we can do an SIR */
+ ldl t1, ipending /* SIR pending? */
+ beq t1, Lintr_chkast /* no, try an AST*/
+
+ /* We've got a SIR. */
+ CALL(do_sir) /* do the SIR; lowers IPL */
+
+Lintr_chkast:
+
+ and s1, ALPHA_PSL_USERMODE, t0 /* are we returning to user? */
+ beq t0, Lintr_restoreregs /* no: just return */
+
+Lintr_setfpenable:
+ /* enable FPU based on whether the current proc is fpcurproc */
+ ldq t0, curproc
+ ldq t1, fpcurproc
+ cmpeq t0, t1, t0
+ mov zero, a0
+ cmovne t0, 1, a0
+ call_pal PAL_OSF1_wrfen
+
+Lintr_restoreregs:
+ /* set the hae register if this process has specified a value */
+ ldq t0, curproc
+ beq t0, Lintr_nohae
+ ldq t1, P_MD_FLAGS(t0)
+ and t1, MDP_HAEUSED
+ beq t1, Lintr_nohae
+ ldq a0, P_MD_HAE(t0)
+ ldq pv, chipset + CHIPSET_WRITE_HAE
+ CALL((pv))
+Lintr_nohae:
+
+ /* restore the registers, and return */
+ bsr ra, exception_restore_regs /* jmp/CALL trashes pv/t12 */
+ ldq ra,(FRAME_RA*8)(sp)
+ .set noat
+ ldq at_reg,(FRAME_AT*8)(sp)
+
+ lda sp,(FRAME_SW_SIZE*8)(sp)
+ call_pal PAL_OSF1_rti
+ .set at
+ END(interrupt_return)
+
+
LEAF(exception_save_regs, 0)
stq v0,(FRAME_V0*8)(sp)
stq a3,(FRAME_A3*8)(sp)
To Unsubscribe: send mail to [EMAIL PROTECTED]
with "unsubscribe freebsd-hackers" in the body of the message