Kenneth R Westerback wrote:
On Fri, Nov 04, 2005 at 03:22:33PM +0100, per engelbrecht wrote:

Kenneth R Westerback wrote:

On Fri, Nov 04, 2005 at 07:14:05AM +0100, per engelbrecht wrote:


K WESTERBACK wrote:



I'm interested.

.... Ken


Hi again Ken

If you find anything of value it would be nice to know.
(putting the box into production real soon)
Thank you.

/per
[EMAIL PROTECTED]






I hope to be able to investigate this weekend. I had a look at the
code and, well, it looked pretty weird. :-).

.... Ken


Hi Ken

When you say weird I get the same sensation as when my dentist say 'Uups' :-S

That would be just brilliant if you could. If not, fine too.
I just appresiate having you to "on it".

The best
/per
[EMAIL PROTECTED]



The ahd timeout code is definately and completely borked. Thanks
very much for finding a program that proved this.

Hi Ken
(damn, you move fast)

I think of it as more of a coincidence, but you're welcome :)


This diff puts ahd back to the primitive 'timeout == bus reset" that
most other drivers use. Now I can 'smartctl -a /dev/sd1c' many times
without crashing or hanging the machine.

Sounds like it's heading in the right direction.


In addition I suppress a lot of useless verbiage so that you can
actually read the program output.

Nice.


I'll be investigating further as to how much of this will committed,
and trying to figure out why it's timing out in the first place, and
why the results are inconsistant. The inconsistancy is that
sometimes commands fail, sometimes 'SMART Health Status: OK' is
displayed.

A few times I've also seen 'SMART Health Status: OK' randomly displayed among lots of dump output. Unable to "catch" it though.


Let me know if this helps you.

I sure will. Can't do it right now, but I'll give it a go around 1800 CEST and give you the result.
Thank you for your time so fare Ken.

/per
[EMAIL PROTECTED]





.... Ken


Index: aic79xx.c
===================================================================
RCS file: /cvs/src/sys/dev/ic/aic79xx.c,v
retrieving revision 1.28
diff -u -p -r1.28 aic79xx.c
--- aic79xx.c   4 Oct 2005 23:52:04 -0000       1.28
+++ aic79xx.c   5 Nov 2005 19:12:57 -0000
@@ -253,9 +253,6 @@ u_int               ahd_resolve_seqaddr(struct ahd_so
 void           ahd_download_instr(struct ahd_softc *ahd,
                                           u_int instrptr, uint8_t *dconsts);
 int            ahd_probe_stack_size(struct ahd_softc *ahd);
-int            ahd_other_scb_timeout(struct ahd_softc *ahd,
-                                             struct scb *scb,
-                                             struct scb *other_scb);
 int            ahd_scb_active_in_fifo(struct ahd_softc *ahd,
                                               struct scb *scb);
 void           ahd_run_data_fifo(struct ahd_softc *ahd,
@@ -3124,7 +3121,7 @@ ahd_set_syncrate(struct ahd_softc *ahd, ahd_send_async(ahd, devinfo->channel, devinfo->target,
                               CAM_LUN_WILDCARD, AC_TRANSFER_NEG, NULL);
 #endif
-               if (1 /*bootverbose*/) {
+               if (bootverbose) {
                        if (offset != 0) {
                                int options;
@@ -9148,305 +9145,41 @@ ahd_timeout(void *arg)
 {
        struct scb *scb = (struct scb *)arg;
        struct ahd_softc *ahd;
+       char channel;
+       long s;
+       int found;
+#ifdef AHD_DEBUG
+       int was_paused;
+#endif
ahd = scb->ahd_softc;
-       if ((scb->flags & SCB_ACTIVE) != 0) {
-               if ((scb->flags & SCB_TIMEDOUT) == 0) {
-                       LIST_INSERT_HEAD(&ahd->timedout_scbs, scb,
-                                        timedout_links);
-                       scb->flags |= SCB_TIMEDOUT;
-               }
-               ahd_recover_commands(ahd);
-       }
-}
-
-/*
- * ahd_recover_commands determines if any of the commands that have currently
- * timedout are the root cause for this timeout.  Innocent commands are given
- * a new timeout while we wait for the command executing on the bus to timeout.
- * This routine is invoked from a thread context so we are allowed to sleep.
- * Our lock is not held on entry.
- */
-void
-ahd_recover_commands(struct ahd_softc *ahd)
-{
-       struct  scb *scb;
-       struct  scb *active_scb;
-       long    s;
-       int     found;
-       int     was_paused;
-       u_int   active_scbptr;
-       u_int   last_phase;
-
        ahd_lock(ahd, &s);
+#ifdef AHD_DEBUG
+       was_paused = ahd_is_paused(ahd);
+       printf("%s: SCB %d timed out - Card was %spaused\n", ahd_name(ahd),
+           SCB_GET_TAG(scb), was_paused ? "" : "not ");
+       ahd_dump_card_state(ahd);
+#endif
+
        /*
         * Pause the controller and manually flush any
         * commands that have just completed but that our
         * interrupt handler has yet to see.
         */
-       was_paused = ahd_is_paused(ahd);
-
-       printf("%s: Recovery Initiated - Card was %spaused\n", ahd_name(ahd),
-              was_paused ? "" : "not ");
-       ahd_dump_card_state(ahd);
ahd_pause_and_flushwork(ahd);
+       aic_set_transaction_status(scb, CAM_CMD_TIMEOUT);
+       channel = SCB_GET_CHANNEL(ahd, scb);
+       found = ahd_reset_channel(ahd, channel, /*Initiate Reset*/TRUE);
- if (LIST_EMPTY(&ahd->timedout_scbs) != 0) {
-               /*
-                * The timedout commands have already
-                * completed.  This typically means
-                * that either the timeout value was on
-                * the hairy edge of what the device
-                * requires or - more likely - interrupts
-                * are not happening.
-                */
-               printf("%s: Timedout SCBs already complete. "
-                      "Interrupts may not be functioning.\n", ahd_name(ahd));
-               ahd_unpause(ahd);
-               ahd_unlock(ahd, &s);
-               return;
-       }
-
-       /*
-        * Determine identity of SCB acting on the bus.
-        * This test only catches non-packetized transactions.
-        * Due to the fleeting nature of packetized operations,
-        * we can't easily determine that a packetized operation
-        * is on the bus.
-        */
-       ahd_set_modes(ahd, AHD_MODE_SCSI, AHD_MODE_SCSI);
-       last_phase = ahd_inb(ahd, LASTPHASE);
-       active_scbptr = ahd_get_scbptr(ahd);
-       active_scb = NULL;
-       if (last_phase != P_BUSFREE
-        || (ahd_inb(ahd, SEQ_FLAGS) & NOT_IDENTIFIED) == 0)
-               active_scb = ahd_lookup_scb(ahd, active_scbptr);
-
-       while ((scb = LIST_FIRST(&ahd->timedout_scbs)) != NULL) {
-               int     target;
-               int     lun;
-               char    channel;
-
-               target = SCB_GET_TARGET(ahd, scb);
-               channel = SCB_GET_CHANNEL(ahd, scb);
-               lun = SCB_GET_LUN(scb);
-
-               ahd_print_path(ahd, scb);
-               printf("SCB %d - timed out\n", SCB_GET_TAG(scb));
-
-               if (scb->flags & (SCB_DEVICE_RESET|SCB_ABORT)) {
-                       /*
-                        * Been down this road before.
-                        * Do a full bus reset.
-                        */
-                       aic_set_transaction_status(scb, CAM_CMD_TIMEOUT);
-bus_reset:
-                       found = ahd_reset_channel(ahd, channel,
-                                                 /*Initiate Reset*/TRUE);
-                       printf("%s: Issued Channel %c Bus Reset. "
-                              "%d SCBs aborted\n", ahd_name(ahd), channel,
-                              found);
-                       continue;
-               }
-
-               /*
-                * Remove the command from the timedout list in
-                * preparation for requeing it.
-                */
-               LIST_REMOVE(scb, timedout_links);
-               scb->flags &= ~SCB_TIMEDOUT;
-
-               if (active_scb != NULL) {
-
-                       if (active_scb != scb) {
-
-                               /*
-                                * If the active SCB is not us, assume that
-                                * the active SCB has a longer timeout than
-                                * the timedout SCB, and wait for the active
-                                * SCB to timeout.  As a safeguard, only
-                                * allow this deferral to continue if some
-                                * untimed-out command is outstanding.
- */ - if (ahd_other_scb_timeout(ahd, scb,
-                                                         active_scb) == 0)
-                                       goto bus_reset;
-                               continue;
- } -
-                       /*
-                        * We're active on the bus, so assert ATN
-                        * and hope that the target responds.
-                        */
-                       ahd_set_recoveryscb(ahd, active_scb);
-                       active_scb->flags |= SCB_RECOVERY_SCB|SCB_DEVICE_RESET;
-                       ahd_outb(ahd, MSG_OUT, HOST_MSG);
-                       ahd_outb(ahd, SCSISIGO, last_phase|ATNO);
-                       ahd_print_path(ahd, active_scb);
-                       printf("BDR message in message buffer\n");
-                       aic_scb_timer_reset(scb, 2 * 1000);
-                       break;
-               } else if (last_phase != P_BUSFREE
-                       && ahd_inb(ahd, SCSIPHASE) == 0) {
-                       /*
-                        * SCB is not identified, there
-                        * is no pending REQ, and the sequencer
-                        * has not seen a busfree.  Looks like
-                        * a stuck connection waiting to
-                        * go busfree.  Reset the bus.
-                        */
-                       printf("%s: Connection stuck awaiting busfree or "
-                              "Identify Msg.\n", ahd_name(ahd));
-                       goto bus_reset;
-               } else if (ahd_search_qinfifo(ahd, target, channel, lun,
-                                             SCB_GET_TAG(scb), ROLE_INITIATOR,
-                                             /*status*/0, SEARCH_COUNT) > 0) {
-
-                       /*
-                        * We haven't even gone out on the bus
-                        * yet, so the timeout must be due to
-                        * some other command.  Reset the timer
-                        * and go on.
-                        */
-                       if (ahd_other_scb_timeout(ahd, scb, NULL) == 0)
-                               goto bus_reset;
-               } else {
-                       /*
-                        * This SCB is for a disconnected transaction
-                        * and we haven't found a better candidate on
-                        * the bus to explain this timeout.
-                        */
-                       ahd_set_recoveryscb(ahd, scb);
-
-                       /*
-                        * Actually re-queue this SCB in an attempt
-                        * to select the device before it reconnects.
-                        * In either case (selection or reselection),
-                        * we will now issue a target reset to the
-                        * timed-out device.
-                        */
-                       scb->flags |= SCB_DEVICE_RESET;
-                       scb->hscb->cdb_len = 0;
-                       scb->hscb->task_attribute = 0;
-                       scb->hscb->task_management = SIU_TASKMGMT_ABORT_TASK;
-
-                       ahd_set_scbptr(ahd, SCB_GET_TAG(scb));
-                       if ((scb->flags & SCB_PACKETIZED) != 0) {
-                               /*
-                                * Mark the SCB has having an outstanding
-                                * task management function.  Should the command
-                                * complete normally before the task management
-                                * function can be sent, the host will be
-                                * notified to abort our requeued SCB.
-                                */
-                               ahd_outb(ahd, SCB_TASK_MANAGEMENT,
-                                        scb->hscb->task_management);
-                       } else {
-                               /*
-                                * If non-packetized, set the MK_MESSAGE control
-                                * bit indicating that we desire to send a
-                                * message.  We also set the disconnected flag
-                                * since there is no guarantee that our SCB
-                                * control byte matches the version on the
-                                * card.  We don't want the sequencer to abort
-                                * the command thinking an unsolicited
-                                * reselection occurred.
-                                */
-                               scb->hscb->control |= MK_MESSAGE|DISCONNECTED;
-
-                               /*
-                                * The sequencer will never re-reference the
-                                * in-core SCB.  To make sure we are notified
-                                * during reslection, set the MK_MESSAGE flag in
-                                * the card's copy of the SCB.
-                                */
-                               ahd_outb(ahd, SCB_CONTROL,
-                                        ahd_inb(ahd, SCB_CONTROL)|MK_MESSAGE);
-                       }
-
-                       /*
-                        * Clear out any entries in the QINFIFO first
-                        * so we are the next SCB for this target
-                        * to run.
-                        */
-                       ahd_search_qinfifo(ahd, target, channel, lun,
-                                          SCB_LIST_NULL, ROLE_INITIATOR,
-                                          CAM_REQUEUE_REQ, SEARCH_COMPLETE);
-                       ahd_qinfifo_requeue_tail(ahd, scb);
-                       ahd_set_scbptr(ahd, active_scbptr);
-                       ahd_print_path(ahd, scb);
-                       printf("Queuing a BDR SCB\n");
-                       aic_scb_timer_reset(scb, 2 * 1000);
-                       break;
-               }
-       }
-       
-       /*
-        * Any remaining SCBs were not the "culprit", so remove
-        * them from the timeout list.  The timer for these commands
-        * will be reset once the recovery SCB completes.
-        */
-       while ((scb = LIST_FIRST(&ahd->timedout_scbs)) != NULL) {
-
-               LIST_REMOVE(scb, timedout_links);
-               scb->flags &= ~SCB_TIMEDOUT;
-       }
+#ifdef AHD_DEBUG
+       printf("%s: Issued Channel %c Bus Reset. %d SCBs aborted\n",
+           ahd_name(ahd), channel, found);
+#endif
ahd_unpause(ahd);
        ahd_unlock(ahd, &s);
-}
-
-/*
- * Re-schedule a timeout for the passed in SCB if we determine that some
- * other SCB is in the process of recovery or an SCB with a longer
- * timeout is still pending.  Limit our search to just "other_scb"
- * if it is non-NULL.
- */
-int
-ahd_other_scb_timeout(struct ahd_softc *ahd, struct scb *scb,
-                     struct scb *other_scb)
-{
-       u_int   newtimeout;
-       int     found;
-
-       ahd_print_path(ahd, scb);
-       printf("Other SCB Timeout%s",
-              (scb->flags & SCB_OTHERTCL_TIMEOUT) != 0
-              ? " again\n" : "\n");
-
-       newtimeout = aic_get_timeout(scb);
-       scb->flags |= SCB_OTHERTCL_TIMEOUT;
-       found = 0;
-       if (other_scb != NULL) {
-               if ((other_scb->flags
-                  & (SCB_OTHERTCL_TIMEOUT|SCB_TIMEDOUT)) == 0
-                || (other_scb->flags & SCB_RECOVERY_SCB) != 0) {
-                       found++;
-                       newtimeout = MAX(aic_get_timeout(other_scb),
-                                        newtimeout);
-               }
-       } else {
-               LIST_FOREACH(other_scb, &ahd->pending_scbs, pending_links) {
-                       if ((other_scb->flags
-                          & (SCB_OTHERTCL_TIMEOUT|SCB_TIMEDOUT)) == 0
-                        || (other_scb->flags & SCB_RECOVERY_SCB) != 0) {
-                               found++;
-                               newtimeout = MAX(aic_get_timeout(other_scb),
-                                                newtimeout);
-                       }
-               }
-       }
-
-       if (found != 0)
-               aic_scb_timer_reset(scb, newtimeout);
-       else {
-               ahd_print_path(ahd, scb);
-               printf("No other SCB worth waiting for...\n");
-       }
-
-       return (found != 0);
 }
/**************************** Flexport Logic **********************************/
Index: aic79xx.h
===================================================================
RCS file: /cvs/src/sys/dev/ic/aic79xx.h,v
retrieving revision 1.18
diff -u -p -r1.18 aic79xx.h
--- aic79xx.h   30 Dec 2004 17:29:55 -0000      1.18
+++ aic79xx.h   5 Nov 2005 19:12:58 -0000
@@ -1484,7 +1484,6 @@ void                      ahd_handle_scsi_status(struct 
ahd
 void                   ahd_calc_residual(struct ahd_softc *ahd,
                                          struct scb *scb);
 void                   ahd_timeout(void *);
-void                   ahd_recover_commands(struct ahd_softc *ahd);
 /*************************** Utility Functions 
********************************/
 struct ahd_phase_table_entry*
                        ahd_lookup_phase_entry(int phase);

Reply via email to