I have six 500 gb satas in a raid5. When I pulled one to switch from 32 bit machine to 64 bit, it went bad -- no problem, right? (;-)>
Anyways, to do a "raidctl -R /dev/wd5d raid1", I got a crash in raidframe malloc call. Googling got me patch from June 2008 (?) in NetBSD. Of course, OpenBSD has nicely cleaned up the source [and fixed things!] -- making diff's interesting. Anyways, I have it rebuilding now, and hope I didn't blow the patch (attached, but it needs much more cleanup!). Index: rf_reconmap.c =================================================================== RCS file: /cvs/src/sys/dev/raidframe/rf_reconmap.c,v retrieving revision 1.4 diff -U5 -r1.4 rf_reconmap.c --- rf_reconmap.c 16 Dec 2002 07:01:05 -0000 1.4 +++ rf_reconmap.c 6 Feb 2009 03:40:55 -0000 @@ -53,17 +53,16 @@ /* Used to mark the end of the list. */ #define RU_NIL ((RF_ReconMapListElem_t *) 0) -void rf_compact_stat_entry(RF_Raid_t *, RF_ReconMap_t *, int); +void rf_compact_stat_entry(RF_Raid_t *, RF_ReconMap_t *, int, int); void rf_crunch_list(RF_ReconMap_t *, RF_ReconMapListElem_t *); RF_ReconMapListElem_t * rf_MakeReconMapListElem(RF_SectorNum_t, RF_SectorNum_t, RF_ReconMapListElem_t *); void rf_FreeReconMapListElem(RF_ReconMap_t *, RF_ReconMapListElem_t *); void rf_update_size(RF_ReconMap_t *, int); -void rf_PrintList(RF_ReconMapListElem_t *); /***************************************************************************** * * Creates and initializes new Reconstruction map. @@ -95,16 +94,20 @@ p->sectorsInDisk = disk_sectors; p->totalRUs = num_rus; p->spareRUs = spareUnitsPerDisk; p->unitsLeft = num_rus - spareUnitsPerDisk; + p->low_ru = 0; + p->status_size = RF_RECONMAP_SIZE; + p->high_ru = p->status_size - 1; + p->head = 0; - RF_Malloc(p->status, num_rus * sizeof(RF_ReconMapListElem_t *), + RF_Malloc(p->status, p->status_size * sizeof(RF_ReconMapListElem_t *), (RF_ReconMapListElem_t **)); RF_ASSERT(p->status != (RF_ReconMapListElem_t **) NULL); - (void) bzero((char *) p->status, num_rus * + (void) bzero((char *) p->status, p->status_size * sizeof(RF_ReconMapListElem_t *)); p->size = sizeof(RF_ReconMap_t) + num_rus * sizeof(RF_ReconMapListElem_t *); p->maxSize = p->size; @@ -139,27 +142,68 @@ void rf_ReconMapUpdate(RF_Raid_t *raidPtr, RF_ReconMap_t *mapPtr, RF_SectorNum_t startSector, RF_SectorNum_t stopSector) { RF_SectorCount_t sectorsPerReconUnit = mapPtr->sectorsPerReconUnit; - RF_SectorNum_t i, first_in_RU, last_in_RU; + RF_SectorNum_t i, first_in_RU, last_in_RU, ru; RF_ReconMapListElem_t *p, *pt; RF_LOCK_MUTEX(mapPtr->mutex); RF_ASSERT(startSector >= 0 && stopSector < mapPtr->sectorsInDisk && stopSector >= startSector); while (startSector <= stopSector) { i = startSector / mapPtr->sectorsPerReconUnit; first_in_RU = i * sectorsPerReconUnit; last_in_RU = first_in_RU + sectorsPerReconUnit - 1; - p = mapPtr->status[i]; +// p = mapPtr->status[i]; + + /* do we need to move the queue? */ + while (i > mapPtr->high_ru) { +#ifdef DIAGNOSTIC + if (mapPtr->status[mapPtr->head]!=RU_ALL) { + printf("\nraid%d: reconmap incorrect -- working on i %llu\n", + raidPtr->raidid, i); + printf("raid%d: ru %llu not completed!!!\n", + raidPtr->raidid, mapPtr->head); + + printf("raid%d: low: %llu high: %llu\n", + raidPtr->raidid, mapPtr->low_ru, mapPtr->high_ru); + + panic("reconmap incorrect"); + } +#endif + mapPtr->low_ru++; + mapPtr->high_ru++; + /* initialize "highest" RU status entry, which + will take over the current head postion */ + mapPtr->status[mapPtr->head]=RU_NOTHING; + + /* move head too */ + mapPtr->head++; + if (mapPtr->head >= mapPtr->status_size) + { + mapPtr->head = 0; + } + } + + ru = i - mapPtr->low_ru + mapPtr->head; + if (ru >= mapPtr->status_size) + ru = ru - mapPtr->status_size; + + if ((ru < 0) || (ru >= mapPtr->status_size)) { + printf("raid%d: ru is bogus %llu %llu %llu %llu %llu\n", + raidPtr->raidid, i, ru, mapPtr->head, mapPtr->low_ru, mapPtr->high_ru); + panic("bogus ru in reconmap"); + } + + p = mapPtr->status[ru]; if (p != RU_ALL) { if (p == RU_NOTHING || p->startSector > startSector) { /* Insert at front of list. */ - mapPtr->status[i] = + mapPtr->status[ru] = rf_MakeReconMapListElem(startSector, RF_MIN(stopSector, last_in_RU), (p == RU_NOTHING) ? NULL : p); rf_update_size(mapPtr, sizeof(RF_ReconMapListElem_t)); @@ -172,11 +216,11 @@ pt->next = rf_MakeReconMapListElem(startSector, RF_MIN(stopSector, last_in_RU), p); rf_update_size(mapPtr, sizeof(RF_ReconMapListElem_t)); } - rf_compact_stat_entry(raidPtr, mapPtr, i); + rf_compact_stat_entry(raidPtr, mapPtr, i, ru); } startSector = RF_MIN(stopSector, last_in_RU) + 1; } RF_UNLOCK_MUTEX(mapPtr->mutex); } @@ -197,21 +241,21 @@ * code, but necessary when called from the user-write code. * *****************************************************************************/ void -rf_compact_stat_entry(RF_Raid_t *raidPtr, RF_ReconMap_t *mapPtr, int i) +rf_compact_stat_entry(RF_Raid_t *raidPtr, RF_ReconMap_t *mapPtr, int i, int j) { RF_SectorCount_t sectorsPerReconUnit = mapPtr->sectorsPerReconUnit; - RF_ReconMapListElem_t *p = mapPtr->status[i]; + RF_ReconMapListElem_t *p = mapPtr->status[j]; rf_crunch_list(mapPtr, p); if ((p->startSector == i * sectorsPerReconUnit) && (p->stopSector == i * sectorsPerReconUnit + sectorsPerReconUnit - 1)) { - mapPtr->status[i] = RU_ALL; + mapPtr->status[j] = RU_ALL; mapPtr->unitsLeft--; rf_FreeReconMapListElem(mapPtr, p); } } @@ -295,11 +339,11 @@ numRUs = mapPtr->sectorsInDisk / mapPtr->sectorsPerReconUnit; if (mapPtr->sectorsInDisk % mapPtr->sectorsPerReconUnit) numRUs++; - for (i = 0; i < numRUs; i++) { + for (i = 0; i < mapPtr->status_size; i++) { p = mapPtr->status[i]; while (p != RU_NOTHING && p != RU_ALL) { q = p; p = p->next; RF_Free(q, sizeof(*q)); @@ -319,16 +363,30 @@ *****************************************************************************/ int rf_CheckRUReconstructed(RF_ReconMap_t *mapPtr, RF_SectorNum_t startSector) { - RF_ReconMapListElem_t *l; /* Used for searching. */ RF_ReconUnitNum_t i; + int rv; i = startSector / mapPtr->sectorsPerReconUnit; - l = mapPtr->status[i]; - return ((l == RU_ALL) ? 1 : 0); +// l = mapPtr->status[i]; +// return ((l == RU_ALL) ? 1 : 0); + if (i < mapPtr->low_ru) + rv = 1; + else if (i > mapPtr->high_ru) + rv = 0; + else { + i = i - mapPtr->low_ru + mapPtr->head; + if (i >= mapPtr->status_size) + i = i - mapPtr->status_size; + if (mapPtr->status[i] == RU_ALL) + rv = 1; + else + rv = 0; + } + return rv; } RF_ReconUnitCount_t rf_UnitsLeftToReconstruct(RF_ReconMap_t *mapPtr) { @@ -340,47 +398,10 @@ void rf_update_size(RF_ReconMap_t *mapPtr, int size) { mapPtr->size += size; mapPtr->maxSize = RF_MAX(mapPtr->size, mapPtr->maxSize); -} - -void -rf_PrintList(RF_ReconMapListElem_t *listPtr) -{ - while (listPtr) { - printf("%d,%d -> ", (int) listPtr->startSector, - (int) listPtr->stopSector); - listPtr = listPtr->next; - } - printf("\n"); -} - -void -rf_PrintReconMap(RF_Raid_t *raidPtr, RF_ReconMap_t *mapPtr, RF_RowCol_t frow, - RF_RowCol_t fcol) -{ - RF_ReconUnitCount_t numRUs; - RF_ReconMapListElem_t *p; - RF_ReconUnitNum_t i; - - numRUs = mapPtr->totalRUs; - if (mapPtr->sectorsInDisk % mapPtr->sectorsPerReconUnit) - numRUs++; - - for (i = 0; i < numRUs; i++) { - p = mapPtr->status[i]; - if (p == RU_ALL) - /* printf("[%d] ALL.\n", i) */; - else - if (p == RU_NOTHING) { - printf("%d: Unreconstructed.\n", i); - } else { - printf("%d: ", i); - rf_PrintList(p); - } - } } void rf_PrintReconSchedule(RF_ReconMap_t *mapPtr, struct timeval *starttime) { Index: rf_reconmap.h =================================================================== RCS file: /cvs/src/sys/dev/raidframe/rf_reconmap.h,v retrieving revision 1.3 diff -U5 -r1.3 rf_reconmap.h --- rf_reconmap.h 16 Dec 2002 07:01:05 -0000 1.3 +++ rf_reconmap.h 6 Feb 2009 03:40:55 -0000 @@ -38,10 +38,13 @@ #define _RF__RF_RECONMAP_H_ #include "rf_types.h" #include "rf_threadstuff.h" +/* the number of recon units in the status table. */ +#define RF_RECONMAP_SIZE 32 + /* * Main reconstruction status descriptor; size and maxsize are used for * monitoring only: they have no function for reconstruction. */ struct RF_ReconMap_s { @@ -55,10 +58,17 @@ RF_ReconUnitCount_t totalRUs; /* Total recon units on disk. */ RF_ReconUnitCount_t spareRUs; /* * Total number of spare RUs on * failed disk. */ + RF_ReconUnitCount_t low_ru; /* lowest reconstruction unit number in + *the status array */ + RF_ReconUnitCount_t high_ru; /* highest reconstruction unit number + * in the status array */ + RF_ReconUnitCount_t head; /* the position in the array where + * low_ru is found */ + RF_ReconUnitCount_t status_size; /* number of recon units in status */ RF_StripeCount_t totalParityStripes; /* * Total number of parity * stripes in array. */ Index: rf_reconstruct.c =================================================================== RCS file: /cvs/src/sys/dev/raidframe/rf_reconstruct.c,v retrieving revision 1.16 diff -U5 -r1.16 rf_reconstruct.c --- rf_reconstruct.c 5 Jun 2007 00:38:22 -0000 1.16 +++ rf_reconstruct.c 6 Feb 2009 03:40:55 -0000 @@ -162,11 +162,18 @@ (void *)((unsigned long)a), \ (void *)((unsigned long)b), \ NULL, NULL, NULL, NULL, NULL, NULL); \ } while (0) +#define RF_RECON_DONE_READS 1 +#define RF_RECON_READ_ERROR 2 +#define RF_RECON_WRITE_ERROR 3 +#define RF_RECON_READ_STOPPED 4 +#define RF_RECON_WRITE_DONE 5 + static RF_FreeList_t *rf_recond_freelist; + #define RF_MAX_FREE_RECOND 4 #define RF_RECOND_INC 1 RF_RaidReconDesc_t *rf_AllocRaidReconDesc(RF_Raid_t *, RF_RowCol_t, RF_RowCol_t, RF_RaidDisk_t *, int, @@ -714,13 +721,17 @@ RF_RowCol_t srow = reconDesc->srow; RF_RowCol_t scol = reconDesc->scol; RF_ReconMap_t *mapPtr; RF_ReconEvent_t *event; + RF_StripeCount_t incPSID,lastPSID,num_writes,pending_writes,prev; + RF_ReconUnitCount_t RUsPerPU; struct timeval etime, elpsd; unsigned long xor_s, xor_resid_us; int retcode, i, ds; + int status, done; + int recon_error, write_error; switch (reconDesc->state) { case 0: raidPtr->accumXorTimeUs = 0; @@ -760,68 +771,174 @@ RF_UNLOCK_MUTEX(raidPtr->mutex); RF_GETTIME(raidPtr->reconControl[row]->starttime); - /* - * Now start up the actual reconstruction: issue a read for - * each surviving disk. - */ - - reconDesc->numDisksDone = 0; - for (i = 0; i < raidPtr->numCol; i++) { - if (i != col) { - /* - * Find and issue the next I/O on the - * indicated disk. - */ - if (rf_IssueNextReadRequest(raidPtr, row, i)) { - Dprintf2("RECON: done issuing for r%d" - " c%d.\n", row, i); - reconDesc->numDisksDone++; - } - } - } - +// /* +// * Now start up the actual reconstruction: issue a read for +// * each surviving disk. +// */ +// +// reconDesc->numDisksDone = 0; +// for (i = 0; i < raidPtr->numCol; i++) { +// if (i != col) { +// /* +// * Find and issue the next I/O on the +// * indicated disk. +// */ +// if (rf_IssueNextReadRequest(raidPtr, row, i)) { +// Dprintf2("RECON: done issuing for r%d" +// " c%d.\n", row, i); +// reconDesc->numDisksDone++; +// } +// } +// } +// reconDesc->state = 2; case 2: Dprintf("RECON: resume requests.\n"); rf_ResumeNewRequests(raidPtr); reconDesc->state = 3; case 3: - /* - * Process reconstruction events until all disks report that - * they've completed all work. - */ +// /* +// * Process reconstruction events until all disks report that +// * they've completed all work. +// */ mapPtr = raidPtr->reconControl[row]->reconMap; - while (reconDesc->numDisksDone < raidPtr->numCol - 1) { +// while (reconDesc->numDisksDone < raidPtr->numCol - 1) { + + incPSID = RF_RECONMAP_SIZE; + lastPSID = raidPtr->Layout.numStripe / raidPtr->Layout.SUsPerPU; + RUsPerPU = raidPtr->Layout.SUsPerPU / raidPtr->Layout.SUsPerRU; + recon_error = 0; + write_error = 0; + pending_writes = incPSID; + raidPtr->reconControl[row]->lastPSID = incPSID; + done = 0; + while (!done) { + num_writes = 0; + /* issue a read for each surviving disk */ + reconDesc->numDisksDone = 0; + for (i = 0; i < raidPtr->numCol; i++) { + if (i != col) { + /* find and issue the next I/O on the + * indicated disk */ + if (rf_IssueNextReadRequest(raidPtr, row, i)) { + Dprintf2("RECON: done issuing for r%d\n" + " c%d.\n", row, i); + reconDesc->numDisksDone++; + } + } + } - event = rf_GetNextReconEvent(reconDesc, row, - (void (*) (void *)) rf_ContinueReconstructFailedDisk, - reconDesc); - RF_ASSERT(event); + /* process reconstruction events until all disks report that + * they've completed all work */ - if (rf_ProcessReconEvent(raidPtr, row, event)) - reconDesc->numDisksDone++; - raidPtr->reconControl[row]->numRUsTotal = - mapPtr->totalRUs; - raidPtr->reconControl[row]->numRUsComplete = - mapPtr->totalRUs - - rf_UnitsLeftToReconstruct(mapPtr); + while (reconDesc->numDisksDone < raidPtr->numCol - 1) { + + event = rf_GetNextReconEvent(reconDesc, row, + (void (*) (void *)) rf_ContinueReconstructFailedDisk, + reconDesc); + status = rf_ProcessReconEvent(raidPtr, row, event); + + /* the normal case is that a read completes, and all is well. */ + if (status == RF_RECON_DONE_READS) { + reconDesc->numDisksDone++; + } else if ((status == RF_RECON_READ_ERROR) || + (status == RF_RECON_WRITE_ERROR)) { + /* an error was encountered while reconstructing... + Pretend we've finished this disk. + */ + recon_error = 1; + raidPtr->reconControl[row]->error = 1; + + /* bump the numDisksDone count for reads, + but not for writes */ + if (status == RF_RECON_READ_ERROR) + reconDesc->numDisksDone++; + + /* write errors are special -- when we are + done dealing with the reads that are + finished, we don't want to wait for any + writes */ + if (status == RF_RECON_WRITE_ERROR) + write_error = 1; + + } else if (status == RF_RECON_READ_STOPPED) { + /* count this component as being "done" */ + reconDesc->numDisksDone++; + } else if (status == RF_RECON_WRITE_DONE) { + num_writes++; + } + + if (recon_error) { + /* make sure any stragglers are woken up so that + their theads will complete, and we can get out + of here with all IO processed */ - raidPtr->reconControl[row]->percentComplete = - (raidPtr->reconControl[row]->numRUsComplete * 100 / - raidPtr->reconControl[row]->numRUsTotal); - if (rf_prReconSched) { - rf_PrintReconSchedule( - raidPtr->reconControl[row]->reconMap, - &(raidPtr->reconControl[row]->starttime)); + rf_WakeupHeadSepCBWaiters(raidPtr, row); + } + + raidPtr->reconControl[row]->numRUsTotal = + mapPtr->totalRUs; + raidPtr->reconControl[row]->numRUsComplete = + mapPtr->totalRUs - + rf_UnitsLeftToReconstruct(mapPtr); + +#if RF_DEBUG_RECON + raidPtr->reconControl[row]->percentComplete = + (raidPtr->reconControl[row]->numRUsComplete * 100 / raidPtr->reconControl[row]->numRUsTot + al); + if (rf_prReconSched) { + rf_PrintReconSchedule(raidPtr->reconControl[row]->reconMap, &(raidPtr->reconControl[row]->starttime)); + } + #endif + } + + /* reads done, wakup any waiters, and then wait for writes */ + + rf_WakeupHeadSepCBWaiters(raidPtr, row); + + while (!recon_error && (num_writes < pending_writes)) { + event = rf_GetNextReconEvent(reconDesc, row, + (void (*) (void *)) rf_ContinueReconstructFailedDisk, + reconDesc); + status = rf_ProcessReconEvent(raidPtr, row, event); + + if (status == RF_RECON_WRITE_ERROR) { + recon_error = 1; + raidPtr->reconControl[row]->error = 1; + /* an error was encountered at the very end... bail */ + } else if (status == RF_RECON_WRITE_DONE) { + num_writes++; + } + } + if (recon_error || + (raidPtr->reconControl[row]->lastPSID == lastPSID)) { + done = 1; + break; + } + + prev = raidPtr->reconControl[row]->lastPSID; + raidPtr->reconControl[row]->lastPSID += incPSID; + + if (raidPtr->reconControl[row]->lastPSID > lastPSID) { + pending_writes = lastPSID - prev; + raidPtr->reconControl[row]->lastPSID = lastPSID; + } + + /* back down curPSID to get ready for the next round... */ + for (i = 0; i < raidPtr->numCol; i++) { + if (i != col) { + raidPtr->reconControl[row]->perDiskInfo[i].curPSID--; + raidPtr->reconControl[row]->perDiskInfo[i].ru_count = RUsPerPU - 1; + } } } reconDesc->state = 4; @@ -833,11 +950,11 @@ /* * At this point all the reads have completed. We now wait * for any pending writes to complete, and then we're done. */ - while (rf_UnitsLeftToReconstruct( + while (!recon_error && rf_UnitsLeftToReconstruct( raidPtr->reconControl[row]->reconMap) > 0) { event = rf_GetNextReconEvent(reconDesc, row, (void (*) (void *)) rf_ContinueReconstructFailedDisk, reconDesc); @@ -992,10 +1109,11 @@ } else if (rbuf->type == RF_RBUF_TYPE_FORCED) rf_FreeReconBuffer(rbuf); else RF_ASSERT(0); + retcode = RF_RECON_WRITE_DONE; break; /* A buffer-stall condition has been cleared. */ case RF_REVENT_BUFCLEAR: Dprintf2("RECON: BUFCLEAR EVENT: row %d col %d.\n", frow, @@ -1267,10 +1385,38 @@ pssPtr->issued[col] = 1; out: RF_UNLOCK_PSS_MUTEX(raidPtr, row, psid); return (0); +} + + +void +rf_WakeupHeadSepCBWaiters(RF_Raid_t *raidPtr, RF_RowCol_t row) +{ + RF_CallbackDesc_t *p; + + RF_LOCK_MUTEX(raidPtr->reconControl->rb_mutex); +// while(raidPtr->reconControl[row]->rb_lock) { +// ltsleep(&raidPtr->reconControl[row]->rb_lock, PRIBIO, +// "rf_wakeuphscbw", 0, &raidPtr->reconControl[row]->rb_mutex); +// } + + raidPtr->reconControl[row]->rb_lock = 1; + RF_UNLOCK_MUTEX(raidPtr->reconControl[row]->rb_mutex); + + while (raidPtr->reconControl[row]->headSepCBList) { + p = raidPtr->reconControl[row]->headSepCBList; + raidPtr->reconControl[row]->headSepCBList = p->next; + p->next = NULL; + rf_CauseReconEvent(raidPtr, row, p->col, NULL, RF_REVENT_HEADSEPCLEAR); + rf_FreeCallbackDesc(p); + } + RF_LOCK_MUTEX(raidPtr->reconControl[row]->rb_mutex); + raidPtr->reconControl[row]->rb_lock = 0; + wakeup(&raidPtr->reconControl[row]->rb_lock); + RF_UNLOCK_MUTEX(raidPtr->reconControl[row]->rb_mutex); } /* * Given a parity stripe ID, we want to find out whether both the Index: rf_reconstruct.h =================================================================== RCS file: /cvs/src/sys/dev/raidframe/rf_reconstruct.h,v retrieving revision 1.5 diff -U5 -r1.5 rf_reconstruct.h --- rf_reconstruct.h 16 Dec 2002 07:01:05 -0000 1.5 +++ rf_reconstruct.h 6 Feb 2009 03:40:55 -0000 @@ -216,10 +216,14 @@ */ int numRUsTotal; /* * Total number of * Reconstruction Units. */ + int error; /* non-0 indicates that an error has + * occurred during the reconstruction, and + * the reconstruction is in the process of + * bailing out. */ /* Reconstruction event queue. */ RF_ReconEvent_t *eventQueue; /* * Queue of pending * reconstruction events. @@ -237,10 +241,14 @@ /* Reconstruction buffer management. */ RF_DECLARE_MUTEX (rb_mutex); /* * Mutex for messing around * with recon buffers. */ + int rb_lock; /* 1 if someone is mucking + * with recon buffers, + * 0 otherwise */ + RF_ReconBuffer_t *floatingRbufs; /* * Available floating * reconstruction buffers. */ RF_ReconBuffer_t *committedRbufs;/* @@ -299,9 +307,10 @@ int rf_ReconstructInPlace(RF_Raid_t *, RF_RowCol_t, RF_RowCol_t); int rf_ContinueReconstructFailedDisk(RF_RaidReconDesc_t *); int rf_ForceOrBlockRecon(RF_Raid_t *, RF_AccessStripeMap_t *, void (*) (RF_Raid_t *, void *), void *); int rf_UnblockRecon(RF_Raid_t *, RF_AccessStripeMap_t *); +void rf_WakeupHeadSepCBWaiters(RF_Raid_t *raidPtr, RF_RowCol_t row); int rf_RegisterReconDoneProc(RF_Raid_t *, void (*) (RF_Raid_t *, void *), void *, RF_ReconDoneProc_t **); #endif /* !_RF__RF_RECONSTRUCT_H_ */ [demime 1.01d removed an attachment of type application/octet-stream which had a name of A.cvs.diff.raidframe]