Hello ... Please note: I will send the Async IO stuff later, see below.
This Patch: Here is a very simple patch to ide.c that does not change the controller type and that lets an existing win2003 in- stallation use multi-sector IO and/or DMA. There is another patch for BSD (sorry, I am not sure about the author's name - is it John?). That patch changes the controler, modifies the BIOS and adds a simple way for async writes. MY PATCH IS VERY DIFFERENT: (1) The controller type is not changed. In qemu 8.0.0 some bits got set wrong, windows therefore disabled DMA. (2) I have no documentation about the controller. WHO CAN SEND ME THE IDE CONTROLLER DOCUMENTATION? I took some ideas from John and used the linux kernel source. UNFORTUNATELY not all off my bits make the linux kernel happy - it complains and disables DMA. BUT IT SHOULD BE EASY TO SORT THIS OUT. (3) My way to async IO is different, read below. The implementation does async read and write. My plans are to make the SDL async too. (4) The async IO layer integrates well with Fabrice Bellard's IOHandlers. In fact vl.c needs no modifications, the code uses a pipe to signal that IO got ready. The modifications on the block driver layer are minimal. To say this again: the async IO stuff itself is about 600 lines of code (in one file) but it needs several small changes in qemu. Here are some fake definitions that you might help you to get a first impression: 8<---8<---8<---8<---8<---8<---8<---8<---8<---8<---8<---8<---8<---8<---8< #ifdef QEMU_TOOL // Global: start the background thread int qaio_initialize(IoAsyncInst* inst, IoAsyncCall* spy) { return 0; } // Global: terminate the background thread int qaio_terminate(IoAsyncInst* inst) { return 0; } // Global: IOHandler used to run the completion callback void qaio_poll(void* opaque) {} // Client: this function is only called once int qaio_register(IoAsyncInst* i, IoAsyncItem** iptr, void* o, int file) { int* pfile = (int*)iptr; *pfile = file; return 0; } // Client: this function should be called to flush pending requests int qaio_unregister(IoAsyncItem** iptr) { return 0; } // Client: make this a child int qaio_parent(IoAsyncItem* item, IoAsyncItem* parent) { return 0; } // Client: begin a new request (flush or commit must follow) int qaio_begin(IoAsyncItem* item, IoAsyncCall* cbf, void* info) { return 1; } // Client: commit any pending request int qaio_commit(IoAsyncItem* item) { return 1; } // Client: flush any pending request synchronously int qaio_flush(IoAsyncItem* item) { return 1; } // Client: queue a write request int qaio_write(IoAsyncItem* item, const void* pdat, uint32_t count, uint64_t offs) { return qemu_write_at((int)item, pdat, count, offs); } // Client: queue a read request int qaio_read(IoAsyncItem* item, void* pdat, uint32_t count, uint64_t offs) { return qemu_read_at((int)item, pdat, count, offs); } #endif >8--->8--->8--->8--->8--->8--->8--->8--->8--->8--->8--->8--->8--->8--->8 The DMA stuff is very important for windows. With synchronous IO you roughly get the following performance on a simple P4/2.4 GHz single disk machine: 6 MByte/s No Multi-Sector, no DMA (qemu 8.0.0 default) 9 MByte/s Multi-Sector, no DMA (this Patch) 12 MByte/s with DMA (this Patch) With async IO the throughput can go up further but this saturates my workstation disk, so I will not quote any numbers here. I should also mention that I use a faster block layer driver (called bkf) that I will submit after it has matured for a while. Whereas the DMA stuff fixes more or less a bug in qemu 8.0.0 the async IO is really important. Here is why: (1) The Windows GUI now works "smoothly". The emulated machine is more responsive and the mouse cursor does not "hang". Please remember that this also requires the hack to make VGA faster that I submitted a while ago. (2) The windows clock looses much fewer timer ticks and now mostly shows the correct time. Perfmon now becomes really usable. (3) Even under heavy IO load the CPU-usage now falls below 100% so that CPU cycles are for other activity on the host or the emulated PC. (4) Async IO does not generally speed-up batch operations, but the emulated PC behaves more like a real one. Example: the animated boot logo of windows. Yours Juergen
Index: hw/ide.c =================================================================== RCS file: /home/Cvsroot/qemu/hw/ide.c,v retrieving revision 1.1.1.1 retrieving revision 1.9 diff -B -b -U3 -r1.1.1.1 -r1.9 --- hw/ide.c 14 Jan 2006 13:19:59 -0000 1.1.1.1 +++ hw/ide.c 1 Feb 2006 10:17:51 -0000 1.9 @@ -185,7 +185,7 @@ #define DISABLE_SEAGATE 0xFB /* set to 1 set disable mult support */ -#define MAX_MULT_SECTORS 16 +#define MAX_MULT_SECTORS 256 /* ATAPI defines */ @@ -289,6 +289,15 @@ typedef void EndTransferFunc(struct IDEState *); +typedef struct physCopyParam +{ + target_phys_addr_t phys; + uint8_t* buff; + int len; +} physCopyParam; + +#define PHYS_MAX MAX_MULT_SECTORS + /* NOTE: IDEState represents in fact one drive */ typedef struct IDEState { /* ide config */ @@ -334,6 +343,11 @@ uint8_t *data_end; uint8_t io_buffer[MAX_MULT_SECTORS*512 + 4]; QEMUTimer *sector_write_timer; /* only used for win2k instal hack */ + + uint phys_count; + physCopyParam phys_param[PHYS_MAX]; + + } IDEState; #define BM_STATUS_DMAING 0x01 @@ -436,7 +450,11 @@ put_le16(p + 49, 1 << 9 | 1 << 8); /* DMA and LBA supported */ put_le16(p + 51, 0x200); /* PIO transfer cycle */ put_le16(p + 52, 0x200); /* DMA transfer cycle */ - put_le16(p + 53, 1 | 1 << 2); /* words 54-58,88 are valid */ + +// put_le16(p + 53, 1 | 1 << 2); /* words 54-58,88 are valid */ +// Windows wants this - OK for LINUX + put_le16(p + 53, 1 | 1 << 1 | 1 << 2); /* words 54-58,64-70,88 are valid */ + put_le16(p + 54, s->cylinders); put_le16(p + 55, s->heads); put_le16(p + 56, s->sectors); @@ -447,6 +465,14 @@ put_le16(p + 59, 0x100 | s->mult_sectors); put_le16(p + 60, s->nb_sectors); put_le16(p + 61, s->nb_sectors >> 16); + +// Windows needs it to activate MWORD DMA +// Linux kernel says: MWORD DMA and UDMA are exclusive! + put_le16(p + 63, 0x7 | (0x7 << 8)); /* Multiword DMA supported/selected */ + +// Windows wants this for MULTI-SECTOR - OK for LINUX + put_le16(p + 64, 0x3f); /* PIO modes supported */ + put_le16(p + 80, (1 << 1) | (1 << 2)); put_le16(p + 82, (1 << 14)); put_le16(p + 83, (1 << 14)); @@ -454,7 +480,11 @@ put_le16(p + 85, (1 << 14)); put_le16(p + 86, 0); put_le16(p + 87, (1 << 14)); + put_le16(p + 88, 0x1f | (1 << 13)); +// neither Window nor LINUX need this +// put_le16(p + 88, 0x3f | (0x04 << 8)); /* UMDA supported/selected */ + put_le16(p + 93, 1 | (1 << 14) | 0x2000 | 0x4000); } @@ -602,12 +632,45 @@ } } +// The asyncio completion routine (mainly for qaio_async) ... +static void ide_set_irq_cb(void *opaque) +{ + IDEState *s = opaque; + if(s->phys_count > 0) { // play-back logged copy cmds + uint urun = 0; + while(urun < s->phys_count) { + physCopyParam* pars = s->phys_param + urun++; + cpu_physical_memory_write(pars->phys, pars->buff, pars->len); + } + s->phys_count = 0; + } + + s->status = READY_STAT | SEEK_STAT; // DMA completion + ide_set_irq(s); +#ifdef DEBUG_IDE_ATAPI + printf("dma status=0x%x\n", s->status); +#endif +} + +// This helper logs cpu_physical_memory_write args to a buffer so that the +// actual copying can be done in the asyncio completion routine ... +static inline void ide_push_phys_addr(IDEState *s, target_phys_addr_t phys, uint8_t* buff, int len) +{ + if(s->phys_count > PHYS_MAX) + printf("ide_push_phys_addr: overflow\n"); + else { + physCopyParam* pars = s->phys_param + s->phys_count++; + pars->phys = phys; pars->buff = buff; pars->len = len; + } +} + static int ide_read_dma_cb(IDEState *s, target_phys_addr_t phys_addr, int transfer_size1) { int len, transfer_size, n; int64_t sector_num; + int iasy = bdrv_async(s->bs, ide_set_irq_cb, s); // try to enable async IO transfer_size = transfer_size1; while (transfer_size > 0) { @@ -620,6 +683,7 @@ if (n > MAX_MULT_SECTORS) n = MAX_MULT_SECTORS; sector_num = ide_get_sector(s); + // XXX: error handling is missing! bdrv_read(s->bs, sector_num, s->io_buffer, n); s->io_buffer_index = 0; s->io_buffer_size = n * 512; @@ -630,7 +694,11 @@ } if (len > transfer_size) len = transfer_size; - cpu_physical_memory_write(phys_addr, + if(iasy >= 0) // log if IO is async + ide_push_phys_addr(s, phys_addr, + s->io_buffer + s->io_buffer_index, len); + else + cpu_physical_memory_write(phys_addr, // direct IO s->io_buffer + s->io_buffer_index, len); s->io_buffer_index += len; transfer_size -= len; @@ -636,12 +704,10 @@ transfer_size -= len; phys_addr += len; } + if (s->io_buffer_index >= s->io_buffer_size && s->nsector == 0) { - s->status = READY_STAT | SEEK_STAT; - ide_set_irq(s); -#ifdef DEBUG_IDE_ATAPI - printf("dma status=0x%x\n", s->status); -#endif + if(bdrv_async(s->bs, NULL, BDRV_ASYNC_COMMIT)) // if direct IO ... + ide_set_irq_cb(s); return 0; } return transfer_size1 - transfer_size; @@ -655,12 +721,6 @@ ide_dma_start(s, ide_read_dma_cb); } -static void ide_sector_write_timer_cb(void *opaque) -{ - IDEState *s = opaque; - ide_set_irq(s); -} - static void ide_sector_write(IDEState *s) { int64_t sector_num; @@ -704,6 +764,7 @@ } } +// called from ide_dma_loop until it returns 0 static int ide_write_dma_cb(IDEState *s, target_phys_addr_t phys_addr, int transfer_size1) @@ -710,23 +771,26 @@ { int len, transfer_size, n; int64_t sector_num; - transfer_size = transfer_size1; + bdrv_async(s->bs, ide_set_irq_cb, s); // try to enable async IO for(;;) { + // flush to file if the buffer gets full ... len = s->io_buffer_size - s->io_buffer_index; if (len == 0) { n = s->io_buffer_size >> 9; sector_num = ide_get_sector(s); - bdrv_write(s->bs, sector_num, s->io_buffer, - s->io_buffer_size >> 9); - sector_num += n; + // XXX: error handling is missing! + bdrv_write(s->bs, sector_num, s->io_buffer, n); ide_set_sector(s, sector_num); + sector_num += n; s->nsector -= n; n = s->nsector; if (n == 0) { - /* end of transfer */ + // end of transfer, commit async IO + if(bdrv_async(s->bs, NULL, BDRV_ASYNC_COMMIT)) { s->status = READY_STAT | SEEK_STAT; - ide_set_irq(s); + ide_set_irq(s); // fallback to sync IO + } return 0; } if (n > MAX_MULT_SECTORS) @@ -1976,7 +2040,7 @@ s->irq_opaque = irq_opaque; s->irq = irq; s->sector_write_timer = qemu_new_timer(vm_clock, - ide_sector_write_timer_cb, s); + ide_set_irq_cb, s); ide_reset(s); } } @@ -2231,6 +2295,7 @@ register_ioport_read(addr + 4, 4, 4, bmdma_addr_readl, bm); addr += 8; } + d->bmdma[0].status = 0x40 | 0x20; /* ATA0: drive1/0 DMA capable */ } /* XXX: call it also when the MRDMODE is changed from the PCI config
_______________________________________________ Qemu-devel mailing list Qemu-devel@nongnu.org http://lists.nongnu.org/mailman/listinfo/qemu-devel