Module Name: src Committed By: martin Date: Sat Jun 22 18:30:13 UTC 2024
Modified Files: src/sys/arch/xen/xen [netbsd-10]: xbdback_xenbus.c Log Message: Pull up the following, requested by bouyer in ticket #726: sys/arch/xen/xen/xbdback_xenbus.c upto 1.107 Restore "sparse" segements support which was lost in rev 1.83, causing VBD corruption with linux guests. The segments in a single request are not always contigous in VA; this means that the end of a segment is not always 7 and the start of the next one is not always 0. When this happens this means that a contigous chunk of data from disk has to be dispatched to various non-contigous VA, in chunks of VBD_BSIZE bytes (or the other way round for writes). Linux I/O subsystems seems to support this natively; to emulate this allocate a MAXPHYS bounce buffer to do the I/O and then memcpy() the data from/to the segments as requested. If the request is contigous do the I/O directly to the mapped VA. This means that we need to keep segments details until iodone(); so move the blkif_request_segment array from xbdback_instance to xbdback_io. The array is allocated separately to guarantee proper page alignement. non-contigous segments seems rare so allocate one bounce buffer per xbdback_instance, and stall the ring if the bounce buffer is already in use. For this add back a mechanism to restart an I/O at a specific point after thread sleep/wakeup. While there guard some more printfs with ratecheck() and add more checks on segments bounds. Tested with a HVM scientific linux install from iso image; the install would fail with a xfs corruption when installing grub. (Plus mostly cosmetic/minor changes.) To generate a diff of this commit: cvs rdiff -u -r1.101.4.1 -r1.101.4.2 src/sys/arch/xen/xen/xbdback_xenbus.c Please note that diffs are not public domain; they are subject to the copyright notices on the relevant files.
Modified files: Index: src/sys/arch/xen/xen/xbdback_xenbus.c diff -u src/sys/arch/xen/xen/xbdback_xenbus.c:1.101.4.1 src/sys/arch/xen/xen/xbdback_xenbus.c:1.101.4.2 --- src/sys/arch/xen/xen/xbdback_xenbus.c:1.101.4.1 Mon Jul 31 15:23:02 2023 +++ src/sys/arch/xen/xen/xbdback_xenbus.c Sat Jun 22 18:30:13 2024 @@ -1,7 +1,7 @@ -/* $NetBSD: xbdback_xenbus.c,v 1.101.4.1 2023/07/31 15:23:02 martin Exp $ */ +/* $NetBSD: xbdback_xenbus.c,v 1.101.4.2 2024/06/22 18:30:13 martin Exp $ */ /* - * Copyright (c) 2006 Manuel Bouyer. + * Copyright (c) 2006,2024 Manuel Bouyer. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions @@ -26,7 +26,7 @@ */ #include <sys/cdefs.h> -__KERNEL_RCSID(0, "$NetBSD: xbdback_xenbus.c,v 1.101.4.1 2023/07/31 15:23:02 martin Exp $"); +__KERNEL_RCSID(0, "$NetBSD: xbdback_xenbus.c,v 1.101.4.2 2024/06/22 18:30:13 martin Exp $"); #include <sys/buf.h> #include <sys/condvar.h> @@ -73,7 +73,7 @@ __KERNEL_RCSID(0, "$NetBSD: xbdback_xenb #define VBD_MAXSECT ((PAGE_SIZE / VBD_BSIZE) - 1) #define VBD_VA_SIZE MAXPHYS -#define VBD_MAX_INDIRECT_SEGMENTS VBD_VA_SIZE >> PAGE_SHIFT +#define VBD_MAX_INDIRECT_SEGMENTS (VBD_VA_SIZE >> PAGE_SHIFT) CTASSERT(XENSHM_MAX_PAGES_PER_REQUEST >= VBD_MAX_INDIRECT_SEGMENTS); @@ -100,6 +100,10 @@ typedef enum {WAITING, RUN, DISCONNECTIN * condition before it starts processing requests again from where it left. * Continuation state is "stored" in the xbdback instance (xbdi_cont), * and should only be manipulated by the instance thread. + * If a continuation has to be restarted from a specific point, + * the callback and argument can be stored in xbdi_cont_restart and + * xbdi_cont_restart_obj + * * * As xbdback(4) has to handle different sort of asynchronous events (Xen * event channels, biointr() soft interrupts, xenbus commands), the xbdi_lock @@ -111,9 +115,7 @@ typedef enum {WAITING, RUN, DISCONNECTIN * xbdback_co_main() * | --> xbdback_co_cache_flush() * | | | - * | | -> xbdback_co_cache_doflush() or NULL - * | | | - * | | -> xbdback_co_do_io() + * | | -> xbdback_co_do_io() or NULL * xbdback_co_main_loop()-| * | |-> xbdback_co_main_done2() or NULL * | | @@ -121,9 +123,7 @@ typedef enum {WAITING, RUN, DISCONNECTIN * | * xbdback_co_io() -> xbdback_co_main_incr() -> xbdback_co_main_loop() * | - * xbdback_co_io_gotio() -> xbdback_map_shm() - * | | - * | xbdback_co_main_incr() -> xbdback_co_main_loop() + * xbdback_co_io_gotio() -> xbdback_co_main_incr() -> xbdback_co_main_loop() * | * xbdback_co_do_io() * | @@ -152,8 +152,12 @@ struct xbdback_io { SLIST_ENTRY(xbdback_io) xio_next; /* The instance pointer is duplicated for convenience. */ struct xbdback_instance *xio_xbdi; /* our xbd instance */ - uint8_t xio_operation; - uint64_t xio_id; + /* _request state: track requests fetched from ring */ + blkif_request_t xio_xen_req; + /* array of segments[VBD_MAX_INDIRECT_SEGMENTS] allocated separately */ + struct blkif_request_segment *xio_seg; + bus_dmamap_t xio_seg_dmamap; + /* internal states */ union { struct { struct buf xio_buf; /* our I/O */ @@ -165,7 +169,7 @@ struct xbdback_io { grant_ref_t xio_gref[VBD_MAX_INDIRECT_SEGMENTS]; /* grants release */ grant_handle_t xio_gh[VBD_MAX_INDIRECT_SEGMENTS]; - uint16_t xio_nrma; /* number of guest pages */ + bool xio_need_bounce; /* request is not contiguous */ } xio_rw; } u; }; @@ -175,7 +179,7 @@ struct xbdback_io { #define xio_xv u.xio_rw.xio_xv #define xio_gref u.xio_rw.xio_gref #define xio_gh u.xio_rw.xio_gh -#define xio_nrma u.xio_rw.xio_nrma +#define xio_need_bounce u.xio_rw.xio_need_bounce /* we keep the xbdback instances in a linked list */ struct xbdback_instance { @@ -194,6 +198,11 @@ struct xbdback_instance { SLIST_HEAD(, xbdback_io) xbdi_io_free; struct xbdback_va xbdi_va[BLKIF_RING_SIZE]; SLIST_HEAD(, xbdback_va) xbdi_va_free; + /* segments structure allocated in page-aligned chunks */ + struct blkif_request_segment *xbdi_segs; + /* bounce buffer in case a transfer is not contiguous */ + vaddr_t xbdi_bouncebuf; + int xbdi_bouncebuf_use; /* is bounce buffer in use? */ /* backing device parameters */ dev_t xbdi_dev; const struct bdevsw *xbdi_bdevsw; /* pointer to the device's bdevsw */ @@ -217,22 +226,26 @@ struct xbdback_instance { */ RING_IDX xbdi_req_prod; /* limit on request indices */ xbdback_cont_t xbdi_cont; - /* _request state: track requests fetched from ring */ - blkif_request_t xbdi_xen_req; - struct blkif_request_segment xbdi_seg[VBD_MAX_INDIRECT_SEGMENTS]; - bus_dmamap_t xbdi_seg_dmamap; - grant_ref_t xbdi_in_gntref; + /* if not NULL, will restart here after thread wakes up */ + xbdback_cont_t xbdi_cont_restart; + void *xbdi_cont_restart_obj; /* other state */ uint xbdi_pendingreqs; /* number of I/O in fly */ struct timeval xbdi_lasterr_time; /* error time tracking */ }; /* Manipulation of the above reference count. */ -#define xbdi_get(xbdip) (xbdip)->xbdi_refcnt++ +#define xbdi_get(xbdip) \ +do { \ + KASSERT(mutex_owned(&xbdip->xbdi_lock)); \ + (xbdip)->xbdi_refcnt++; \ +} while (0) + #define xbdi_put(xbdip) \ do { \ + KASSERT(mutex_owned(&xbdip->xbdi_lock)); \ if (--((xbdip)->xbdi_refcnt) == 0) \ xbdback_finish_disconnect(xbdip); \ -} while (/* CONSTCOND */ 0) +} while (0) static SLIST_HEAD(, xbdback_instance) xbdback_instances; static kmutex_t xbdback_lock; @@ -260,7 +273,6 @@ static void *xbdback_co_main_incr(struct static void *xbdback_co_main_done2(struct xbdback_instance *, void *); static void *xbdback_co_cache_flush(struct xbdback_instance *, void *); -static void *xbdback_co_cache_doflush(struct xbdback_instance *, void *); static void *xbdback_co_io(struct xbdback_instance *, void *); static void *xbdback_co_io_gotio(struct xbdback_instance *, void *); @@ -273,7 +285,7 @@ static void xbdback_iodone_locked(struct struct xbdback_io *, struct buf *); static void xbdback_send_reply(struct xbdback_instance *, uint64_t , int , int); -static void *xbdback_map_shm(struct xbdback_io *); +static int xbdback_map_shm(struct xbdback_io *); static void xbdback_unmap_shm(struct xbdback_io *); static struct xbdback_io *xbdback_io_get(struct xbdback_instance *); @@ -308,6 +320,7 @@ xbdback_xenbus_create(struct xenbus_devi struct xbdback_instance *xbdi; long domid, handle; int error, i; + int segalloc = 0; char *ep; if ((error = xenbus_read_ul(NULL, xbusd->xbusd_path, @@ -356,32 +369,19 @@ xbdback_xenbus_create(struct xenbus_devi /* initialize status and reference counter */ xbdi->xbdi_status = DISCONNECTED; - xbdi_get(xbdi); mutex_init(&xbdi->xbdi_lock, MUTEX_DEFAULT, IPL_BIO); cv_init(&xbdi->xbdi_cv, xbdi->xbdi_name); + mutex_enter(&xbdi->xbdi_lock); + xbdi_get(xbdi); + mutex_exit(&xbdi->xbdi_lock); + xbusd->xbusd_u.b.b_cookie = xbdi; xbusd->xbusd_u.b.b_detach = xbdback_xenbus_destroy; xbusd->xbusd_otherend_changed = xbdback_frontend_changed; xbdi->xbdi_xbusd = xbusd; - if (bus_dmamap_create(xbdi->xbdi_xbusd->xbusd_dmat, PAGE_SIZE, - 1, PAGE_SIZE, PAGE_SIZE, BUS_DMA_WAITOK | BUS_DMA_ALLOCNOW, - &xbdi->xbdi_seg_dmamap) != 0) { - printf("%s: can't create dma map for indirect segments\n", - xbdi->xbdi_name); - goto fail; - } - if (bus_dmamap_load(xbdi->xbdi_xbusd->xbusd_dmat, - xbdi->xbdi_seg_dmamap, xbdi->xbdi_seg, - sizeof(xbdi->xbdi_seg), NULL, BUS_DMA_WAITOK) != 0) { - printf("%s: can't load dma map for indirect segments\n", - xbdi->xbdi_name); - goto fail; - } - KASSERT(xbdi->xbdi_seg_dmamap->dm_nsegs == 1); - SLIST_INIT(&xbdi->xbdi_va_free); for (i = 0; i < BLKIF_RING_SIZE; i++) { xbdi->xbdi_va[i].xv_vaddr = uvm_km_alloc(kernel_map, @@ -390,10 +390,46 @@ xbdback_xenbus_create(struct xenbus_devi xv_next); } + /* + * allocate page-aligned memory for segments, so that for each + * xbdback_io its segments are in a single page. + * sizeof(struct blkif_request_segment) * VBD_MAX_INDIRECT_SEGMENTS + * is 128 so this helps us avoiding a page boundary withing a + * block of VBD_MAX_INDIRECT_SEGMENTS segments. + */ + CTASSERT(sizeof(struct blkif_request_segment) * VBD_MAX_INDIRECT_SEGMENTS == 128); + xbdi->xbdi_segs = (void *)uvm_km_alloc(kernel_map, round_page( + sizeof(struct blkif_request_segment) * VBD_MAX_INDIRECT_SEGMENTS * BLKIF_RING_SIZE), + PAGE_SIZE, UVM_KMF_WIRED | UVM_KMF_WAITVA); + SLIST_INIT(&xbdi->xbdi_io_free); for (i = 0; i < BLKIF_RING_SIZE; i++) { - SLIST_INSERT_HEAD(&xbdi->xbdi_io_free, &xbdi->xbdi_io[i], - xio_next); + struct xbdback_io *xbd_io = &xbdi->xbdi_io[i]; + xbd_io->xio_seg = + &xbdi->xbdi_segs[i * VBD_MAX_INDIRECT_SEGMENTS]; + error = bus_dmamap_create(xbdi->xbdi_xbusd->xbusd_dmat, + PAGE_SIZE, 1, PAGE_SIZE, PAGE_SIZE, + BUS_DMA_WAITOK | BUS_DMA_ALLOCNOW, + &xbd_io->xio_seg_dmamap); + if (error != 0) { + printf("%s: can't create dma map for indirect segments %d\n", + xbdi->xbdi_name, i); + goto fail; + } + error = bus_dmamap_load(xbdi->xbdi_xbusd->xbusd_dmat, + xbd_io->xio_seg_dmamap, xbd_io->xio_seg, + sizeof(struct blkif_request_segment) * VBD_MAX_INDIRECT_SEGMENTS, + NULL, BUS_DMA_WAITOK); + if (error != 0) { + printf("%s: can't load dma map for indirect segments %d @%p (%d, %zu)\n", + xbdi->xbdi_name, i, xbd_io->xio_seg, error, sizeof(xbd_io->xio_seg)); + bus_dmamap_destroy(xbdi->xbdi_xbusd->xbusd_dmat, + xbd_io->xio_seg_dmamap); + goto fail; + } + KASSERT(xbd_io->xio_seg_dmamap->dm_nsegs == 1); + segalloc = i; + SLIST_INSERT_HEAD(&xbdi->xbdi_io_free, xbd_io, xio_next); } error = xenbus_watch_path2(xbusd, xbusd->xbusd_path, "physical-device", @@ -410,10 +446,23 @@ xbdback_xenbus_create(struct xenbus_devi xbusd->xbusd_path, error); goto fail2; } + + xbdi->xbdi_bouncebuf = uvm_km_alloc(kernel_map, MAXPHYS, PAGE_SIZE, + UVM_KMF_WIRED | UVM_KMF_WAITVA); return 0; fail2: unregister_xenbus_watch(&xbdi->xbdi_watch); fail: + for (i = 0; i < segalloc; i++) { + struct xbdback_io *xbd_io = &xbdi->xbdi_io[i]; + bus_dmamap_unload(xbdi->xbdi_xbusd->xbusd_dmat, + xbd_io->xio_seg_dmamap); + bus_dmamap_destroy(xbdi->xbdi_xbusd->xbusd_dmat, + xbd_io->xio_seg_dmamap); + } + mutex_enter(&xbdback_lock); + SLIST_REMOVE(&xbdback_instances, xbdi, xbdback_instance, next); + mutex_exit(&xbdback_lock); kmem_free(xbdi, sizeof(*xbdi)); return error; } @@ -457,6 +506,11 @@ xbdback_xenbus_destroy(void *arg) mutex_exit(&xbdback_lock); for (int i = 0; i < BLKIF_RING_SIZE; i++) { + struct xbdback_io *xbd_io = &xbdi->xbdi_io[i]; + bus_dmamap_unload(xbdi->xbdi_xbusd->xbusd_dmat, + xbd_io->xio_seg_dmamap); + bus_dmamap_destroy(xbdi->xbdi_xbusd->xbusd_dmat, + xbd_io->xio_seg_dmamap); if (xbdi->xbdi_va[i].xv_vaddr != 0) { uvm_km_free(kernel_map, xbdi->xbdi_va[i].xv_vaddr, VBD_VA_SIZE, UVM_KMF_VAONLY); @@ -464,8 +518,6 @@ xbdback_xenbus_destroy(void *arg) } } - bus_dmamap_unload(xbdi->xbdi_xbusd->xbusd_dmat, xbdi->xbdi_seg_dmamap); - bus_dmamap_destroy(xbdi->xbdi_xbusd->xbusd_dmat, xbdi->xbdi_seg_dmamap); mutex_destroy(&xbdi->xbdi_lock); cv_destroy(&xbdi->xbdi_cv); @@ -879,6 +931,7 @@ static void xbdback_thread(void *arg) { struct xbdback_instance *xbdi = arg; + void *obj; mutex_enter(&xbdi->xbdi_lock); for (;;) { @@ -888,12 +941,19 @@ xbdback_thread(void *arg) break; case RUN: xbdi->xbdi_status = WAITING; /* reset state */ - + obj = xbdi; + if (xbdi->xbdi_cont_restart != NULL) { + KASSERT(xbdi->xbdi_cont == NULL); + xbdi->xbdi_cont = xbdi->xbdi_cont_restart; + obj = xbdi->xbdi_cont_restart_obj; + xbdi->xbdi_cont_restart = NULL; + xbdi->xbdi_cont_restart_obj = NULL; + } if (xbdi->xbdi_cont == NULL) { xbdi->xbdi_cont = xbdback_co_main; } - xbdback_trampoline(xbdi, xbdi); + xbdback_trampoline(xbdi, obj); break; case DISCONNECTING: if (xbdi->xbdi_pendingreqs > 0) { @@ -951,9 +1011,20 @@ xbdback_co_main_loop(struct xbdback_inst blkif_x86_64_request_indirect_t *rin64; if (xbdi->xbdi_ring.ring_n.req_cons != xbdi->xbdi_req_prod) { + struct xbdback_io *xbd_io = xbdback_io_get(xbdi); uint8_t real_op = 0xff; - req = &xbdi->xbdi_xen_req; + if (xbd_io == NULL) { + /* retry after iodone */ + xbdi->xbdi_cont = NULL; + return NULL; + } + memset(&xbd_io->u, 0, sizeof(xbd_io->u)); + + buf_init(&xbd_io->xio_buf); + xbd_io->xio_xbdi = xbdi; + + req = &xbd_io->xio_xen_req; memset(req, 0, sizeof(*req)); switch(xbdi->xbdi_proto) { @@ -1012,11 +1083,10 @@ xbdback_co_main_loop(struct xbdback_inst case BLKIF_OP_READ: case BLKIF_OP_WRITE: xbdi->xbdi_cont = xbdback_co_io; - break; + return xbd_io; case BLKIF_OP_FLUSH_DISKCACHE: - xbdi_get(xbdi); xbdi->xbdi_cont = xbdback_co_cache_flush; - break; + return xbd_io; default: if (ratecheck(&xbdi->xbdi_lasterr_time, &xbdback_err_intvl)) { @@ -1027,12 +1097,12 @@ fail: xbdback_send_reply(xbdi, req->id, real_op, BLKIF_RSP_ERROR); xbdi->xbdi_cont = xbdback_co_main_incr; - break; + return xbdi; } } else { xbdi->xbdi_cont = xbdback_co_main_done2; + return xbdi; } - return xbdi; } /* @@ -1079,32 +1149,21 @@ xbdback_co_main_done2(struct xbdback_ins * Frontend requested a cache flush operation. */ static void * -xbdback_co_cache_flush(struct xbdback_instance *xbdi, void *obj __unused) +xbdback_co_cache_flush(struct xbdback_instance *xbdi, void *obj) { + struct xbdback_io *xbd_io = obj; + KASSERT(xbd_io->xio_xen_req.operation == BLKIF_OP_FLUSH_DISKCACHE); if (xbdi->xbdi_pendingreqs > 0) { /* * There are pending requests. * Event or iodone() will restart processing */ + xbdi->xbdi_cont_restart = xbdback_co_cache_flush; + xbdi->xbdi_cont_restart_obj = xbd_io; xbdi->xbdi_cont = NULL; - xbdi_put(xbdi); return NULL; } - xbdi->xbdi_cont = xbdback_co_cache_doflush; - return xbdback_io_get(xbdi); -} - -/* Start the flush work */ -static void * -xbdback_co_cache_doflush(struct xbdback_instance *xbdi, void *obj) -{ - struct xbdback_io *xbd_io; - - XENPRINTF(("xbdback_co_cache_doflush %p %p\n", xbdi, obj)); - xbd_io = obj; - xbd_io->xio_xbdi = xbdi; - xbd_io->xio_operation = xbdi->xbdi_xen_req.operation; - xbd_io->xio_id = xbdi->xbdi_xen_req.id; + xbdi_get(xbdi); xbdi->xbdi_cont = xbdback_co_do_io; return xbd_io; } @@ -1114,7 +1173,7 @@ xbdback_co_cache_doflush(struct xbdback_ * then get the segment information directly from the ring request. */ static void * -xbdback_co_io(struct xbdback_instance *xbdi, void *obj __unused) +xbdback_co_io(struct xbdback_instance *xbdi, void *obj) { int i, error; blkif_request_t *req, *reqn; @@ -1123,8 +1182,11 @@ xbdback_co_io(struct xbdback_instance *x blkif_request_indirect_t *rinn; blkif_x86_32_request_indirect_t *rin32; blkif_x86_64_request_indirect_t *rin64; + const char *errstr; + struct xbdback_io *xbd_io = obj; + grant_ref_t in_gntref = 0; - req = &xbdi->xbdi_xen_req; + req = &xbd_io->xio_xen_req; /* some sanity checks */ KASSERT(req->operation == BLKIF_OP_READ || @@ -1142,17 +1204,20 @@ xbdback_co_io(struct xbdback_instance *x rinn = (blkif_request_indirect_t *)reqn; req->operation = rinn->indirect_op; req->nr_segments = (uint8_t)rinn->nr_segments; - if (req->nr_segments > VBD_MAX_INDIRECT_SEGMENTS) - goto bad_nr_segments; - xbdi->xbdi_in_gntref = rinn->indirect_grefs[0]; + if (req->nr_segments > VBD_MAX_INDIRECT_SEGMENTS) { + errstr = "too many indirect segments"; + goto bad_segments; + } + in_gntref = rinn->indirect_grefs[0]; /* first_sect and segment grefs fetched later */ } else { req->nr_segments = reqn->nr_segments; - if (req->nr_segments > BLKIF_MAX_SEGMENTS_PER_REQUEST) - goto bad_nr_segments; + if (req->nr_segments > BLKIF_MAX_SEGMENTS_PER_REQUEST) { + errstr = "too many segments"; + goto bad_segments; + } for (i = 0; i < req->nr_segments; i++) - xbdi->xbdi_seg[i] = reqn->seg[i]; - xbdi->xbdi_in_gntref = 0; + xbd_io->xio_seg[i] = reqn->seg[i]; } break; case XBDIP_32: @@ -1164,17 +1229,20 @@ xbdback_co_io(struct xbdback_instance *x rin32 = (blkif_x86_32_request_indirect_t *)req32; req->operation = rin32->indirect_op; req->nr_segments = (uint8_t)rin32->nr_segments; - if (req->nr_segments > VBD_MAX_INDIRECT_SEGMENTS) - goto bad_nr_segments; - xbdi->xbdi_in_gntref = rin32->indirect_grefs[0]; + if (req->nr_segments > VBD_MAX_INDIRECT_SEGMENTS) { + errstr = "too many indirect segments"; + goto bad_segments; + } + in_gntref = rin32->indirect_grefs[0]; /* first_sect and segment grefs fetched later */ } else { req->nr_segments = req32->nr_segments; - if (req->nr_segments > BLKIF_MAX_SEGMENTS_PER_REQUEST) - goto bad_nr_segments; + if (req->nr_segments > BLKIF_MAX_SEGMENTS_PER_REQUEST) { + errstr = "too many segments"; + goto bad_segments; + } for (i = 0; i < req->nr_segments; i++) - xbdi->xbdi_seg[i] = req32->seg[i]; - xbdi->xbdi_in_gntref = 0; + xbd_io->xio_seg[i] = req32->seg[i]; } break; case XBDIP_64: @@ -1185,17 +1253,20 @@ xbdback_co_io(struct xbdback_instance *x if (req64->operation == BLKIF_OP_INDIRECT) { rin64 = (blkif_x86_64_request_indirect_t *)req64; req->nr_segments = (uint8_t)rin64->nr_segments; - if (req->nr_segments > VBD_MAX_INDIRECT_SEGMENTS) - goto bad_nr_segments; - xbdi->xbdi_in_gntref = rin64->indirect_grefs[0]; + if (req->nr_segments > VBD_MAX_INDIRECT_SEGMENTS) { + errstr = "too many indirect segments"; + goto bad_segments; + } + in_gntref = rin64->indirect_grefs[0]; /* first_sect and segment grefs fetched later */ } else { req->nr_segments = req64->nr_segments; - if (req->nr_segments > BLKIF_MAX_SEGMENTS_PER_REQUEST) - goto bad_nr_segments; + if (req->nr_segments > BLKIF_MAX_SEGMENTS_PER_REQUEST) { + errstr = "too many segments"; + goto bad_segments; + } for (i = 0; i < req->nr_segments; i++) - xbdi->xbdi_seg[i] = req64->seg[i]; - xbdi->xbdi_in_gntref = 0; + xbd_io->xio_seg[i] = req64->seg[i]; } break; } @@ -1208,16 +1279,42 @@ xbdback_co_io(struct xbdback_instance *x } /* Max value checked already earlier */ - if (req->nr_segments < 1) - goto bad_nr_segments; + if (req->nr_segments < 1) { + errstr = "invalid number of segments"; + goto bad_segments; + } + /* If segments are on an indirect page, copy them now */ + if (in_gntref) { + gnttab_copy_t gop; + paddr_t ma; + + gop.flags = GNTCOPY_source_gref; + gop.len = req->nr_segments + * sizeof(struct blkif_request_segment); + + gop.source.u.ref = in_gntref; + gop.source.offset = 0; + gop.source.domid = xbdi->xbdi_domid; + + ma = xbd_io->xio_seg_dmamap->dm_segs[0].ds_addr; + gop.dest.offset = ma & PAGE_MASK; + gop.dest.domid = DOMID_SELF; + gop.dest.u.gmfn = ma >> PAGE_SHIFT; + + if (HYPERVISOR_grant_table_op(GNTTABOP_copy, &gop, 1) != 0) { + errstr = "GNTTABOP_copy failed"; + goto bad_segments; + } + } + + xbdi_get(xbdi); xbdi->xbdi_cont = xbdback_co_io_gotio; - return xbdback_io_get(xbdi); + return xbd_io; - bad_nr_segments: + bad_segments: if (ratecheck(&xbdi->xbdi_lasterr_time, &xbdback_err_intvl)) { - printf("%s: invalid number of segments: %d\n", - xbdi->xbdi_name, req->nr_segments); + printf("%s: %s\n", xbdi->xbdi_name, errstr); } error = EINVAL; /* FALLTHROUGH */ @@ -1233,69 +1330,54 @@ xbdback_co_io(struct xbdback_instance *x static void * xbdback_co_io_gotio(struct xbdback_instance *xbdi, void *obj) { - struct xbdback_io *xbd_io; + struct xbdback_io *xbd_io = obj; int buf_flags; size_t bcount; - blkif_request_t *req; + blkif_request_t *req = &xbd_io->xio_xen_req; + uint8_t last_sect; + int error; KASSERT(mutex_owned(&xbdi->xbdi_lock)); - - xbdi_get(xbdi); - xbdi->xbdi_pendingreqs++; - - req = &xbdi->xbdi_xen_req; - xbd_io = obj; - memset(xbd_io, 0, sizeof(*xbd_io)); - buf_init(&xbd_io->xio_buf); - xbd_io->xio_xbdi = xbdi; - xbd_io->xio_operation = req->operation; - xbd_io->xio_id = req->id; - - /* If segments are on an indirect page, copy them now */ - if (xbdi->xbdi_in_gntref) { - gnttab_copy_t gop; - paddr_t ma; - - gop.flags = GNTCOPY_source_gref; - gop.len = req->nr_segments - * sizeof(struct blkif_request_segment); - - gop.source.u.ref = xbdi->xbdi_in_gntref; - gop.source.offset = 0; - gop.source.domid = xbdi->xbdi_domid; - - ma = xbdi->xbdi_seg_dmamap->dm_segs[0].ds_addr; - gop.dest.offset = ma & PAGE_MASK; - gop.dest.domid = DOMID_SELF; - gop.dest.u.gmfn = ma >> PAGE_SHIFT; - - if (HYPERVISOR_grant_table_op(GNTTABOP_copy, &gop, 1) != 0) { - printf("%s: GNTTABOP_copy failed\n", xbdi->xbdi_name); - xbdback_send_reply(xbdi, xbdi->xbdi_xen_req.id, - xbdi->xbdi_xen_req.operation, - BLKIF_RSP_ERROR); - xbdi->xbdi_cont = xbdback_co_main_incr; - return NULL; - } - } + KASSERT(xbdi->xbdi_refcnt > 0); /* Process segments */ bcount = 0; for (int i = 0; i < req->nr_segments; i++) { - struct blkif_request_segment *seg = &xbdi->xbdi_seg[i]; + struct blkif_request_segment *seg = &xbd_io->xio_seg[i]; + if (seg->last_sect > VBD_MAXSECT || + seg->first_sect > VBD_MAXSECT) { + if (ratecheck(&xbdi->xbdi_lasterr_time, + &xbdback_err_intvl)) { + printf("%s: invalid segment sectors %d %d\n", + xbdi->xbdi_name, + seg->first_sect, seg->last_sect); + } + xbdi->xbdi_pendingreqs++; /* xbdback_io_error will -- */ + xbdback_io_error(xbd_io, EINVAL); + /* do not retry */ + xbdi->xbdi_cont = xbdback_co_main_incr; + return xbdi; + } + + if (i > 0) { + if (last_sect != VBD_MAXSECT || + seg->first_sect != 0) { + xbd_io->xio_need_bounce = 1; + } + } + last_sect = seg->last_sect; xbd_io->xio_gref[i] = seg->gref; bcount += (seg->last_sect - seg->first_sect + 1) * VBD_BSIZE; } - xbd_io->xio_nrma = req->nr_segments; - xbd_io->xio_start_offset = xbdi->xbdi_seg[0].first_sect * VBD_BSIZE; + xbd_io->xio_start_offset = xbd_io->xio_seg[0].first_sect * VBD_BSIZE; KASSERT(bcount <= MAXPHYS); KASSERT(xbd_io->xio_start_offset < PAGE_SIZE); KASSERT(bcount + xbd_io->xio_start_offset <= VBD_VA_SIZE); /* Fill-in the buf */ - if (xbdi->xbdi_xen_req.operation == BLKIF_OP_WRITE) { + if (req->operation == BLKIF_OP_WRITE) { buf_flags = B_WRITE; } else { buf_flags = B_READ; @@ -1311,11 +1393,36 @@ xbdback_co_io_gotio(struct xbdback_insta xbd_io->xio_buf.b_dev = xbdi->xbdi_dev; xbd_io->xio_buf.b_blkno = req->sector_number; xbd_io->xio_buf.b_bcount = bcount; - xbd_io->xio_buf.b_data = NULL; + if (__predict_false(xbd_io->xio_need_bounce)) { + if (__predict_false(xbdi->xbdi_bouncebuf_use)) { + KASSERT(xbdi->xbdi_pendingreqs > 1); + /* retry later */ + xbdi->xbdi_cont_restart = xbdback_co_io_gotio; + xbdi->xbdi_cont_restart_obj = xbd_io; + xbdi->xbdi_cont = NULL; + return NULL; + } + xbdi->xbdi_bouncebuf_use++; + KASSERT(xbdi->xbdi_bouncebuf_use == 1); + xbd_io->xio_buf.b_data = (void *)xbdi->xbdi_bouncebuf; + } + xbdi->xbdi_pendingreqs++; + if ((error = xbdback_map_shm(xbd_io)) != 0) { + xbdback_io_error(xbd_io, error); + /* do not retry */ + xbdi->xbdi_cont = xbdback_co_main_incr; + return xbdi; + } + if (__predict_true(xbd_io->xio_need_bounce == 0)) { + xbd_io->xio_buf.b_data = (void *) + (xbd_io->xio_vaddr + xbd_io->xio_start_offset); + } + + xbd_io->xio_buf.b_private = xbd_io; xbdi->xbdi_cont = xbdback_co_do_io; - return xbdback_map_shm(xbd_io); + return xbd_io; } static void @@ -1337,8 +1444,11 @@ static void * xbdback_co_do_io(struct xbdback_instance *xbdi, void *obj) { struct xbdback_io *xbd_io = obj; + blkif_request_t *req = &xbd_io->xio_xen_req; + + KASSERT(xbdi->xbdi_refcnt > 0); - switch (xbd_io->xio_operation) { + switch (req->operation) { case BLKIF_OP_FLUSH_DISKCACHE: { int error; @@ -1358,8 +1468,7 @@ xbdback_co_do_io(struct xbdback_instance error = BLKIF_RSP_ERROR; } else error = BLKIF_RSP_OKAY; - xbdback_send_reply(xbdi, xbd_io->xio_id, - xbd_io->xio_operation, error); + xbdback_send_reply(xbdi, req->id, req->operation, error); xbdback_io_put(xbdi, xbd_io); xbdi_put(xbdi); xbdi->xbdi_cont = xbdback_co_main_incr; @@ -1367,11 +1476,28 @@ xbdback_co_do_io(struct xbdback_instance } case BLKIF_OP_READ: case BLKIF_OP_WRITE: + if (__predict_false(xbd_io->xio_need_bounce) && + req->operation == BLKIF_OP_WRITE) { + vaddr_t boffset = 0; + for (int i = 0; i < req->nr_segments; i++) { + struct blkif_request_segment *seg = + &xbd_io->xio_seg[i]; + vaddr_t segoffset = seg->first_sect * VBD_BSIZE; + size_t segbcount = + (seg->last_sect - seg->first_sect + 1) * + VBD_BSIZE; + KASSERT(segoffset + segbcount <= PAGE_SIZE); + KASSERT(boffset + segbcount < MAXPHYS); + segoffset += PAGE_SIZE * i; + memcpy( + (void *)(xbdi->xbdi_bouncebuf + boffset), + (void *)(xbd_io->xio_vaddr + segoffset), + segbcount); + boffset += segbcount; + } + } KASSERT(mutex_owned(&xbdi->xbdi_lock)); mutex_exit(&xbdi->xbdi_lock); - xbd_io->xio_buf.b_data = (void *) - (xbd_io->xio_vaddr + xbd_io->xio_start_offset); - if ((xbd_io->xio_buf.b_flags & B_READ) == 0) { mutex_enter(xbd_io->xio_buf.b_vp->v_interlock); xbd_io->xio_buf.b_vp->v_numoutput++; @@ -1385,7 +1511,7 @@ xbdback_co_do_io(struct xbdback_instance default: /* Should never happen */ panic("xbdback_co_do_io: unsupported operation %d", - xbd_io->xio_operation); + req->operation); } } @@ -1416,6 +1542,7 @@ xbdback_iodone_locked(struct xbdback_ins struct buf *bp) { int status; + blkif_request_t *req = &xbd_io->xio_xen_req; XENPRINTF(("xbdback_io domain %d: iodone ptr 0x%lx\n", xbdi->xbdi_domid, (long)xbd_io)); @@ -1423,6 +1550,34 @@ xbdback_iodone_locked(struct xbdback_ins KASSERT(mutex_owned(&xbdi->xbdi_lock)); KASSERT(bp->b_error != 0 || xbd_io->xio_xv != NULL); + if (__predict_false(xbd_io->xio_need_bounce)) { + KASSERT(xbd_io->xio_buf.b_data == (void *)xbdi->xbdi_bouncebuf); + + KASSERT(req->operation == BLKIF_OP_WRITE || + req->operation == BLKIF_OP_READ); + + if (req->operation == BLKIF_OP_READ && bp->b_error == 0) { + vaddr_t boffset = 0; + for (int i = 0; i < req->nr_segments; i++) { + struct blkif_request_segment *seg = + &xbd_io->xio_seg[i]; + vaddr_t segoffset = seg->first_sect * VBD_BSIZE; + size_t segbcount = + (seg->last_sect - seg->first_sect + 1) * + VBD_BSIZE; + KASSERT(segoffset + segbcount <= PAGE_SIZE); + KASSERT(boffset + segbcount < MAXPHYS); + segoffset += PAGE_SIZE * i; + memcpy( + (void *)(xbd_io->xio_vaddr + segoffset), + (void *)(xbdi->xbdi_bouncebuf + boffset), + segbcount); + boffset += segbcount; + } + } + KASSERT(xbdi->xbdi_bouncebuf_use == 1); + xbdi->xbdi_bouncebuf_use--; + } if (xbd_io->xio_xv != NULL) xbdback_unmap_shm(xbd_io); @@ -1433,7 +1588,7 @@ xbdback_iodone_locked(struct xbdback_ins } else status = BLKIF_RSP_OKAY; - xbdback_send_reply(xbdi, xbd_io->xio_id, xbd_io->xio_operation, status); + xbdback_send_reply(xbdi, req->id, req->operation, status); xbdi_put(xbdi); KASSERT(xbdi->xbdi_pendingreqs > 0); @@ -1514,50 +1669,48 @@ xbdback_send_reply(struct xbdback_instan * Map multiple entries of an I/O request into backend's VA space. * The xbd_io->xio_gref array has to be filled out by the caller. */ -static void * +static int xbdback_map_shm(struct xbdback_io *xbd_io) { struct xbdback_instance *xbdi = xbd_io->xio_xbdi; + blkif_request_t *req = &xbd_io->xio_xen_req; int error; #ifdef XENDEBUG_VBD int i; printf("xbdback_map_shm map grant "); - for (i = 0; i < xbd_io->xio_nrma; i++) { + for (i = 0; i < req->nr_segments; i++) { printf("%u ", (u_int)xbd_io->xio_gref[i]); } #endif KASSERT(mutex_owned(&xbdi->xbdi_lock)); + KASSERT(xbd_io->xio_xv == NULL); xbd_io->xio_xv = SLIST_FIRST(&xbdi->xbdi_va_free); KASSERT(xbd_io->xio_xv != NULL); SLIST_REMOVE_HEAD(&xbdi->xbdi_va_free, xv_next); xbd_io->xio_vaddr = xbd_io->xio_xv->xv_vaddr; - error = xen_shm_map(xbd_io->xio_nrma, xbdi->xbdi_domid, + error = xen_shm_map(req->nr_segments, xbdi->xbdi_domid, xbd_io->xio_gref, xbd_io->xio_vaddr, xbd_io->xio_gh, - (xbd_io->xio_operation == BLKIF_OP_WRITE) ? XSHM_RO : 0); + (req->operation == BLKIF_OP_WRITE) ? XSHM_RO : 0); switch(error) { case 0: #ifdef XENDEBUG_VBD - printf("handle "); - for (i = 0; i < xbd_io->xio_nrma; i++) { - printf("%u ", (u_int)xbd_io->xio_gh[i]); + printf("handle"); + for (i = 0; i < req->nr_segments; i++) { + printf(" %u ", (u_int)xbd_io->xio_gh[i]); } printf("\n"); #endif - return xbd_io; + return 0; default: /* reset xio_xv so error handling won't try to unmap it */ SLIST_INSERT_HEAD(&xbdi->xbdi_va_free, xbd_io->xio_xv, xv_next); xbd_io->xio_xv = NULL; - /* this will also free xbd_io via xbdback_iodone() */ - xbdback_io_error(xbd_io, error); - /* do not retry */ - xbdi->xbdi_cont = xbdback_co_main_incr; - return xbdi; + return error; } } @@ -1566,18 +1719,19 @@ static void xbdback_unmap_shm(struct xbdback_io *xbd_io) { struct xbdback_instance *xbdi = xbd_io->xio_xbdi; + blkif_request_t *req = &xbd_io->xio_xen_req; #ifdef XENDEBUG_VBD int i; printf("xbdback_unmap_shm handle "); - for (i = 0; i < xbd_io->xio_nrma; i++) { + for (i = 0; i < req->nr_segments; i++) { printf("%u ", (u_int)xbd_io->xio_gh[i]); } printf("\n"); #endif KASSERT(xbd_io->xio_xv != NULL); - xen_shm_unmap(xbd_io->xio_vaddr, xbd_io->xio_nrma, + xen_shm_unmap(xbd_io->xio_vaddr, req->nr_segments, xbd_io->xio_gh); SLIST_INSERT_HEAD(&xbdi->xbdi_va_free, xbd_io->xio_xv, xv_next); xbd_io->xio_xv = NULL; @@ -1589,7 +1743,6 @@ static struct xbdback_io * xbdback_io_get(struct xbdback_instance *xbdi) { struct xbdback_io *xbd_io = SLIST_FIRST(&xbdi->xbdi_io_free); - KASSERT(xbd_io != NULL); SLIST_REMOVE_HEAD(&xbdi->xbdi_io_free, xio_next); return xbd_io; } @@ -1598,6 +1751,7 @@ xbdback_io_get(struct xbdback_instance * static void xbdback_io_put(struct xbdback_instance *xbdi, struct xbdback_io *xbd_io) { + KASSERT(xbd_io->xio_xv == NULL); KASSERT(xbd_io != NULL); SLIST_INSERT_HEAD(&xbdi->xbdi_io_free, xbd_io, xio_next); } @@ -1612,6 +1766,8 @@ xbdback_trampoline(struct xbdback_instan xbdback_cont_t cont; while(obj != NULL && xbdi->xbdi_cont != NULL) { + KASSERT(xbdi->xbdi_cont_restart == NULL); + KASSERT(xbdi->xbdi_cont_restart_obj == NULL); cont = xbdi->xbdi_cont; #ifdef DIAGNOSTIC xbdi->xbdi_cont = (xbdback_cont_t)0xDEADBEEF; @@ -1623,6 +1779,13 @@ xbdback_trampoline(struct xbdback_instan "xbdi->xbdi_cont!\n", (long)cont); panic("xbdback_trampoline: bad continuation"); } + if (xbdi->xbdi_cont_restart != NULL || + xbdi->xbdi_cont_restart_obj != NULL) { + KASSERT(xbdi->xbdi_cont_restart != NULL); + KASSERT(xbdi->xbdi_cont_restart_obj != NULL); + KASSERT(xbdi->xbdi_cont == NULL); + KASSERT(obj == NULL); + } #endif } }