Jason Wang <jasow...@redhat.com> 于2020年7月21日周二 下午1:30写道: > > > On 2020/7/21 下午12:33, Li Qiang wrote: > > Jason Wang <jasow...@redhat.com> 于2020年7月21日周二 上午10:03写道: > >> > >> On 2020/7/21 上午12:45, Li Qiang wrote: > >>> Alexander Bulekov reported a UAF bug related e1000e packets send. > >>> > >>> -->https://bugs.launchpad.net/qemu/+bug/1886362 > >>> > >>> This is because the guest trigger a e1000e packet send and set the > >>> data's address to e1000e's MMIO address. So when the e1000e do DMA > >>> it will write the MMIO again and trigger re-entrancy and finally > >>> causes this UAF. > >>> > >>> Paolo suggested to use a bottom half whenever MMIO is doing complicate > >>> things in here: > >>> -->https://lists.nongnu.org/archive/html/qemu-devel/2020-07/msg03342.html > >>> > >>> Reference here: > >>> 'The easiest solution is to delay processing of descriptors to a bottom > >>> half whenever MMIO is doing something complicated. This is also better > >>> for latency because it will free the vCPU thread more quickly and leave > >>> the work to the I/O thread.' > >>> > >>> This patch fixes this UAF. > >>> > >>> Signed-off-by: Li Qiang <liq...@163.com> > >>> --- > >>> Change since v1: > >>> Per Jason's review here: > >>> -- https://lists.nongnu.org/archive/html/qemu-devel/2020-07/msg05368.html > >>> 1. Cancel and schedule the tx bh when VM is stopped or resume > >>> 2. Add a tx_burst for e1000e configuration to throttle the bh execution > >>> 3. Add a tx_waiting to record whether the bh is pending or not > >>> Don't use BQL in the tx_bh handler as when tx_bh is executed, the BQL is > >>> acquired. > >>> > >>> hw/net/e1000e.c | 6 +++ > >>> hw/net/e1000e_core.c | 106 ++++++++++++++++++++++++++++++++++--------- > >>> hw/net/e1000e_core.h | 8 ++++ > >>> 3 files changed, 98 insertions(+), 22 deletions(-) > >>> > >>> diff --git a/hw/net/e1000e.c b/hw/net/e1000e.c > >>> index fda34518c9..24e35a78bf 100644 > >>> --- a/hw/net/e1000e.c > >>> +++ b/hw/net/e1000e.c > >>> @@ -77,10 +77,14 @@ typedef struct E1000EState { > >>> > >>> bool disable_vnet; > >>> > >>> + int32_t tx_burst; > >>> + > >>> E1000ECore core; > >>> > >>> } E1000EState; > >>> > >>> +#define TX_BURST 256 > >>> + > >>> #define E1000E_MMIO_IDX 0 > >>> #define E1000E_FLASH_IDX 1 > >>> #define E1000E_IO_IDX 2 > >>> @@ -263,6 +267,7 @@ static void e1000e_core_realize(E1000EState *s) > >>> { > >>> s->core.owner = &s->parent_obj; > >>> s->core.owner_nic = s->nic; > >>> + s->core.tx_burst = s->tx_burst; > >>> } > >>> > >>> static void > >>> @@ -665,6 +670,7 @@ static Property e1000e_properties[] = { > >>> e1000e_prop_subsys_ven, uint16_t), > >>> DEFINE_PROP_SIGNED("subsys", E1000EState, subsys, 0, > >>> e1000e_prop_subsys, uint16_t), > >>> + DEFINE_PROP_INT32("x-txburst", E1000EState, tx_burst, TX_BURST), > >>> DEFINE_PROP_END_OF_LIST(), > >>> }; > >>> > >>> diff --git a/hw/net/e1000e_core.c b/hw/net/e1000e_core.c > >>> index bcd186cac5..0a38a50cca 100644 > >>> --- a/hw/net/e1000e_core.c > >>> +++ b/hw/net/e1000e_core.c > >>> @@ -909,19 +909,18 @@ e1000e_rx_ring_init(E1000ECore *core, E1000E_RxRing > >>> *rxr, int idx) > >>> rxr->i = &i[idx]; > >>> } > >>> > >>> -static void > >>> -e1000e_start_xmit(E1000ECore *core, const E1000E_TxRing *txr) > >>> +static int32_t > >>> +e1000e_start_xmit(struct e1000e_tx *q) > >>> { > >>> + E1000ECore *core = q->core; > >>> dma_addr_t base; > >>> struct e1000_tx_desc desc; > >>> - bool ide = false; > >>> - const E1000E_RingInfo *txi = txr->i; > >>> - uint32_t cause = E1000_ICS_TXQE; > >>> + const E1000E_RingInfo *txi; > >>> + E1000E_TxRing txr; > >>> + int32_t num_packets = 0; > >>> > >>> - if (!(core->mac[TCTL] & E1000_TCTL_EN)) { > >>> - trace_e1000e_tx_disabled(); > >>> - return; > >>> - } > >>> + e1000e_tx_ring_init(core, &txr, q - &core->tx[0]); > >>> + txi = txr.i; > >>> > >>> while (!e1000e_ring_empty(core, txi)) { > >>> base = e1000e_ring_head_descr(core, txi); > >>> @@ -931,15 +930,17 @@ e1000e_start_xmit(E1000ECore *core, const > >>> E1000E_TxRing *txr) > >>> trace_e1000e_tx_descr((void *)(intptr_t)desc.buffer_addr, > >>> desc.lower.data, desc.upper.data); > >>> > >>> - e1000e_process_tx_desc(core, txr->tx, &desc, txi->idx); > >>> - cause |= e1000e_txdesc_writeback(core, base, &desc, &ide, > >>> txi->idx); > >>> + e1000e_process_tx_desc(core, txr.tx, &desc, txi->idx); > >>> + q->cause |= e1000e_txdesc_writeback(core, base, &desc, > >>> + &q->ide, txi->idx); > >>> > >>> e1000e_ring_advance(core, txi, 1); > >>> + if (++num_packets >= core->tx_burst) { > >>> + break; > >>> + } > >>> } > >>> > >>> - if (!ide || !e1000e_intrmgr_delay_tx_causes(core, &cause)) { > >>> - e1000e_set_interrupt_cause(core, cause); > >>> - } > >>> + return num_packets; > >>> } > >>> > >>> static bool > >>> @@ -2423,32 +2424,41 @@ e1000e_set_dbal(E1000ECore *core, int index, > >>> uint32_t val) > >>> static void > >>> e1000e_set_tctl(E1000ECore *core, int index, uint32_t val) > >>> { > >>> - E1000E_TxRing txr; > >>> core->mac[index] = val; > >>> > >>> if (core->mac[TARC0] & E1000_TARC_ENABLE) { > >>> - e1000e_tx_ring_init(core, &txr, 0); > >>> - e1000e_start_xmit(core, &txr); > >>> + if (core->tx[0].tx_waiting) { > >>> + return; > >>> + } > >>> + core->tx[0].tx_waiting = 1; > >>> + if (!core->vm_running) { > >>> + return; > >>> + } > >>> + qemu_bh_schedule(core->tx[0].tx_bh); > >>> } > >>> > >>> if (core->mac[TARC1] & E1000_TARC_ENABLE) { > >>> - e1000e_tx_ring_init(core, &txr, 1); > >>> - e1000e_start_xmit(core, &txr); > >>> + if (core->tx[1].tx_waiting) { > >>> + return; > >>> + } > >>> + core->tx[1].tx_waiting = 1; > >>> + if (!core->vm_running) { > >>> + return; > >>> + } > >>> + qemu_bh_schedule(core->tx[1].tx_bh); > >>> } > >>> } > >>> > >>> static void > >>> e1000e_set_tdt(E1000ECore *core, int index, uint32_t val) > >>> { > >>> - E1000E_TxRing txr; > >>> int qidx = e1000e_mq_queue_idx(TDT, index); > >>> uint32_t tarc_reg = (qidx == 0) ? TARC0 : TARC1; > >>> > >>> core->mac[index] = val & 0xffff; > >>> > >>> if (core->mac[tarc_reg] & E1000_TARC_ENABLE) { > >>> - e1000e_tx_ring_init(core, &txr, qidx); > >>> - e1000e_start_xmit(core, &txr); > >>> + qemu_bh_schedule(core->tx[qidx].tx_bh); > >>> } > >>> } > >>> > >>> @@ -3315,10 +3325,56 @@ e1000e_vm_state_change(void *opaque, int running, > >>> RunState state) > >>> trace_e1000e_vm_state_running(); > >>> e1000e_intrmgr_resume(core); > >>> e1000e_autoneg_resume(core); > >>> + core->vm_running = 1; > >>> + > >>> + for (int i = 0; i < E1000E_NUM_QUEUES; i++) { > >>> + qemu_bh_schedule(core->tx[i].tx_bh); > >> > >> I guess the reason for the unconditional scheduling of bh is to make > >> sure tx work after live migration since we don't migrate tx_waiting. > >> > >> If yes, better add a comment here. > > Ok will do in next revision. > > > > And do we need to clear tx_waiting here? > > > > I think there is no need as the tx_bh handler will do this. > > > Yes. > > > > > >> > >>> + } > >>> + > >>> } else { > >>> trace_e1000e_vm_state_stopped(); > >>> + > >>> + for (int i = 0; i < E1000E_NUM_QUEUES; i++) { > >>> + qemu_bh_cancel(core->tx[i].tx_bh); > >>> + } > >>> + > >>> e1000e_autoneg_pause(core); > >>> e1000e_intrmgr_pause(core); > >>> + core->vm_running = 0; > >>> + } > >>> +} > >>> + > >>> + > >>> +static void e1000e_core_tx_bh(void *opaque) > >>> +{ > >>> + struct e1000e_tx *q = opaque; > >>> + E1000ECore *core = q->core; > >>> + int32_t ret; > >>> + > >>> + if (!core->vm_running) { > >>> + assert(q->tx_waiting); > >>> + return; > >>> + } > >>> + > >>> + q->tx_waiting = 0; > >>> + > >>> + if (!(core->mac[TCTL] & E1000_TCTL_EN)) { > >>> + trace_e1000e_tx_disabled(); > >>> + return; > >>> + } > >>> + > >>> + q->cause = E1000_ICS_TXQE; > >>> + q->ide = false; > >>> + > >>> + ret = e1000e_start_xmit(q); > >>> + if (ret >= core->tx_burst) { > >>> + qemu_bh_schedule(q->tx_bh); > >>> + q->tx_waiting = 1; > >>> + return; > >>> + } > >>> + > >>> + if (!q->ide || !e1000e_intrmgr_delay_tx_causes(core, &q->cause)) { > >>> + e1000e_set_interrupt_cause(core, q->cause); > >> > >> So I think this will cause some delay of the interrupt delivering. > > Exactly. if tx burst occurs, the interrupt won't be triggered in the > > first tx_bh handler. > > As we give the vcpu more time and this may acceptable? > > > I think not, e1000e has its own interrupt delay mechanism, let's keep it > work as much as possible. > > > > > > It > >> looks to be it's better to leave the set ics in e1000e_start_xmit(). > > I think let e1000e_start_xmit just send packets is more clean. > > Leave setting ics in it will not improve the performance as far as I can > > see. > > What's your idea here to leave it in e1000e_start_xmit? > > > I just worry about the delay of the interrupt if you do it in > e1000e_core_tx_bh() and just let e1000e_start_xmit() return the number > of packets transmitted and keep most of its code untouched.
Sorry here let's make sure I got your meaning. 'The set ics' is meaning 'set interrupt cause' , in code: + if (!q->ide || !e1000e_intrmgr_delay_tx_causes(core, &q->cause)) { + e1000e_set_interrupt_cause(core, q->cause); Do you mean we should leave it in the 'e1000e_start_xmit'? My understanding is this. if it is this, we should also add following code in it: + if (ret >= core->tx_burst) { + qemu_bh_schedule(q->tx_bh); + q->tx_waiting = 1; + return; + } Then the tx_bh handler just call 'e1000e_start_xmit'. Right? > > > > > >> > >>> } > >>> } > >>> > >>> @@ -3334,12 +3390,15 @@ e1000e_core_pci_realize(E1000ECore *core, > >>> e1000e_autoneg_timer, core); > >>> e1000e_intrmgr_pci_realize(core); > >>> > >>> + core->vm_running = runstate_is_running(); > >>> core->vmstate = > >>> qemu_add_vm_change_state_handler(e1000e_vm_state_change, core); > >>> > >>> for (i = 0; i < E1000E_NUM_QUEUES; i++) { > >>> net_tx_pkt_init(&core->tx[i].tx_pkt, core->owner, > >>> E1000E_MAX_TX_FRAGS, core->has_vnet); > >>> + core->tx[i].core = core; > >>> + core->tx[i].tx_bh = qemu_bh_new(e1000e_core_tx_bh, &core->tx[i]); > >>> } > >>> > >>> net_rx_pkt_init(&core->rx_pkt, core->has_vnet); > >>> @@ -3367,6 +3426,9 @@ e1000e_core_pci_uninit(E1000ECore *core) > >>> for (i = 0; i < E1000E_NUM_QUEUES; i++) { > >>> net_tx_pkt_reset(core->tx[i].tx_pkt); > >>> net_tx_pkt_uninit(core->tx[i].tx_pkt); > >>> + qemu_bh_cancel(core->tx[i].tx_bh); > >>> + qemu_bh_delete(core->tx[i].tx_bh); > >>> + core->tx[i].tx_bh = NULL; > >>> } > >>> > >>> net_rx_pkt_uninit(core->rx_pkt); > >>> diff --git a/hw/net/e1000e_core.h b/hw/net/e1000e_core.h > >>> index aee32f7e48..0c16dce3a6 100644 > >>> --- a/hw/net/e1000e_core.h > >>> +++ b/hw/net/e1000e_core.h > >>> @@ -77,10 +77,18 @@ struct E1000Core { > >>> unsigned char sum_needed; > >>> bool cptse; > >>> struct NetTxPkt *tx_pkt; > >>> + QEMUBH *tx_bh; > >>> + uint32_t tx_waiting; > >>> + uint32_t cause; > >>> + bool ide; > >>> + E1000ECore *core; > >>> } tx[E1000E_NUM_QUEUES]; > >>> > >>> struct NetRxPkt *rx_pkt; > >>> > >>> + int32_t tx_burst; > >>> + > >>> + bool vm_running; > >>> bool has_vnet; > >>> int max_queue_num; > >>> > >> > >> Do we need to cancel the bh and reset tx_waiting in e1000e_core_reset()? > > I think we need. But why the virtio net doesn't do this? Maybe it also > > lack of this? > > > Ok, so I think the current code is probably OK. So my next revision will do following: 1. add comment for the tx_bh schdule when VM resume. 2. Leave some code in the 'e1000e_start_xmit' 3. cancel the tx_bh and reset tx_waiting in e1000e_core_reset If I lost/misunderstanding anything please let me know. Thanks, Li Qiang > > virtio-net check DRIVER_OK and the code here check TCTL_EN. > > Thanks > > > > > > Thanks, > > Li Qiang > > > >> Thanks > >> >