Re: [PATCH v2] e1000e: using bottom half to send packets

Li Qiang Mon, 20 Jul 2020 23:00:59 -0700

Jason Wang <jasow...@redhat.com> 于2020年7月21日周二 下午1:30写道：
>
>
> On 2020/7/21 下午12:33, Li Qiang wrote:
> > Jason Wang <jasow...@redhat.com> 于2020年7月21日周二 上午10:03写道：
> >>
> >> On 2020/7/21 上午12:45, Li Qiang wrote:
> >>> Alexander Bulekov reported a UAF bug related e1000e packets send.
> >>>
> >>> -->https://bugs.launchpad.net/qemu/+bug/1886362
> >>>
> >>> This is because the guest trigger a e1000e packet send and set the
> >>> data's address to e1000e's MMIO address. So when the e1000e do DMA
> >>> it will write the MMIO again and trigger re-entrancy and finally
> >>> causes this UAF.
> >>>
> >>> Paolo suggested to use a bottom half whenever MMIO is doing complicate
> >>> things in here:
> >>> -->https://lists.nongnu.org/archive/html/qemu-devel/2020-07/msg03342.html
> >>>
> >>> Reference here:
> >>> 'The easiest solution is to delay processing of descriptors to a bottom
> >>> half whenever MMIO is doing something complicated.  This is also better
> >>> for latency because it will free the vCPU thread more quickly and leave
> >>> the work to the I/O thread.'
> >>>
> >>> This patch fixes this UAF.
> >>>
> >>> Signed-off-by: Li Qiang <liq...@163.com>
> >>> ---
> >>> Change since v1:
> >>> Per Jason's review here:
> >>> -- https://lists.nongnu.org/archive/html/qemu-devel/2020-07/msg05368.html
> >>> 1. Cancel and schedule the tx bh when VM is stopped or resume
> >>> 2. Add a tx_burst for e1000e configuration to throttle the bh execution
> >>> 3. Add a tx_waiting to record whether the bh is pending or not
> >>> Don't use BQL in the tx_bh handler as when tx_bh is executed, the BQL is
> >>> acquired.
> >>>
> >>>    hw/net/e1000e.c      |   6 +++
> >>>    hw/net/e1000e_core.c | 106 ++++++++++++++++++++++++++++++++++---------
> >>>    hw/net/e1000e_core.h |   8 ++++
> >>>    3 files changed, 98 insertions(+), 22 deletions(-)
> >>>
> >>> diff --git a/hw/net/e1000e.c b/hw/net/e1000e.c
> >>> index fda34518c9..24e35a78bf 100644
> >>> --- a/hw/net/e1000e.c
> >>> +++ b/hw/net/e1000e.c
> >>> @@ -77,10 +77,14 @@ typedef struct E1000EState {
> >>>
> >>>        bool disable_vnet;
> >>>
> >>> +    int32_t tx_burst;
> >>> +
> >>>        E1000ECore core;
> >>>
> >>>    } E1000EState;
> >>>
> >>> +#define TX_BURST 256
> >>> +
> >>>    #define E1000E_MMIO_IDX     0
> >>>    #define E1000E_FLASH_IDX    1
> >>>    #define E1000E_IO_IDX       2
> >>> @@ -263,6 +267,7 @@ static void e1000e_core_realize(E1000EState *s)
> >>>    {
> >>>        s->core.owner = &s->parent_obj;
> >>>        s->core.owner_nic = s->nic;
> >>> +    s->core.tx_burst = s->tx_burst;
> >>>    }
> >>>
> >>>    static void
> >>> @@ -665,6 +670,7 @@ static Property e1000e_properties[] = {
> >>>                            e1000e_prop_subsys_ven, uint16_t),
> >>>        DEFINE_PROP_SIGNED("subsys", E1000EState, subsys, 0,
> >>>                            e1000e_prop_subsys, uint16_t),
> >>> +    DEFINE_PROP_INT32("x-txburst", E1000EState, tx_burst, TX_BURST),
> >>>        DEFINE_PROP_END_OF_LIST(),
> >>>    };
> >>>
> >>> diff --git a/hw/net/e1000e_core.c b/hw/net/e1000e_core.c
> >>> index bcd186cac5..0a38a50cca 100644
> >>> --- a/hw/net/e1000e_core.c
> >>> +++ b/hw/net/e1000e_core.c
> >>> @@ -909,19 +909,18 @@ e1000e_rx_ring_init(E1000ECore *core, E1000E_RxRing 
> >>> *rxr, int idx)
> >>>        rxr->i      = &i[idx];
> >>>    }
> >>>
> >>> -static void
> >>> -e1000e_start_xmit(E1000ECore *core, const E1000E_TxRing *txr)
> >>> +static int32_t
> >>> +e1000e_start_xmit(struct e1000e_tx *q)
> >>>    {
> >>> +    E1000ECore *core = q->core;
> >>>        dma_addr_t base;
> >>>        struct e1000_tx_desc desc;
> >>> -    bool ide = false;
> >>> -    const E1000E_RingInfo *txi = txr->i;
> >>> -    uint32_t cause = E1000_ICS_TXQE;
> >>> +    const E1000E_RingInfo *txi;
> >>> +    E1000E_TxRing txr;
> >>> +    int32_t num_packets = 0;
> >>>
> >>> -    if (!(core->mac[TCTL] & E1000_TCTL_EN)) {
> >>> -        trace_e1000e_tx_disabled();
> >>> -        return;
> >>> -    }
> >>> +    e1000e_tx_ring_init(core, &txr, q - &core->tx[0]);
> >>> +    txi = txr.i;
> >>>
> >>>        while (!e1000e_ring_empty(core, txi)) {
> >>>            base = e1000e_ring_head_descr(core, txi);
> >>> @@ -931,15 +930,17 @@ e1000e_start_xmit(E1000ECore *core, const 
> >>> E1000E_TxRing *txr)
> >>>            trace_e1000e_tx_descr((void *)(intptr_t)desc.buffer_addr,
> >>>                                  desc.lower.data, desc.upper.data);
> >>>
> >>> -        e1000e_process_tx_desc(core, txr->tx, &desc, txi->idx);
> >>> -        cause |= e1000e_txdesc_writeback(core, base, &desc, &ide, 
> >>> txi->idx);
> >>> +        e1000e_process_tx_desc(core, txr.tx, &desc, txi->idx);
> >>> +        q->cause |= e1000e_txdesc_writeback(core, base, &desc,
> >>> +                                            &q->ide, txi->idx);
> >>>
> >>>            e1000e_ring_advance(core, txi, 1);
> >>> +        if (++num_packets >= core->tx_burst) {
> >>> +            break;
> >>> +        }
> >>>        }
> >>>
> >>> -    if (!ide || !e1000e_intrmgr_delay_tx_causes(core, &cause)) {
> >>> -        e1000e_set_interrupt_cause(core, cause);
> >>> -    }
> >>> +    return num_packets;
> >>>    }
> >>>
> >>>    static bool
> >>> @@ -2423,32 +2424,41 @@ e1000e_set_dbal(E1000ECore *core, int index, 
> >>> uint32_t val)
> >>>    static void
> >>>    e1000e_set_tctl(E1000ECore *core, int index, uint32_t val)
> >>>    {
> >>> -    E1000E_TxRing txr;
> >>>        core->mac[index] = val;
> >>>
> >>>        if (core->mac[TARC0] & E1000_TARC_ENABLE) {
> >>> -        e1000e_tx_ring_init(core, &txr, 0);
> >>> -        e1000e_start_xmit(core, &txr);
> >>> +        if (core->tx[0].tx_waiting) {
> >>> +            return;
> >>> +        }
> >>> +        core->tx[0].tx_waiting = 1;
> >>> +        if (!core->vm_running) {
> >>> +            return;
> >>> +        }
> >>> +        qemu_bh_schedule(core->tx[0].tx_bh);
> >>>        }
> >>>
> >>>        if (core->mac[TARC1] & E1000_TARC_ENABLE) {
> >>> -        e1000e_tx_ring_init(core, &txr, 1);
> >>> -        e1000e_start_xmit(core, &txr);
> >>> +        if (core->tx[1].tx_waiting) {
> >>> +            return;
> >>> +        }
> >>> +        core->tx[1].tx_waiting = 1;
> >>> +        if (!core->vm_running) {
> >>> +            return;
> >>> +        }
> >>> +        qemu_bh_schedule(core->tx[1].tx_bh);
> >>>        }
> >>>    }
> >>>
> >>>    static void
> >>>    e1000e_set_tdt(E1000ECore *core, int index, uint32_t val)
> >>>    {
> >>> -    E1000E_TxRing txr;
> >>>        int qidx = e1000e_mq_queue_idx(TDT, index);
> >>>        uint32_t tarc_reg = (qidx == 0) ? TARC0 : TARC1;
> >>>
> >>>        core->mac[index] = val & 0xffff;
> >>>
> >>>        if (core->mac[tarc_reg] & E1000_TARC_ENABLE) {
> >>> -        e1000e_tx_ring_init(core, &txr, qidx);
> >>> -        e1000e_start_xmit(core, &txr);
> >>> +        qemu_bh_schedule(core->tx[qidx].tx_bh);
> >>>        }
> >>>    }
> >>>
> >>> @@ -3315,10 +3325,56 @@ e1000e_vm_state_change(void *opaque, int running, 
> >>> RunState state)
> >>>            trace_e1000e_vm_state_running();
> >>>            e1000e_intrmgr_resume(core);
> >>>            e1000e_autoneg_resume(core);
> >>> +        core->vm_running = 1;
> >>> +
> >>> +        for (int i = 0; i < E1000E_NUM_QUEUES; i++) {
> >>> +            qemu_bh_schedule(core->tx[i].tx_bh);
> >>
> >> I guess the reason for the unconditional scheduling of bh is to make
> >> sure tx work after live migration since we don't migrate tx_waiting.
> >>
> >> If yes, better add a comment here.
> > Ok will do in next revision.
> >
> > And do we need to clear tx_waiting here?
> >
> > I think there is no need as the tx_bh handler will do this.
>
>
> Yes.
>
>
> >
> >>
> >>> +        }
> >>> +
> >>>        } else {
> >>>            trace_e1000e_vm_state_stopped();
> >>> +
> >>> +        for (int i = 0; i < E1000E_NUM_QUEUES; i++) {
> >>> +            qemu_bh_cancel(core->tx[i].tx_bh);
> >>> +        }
> >>> +
> >>>            e1000e_autoneg_pause(core);
> >>>            e1000e_intrmgr_pause(core);
> >>> +        core->vm_running = 0;
> >>> +    }
> >>> +}
> >>> +
> >>> +
> >>> +static void e1000e_core_tx_bh(void *opaque)
> >>> +{
> >>> +    struct e1000e_tx *q = opaque;
> >>> +    E1000ECore *core = q->core;
> >>> +    int32_t ret;
> >>> +
> >>> +    if (!core->vm_running) {
> >>> +        assert(q->tx_waiting);
> >>> +        return;
> >>> +    }
> >>> +
> >>> +    q->tx_waiting = 0;
> >>> +
> >>> +    if (!(core->mac[TCTL] & E1000_TCTL_EN)) {
> >>> +        trace_e1000e_tx_disabled();
> >>> +        return;
> >>> +    }
> >>> +
> >>> +    q->cause = E1000_ICS_TXQE;
> >>> +    q->ide = false;
> >>> +
> >>> +    ret = e1000e_start_xmit(q);
> >>> +    if (ret >= core->tx_burst) {
> >>> +        qemu_bh_schedule(q->tx_bh);
> >>> +        q->tx_waiting = 1;
> >>> +        return;
> >>> +    }
> >>> +
> >>> +    if (!q->ide || !e1000e_intrmgr_delay_tx_causes(core, &q->cause)) {
> >>> +        e1000e_set_interrupt_cause(core, q->cause);
> >>
> >> So I think this will cause some delay of the interrupt delivering.
> > Exactly. if tx burst occurs, the interrupt won't be triggered in the
> > first tx_bh handler.
> > As we give the vcpu more time and this may acceptable?
>
>
> I think not, e1000e has its own interrupt delay mechanism, let's keep it
> work as much as possible.
>
>
> >
> > It
> >> looks to be it's better to leave the set ics in e1000e_start_xmit().
> > I think let e1000e_start_xmit just send packets is more clean.
> > Leave setting ics in it will not improve the performance as far as I can 
> > see.
> > What's your idea here to leave it in e1000e_start_xmit?
>
>
> I just worry about the delay of the interrupt if you do it in
> e1000e_core_tx_bh() and just let e1000e_start_xmit() return the number
> of packets transmitted and keep most of its code untouched.


Sorry here let's make sure I got your meaning.
'The set ics' is meaning 'set interrupt cause' , in code:

+    if (!q->ide || !e1000e_intrmgr_delay_tx_causes(core, &q->cause)) {
+        e1000e_set_interrupt_cause(core, q->cause);

Do you mean we should leave it in the 'e1000e_start_xmit'? My
understanding is this.
if it is this, we should also add following code in it:

+    if (ret >= core->tx_burst) {
+        qemu_bh_schedule(q->tx_bh);
+        q->tx_waiting = 1;
+        return;
+    }

Then the tx_bh handler just call 'e1000e_start_xmit'. Right?



>
>
> >
> >>
> >>>        }
> >>>    }
> >>>
> >>> @@ -3334,12 +3390,15 @@ e1000e_core_pci_realize(E1000ECore     *core,
> >>>                                           e1000e_autoneg_timer, core);
> >>>        e1000e_intrmgr_pci_realize(core);
> >>>
> >>> +    core->vm_running = runstate_is_running();
> >>>        core->vmstate =
> >>>            qemu_add_vm_change_state_handler(e1000e_vm_state_change, core);
> >>>
> >>>        for (i = 0; i < E1000E_NUM_QUEUES; i++) {
> >>>            net_tx_pkt_init(&core->tx[i].tx_pkt, core->owner,
> >>>                            E1000E_MAX_TX_FRAGS, core->has_vnet);
> >>> +        core->tx[i].core = core;
> >>> +        core->tx[i].tx_bh = qemu_bh_new(e1000e_core_tx_bh, &core->tx[i]);
> >>>        }
> >>>
> >>>        net_rx_pkt_init(&core->rx_pkt, core->has_vnet);
> >>> @@ -3367,6 +3426,9 @@ e1000e_core_pci_uninit(E1000ECore *core)
> >>>        for (i = 0; i < E1000E_NUM_QUEUES; i++) {
> >>>            net_tx_pkt_reset(core->tx[i].tx_pkt);
> >>>            net_tx_pkt_uninit(core->tx[i].tx_pkt);
> >>> +        qemu_bh_cancel(core->tx[i].tx_bh);
> >>> +        qemu_bh_delete(core->tx[i].tx_bh);
> >>> +        core->tx[i].tx_bh = NULL;
> >>>        }
> >>>
> >>>        net_rx_pkt_uninit(core->rx_pkt);
> >>> diff --git a/hw/net/e1000e_core.h b/hw/net/e1000e_core.h
> >>> index aee32f7e48..0c16dce3a6 100644
> >>> --- a/hw/net/e1000e_core.h
> >>> +++ b/hw/net/e1000e_core.h
> >>> @@ -77,10 +77,18 @@ struct E1000Core {
> >>>            unsigned char sum_needed;
> >>>            bool cptse;
> >>>            struct NetTxPkt *tx_pkt;
> >>> +        QEMUBH *tx_bh;
> >>> +        uint32_t tx_waiting;
> >>> +        uint32_t cause;
> >>> +        bool ide;
> >>> +        E1000ECore *core;
> >>>        } tx[E1000E_NUM_QUEUES];
> >>>
> >>>        struct NetRxPkt *rx_pkt;
> >>>
> >>> +    int32_t tx_burst;
> >>> +
> >>> +    bool vm_running;
> >>>        bool has_vnet;
> >>>        int max_queue_num;
> >>>
> >>
> >> Do we need to cancel the bh and reset tx_waiting in e1000e_core_reset()?
> > I think we need. But why the virtio net doesn't do this? Maybe it also
> > lack of this?
>
>
> Ok, so I think the current code is probably OK.

So my next revision will do following:
1. add comment for the tx_bh schdule when VM resume.
2. Leave some code in the 'e1000e_start_xmit'
3. cancel the tx_bh and reset tx_waiting in e1000e_core_reset

If I lost/misunderstanding anything please let me know.

Thanks,
Li Qiang

>
> virtio-net check DRIVER_OK and the code here check  TCTL_EN.
>
> Thanks
>
>
> >
> > Thanks,
> > Li Qiang
> >
> >> Thanks
> >>
>

Re: [PATCH v2] e1000e: using bottom half to send packets

Reply via email to