On 01/11/2015 11:57 AM, sfel...@gmail.com wrote: > From: Scott Feldman <sfel...@gmail.com> > > Rocker is a simulated ethernet switch device. The device supports up to 62 > front-panel ports and supports L2 switching and L3 routing functions, as well > as L2/L3/L4 ACLs. The device presents a single PCI device for each switch, > with a memory-mapped register space for device driver access. > > Rocker device is invoked with -device, for example a 4-port switch: > > -device rocker,name=sw1,len-ports=4,ports[0]=dev0,ports[1]=dev1, \ > ports[2]=dev2,ports[3]=dev3 > > Each port is a netdev and can be paired with using -netdev id=<port name>. > > Signed-off-by: Scott Feldman <sfel...@gmail.com> > Signed-off-by: Jiri Pirko <j...@resnulli.us> > ---
Looks like the devices does not support state saving. How about tagging it with unmigratable first? > default-configs/pci.mak | 1 + > hw/net/Makefile.objs | 3 + > hw/net/rocker/rocker.c | 1395 +++++++++++++++++++++++++ > hw/net/rocker/rocker.h | 76 ++ > hw/net/rocker/rocker_desc.c | 379 +++++++ > hw/net/rocker/rocker_desc.h | 57 + > hw/net/rocker/rocker_fp.c | 232 +++++ > hw/net/rocker/rocker_fp.h | 53 + > hw/net/rocker/rocker_hw.h | 475 +++++++++ > hw/net/rocker/rocker_of_dpa.c | 2314 > +++++++++++++++++++++++++++++++++++++++++ > hw/net/rocker/rocker_of_dpa.h | 25 + > hw/net/rocker/rocker_tlv.h | 244 +++++ > hw/net/rocker/rocker_world.c | 108 ++ > hw/net/rocker/rocker_world.h | 63 ++ > 14 files changed, 5425 insertions(+) > create mode 100644 hw/net/rocker/rocker.c > create mode 100644 hw/net/rocker/rocker.h > create mode 100644 hw/net/rocker/rocker_desc.c > create mode 100644 hw/net/rocker/rocker_desc.h > create mode 100644 hw/net/rocker/rocker_fp.c > create mode 100644 hw/net/rocker/rocker_fp.h > create mode 100644 hw/net/rocker/rocker_hw.h > create mode 100644 hw/net/rocker/rocker_of_dpa.c > create mode 100644 hw/net/rocker/rocker_of_dpa.h > create mode 100644 hw/net/rocker/rocker_tlv.h > create mode 100644 hw/net/rocker/rocker_world.c > create mode 100644 hw/net/rocker/rocker_world.h [...] > + > +typedef struct rocker { > + /* private */ > + PCIDevice parent_obj; > + /* public */ > + > + MemoryRegion mmio; > + MemoryRegion msix_bar; > + > + /* switch configuration */ > + char *name; /* switch name */ > + uint32_t fp_ports; /* front-panel port count */ > + NICPeers *fp_ports_peers; > + MACAddr fp_start_macaddr; /* front-panel port 0 mac addr */ > + uint64_t switch_id; /* switch id */ Looks like the function of name ad switch_id are duplicated? > + > + /* front-panel ports */ > + FpPort *fp_port[ROCKER_FP_PORTS_MAX]; > + > + /* register backings */ > + uint32_t test_reg; > + uint64_t test_reg64; > + dma_addr_t test_dma_addr; > + uint32_t test_dma_size; > + > + /* desc rings */ > + DescRing **rings; > + > + /* switch worlds */ > + World *worlds[ROCKER_WORLD_TYPE_MAX]; > + World *world_dflt; > + > + QLIST_ENTRY(rocker) next; > +} Rocker; > + [...] > + > +static int tx_consume(Rocker *r, DescInfo *info) > +{ > + PCIDevice *dev = PCI_DEVICE(r); > + char *buf = desc_get_buf(info, true); > + RockerTlv *tlv_frag; > + RockerTlv *tlvs[ROCKER_TLV_TX_MAX + 1]; > + struct iovec iov[ROCKER_TX_FRAGS_MAX] = { { 0, }, }; > + uint32_t pport; > + uint32_t port; > + uint16_t tx_offload = ROCKER_TX_OFFLOAD_NONE; > + uint16_t tx_l3_csum_off = 0; > + uint16_t tx_tso_mss = 0; > + uint16_t tx_tso_hdr_len = 0; > + int iovcnt = 0; > + int err = 0; > + int rem; > + int i; > + > + if (!buf) { > + return -ENXIO; > + } > + > + rocker_tlv_parse(tlvs, ROCKER_TLV_TX_MAX, buf, desc_tlv_size(info)); > + > + if (!tlvs[ROCKER_TLV_TX_FRAGS]) { > + return -EINVAL; > + } > + > + pport = rocker_get_pport_by_tx_ring(r, desc_get_ring(info)); > + if (!fp_port_from_pport(pport, &port)) { > + return -EINVAL; > + } > + > + if (tlvs[ROCKER_TLV_TX_OFFLOAD]) { > + tx_offload = rocker_tlv_get_u8(tlvs[ROCKER_TLV_TX_OFFLOAD]); > + } > + > + switch (tx_offload) { > + case ROCKER_TX_OFFLOAD_L3_CSUM: > + if (!tlvs[ROCKER_TLV_TX_L3_CSUM_OFF]) { > + return -EINVAL; > + } > + case ROCKER_TX_OFFLOAD_TSO: > + if (!tlvs[ROCKER_TLV_TX_TSO_MSS] || > + !tlvs[ROCKER_TLV_TX_TSO_HDR_LEN]) { > + return -EINVAL; > + } > + } > + > + if (tlvs[ROCKER_TLV_TX_L3_CSUM_OFF]) { > + tx_l3_csum_off = > rocker_tlv_get_le16(tlvs[ROCKER_TLV_TX_L3_CSUM_OFF]); > + } > + > + if (tlvs[ROCKER_TLV_TX_TSO_MSS]) { > + tx_tso_mss = rocker_tlv_get_le16(tlvs[ROCKER_TLV_TX_TSO_MSS]); > + } > + > + if (tlvs[ROCKER_TLV_TX_TSO_HDR_LEN]) { > + tx_tso_hdr_len = > rocker_tlv_get_le16(tlvs[ROCKER_TLV_TX_TSO_HDR_LEN]); > + } > + > + rocker_tlv_for_each_nested(tlv_frag, tlvs[ROCKER_TLV_TX_FRAGS], rem) { > + hwaddr frag_addr; > + uint16_t frag_len; > + > + if (rocker_tlv_type(tlv_frag) != ROCKER_TLV_TX_FRAG) { > + return -EINVAL; > + } > + > + rocker_tlv_parse_nested(tlvs, ROCKER_TLV_TX_FRAG_ATTR_MAX, tlv_frag); > + > + if (!tlvs[ROCKER_TLV_TX_FRAG_ATTR_ADDR] || > + !tlvs[ROCKER_TLV_TX_FRAG_ATTR_LEN]) { > + return -EINVAL; > + } > + > + frag_addr = rocker_tlv_get_le64(tlvs[ROCKER_TLV_TX_FRAG_ATTR_ADDR]); > + frag_len = rocker_tlv_get_le16(tlvs[ROCKER_TLV_TX_FRAG_ATTR_LEN]); > + > + iov[iovcnt].iov_len = frag_len; > + iov[iovcnt].iov_base = g_malloc(frag_len); > + if (!iov[iovcnt].iov_base) { > + err = -ENOMEM; > + goto err_no_mem; > + } > + > + if (pci_dma_read(dev, frag_addr, iov[iovcnt].iov_base, > + iov[iovcnt].iov_len)) { > + err = -ENXIO; > + goto err_bad_io; > + } > + > + if (++iovcnt > ROCKER_TX_FRAGS_MAX) { > + goto err_too_many_frags; > + } > + } > + > + if (iovcnt) { > + /* XXX perform Tx offloads */ > + /* XXX silence compiler for now */ Need add TSO here (e1000 is a good reference) or disable TSO in driver. > + tx_l3_csum_off += tx_tso_mss = tx_tso_hdr_len = 0; > + } > + > + err = fp_port_eg(r->fp_port[port], iov, iovcnt); > + > +err_no_mem: > +err_bad_io: > +err_too_many_frags: > + for (i = 0; i < iovcnt; i++) { > + if (iov[i].iov_base) { > + g_free(iov[i].iov_base); > + } > + } > + > + return err; > +} > + [...] > + > +int rocker_event_link_changed(Rocker *r, uint32_t pport, bool link_up) > +{ > + DescRing *ring = r->rings[ROCKER_RING_EVENT]; > + DescInfo *info = desc_ring_fetch_desc(ring); > + RockerTlv *nest; > + char *buf; > + size_t tlv_size; > + int pos; > + int err; > + > + if (!info) { > + return -ENOBUFS; > + } > + > + tlv_size = rocker_tlv_total_size(sizeof(uint16_t)) + /* event type */ > + rocker_tlv_total_size(0) + /* nest */ > + rocker_tlv_total_size(sizeof(uint32_t)) + /* pport */ > + rocker_tlv_total_size(sizeof(uint8_t)); /* link up */ > + > + if (tlv_size > desc_buf_size(info)) { > + err = -EMSGSIZE; > + goto err_too_big; > + } > + > + buf = desc_get_buf(info, false); > + if (!buf) { > + err = -ENOMEM; > + goto err_no_mem; > + } > + > + pos = 0; > + rocker_tlv_put_le32(buf, &pos, ROCKER_TLV_EVENT_TYPE, > + ROCKER_TLV_EVENT_TYPE_LINK_CHANGED); > + nest = rocker_tlv_nest_start(buf, &pos, ROCKER_TLV_EVENT_INFO); > + rocker_tlv_put_le32(buf, &pos, ROCKER_TLV_EVENT_LINK_CHANGED_PPORT, > pport); > + rocker_tlv_put_u8(buf, &pos, ROCKER_TLV_EVENT_LINK_CHANGED_LINKUP, > + link_up ? 1 : 0); Looks like those types are not documented. > + rocker_tlv_nest_end(buf, &pos, nest); > + > + err = desc_set_buf(info, tlv_size); > + > +err_too_big: > +err_no_mem: > + if (desc_ring_post_desc(ring, err)) { > + rocker_msix_irq(r, ROCKER_MSIX_VEC_EVENT); > + } > + > + return err; > +} > + > +int rocker_event_mac_vlan_seen(Rocker *r, uint32_t pport, uint8_t *addr, > + uint16_t vlan_id) > +{ > + DescRing *ring = r->rings[ROCKER_RING_EVENT]; > + DescInfo *info; > + FpPort *fp_port; > + uint32_t port; > + RockerTlv *nest; > + char *buf; > + size_t tlv_size; > + int pos; > + int err; > + > + if (!fp_port_from_pport(pport, &port)) { > + return -EINVAL; > + } > + fp_port = r->fp_port[port]; > + if (!fp_port_get_learning(fp_port)) { > + return 0; > + } > + > + info = desc_ring_fetch_desc(ring); > + if (!info) { > + return -ENOBUFS; > + } > + > + tlv_size = rocker_tlv_total_size(sizeof(uint16_t)) + /* event type */ > + rocker_tlv_total_size(0) + /* nest */ > + rocker_tlv_total_size(sizeof(uint32_t)) + /* pport */ > + rocker_tlv_total_size(ETH_ALEN) + /* mac addr */ > + rocker_tlv_total_size(sizeof(uint16_t)); /* vlan_id */ > + > + if (tlv_size > desc_buf_size(info)) { > + err = -EMSGSIZE; > + goto err_too_big; > + } > + > + buf = desc_get_buf(info, false); > + if (!buf) { > + err = -ENOMEM; > + goto err_no_mem; > + } > + > + pos = 0; > + rocker_tlv_put_le32(buf, &pos, ROCKER_TLV_EVENT_TYPE, > + ROCKER_TLV_EVENT_TYPE_MAC_VLAN_SEEN); > + nest = rocker_tlv_nest_start(buf, &pos, ROCKER_TLV_EVENT_INFO); > + rocker_tlv_put_le32(buf, &pos, ROCKER_TLV_EVENT_MAC_VLAN_PPORT, pport); > + rocker_tlv_put(buf, &pos, ROCKER_TLV_EVENT_MAC_VLAN_MAC, ETH_ALEN, addr); > + rocker_tlv_put_u16(buf, &pos, ROCKER_TLV_EVENT_MAC_VLAN_VLAN_ID, > vlan_id); Undocumented types. > + rocker_tlv_nest_end(buf, &pos, nest); > + > + err = desc_set_buf(info, tlv_size); > + > +err_too_big: > +err_no_mem: > + if (desc_ring_post_desc(ring, err)) { > + rocker_msix_irq(r, ROCKER_MSIX_VEC_EVENT); > + } > + > + return err; > +} > + > +static DescRing *rocker_get_rx_ring_by_pport(Rocker *r, > + uint32_t pport) > +{ > + return r->rings[(pport - 1) * 2 + 3]; > +} > + > +int rx_produce(World *world, uint32_t pport, > + const struct iovec *iov, int iovcnt) > +{ > + Rocker *r = world_rocker(world); > + PCIDevice *dev = (PCIDevice *)r; > + DescRing *ring = rocker_get_rx_ring_by_pport(r, pport); > + DescInfo *info = desc_ring_fetch_desc(ring); > + char *data; > + size_t data_size = iov_size(iov, iovcnt); > + char *buf; > + uint16_t rx_flags = 0; > + uint16_t rx_csum = 0; > + size_t tlv_size; > + RockerTlv *tlvs[ROCKER_TLV_RX_MAX + 1]; > + hwaddr frag_addr; > + uint16_t frag_max_len; > + int pos; > + int err; > + > + if (!info) { > + return -ENOBUFS; > + } > + > + buf = desc_get_buf(info, false); > + if (!buf) { > + err = -ENXIO; > + goto out; > + } > + rocker_tlv_parse(tlvs, ROCKER_TLV_RX_MAX, buf, desc_tlv_size(info)); > + > + if (!tlvs[ROCKER_TLV_RX_FRAG_ADDR] || > + !tlvs[ROCKER_TLV_RX_FRAG_MAX_LEN]) { This conflicts with the spec which only describes RX_PACKET. > + err = -EINVAL; > + goto out; > + } > + > + frag_addr = rocker_tlv_get_le64(tlvs[ROCKER_TLV_RX_FRAG_ADDR]); > + frag_max_len = rocker_tlv_get_le16(tlvs[ROCKER_TLV_RX_FRAG_MAX_LEN]); > + > + if (data_size > frag_max_len) { > + err = -EMSGSIZE; > + goto out; > + } > + > + /* XXX calc rx flags/csum */ > + > + tlv_size = rocker_tlv_total_size(sizeof(uint16_t)) + /* flags */ > + rocker_tlv_total_size(sizeof(uint16_t)) + /* scum */ > + rocker_tlv_total_size(sizeof(uint64_t)) + /* frag addr */ > + rocker_tlv_total_size(sizeof(uint16_t)) + /* frag max len */ > + rocker_tlv_total_size(sizeof(uint16_t)); /* frag len */ > + > + if (tlv_size > desc_buf_size(info)) { > + err = -EMSGSIZE; > + goto out; > + } > + > + /* TODO: > + * iov dma write can be optimized in similar way e1000 does it in > + * e1000_receive_iov. But maybe if would make sense to introduce > + * generic helper iov_dma_write. > + */ > + > + data = g_malloc(data_size); > + if (!data) { > + err = -ENOMEM; > + goto out; > + } > + iov_to_buf(iov, iovcnt, 0, data, data_size); > + pci_dma_write(dev, frag_addr, data, data_size); > + g_free(data); > + > + pos = 0; > + rocker_tlv_put_le16(buf, &pos, ROCKER_TLV_RX_FLAGS, rx_flags); > + rocker_tlv_put_le16(buf, &pos, ROCKER_TLV_RX_CSUM, rx_csum); Note: Some backend (e.g tap) can do offloading, may consider to add the support in the future. > + rocker_tlv_put_le64(buf, &pos, ROCKER_TLV_RX_FRAG_ADDR, frag_addr); > + rocker_tlv_put_le16(buf, &pos, ROCKER_TLV_RX_FRAG_MAX_LEN, frag_max_len); The above 2 types were not documented. Please make sure all types were documented. > + rocker_tlv_put_le16(buf, &pos, ROCKER_TLV_RX_FRAG_LEN, data_size); > + > + err = desc_set_buf(info, tlv_size); > + > +out: > + if (desc_ring_post_desc(ring, err)) { > + rocker_msix_irq(r, ROCKER_MSIX_VEC_RX(pport - 1)); > + } > + > + return err; > +} > + [...] > + > +bool fp_port_from_pport(uint32_t pport, uint32_t *port) > +{ > + if (pport < 1 || pport > ROCKER_FP_PORTS_MAX) { > + return false; > + } > + *port = pport - 1; > + return true; > +} > + > +int fp_port_eg(FpPort *port, const struct iovec *iov, int iovcnt) > +{ > + NetClientState *nc = qemu_get_queue(port->nic); Using only 1 queues is ok for multiqueue backend. In the future may consider to use all. > + > + if (port->enabled) { > + qemu_sendv_packet(nc, iov, iovcnt); > + } > + > + return 0; > +} > + [...] Thanks