On Thu, 30 Jul 2020 18:53:58 -0400 David Thompson wrote: > The logic in "mlxbf_gige_mdio.c" is the driver controlling > the Mellanox BlueField hardware that interacts with a PHY > device via MDIO/MDC pins. This driver does the following: > - At driver probe time, it configures several BlueField MDIO > parameters such as sample rate, full drive, voltage and MDC > based on values read from ACPI table. > - It defines functions to read and write MDIO registers and > registers the MDIO bus. > - It defines the phy interrupt handler reporting a > link up/down status change > - This driver's probe is invoked from the main driver logic > while the phy interrupt handler is registered in ndo_open.
These parts will definitely need Andrew or other PHY maintainers to look at. Good luck with the ACPI stuff :) > Reported-by: kernel test robot <l...@intel.com> Remove this, it only applies if the entire patch is prompted by the reporter. I doubt the build bot made you write this driver. > +config MLXBF_GIGE > + tristate "Mellanox Technologies BlueField Gigabit Ethernet support" > + depends on (ARM64 || COMPILE_TEST) && ACPI && INET Why do you depend on INET? :S > + for (i = 0; i < priv->rx_q_entries; i++) { > + /* Allocate a receive buffer for this RX WQE. The DMA > + * form (dma_addr_t) of the receive buffer address is > + * stored in the RX WQE array (via 'rx_wqe_ptr') where > + * it is accessible by the GigE device. The VA form of > + * the receive buffer is stored in 'rx_buf[]' array in > + * the driver private storage for housekeeping. > + */ > + priv->rx_buf[i] = dma_alloc_coherent(priv->dev, > + MLXBF_GIGE_DEFAULT_BUF_SZ, > + &rx_buf_dma, > + GFP_KERNEL); Do the buffers have to be in coherent memory? That's kinda strange. > + if (!priv->rx_buf[i]) > + goto free_wqe_and_buf; > + > + *rx_wqe_ptr++ = rx_buf_dma; > + } > + /* Enable RX DMA to write new packets to memory */ > + writeq(MLXBF_GIGE_RX_DMA_EN, priv->base + MLXBF_GIGE_RX_DMA); Looks a little strange that you enable DMA and then do further config like CRC stripping. > + /* Enable removal of CRC during RX */ > + data = readq(priv->base + MLXBF_GIGE_RX); > + data |= MLXBF_GIGE_RX_STRIP_CRC_EN; > + writeq(data, priv->base + MLXBF_GIGE_RX); > + > + /* Enable RX MAC filter pass and discard counters */ > + writeq(MLXBF_GIGE_RX_MAC_FILTER_COUNT_DISC_EN, > + priv->base + MLXBF_GIGE_RX_MAC_FILTER_COUNT_DISC); > + writeq(MLXBF_GIGE_RX_MAC_FILTER_COUNT_PASS_EN, > + priv->base + MLXBF_GIGE_RX_MAC_FILTER_COUNT_PASS); > + > + /* Clear MLXBF_GIGE_INT_MASK 'receive pkt' bit to > + * indicate readiness to receive pkts > + */ Readiness to receive interrupts - I presume. > + data = readq(priv->base + MLXBF_GIGE_INT_MASK); > + data &= ~MLXBF_GIGE_INT_MASK_RX_RECEIVE_PACKET; > + writeq(data, priv->base + MLXBF_GIGE_INT_MASK); > + > + writeq(ilog2(priv->rx_q_entries), > + priv->base + MLXBF_GIGE_RX_WQE_SIZE_LOG2); This is probably what enables the reception of packets? > + return 0; > + > +free_wqe_and_buf: > + rx_wqe_ptr = priv->rx_wqe_base; > + for (j = 0; j < i; j++) { > + dma_free_coherent(priv->dev, MLXBF_GIGE_DEFAULT_BUF_SZ, > + priv->rx_buf[j], *rx_wqe_ptr); > + rx_wqe_ptr++; > + } > + dma_free_coherent(priv->dev, wq_size, > + priv->rx_wqe_base, priv->rx_wqe_base_dma); > + return -ENOMEM; > +} > + > +/* Transmit Initialization > + * 1) Allocates TX WQE array using coherent DMA mapping > + * 2) Allocates TX completion counter using coherent DMA mapping > + */ > +static int mlxbf_gige_tx_init(struct mlxbf_gige *priv) > +{ > + size_t size; > + > + size = MLXBF_GIGE_TX_WQE_SZ * priv->tx_q_entries; > + priv->tx_wqe_base = dma_alloc_coherent(priv->dev, size, > + &priv->tx_wqe_base_dma, > + GFP_KERNEL); > + if (!priv->tx_wqe_base) > + return -ENOMEM; > + > + priv->tx_wqe_next = priv->tx_wqe_base; > + > + /* Write TX WQE base address into MMIO reg */ > + writeq(priv->tx_wqe_base_dma, priv->base + MLXBF_GIGE_TX_WQ_BASE); > + > + /* Allocate address for TX completion count */ > + priv->tx_cc = dma_alloc_coherent(priv->dev, MLXBF_GIGE_TX_CC_SZ, > + &priv->tx_cc_dma, GFP_KERNEL); > + nit: don't put empty lines between a call and error checking > + if (!priv->tx_cc) { > + dma_free_coherent(priv->dev, size, > + priv->tx_wqe_base, priv->tx_wqe_base_dma); > + return -ENOMEM; > + } > + > + /* Write TX CC base address into MMIO reg */ > + writeq(priv->tx_cc_dma, priv->base + MLXBF_GIGE_TX_CI_UPDATE_ADDRESS); > + > + writeq(ilog2(priv->tx_q_entries), > + priv->base + MLXBF_GIGE_TX_WQ_SIZE_LOG2); > + > + priv->prev_tx_ci = 0; > + priv->tx_pi = 0; > + > + return 0; > +} > +/* Start of struct ethtool_ops functions */ > +static int mlxbf_gige_get_regs_len(struct net_device *netdev) > +{ > + /* Return size of MMIO register space (in bytes). > + * > + * NOTE: MLXBF_GIGE_MAC_CFG is the last defined register offset, > + * so use that plus size of single register to derive total size > + */ > + return MLXBF_GIGE_MAC_CFG + 8; Please make a define for this, instead of the comment. > +} > + > +static void mlxbf_gige_get_regs(struct net_device *netdev, > + struct ethtool_regs *regs, void *p) > +{ > + struct mlxbf_gige *priv = netdev_priv(netdev); > + __be64 *buff = p; > + int reg; > + > + regs->version = MLXBF_GIGE_REGS_VERSION; > + > + /* Read entire MMIO register space and store results > + * into the provided buffer. Each 64-bit word is converted > + * to big-endian to make the output more readable. > + * > + * NOTE: by design, a read to an offset without an existing > + * register will be acknowledged and return zero. > + */ > + for (reg = 0; reg <= MLXBF_GIGE_MAC_CFG; reg += 8) Also with a define this will look less weird. > + *buff++ = cpu_to_be64(readq(priv->base + reg)); Can you use memcpy_fromio()? > +} > + > +static void mlxbf_gige_get_ringparam(struct net_device *netdev, > + struct ethtool_ringparam *ering) > +{ > + struct mlxbf_gige *priv = netdev_priv(netdev); > + > + memset(ering, 0, sizeof(*ering)); No need, core clears this. > + ering->rx_max_pending = MLXBF_GIGE_MAX_RXQ_SZ; > + ering->tx_max_pending = MLXBF_GIGE_MAX_TXQ_SZ; > + ering->rx_pending = priv->rx_q_entries; > + ering->tx_pending = priv->tx_q_entries; > +} > + > +static int mlxbf_gige_set_ringparam(struct net_device *netdev, > + struct ethtool_ringparam *ering) > +{ > + const struct net_device_ops *ops = netdev->netdev_ops; > + struct mlxbf_gige *priv = netdev_priv(netdev); > + int new_rx_q_entries, new_tx_q_entries; > + > + /* Device does not have separate queues for small/large frames */ > + if (ering->rx_mini_pending || ering->rx_jumbo_pending) > + return -EINVAL; > + > + /* Round up to supported values */ > + new_rx_q_entries = roundup_pow_of_two(ering->rx_pending); > + new_tx_q_entries = roundup_pow_of_two(ering->tx_pending); > + > + /* Range check the new values */ > + if (new_tx_q_entries < MLXBF_GIGE_MIN_TXQ_SZ || > + new_tx_q_entries > MLXBF_GIGE_MAX_TXQ_SZ || > + new_rx_q_entries < MLXBF_GIGE_MIN_RXQ_SZ || > + new_rx_q_entries > MLXBF_GIGE_MAX_RXQ_SZ) No need to check against max values, core does that. > + return -EINVAL; > + > + /* If queue sizes did not change, exit now */ > + if (new_rx_q_entries == priv->rx_q_entries && > + new_tx_q_entries == priv->tx_q_entries) > + return 0; > + > + if (netif_running(netdev)) > + ops->ndo_stop(netdev); > + > + priv->rx_q_entries = new_rx_q_entries; > + priv->tx_q_entries = new_tx_q_entries; > + > + if (netif_running(netdev)) > + ops->ndo_open(netdev); Please write the driver in a way where full stopping and starting the interface, risking memory allocation failures is not necessary to reconfigure rings. > + return 0; > +} > + > +static void mlxbf_gige_get_drvinfo(struct net_device *netdev, > + struct ethtool_drvinfo *info) > +{ > + strlcpy(info->driver, DRV_NAME, sizeof(info->driver)); > + strlcpy(info->bus_info, dev_name(&netdev->dev), sizeof(info->bus_info)); > +} If this is all you report - you don't need to implement drvinfo. Core fills those in. > +static void mlxbf_gige_get_ethtool_stats(struct net_device *netdev, > + struct ethtool_stats *estats, > + u64 *data) > +{ > + struct mlxbf_gige *priv = netdev_priv(netdev); > + unsigned long flags; > + > + spin_lock_irqsave(&priv->lock, flags); Why do you take a lock around stats? > + /* Fill data array with interface statistics > + * > + * NOTE: the data writes must be in > + * sync with the strings shown in > + * the mlxbf_gige_ethtool_stats_keys[] array > + * > + * NOTE2: certain statistics below are zeroed upon > + * port disable, so the calculation below > + * must include the "cached" value of the stat > + * plus the value read directly from hardware. > + * Cached statistics are currently: > + * rx_din_dropped_pkts > + * rx_filter_passed_pkts > + * rx_filter_discard_pkts > + */ > + *data++ = netdev->stats.rx_bytes; > + *data++ = netdev->stats.rx_packets; > + *data++ = netdev->stats.tx_bytes; > + *data++ = netdev->stats.tx_packets; Please don't duplicate standard stats in ethtool. > + *data++ = priv->stats.hw_access_errors; > + *data++ = priv->stats.tx_invalid_checksums; > + *data++ = priv->stats.tx_small_frames; > + *data++ = priv->stats.tx_index_errors; > + *data++ = priv->stats.sw_config_errors; > + *data++ = priv->stats.sw_access_errors; > + *data++ = priv->stats.rx_truncate_errors; > + *data++ = priv->stats.rx_mac_errors; > + *data++ = (priv->stats.rx_din_dropped_pkts + > + readq(priv->base + MLXBF_GIGE_RX_DIN_DROP_COUNTER)); > + *data++ = priv->stats.tx_fifo_full; > + *data++ = (priv->stats.rx_filter_passed_pkts + > + readq(priv->base + MLXBF_GIGE_RX_PASS_COUNTER_ALL)); > + *data++ = (priv->stats.rx_filter_discard_pkts + > + readq(priv->base + MLXBF_GIGE_RX_DISC_COUNTER_ALL)); > + > + spin_unlock_irqrestore(&priv->lock, flags); > +} > +static int mlxbf_gige_get_link_ksettings(struct net_device *netdev, > + struct ethtool_link_ksettings > *link_ksettings) > +{ > + struct phy_device *phydev = netdev->phydev; > + u32 supported, advertising; > + u32 lp_advertising = 0; > + int status; > + > + supported = SUPPORTED_TP | SUPPORTED_1000baseT_Full | > + SUPPORTED_Autoneg | SUPPORTED_Pause; > + > + advertising = ADVERTISED_1000baseT_Full | ADVERTISED_Autoneg | > + ADVERTISED_Pause; > + > + status = phy_read(phydev, MII_LPA); > + if (status >= 0) > + lp_advertising = mii_lpa_to_ethtool_lpa_t(status & 0xffff); > + > + status = phy_read(phydev, MII_STAT1000); > + if (status >= 0) > + lp_advertising |= mii_stat1000_to_ethtool_lpa_t(status & > 0xffff); > + > + > ethtool_convert_legacy_u32_to_link_mode(link_ksettings->link_modes.supported, > + supported); > + > ethtool_convert_legacy_u32_to_link_mode(link_ksettings->link_modes.advertising, > + advertising); > + > ethtool_convert_legacy_u32_to_link_mode(link_ksettings->link_modes.lp_advertising, > + lp_advertising); > + > + link_ksettings->base.autoneg = AUTONEG_ENABLE; > + link_ksettings->base.speed = SPEED_1000; > + link_ksettings->base.duplex = DUPLEX_FULL; Hm. You're reporting this even if the link is down? > + link_ksettings->base.port = PORT_TP; > + link_ksettings->base.phy_address = MLXBF_GIGE_MDIO_DEFAULT_PHY_ADDR; > + link_ksettings->base.transceiver = XCVR_INTERNAL; > + link_ksettings->base.mdio_support = ETH_MDIO_SUPPORTS_C22; > + link_ksettings->base.eth_tp_mdix = ETH_TP_MDI_INVALID; > + link_ksettings->base.eth_tp_mdix_ctrl = ETH_TP_MDI_INVALID; > +static irqreturn_t mlxbf_gige_rx_intr(int irq, void *dev_id) > +{ > + struct mlxbf_gige *priv; > + > + priv = dev_id; > + > + priv->rx_intr_count++; > + > + /* Driver has been interrupted because a new packet is available, > + * but do not process packets at this time. Instead, disable any > + * further "packet rx" interrupts and tell the networking subsystem > + * to poll the driver to pick up all available packets. I must say these comments really are excessive. > + * NOTE: GigE silicon automatically disables "packet rx" interrupt by > + * setting MLXBF_GIGE_INT_MASK bit0 upon triggering the interrupt > + * to the ARM cores. Software needs to re-enable "packet rx" > + * interrupts by clearing MLXBF_GIGE_INT_MASK bit0. > + */ > + > + /* Tell networking subsystem to poll GigE driver */ > + napi_schedule(&priv->napi); _irqoff > + > + return IRQ_HANDLED; > +} > +static u16 mlxbf_gige_tx_buffs_avail(struct mlxbf_gige *priv) > +{ > + unsigned long flags; > + u16 avail; > + > + spin_lock_irqsave(&priv->lock, flags); > + > + if (priv->prev_tx_ci == priv->tx_pi) > + avail = priv->tx_q_entries - 1; > + else > + avail = ((priv->tx_q_entries + priv->prev_tx_ci - priv->tx_pi) > + % priv->tx_q_entries) - 1; Because this is unsigned logic, and q_entries is a power of 2 you probably don't need the if, and the modulo. > + spin_unlock_irqrestore(&priv->lock, flags); > + > + return avail; > +} > + > +static bool mlxbf_gige_handle_tx_complete(struct mlxbf_gige *priv) > +{ > + struct net_device_stats *stats; > + u16 tx_wqe_index; > + u64 *tx_wqe_addr; > + u64 tx_status; > + u16 tx_ci; > + > + tx_status = readq(priv->base + MLXBF_GIGE_TX_STATUS); > + if (tx_status & MLXBF_GIGE_TX_STATUS_DATA_FIFO_FULL) > + priv->stats.tx_fifo_full++; > + tx_ci = readq(priv->base + MLXBF_GIGE_TX_CONSUMER_INDEX); > + stats = &priv->netdev->stats; > + > + /* Transmit completion logic needs to loop until the completion > + * index (in SW) equals TX consumer index (from HW). These > + * parameters are unsigned 16-bit values and the wrap case needs > + * to be supported, that is TX consumer index wrapped from 0xFFFF > + * to 0 while TX completion index is still < 0xFFFF. > + */ > + for (; priv->prev_tx_ci != tx_ci; priv->prev_tx_ci++) { > + tx_wqe_index = priv->prev_tx_ci % priv->tx_q_entries; > + /* Each TX WQE is 16 bytes. The 8 MSB store the 2KB TX > + * buffer address and the 8 LSB contain information > + * about the TX WQE. > + */ > + tx_wqe_addr = priv->tx_wqe_base + > + (tx_wqe_index * MLXBF_GIGE_TX_WQE_SZ_QWORDS); > + > + stats->tx_packets++; > + stats->tx_bytes += MLXBF_GIGE_TX_WQE_PKT_LEN(tx_wqe_addr); > + dma_free_coherent(priv->dev, MLXBF_GIGE_DEFAULT_BUF_SZ, > + priv->tx_buf[tx_wqe_index], *tx_wqe_addr); > + priv->tx_buf[tx_wqe_index] = NULL; > + } > + > + /* Since the TX ring was likely just drained, check if TX queue > + * had previously been stopped and now that there are TX buffers > + * available the TX queue can be awakened. > + */ > + if (netif_queue_stopped(priv->netdev) && > + mlxbf_gige_tx_buffs_avail(priv)) { > + netif_wake_queue(priv->netdev); > + } No need for parens. > + > + return true; > +} > + > +static bool mlxbf_gige_rx_packet(struct mlxbf_gige *priv, int *rx_pkts) > +{ > + struct net_device *netdev = priv->netdev; > + u16 rx_pi_rem, rx_ci_rem; > + struct sk_buff *skb; > + u64 *rx_cqe_addr; > + u64 datalen; > + u64 rx_cqe; > + u16 rx_ci; > + u16 rx_pi; > + u8 *pktp; > + > + /* Index into RX buffer array is rx_pi w/wrap based on RX_CQE_SIZE */ > + rx_pi = readq(priv->base + MLXBF_GIGE_RX_WQE_PI); > + rx_pi_rem = rx_pi % priv->rx_q_entries; > + pktp = priv->rx_buf[rx_pi_rem]; > + rx_cqe_addr = priv->rx_cqe_base + rx_pi_rem; > + rx_cqe = *rx_cqe_addr; > + datalen = rx_cqe & MLXBF_GIGE_RX_CQE_PKT_LEN_MASK; > + > + if ((rx_cqe & MLXBF_GIGE_RX_CQE_PKT_STATUS_MASK) == 0) { > + /* Packet is OK, increment stats */ > + netdev->stats.rx_packets++; > + netdev->stats.rx_bytes += datalen; > + > + skb = dev_alloc_skb(datalen); > + if (!skb) { > + netdev->stats.rx_dropped++; > + return false; > + } > + > + memcpy(skb_put(skb, datalen), pktp, datalen); > + > + skb->dev = netdev; > + skb->protocol = eth_type_trans(skb, netdev); > + skb->ip_summed = CHECKSUM_NONE; /* device did not checksum > packet */ > + > + netif_receive_skb(skb); > + } else if (rx_cqe & MLXBF_GIGE_RX_CQE_PKT_STATUS_MAC_ERR) { > + priv->stats.rx_mac_errors++; > + } else if (rx_cqe & MLXBF_GIGE_RX_CQE_PKT_STATUS_TRUNCATED) { > + priv->stats.rx_truncate_errors++; > + } > + > + /* Let hardware know we've replenished one buffer */ > + writeq(rx_pi + 1, priv->base + MLXBF_GIGE_RX_WQE_PI); > + > + (*rx_pkts)++; > + rx_pi = readq(priv->base + MLXBF_GIGE_RX_WQE_PI); > + rx_pi_rem = rx_pi % priv->rx_q_entries; > + rx_ci = readq(priv->base + MLXBF_GIGE_RX_CQE_PACKET_CI); > + rx_ci_rem = rx_ci % priv->rx_q_entries; Looks really wasteful to keep reading IO registers on every packet, while you may already know there is more from the previous iteration.. > + return rx_pi_rem != rx_ci_rem; > +} > + > +/* Driver poll() function called by NAPI infrastructure */ > +static int mlxbf_gige_poll(struct napi_struct *napi, int budget) > +{ > + struct mlxbf_gige *priv; > + bool remaining_pkts; > + int work_done = 0; > + u64 data; > + > + priv = container_of(napi, struct mlxbf_gige, napi); > + > + mlxbf_gige_handle_tx_complete(priv); > + > + do { > + remaining_pkts = mlxbf_gige_rx_packet(priv, &work_done); > + } while (remaining_pkts && work_done < budget); > + > + /* If amount of work done < budget, turn off NAPI polling > + * via napi_complete_done(napi, work_done) and then > + * re-enable interrupts. > + */ > + if (work_done < budget && napi_complete_done(napi, work_done)) { > + /* Clear MLXBF_GIGE_INT_MASK 'receive pkt' bit to > + * indicate receive readiness > + */ > + data = readq(priv->base + MLXBF_GIGE_INT_MASK); > + data &= ~MLXBF_GIGE_INT_MASK_RX_RECEIVE_PACKET; > + writeq(data, priv->base + MLXBF_GIGE_INT_MASK); > + } > + > + return work_done; > +} > + > +static int mlxbf_gige_request_irqs(struct mlxbf_gige *priv) > +{ > + int err; > + > + err = devm_request_irq(priv->dev, priv->error_irq, > + mlxbf_gige_error_intr, 0, "mlxbf_gige_error", > + priv); > + if (err) { > + dev_err(priv->dev, "Request error_irq failure\n"); > + return err; > + } > + > + err = devm_request_irq(priv->dev, priv->rx_irq, > + mlxbf_gige_rx_intr, 0, "mlxbf_gige_rx", > + priv); > + if (err) { > + dev_err(priv->dev, "Request rx_irq failure\n"); > + return err; > + } > + > + err = devm_request_irq(priv->dev, priv->llu_plu_irq, > + mlxbf_gige_llu_plu_intr, 0, "mlxbf_gige_llu_plu", > + priv); > + if (err) { > + dev_err(priv->dev, "Request llu_plu_irq failure\n"); > + return err; > + } > + > + return 0; > +} > + > +static void mlxbf_gige_free_irqs(struct mlxbf_gige *priv) > +{ > + devm_free_irq(priv->dev, priv->error_irq, priv); > + devm_free_irq(priv->dev, priv->rx_irq, priv); > + devm_free_irq(priv->dev, priv->llu_plu_irq, priv); What's the point of devm if you free them manually, just use normal routines, please. > +} > +static void mlxbf_gige_clean_port(struct mlxbf_gige *priv) > +{ > + u64 control, status; > + int cnt; > + > + /* Set the CLEAN_PORT_EN bit to trigger SW reset */ > + control = readq(priv->base + MLXBF_GIGE_CONTROL); > + control |= MLXBF_GIGE_CONTROL_CLEAN_PORT_EN; > + writeq(control, priv->base + MLXBF_GIGE_CONTROL); > + > + /* Loop waiting for status ready bit to assert */ > + cnt = 1000; > + do { > + status = readq(priv->base + MLXBF_GIGE_STATUS); > + if (status & MLXBF_GIGE_STATUS_READY) > + break; > + usleep_range(50, 100); > + } while (--cnt > 0); check out linux/iopoll.h > + /* Clear the CLEAN_PORT_EN bit at end of this loop */ > + control = readq(priv->base + MLXBF_GIGE_CONTROL); > + control &= ~MLXBF_GIGE_CONTROL_CLEAN_PORT_EN; > + writeq(control, priv->base + MLXBF_GIGE_CONTROL); > +} > + > +static int mlxbf_gige_open(struct net_device *netdev) > +{ > + struct mlxbf_gige *priv = netdev_priv(netdev); > + struct phy_device *phydev = netdev->phydev; > + u64 int_en; > + int err; > + > + mlxbf_gige_cache_stats(priv); > + mlxbf_gige_clean_port(priv); > + mlxbf_gige_rx_init(priv); > + mlxbf_gige_tx_init(priv); These can fail. > + netif_napi_add(netdev, &priv->napi, mlxbf_gige_poll, NAPI_POLL_WEIGHT); > + napi_enable(&priv->napi); > + netif_start_queue(netdev); > + > + err = mlxbf_gige_request_irqs(priv); Request IRQs before you start enabling NAPI and doing stuff that can't fail. > + if (err) > + return err; > + > + phy_start(phydev); > + > + /* Set bits in INT_EN that we care about */ > + int_en = MLXBF_GIGE_INT_EN_HW_ACCESS_ERROR | > + MLXBF_GIGE_INT_EN_TX_CHECKSUM_INPUTS | > + MLXBF_GIGE_INT_EN_TX_SMALL_FRAME_SIZE | > + MLXBF_GIGE_INT_EN_TX_PI_CI_EXCEED_WQ_SIZE | > + MLXBF_GIGE_INT_EN_SW_CONFIG_ERROR | > + MLXBF_GIGE_INT_EN_SW_ACCESS_ERROR | > + MLXBF_GIGE_INT_EN_RX_RECEIVE_PACKET; > + writeq(int_en, priv->base + MLXBF_GIGE_INT_EN); > + > + return 0; > +} > +static netdev_tx_t mlxbf_gige_start_xmit(struct sk_buff *skb, > + struct net_device *netdev) > +{ > + struct mlxbf_gige *priv = netdev_priv(netdev); > + dma_addr_t tx_buf_dma; > + u8 *tx_buf = NULL; > + u64 *tx_wqe_addr; > + u64 word2; > + > + /* Check that there is room left in TX ring */ > + if (!mlxbf_gige_tx_buffs_avail(priv)) { > + /* TX ring is full, inform stack but do not free SKB */ > + netif_stop_queue(netdev); > + netdev->stats.tx_dropped++; > + return NETDEV_TX_BUSY; Stop the queue when the last index gets used, not when a frame that overflows the ring arrives. re-queuing skbs is expensive. > + } > + > + /* Allocate ptr for buffer */ > + if (skb->len < MLXBF_GIGE_DEFAULT_BUF_SZ) > + tx_buf = dma_alloc_coherent(priv->dev, > MLXBF_GIGE_DEFAULT_BUF_SZ, > + &tx_buf_dma, GFP_KERNEL); > + > + if (!tx_buf) { > + /* Free incoming skb, could not alloc TX buffer */ > + dev_kfree_skb(skb); > + netdev->stats.tx_dropped++; > + return NET_XMIT_DROP; > + } > + > + priv->tx_buf[priv->tx_pi % priv->tx_q_entries] = tx_buf; > + > + /* Copy data from skb to allocated TX buffer > + * > + * NOTE: GigE silicon will automatically pad up to > + * minimum packet length if needed. > + */ > + skb_copy_bits(skb, 0, tx_buf, skb->len); Why do you copy data for both TX and RX?! Can't you map the skb data directly? > + /* Get address of TX WQE */ > + tx_wqe_addr = priv->tx_wqe_next; > + > + mlxbf_gige_update_tx_wqe_next(priv); > + > + /* Put PA of buffer address into first 64-bit word of TX WQE */ > + *tx_wqe_addr = tx_buf_dma; > + > + /* Set TX WQE pkt_len appropriately */ > + word2 = skb->len & MLXBF_GIGE_TX_WQE_PKT_LEN_MASK; > + > + /* Write entire 2nd word of TX WQE */ > + *(tx_wqe_addr + 1) = word2; > + > + priv->tx_pi++; > + > + /* Create memory barrier before write to TX PI */ > + wmb(); please implement xmit_more > + writeq(priv->tx_pi, priv->base + MLXBF_GIGE_TX_PRODUCER_INDEX); > + > + /* Free incoming skb, contents already copied to HW */ > + dev_kfree_skb(skb); No, you can't do that. The socket that produced this data will think it's already on the wire and bombard you with more data. > + return NETDEV_TX_OK; > +} > +static const struct net_device_ops mlxbf_gige_netdev_ops = { > + .ndo_open = mlxbf_gige_open, > + .ndo_stop = mlxbf_gige_stop, > + .ndo_start_xmit = mlxbf_gige_start_xmit, > + .ndo_set_mac_address = eth_mac_addr, > + .ndo_validate_addr = eth_validate_addr, > + .ndo_do_ioctl = mlxbf_gige_do_ioctl, > + .ndo_set_rx_mode = mlxbf_gige_set_rx_mode, You must report standard stats. > +}; > + > +static u64 mlxbf_gige_mac_to_u64(u8 *addr) > +{ > + u64 mac = 0; > + int i; > + > + for (i = 0; i < ETH_ALEN; i++) { > + mac <<= 8; > + mac |= addr[i]; > + } > + return mac; > +} ether_addr_to_u64() > +static void mlxbf_gige_u64_to_mac(u8 *addr, u64 mac) > +{ > + int i; > + > + for (i = ETH_ALEN; i > 0; i--) { > + addr[i - 1] = mac & 0xFF; > + mac >>= 8; > + } > +} u64_to_ether_addr()