> -----Original Message----- > From: Yigit, Ferruh > Sent: Tuesday, June 26, 2018 5:15 PM > To: Zhang, Qi Z <qi.z.zh...@intel.com>; Stephen Hemminger > <step...@networkplumber.org> > Cc: Xing, Beilei <beilei.x...@intel.com>; Wu, Jingjing > <jingjing...@intel.com>; > Yu, De <de...@intel.com>; dev@dpdk.org > Subject: Re: [dpdk-dev] [PATCH v2] net/i40e: remove VF interrupt handler > > On 6/24/2018 11:56 AM, Zhang, Qi Z wrote: > > Hi Stephen: > > > >> -----Original Message----- > >> From: Stephen Hemminger [mailto:step...@networkplumber.org] > >> Sent: Friday, June 22, 2018 11:44 PM > >> To: Zhang, Qi Z <qi.z.zh...@intel.com> > >> Cc: Xing, Beilei <beilei.x...@intel.com>; Wu, Jingjing > >> <jingjing...@intel.com>; Yu, De <de...@intel.com>; dev@dpdk.org > >> Subject: Re: [dpdk-dev] [PATCH v2] net/i40e: remove VF interrupt > >> handler > >> > >> On Fri, 22 Jun 2018 08:44:14 +0800 > >> Qi Zhang <qi.z.zh...@intel.com> wrote: > >> > >>> For i40evf, internal rx interrupt and adminq interrupt share the > >>> same source, that cause a lot cpu cycles be wasted on interrupt > >>> handler on rx path. This is complained by customers which require > >>> low latency (when set I40E_ITR_INTERVAL to small value), but have to > >>> be sufferred by tremendous interrupts handling that eat significant CPU > resources. > >>> > >>> The patch disable pci interrupt and remove the interrupt handler, > >>> replace it with a low frequency (50ms) interrupt polling daemon > >>> which is implemented by registering a alarm callback periodly, this > >>> save CPU time significently: On a typical x86 server with 2.1GHz > >>> CPU, with low latency configure (32us) we saw CPU usage from top > >>> commmand reduced from 20% to 0% on management core in testpmd). > >>> > >>> Also with the new method we can remove compile option: > >>> I40E_ITR_INTERVAL which is used to balance between low latency and > >>> low > >> CPU usage previously. > >>> Now we don't need it since we can reach both at same time. > >>> > >>> Suggested-by: Jingjing Wu <jingjing...@intel.com> > >>> Signed-off-by: Qi Zhang <qi.z.zh...@intel.com> > >>> --- > >>> > >>> v2: > >>> - update doc > >>> > >>> config/common_base | 2 -- > >>> doc/guides/nics/i40e.rst | 5 ----- > >>> drivers/net/i40e/i40e_ethdev.c | 3 +-- > >>> drivers/net/i40e/i40e_ethdev.h | 22 +++++++++++----------- > >>> drivers/net/i40e/i40e_ethdev_vf.c | 36 > >>> ++++++++++++++---------------------- > >>> 5 files changed, 26 insertions(+), 42 deletions(-) > >>> > >>> diff --git a/config/common_base b/config/common_base index > >>> 6b0d1cbbb..9e21c6865 100644 > >>> --- a/config/common_base > >>> +++ b/config/common_base > >>> @@ -264,8 +264,6 @@ CONFIG_RTE_LIBRTE_I40E_INC_VECTOR=y > >>> CONFIG_RTE_LIBRTE_I40E_16BYTE_RX_DESC=n > >>> CONFIG_RTE_LIBRTE_I40E_QUEUE_NUM_PER_PF=64 > >>> CONFIG_RTE_LIBRTE_I40E_QUEUE_NUM_PER_VM=4 > >>> -# interval up to 8160 us, aligned to 2 (or default value) > >>> -CONFIG_RTE_LIBRTE_I40E_ITR_INTERVAL=-1 > >>> > >>> # > >>> # Compile burst-oriented FM10K PMD > >>> diff --git a/doc/guides/nics/i40e.rst b/doc/guides/nics/i40e.rst > >>> index > >>> 18549bf5a..3fc4ceac7 100644 > >>> --- a/doc/guides/nics/i40e.rst > >>> +++ b/doc/guides/nics/i40e.rst > >>> @@ -96,11 +96,6 @@ Please note that enabling debugging options may > >> affect system performance. > >>> > >>> Number of queues reserved for each VMDQ Pool. > >>> > >>> -- ``CONFIG_RTE_LIBRTE_I40E_ITR_INTERVAL`` (default ``-1``) > >>> - > >>> - Interrupt Throttling interval. > >>> - > >>> - > >>> Runtime Config Options > >>> ~~~~~~~~~~~~~~~~~~~~~~ > >>> > >>> diff --git a/drivers/net/i40e/i40e_ethdev.c > >>> b/drivers/net/i40e/i40e_ethdev.c index 13c5d3296..c8f9566e0 100644 > >>> --- a/drivers/net/i40e/i40e_ethdev.c > >>> +++ b/drivers/net/i40e/i40e_ethdev.c > >>> @@ -1829,8 +1829,7 @@ __vsi_queues_bind_intr(struct i40e_vsi *vsi, > >> uint16_t msix_vect, > >>> /* Write first RX queue to Link list register as the head element */ > >>> if (vsi->type != I40E_VSI_SRIOV) { > >>> uint16_t interval = > >>> - i40e_calc_itr_interval(RTE_LIBRTE_I40E_ITR_INTERVAL, 1, > >>> - pf->support_multi_driver); > >>> + i40e_calc_itr_interval(1, pf->support_multi_driver); > >>> > >>> if (msix_vect == I40E_MISC_VEC_ID) { > >>> I40E_WRITE_REG(hw, I40E_PFINT_LNKLST0, diff --git > >>> a/drivers/net/i40e/i40e_ethdev.h b/drivers/net/i40e/i40e_ethdev.h > >>> index 11c4c76bd..599993dac 100644 > >>> --- a/drivers/net/i40e/i40e_ethdev.h > >>> +++ b/drivers/net/i40e/i40e_ethdev.h > >>> @@ -178,7 +178,7 @@ enum i40e_flxpld_layer_idx { > >>> #define I40E_ITR_INDEX_NONE 3 > >>> #define I40E_QUEUE_ITR_INTERVAL_DEFAULT 32 /* 32 us */ > >>> #define I40E_QUEUE_ITR_INTERVAL_MAX 8160 /* 8160 us */ > >>> -#define I40E_VF_QUEUE_ITR_INTERVAL_DEFAULT 8160 /* 8160 us */ > >>> +#define I40E_VF_QUEUE_ITR_INTERVAL_DEFAULT 32 /* 32 us */ > >>> /* Special FW support this floating VEB feature */ #define > >>> FLOATING_VEB_SUPPORTED_FW_MAJ 5 #define > >> FLOATING_VEB_SUPPORTED_FW_MIN > >>> 0 @@ -1328,17 +1328,17 @@ i40e_align_floor(int n) } > >>> > >>> static inline uint16_t > >>> -i40e_calc_itr_interval(int16_t interval, bool is_pf, bool > >>> is_multi_drv) > >>> +i40e_calc_itr_interval(bool is_pf, bool is_multi_drv) > >>> { > >>> - if (interval < 0 || interval > I40E_QUEUE_ITR_INTERVAL_MAX) { > >>> - if (is_multi_drv) { > >>> - interval = I40E_QUEUE_ITR_INTERVAL_MAX; > >>> - } else { > >>> - if (is_pf) > >>> - interval = I40E_QUEUE_ITR_INTERVAL_DEFAULT; > >>> - else > >>> - interval = I40E_VF_QUEUE_ITR_INTERVAL_DEFAULT; > >>> - } > >>> + uint16_t interval = 0; > >>> + > >>> + if (is_multi_drv) { > >>> + interval = I40E_QUEUE_ITR_INTERVAL_MAX; > >>> + } else { > >>> + if (is_pf) > >>> + interval = I40E_QUEUE_ITR_INTERVAL_DEFAULT; > >>> + else > >>> + interval = I40E_VF_QUEUE_ITR_INTERVAL_DEFAULT; > >>> } > >>> > >>> /* Convert to hardware count, as writing each 1 represents 2 us */ > >>> diff --git a/drivers/net/i40e/i40e_ethdev_vf.c > >>> b/drivers/net/i40e/i40e_ethdev_vf.c > >>> index 804e44530..ad5c069e8 100644 > >>> --- a/drivers/net/i40e/i40e_ethdev_vf.c > >>> +++ b/drivers/net/i40e/i40e_ethdev_vf.c > >>> @@ -44,6 +44,8 @@ > >>> #define I40EVF_BUSY_WAIT_COUNT 50 > >>> #define MAX_RESET_WAIT_CNT 20 > >>> > >>> +#define I40EVF_ALARM_INTERVAL 50000 /* us */ > >>> + > >>> struct i40evf_arq_msg_info { > >>> enum virtchnl_ops ops; > >>> enum i40e_status_code result; > >>> @@ -1133,7 +1135,7 @@ i40evf_init_vf(struct rte_eth_dev *dev) > >>> struct i40e_hw *hw = > >> I40E_DEV_PRIVATE_TO_HW(dev->data->dev_private); > >>> struct i40e_vf *vf = > >> I40EVF_DEV_PRIVATE_TO_VF(dev->data->dev_private); > >>> uint16_t interval = > >>> - i40e_calc_itr_interval(RTE_LIBRTE_I40E_ITR_INTERVAL, 0, 0); > >>> + i40e_calc_itr_interval(0, 0); > >>> > >>> vf->adapter = > I40E_DEV_PRIVATE_TO_ADAPTER(dev->data->dev_private); > >>> vf->dev_data = dev->data; > >>> @@ -1370,7 +1372,7 @@ i40evf_handle_aq_msg(struct rte_eth_dev *dev) > >>> * void > >>> */ > >>> static void > >>> -i40evf_dev_interrupt_handler(void *param) > >>> +i40evf_dev_alarm_handler(void *param) > >>> { > >>> struct rte_eth_dev *dev = (struct rte_eth_dev *)param; > >>> struct i40e_hw *hw = > >> I40E_DEV_PRIVATE_TO_HW(dev->data->dev_private); > >>> @@ -1399,6 +1401,8 @@ i40evf_dev_interrupt_handler(void *param) > >>> > >>> done: > >>> i40evf_enable_irq0(hw); > >>> + rte_eal_alarm_set(I40EVF_ALARM_INTERVAL, > >>> + i40evf_dev_alarm_handler, dev); > >>> } > >>> > >>> static int > >>> @@ -1442,12 +1446,8 @@ i40evf_dev_init(struct rte_eth_dev *eth_dev) > >>> return -1; > >>> } > >>> > >>> - /* register callback func to eal lib */ > >>> - rte_intr_callback_register(&pci_dev->intr_handle, > >>> - i40evf_dev_interrupt_handler, (void *)eth_dev); > >>> - > >>> - /* enable uio intr after callback register */ > >>> - rte_intr_enable(&pci_dev->intr_handle); > >>> + rte_eal_alarm_set(I40EVF_ALARM_INTERVAL, > >>> + i40evf_dev_alarm_handler, eth_dev); > >>> > >>> /* configure and enable device interrupt */ > >>> i40evf_enable_irq0(hw); > >>> @@ -1836,7 +1836,7 @@ i40evf_dev_rx_queue_intr_enable(struct > >> rte_eth_dev *dev, uint16_t queue_id) > >>> struct rte_intr_handle *intr_handle = &pci_dev->intr_handle; > >>> struct i40e_hw *hw = > >> I40E_DEV_PRIVATE_TO_HW(dev->data->dev_private); > >>> uint16_t interval = > >>> - i40e_calc_itr_interval(RTE_LIBRTE_I40E_ITR_INTERVAL, 0, 0); > >>> + i40e_calc_itr_interval(0, 0); > >>> uint16_t msix_intr; > >>> > >>> msix_intr = intr_handle->intr_vec[queue_id]; @@ -1859,8 +1859,6 > @@ > >>> i40evf_dev_rx_queue_intr_enable(struct rte_eth_dev *dev, uint16_t > >>> queue_id) > >>> > >>> I40EVF_WRITE_FLUSH(hw); > >>> > >>> - rte_intr_enable(&pci_dev->intr_handle); > >>> - > >>> return 0; > >>> } > >>> > >>> @@ -2023,10 +2021,8 @@ i40evf_dev_start(struct rte_eth_dev *dev) > >>> * queue interrupt to other VFIO vectors. > >>> * So clear uio/vfio intr/evevnfd first to avoid failure. > >>> */ > >>> - if (dev->data->dev_conf.intr_conf.rxq != 0) { > >>> - rte_intr_disable(intr_handle); > >>> + if (dev->data->dev_conf.intr_conf.rxq != 0) > >>> rte_intr_enable(intr_handle); > >>> - } > >>> > >>> i40evf_enable_queues_intr(dev); > >>> > >>> @@ -2050,6 +2046,9 @@ i40evf_dev_stop(struct rte_eth_dev *dev) > >>> > >>> PMD_INIT_FUNC_TRACE(); > >>> > >>> + if (dev->data->dev_conf.intr_conf.rxq != 0) > >>> + rte_intr_disable(intr_handle); > >>> + > >>> if (hw->adapter_stopped == 1) > >>> return; > >>> i40evf_stop_queues(dev); > >>> @@ -2285,9 +2284,8 @@ static void > >>> i40evf_dev_close(struct rte_eth_dev *dev) { > >>> struct i40e_hw *hw = > >> I40E_DEV_PRIVATE_TO_HW(dev->data->dev_private); > >>> - struct rte_pci_device *pci_dev = RTE_ETH_DEV_TO_PCI(dev); > >>> - struct rte_intr_handle *intr_handle = &pci_dev->intr_handle; > >>> > >>> + rte_eal_alarm_cancel(i40evf_dev_alarm_handler, dev); > >>> i40evf_dev_stop(dev); > >>> i40e_dev_free_queues(dev); > >>> /* > >>> @@ -2300,12 +2298,6 @@ i40evf_dev_close(struct rte_eth_dev *dev) > >>> > >>> i40evf_reset_vf(hw); > >>> i40e_shutdown_adminq(hw); > >>> - /* disable uio intr before callback unregister */ > >>> - rte_intr_disable(intr_handle); > >>> - > >>> - /* unregister callback func from eal lib */ > >>> - rte_intr_callback_unregister(intr_handle, > >>> - i40evf_dev_interrupt_handler, dev); > >>> i40evf_disable_irq0(hw); > >>> } > >>> > >> > >> Rather than adding a polling routine internally, why not change the > >> driver to not support Link State or receive interrupts. Better yet, > >> let the application decide. > >> Keep the interrupt logic but only enable interrupts if application > >> has requested LSC or recveive interrupt mode. > > > > The interrupt handler is not only for LSC (actually VF does not > > support LSC) or rx interrupt mode, it is used for PF to VF message through > admin queue which is always required. > > I guess the question is, is it possible to disable Rx interrupts? And if > possible > can user control this enable/disable per interrupt source?
The problem is Rx interrupt is shared with admin queue interrupt, they are enable/disable together. So if we want to get admin queue message from interrupt , we have to suffer the massive interrupt handling. But admin queue used for pf/vf channel is supposed can't be closed, (though it can be delayed a little bit) Regards Qi