On 6/26/2018 11:04 AM, Zhang, Qi Z wrote: > > >> -----Original Message----- >> From: Yigit, Ferruh >> Sent: Tuesday, June 26, 2018 5:15 PM >> To: Zhang, Qi Z <qi.z.zh...@intel.com>; Stephen Hemminger >> <step...@networkplumber.org> >> Cc: Xing, Beilei <beilei.x...@intel.com>; Wu, Jingjing >> <jingjing...@intel.com>; >> Yu, De <de...@intel.com>; dev@dpdk.org >> Subject: Re: [dpdk-dev] [PATCH v2] net/i40e: remove VF interrupt handler >> >> On 6/24/2018 11:56 AM, Zhang, Qi Z wrote: >>> Hi Stephen: >>> >>>> -----Original Message----- >>>> From: Stephen Hemminger [mailto:step...@networkplumber.org] >>>> Sent: Friday, June 22, 2018 11:44 PM >>>> To: Zhang, Qi Z <qi.z.zh...@intel.com> >>>> Cc: Xing, Beilei <beilei.x...@intel.com>; Wu, Jingjing >>>> <jingjing...@intel.com>; Yu, De <de...@intel.com>; dev@dpdk.org >>>> Subject: Re: [dpdk-dev] [PATCH v2] net/i40e: remove VF interrupt >>>> handler >>>> >>>> On Fri, 22 Jun 2018 08:44:14 +0800 >>>> Qi Zhang <qi.z.zh...@intel.com> wrote: >>>> >>>>> For i40evf, internal rx interrupt and adminq interrupt share the >>>>> same source, that cause a lot cpu cycles be wasted on interrupt >>>>> handler on rx path. This is complained by customers which require >>>>> low latency (when set I40E_ITR_INTERVAL to small value), but have to >>>>> be sufferred by tremendous interrupts handling that eat significant CPU >> resources. >>>>> >>>>> The patch disable pci interrupt and remove the interrupt handler, >>>>> replace it with a low frequency (50ms) interrupt polling daemon >>>>> which is implemented by registering a alarm callback periodly, this >>>>> save CPU time significently: On a typical x86 server with 2.1GHz >>>>> CPU, with low latency configure (32us) we saw CPU usage from top >>>>> commmand reduced from 20% to 0% on management core in testpmd). >>>>> >>>>> Also with the new method we can remove compile option: >>>>> I40E_ITR_INTERVAL which is used to balance between low latency and >>>>> low >>>> CPU usage previously. >>>>> Now we don't need it since we can reach both at same time. >>>>> >>>>> Suggested-by: Jingjing Wu <jingjing...@intel.com> >>>>> Signed-off-by: Qi Zhang <qi.z.zh...@intel.com> >>>>> --- >>>>> >>>>> v2: >>>>> - update doc >>>>> >>>>> config/common_base | 2 -- >>>>> doc/guides/nics/i40e.rst | 5 ----- >>>>> drivers/net/i40e/i40e_ethdev.c | 3 +-- >>>>> drivers/net/i40e/i40e_ethdev.h | 22 +++++++++++----------- >>>>> drivers/net/i40e/i40e_ethdev_vf.c | 36 >>>>> ++++++++++++++---------------------- >>>>> 5 files changed, 26 insertions(+), 42 deletions(-) >>>>> >>>>> diff --git a/config/common_base b/config/common_base index >>>>> 6b0d1cbbb..9e21c6865 100644 >>>>> --- a/config/common_base >>>>> +++ b/config/common_base >>>>> @@ -264,8 +264,6 @@ CONFIG_RTE_LIBRTE_I40E_INC_VECTOR=y >>>>> CONFIG_RTE_LIBRTE_I40E_16BYTE_RX_DESC=n >>>>> CONFIG_RTE_LIBRTE_I40E_QUEUE_NUM_PER_PF=64 >>>>> CONFIG_RTE_LIBRTE_I40E_QUEUE_NUM_PER_VM=4 >>>>> -# interval up to 8160 us, aligned to 2 (or default value) >>>>> -CONFIG_RTE_LIBRTE_I40E_ITR_INTERVAL=-1 >>>>> >>>>> # >>>>> # Compile burst-oriented FM10K PMD >>>>> diff --git a/doc/guides/nics/i40e.rst b/doc/guides/nics/i40e.rst >>>>> index >>>>> 18549bf5a..3fc4ceac7 100644 >>>>> --- a/doc/guides/nics/i40e.rst >>>>> +++ b/doc/guides/nics/i40e.rst >>>>> @@ -96,11 +96,6 @@ Please note that enabling debugging options may >>>> affect system performance. >>>>> >>>>> Number of queues reserved for each VMDQ Pool. >>>>> >>>>> -- ``CONFIG_RTE_LIBRTE_I40E_ITR_INTERVAL`` (default ``-1``) >>>>> - >>>>> - Interrupt Throttling interval. >>>>> - >>>>> - >>>>> Runtime Config Options >>>>> ~~~~~~~~~~~~~~~~~~~~~~ >>>>> >>>>> diff --git a/drivers/net/i40e/i40e_ethdev.c >>>>> b/drivers/net/i40e/i40e_ethdev.c index 13c5d3296..c8f9566e0 100644 >>>>> --- a/drivers/net/i40e/i40e_ethdev.c >>>>> +++ b/drivers/net/i40e/i40e_ethdev.c >>>>> @@ -1829,8 +1829,7 @@ __vsi_queues_bind_intr(struct i40e_vsi *vsi, >>>> uint16_t msix_vect, >>>>> /* Write first RX queue to Link list register as the head element */ >>>>> if (vsi->type != I40E_VSI_SRIOV) { >>>>> uint16_t interval = >>>>> - i40e_calc_itr_interval(RTE_LIBRTE_I40E_ITR_INTERVAL, 1, >>>>> - pf->support_multi_driver); >>>>> + i40e_calc_itr_interval(1, pf->support_multi_driver); >>>>> >>>>> if (msix_vect == I40E_MISC_VEC_ID) { >>>>> I40E_WRITE_REG(hw, I40E_PFINT_LNKLST0, diff --git >>>>> a/drivers/net/i40e/i40e_ethdev.h b/drivers/net/i40e/i40e_ethdev.h >>>>> index 11c4c76bd..599993dac 100644 >>>>> --- a/drivers/net/i40e/i40e_ethdev.h >>>>> +++ b/drivers/net/i40e/i40e_ethdev.h >>>>> @@ -178,7 +178,7 @@ enum i40e_flxpld_layer_idx { >>>>> #define I40E_ITR_INDEX_NONE 3 >>>>> #define I40E_QUEUE_ITR_INTERVAL_DEFAULT 32 /* 32 us */ >>>>> #define I40E_QUEUE_ITR_INTERVAL_MAX 8160 /* 8160 us */ >>>>> -#define I40E_VF_QUEUE_ITR_INTERVAL_DEFAULT 8160 /* 8160 us */ >>>>> +#define I40E_VF_QUEUE_ITR_INTERVAL_DEFAULT 32 /* 32 us */ >>>>> /* Special FW support this floating VEB feature */ #define >>>>> FLOATING_VEB_SUPPORTED_FW_MAJ 5 #define >>>> FLOATING_VEB_SUPPORTED_FW_MIN >>>>> 0 @@ -1328,17 +1328,17 @@ i40e_align_floor(int n) } >>>>> >>>>> static inline uint16_t >>>>> -i40e_calc_itr_interval(int16_t interval, bool is_pf, bool >>>>> is_multi_drv) >>>>> +i40e_calc_itr_interval(bool is_pf, bool is_multi_drv) >>>>> { >>>>> - if (interval < 0 || interval > I40E_QUEUE_ITR_INTERVAL_MAX) { >>>>> - if (is_multi_drv) { >>>>> - interval = I40E_QUEUE_ITR_INTERVAL_MAX; >>>>> - } else { >>>>> - if (is_pf) >>>>> - interval = I40E_QUEUE_ITR_INTERVAL_DEFAULT; >>>>> - else >>>>> - interval = I40E_VF_QUEUE_ITR_INTERVAL_DEFAULT; >>>>> - } >>>>> + uint16_t interval = 0; >>>>> + >>>>> + if (is_multi_drv) { >>>>> + interval = I40E_QUEUE_ITR_INTERVAL_MAX; >>>>> + } else { >>>>> + if (is_pf) >>>>> + interval = I40E_QUEUE_ITR_INTERVAL_DEFAULT; >>>>> + else >>>>> + interval = I40E_VF_QUEUE_ITR_INTERVAL_DEFAULT; >>>>> } >>>>> >>>>> /* Convert to hardware count, as writing each 1 represents 2 us */ >>>>> diff --git a/drivers/net/i40e/i40e_ethdev_vf.c >>>>> b/drivers/net/i40e/i40e_ethdev_vf.c >>>>> index 804e44530..ad5c069e8 100644 >>>>> --- a/drivers/net/i40e/i40e_ethdev_vf.c >>>>> +++ b/drivers/net/i40e/i40e_ethdev_vf.c >>>>> @@ -44,6 +44,8 @@ >>>>> #define I40EVF_BUSY_WAIT_COUNT 50 >>>>> #define MAX_RESET_WAIT_CNT 20 >>>>> >>>>> +#define I40EVF_ALARM_INTERVAL 50000 /* us */ >>>>> + >>>>> struct i40evf_arq_msg_info { >>>>> enum virtchnl_ops ops; >>>>> enum i40e_status_code result; >>>>> @@ -1133,7 +1135,7 @@ i40evf_init_vf(struct rte_eth_dev *dev) >>>>> struct i40e_hw *hw = >>>> I40E_DEV_PRIVATE_TO_HW(dev->data->dev_private); >>>>> struct i40e_vf *vf = >>>> I40EVF_DEV_PRIVATE_TO_VF(dev->data->dev_private); >>>>> uint16_t interval = >>>>> - i40e_calc_itr_interval(RTE_LIBRTE_I40E_ITR_INTERVAL, 0, 0); >>>>> + i40e_calc_itr_interval(0, 0); >>>>> >>>>> vf->adapter = >> I40E_DEV_PRIVATE_TO_ADAPTER(dev->data->dev_private); >>>>> vf->dev_data = dev->data; >>>>> @@ -1370,7 +1372,7 @@ i40evf_handle_aq_msg(struct rte_eth_dev *dev) >>>>> * void >>>>> */ >>>>> static void >>>>> -i40evf_dev_interrupt_handler(void *param) >>>>> +i40evf_dev_alarm_handler(void *param) >>>>> { >>>>> struct rte_eth_dev *dev = (struct rte_eth_dev *)param; >>>>> struct i40e_hw *hw = >>>> I40E_DEV_PRIVATE_TO_HW(dev->data->dev_private); >>>>> @@ -1399,6 +1401,8 @@ i40evf_dev_interrupt_handler(void *param) >>>>> >>>>> done: >>>>> i40evf_enable_irq0(hw); >>>>> + rte_eal_alarm_set(I40EVF_ALARM_INTERVAL, >>>>> + i40evf_dev_alarm_handler, dev); >>>>> } >>>>> >>>>> static int >>>>> @@ -1442,12 +1446,8 @@ i40evf_dev_init(struct rte_eth_dev *eth_dev) >>>>> return -1; >>>>> } >>>>> >>>>> - /* register callback func to eal lib */ >>>>> - rte_intr_callback_register(&pci_dev->intr_handle, >>>>> - i40evf_dev_interrupt_handler, (void *)eth_dev); >>>>> - >>>>> - /* enable uio intr after callback register */ >>>>> - rte_intr_enable(&pci_dev->intr_handle); >>>>> + rte_eal_alarm_set(I40EVF_ALARM_INTERVAL, >>>>> + i40evf_dev_alarm_handler, eth_dev); >>>>> >>>>> /* configure and enable device interrupt */ >>>>> i40evf_enable_irq0(hw); >>>>> @@ -1836,7 +1836,7 @@ i40evf_dev_rx_queue_intr_enable(struct >>>> rte_eth_dev *dev, uint16_t queue_id) >>>>> struct rte_intr_handle *intr_handle = &pci_dev->intr_handle; >>>>> struct i40e_hw *hw = >>>> I40E_DEV_PRIVATE_TO_HW(dev->data->dev_private); >>>>> uint16_t interval = >>>>> - i40e_calc_itr_interval(RTE_LIBRTE_I40E_ITR_INTERVAL, 0, 0); >>>>> + i40e_calc_itr_interval(0, 0); >>>>> uint16_t msix_intr; >>>>> >>>>> msix_intr = intr_handle->intr_vec[queue_id]; @@ -1859,8 +1859,6 >> @@ >>>>> i40evf_dev_rx_queue_intr_enable(struct rte_eth_dev *dev, uint16_t >>>>> queue_id) >>>>> >>>>> I40EVF_WRITE_FLUSH(hw); >>>>> >>>>> - rte_intr_enable(&pci_dev->intr_handle); >>>>> - >>>>> return 0; >>>>> } >>>>> >>>>> @@ -2023,10 +2021,8 @@ i40evf_dev_start(struct rte_eth_dev *dev) >>>>> * queue interrupt to other VFIO vectors. >>>>> * So clear uio/vfio intr/evevnfd first to avoid failure. >>>>> */ >>>>> - if (dev->data->dev_conf.intr_conf.rxq != 0) { >>>>> - rte_intr_disable(intr_handle); >>>>> + if (dev->data->dev_conf.intr_conf.rxq != 0) >>>>> rte_intr_enable(intr_handle); >>>>> - } >>>>> >>>>> i40evf_enable_queues_intr(dev); >>>>> >>>>> @@ -2050,6 +2046,9 @@ i40evf_dev_stop(struct rte_eth_dev *dev) >>>>> >>>>> PMD_INIT_FUNC_TRACE(); >>>>> >>>>> + if (dev->data->dev_conf.intr_conf.rxq != 0) >>>>> + rte_intr_disable(intr_handle); >>>>> + >>>>> if (hw->adapter_stopped == 1) >>>>> return; >>>>> i40evf_stop_queues(dev); >>>>> @@ -2285,9 +2284,8 @@ static void >>>>> i40evf_dev_close(struct rte_eth_dev *dev) { >>>>> struct i40e_hw *hw = >>>> I40E_DEV_PRIVATE_TO_HW(dev->data->dev_private); >>>>> - struct rte_pci_device *pci_dev = RTE_ETH_DEV_TO_PCI(dev); >>>>> - struct rte_intr_handle *intr_handle = &pci_dev->intr_handle; >>>>> >>>>> + rte_eal_alarm_cancel(i40evf_dev_alarm_handler, dev); >>>>> i40evf_dev_stop(dev); >>>>> i40e_dev_free_queues(dev); >>>>> /* >>>>> @@ -2300,12 +2298,6 @@ i40evf_dev_close(struct rte_eth_dev *dev) >>>>> >>>>> i40evf_reset_vf(hw); >>>>> i40e_shutdown_adminq(hw); >>>>> - /* disable uio intr before callback unregister */ >>>>> - rte_intr_disable(intr_handle); >>>>> - >>>>> - /* unregister callback func from eal lib */ >>>>> - rte_intr_callback_unregister(intr_handle, >>>>> - i40evf_dev_interrupt_handler, dev); >>>>> i40evf_disable_irq0(hw); >>>>> } >>>>> >>>> >>>> Rather than adding a polling routine internally, why not change the >>>> driver to not support Link State or receive interrupts. Better yet, >>>> let the application decide. >>>> Keep the interrupt logic but only enable interrupts if application >>>> has requested LSC or recveive interrupt mode. >>> >>> The interrupt handler is not only for LSC (actually VF does not >>> support LSC) or rx interrupt mode, it is used for PF to VF message through >> admin queue which is always required. >> >> I guess the question is, is it possible to disable Rx interrupts? And if >> possible >> can user control this enable/disable per interrupt source? > > The problem is Rx interrupt is shared with admin queue interrupt, they are > enable/disable together.
OK, thanks, this clarifies. > So if we want to get admin queue message from interrupt , we have to suffer > the massive interrupt handling. > But admin queue used for pf/vf channel is supposed can't be closed, (though > it can be delayed a little bit) > > Regards > Qi > >