> -----Original Message-----
> From: Burakov, Anatoly <anatoly.bura...@intel.com>
> Sent: Thursday, July 8, 2021 9:14 AM
> To: dev@dpdk.org; McDaniel, Timothy <timothy.mcdan...@intel.com>; Xing,
> Beilei <beilei.x...@intel.com>; Wu, Jingjing <jingjing...@intel.com>; Yang,
> Qiming <qiming.y...@intel.com>; Zhang, Qi Z <qi.z.zh...@intel.com>; Wang,
> Haiyue <haiyue.w...@intel.com>; Matan Azrad <ma...@nvidia.com>; Shahaf
> Shuler <shah...@nvidia.com>; Viacheslav Ovsiienko <viachesl...@nvidia.com>;
> Richardson, Bruce <bruce.richard...@intel.com>; Ananyev, Konstantin
> <konstantin.anan...@intel.com>
> Cc: Loftus, Ciara <ciara.lof...@intel.com>; Hunt, David <david.h...@intel.com>
> Subject: [PATCH v8 1/7] power_intrinsics: use callbacks for comparison
> 
> Previously, the semantics of power monitor were such that we were
> checking current value against the expected value, and if they matched,
> then the sleep was aborted. This is somewhat inflexible, because it only
> allowed us to check for a specific value in a specific way.
> 
> This commit replaces the comparison with a user callback mechanism, so
> that any PMD (or other code) using `rte_power_monitor()` can define
> their own comparison semantics and decision making on how to detect the
> need to abort the entering of power optimized state.
> 
> Existing implementations are adjusted to follow the new semantics.
> 
> Suggested-by: Konstantin Ananyev <konstantin.anan...@intel.com>
> Signed-off-by: Anatoly Burakov <anatoly.bura...@intel.com>
> Acked-by: Konstantin Ananyev <konstantin.anan...@intel.com>
> ---
> 
> Notes:
>     v4:
>     - Return error if callback is set to NULL
>     - Replace raw number with a macro in monitor condition opaque data
> 
>     v2:
>     - Use callback mechanism for more flexibility
>     - Address feedback from Konstantin
> 
>  doc/guides/rel_notes/release_21_08.rst        |  2 ++
>  drivers/event/dlb2/dlb2.c                     | 17 ++++++++--
>  drivers/net/i40e/i40e_rxtx.c                  | 20 +++++++----
>  drivers/net/iavf/iavf_rxtx.c                  | 20 +++++++----
>  drivers/net/ice/ice_rxtx.c                    | 20 +++++++----
>  drivers/net/ixgbe/ixgbe_rxtx.c                | 20 +++++++----
>  drivers/net/mlx5/mlx5_rx.c                    | 17 ++++++++--
>  .../include/generic/rte_power_intrinsics.h    | 33 +++++++++++++++----
>  lib/eal/x86/rte_power_intrinsics.c            | 17 +++++-----
>  9 files changed, 122 insertions(+), 44 deletions(-)
> 
> diff --git a/doc/guides/rel_notes/release_21_08.rst
> b/doc/guides/rel_notes/release_21_08.rst
> index c92e016783..65910de348 100644
> --- a/doc/guides/rel_notes/release_21_08.rst
> +++ b/doc/guides/rel_notes/release_21_08.rst
> @@ -135,6 +135,8 @@ API Changes
>  * eal: ``rte_strscpy`` sets ``rte_errno`` to ``E2BIG`` in case of string
>    truncation.
> 
> +* eal: the ``rte_power_intrinsics`` API changed to use a callback mechanism.
> +
> 
>  ABI Changes
>  -----------
> diff --git a/drivers/event/dlb2/dlb2.c b/drivers/event/dlb2/dlb2.c
> index eca183753f..252bbd8d5e 100644
> --- a/drivers/event/dlb2/dlb2.c
> +++ b/drivers/event/dlb2/dlb2.c
> @@ -3154,6 +3154,16 @@ dlb2_port_credits_inc(struct dlb2_port *qm_port,
> int num)
>       }
>  }
> 
> +#define CLB_MASK_IDX 0
> +#define CLB_VAL_IDX 1
> +static int
> +dlb2_monitor_callback(const uint64_t val,
> +             const uint64_t opaque[RTE_POWER_MONITOR_OPAQUE_SZ])
> +{
> +     /* abort if the value matches */
> +     return (val & opaque[CLB_MASK_IDX]) == opaque[CLB_VAL_IDX] ? -1 :
> 0;
> +}
> +
>  static inline int
>  dlb2_dequeue_wait(struct dlb2_eventdev *dlb2,
>                 struct dlb2_eventdev_port *ev_port,
> @@ -3194,8 +3204,11 @@ dlb2_dequeue_wait(struct dlb2_eventdev *dlb2,
>                       expected_value = 0;
> 
>               pmc.addr = monitor_addr;
> -             pmc.val = expected_value;
> -             pmc.mask = qe_mask.raw_qe[1];
> +             /* store expected value and comparison mask in opaque data */
> +             pmc.opaque[CLB_VAL_IDX] = expected_value;
> +             pmc.opaque[CLB_MASK_IDX] = qe_mask.raw_qe[1];
> +             /* set up callback */
> +             pmc.fn = dlb2_monitor_callback;
>               pmc.size = sizeof(uint64_t);
> 
>               rte_power_monitor(&pmc, timeout + start_ticks);
> diff --git a/drivers/net/i40e/i40e_rxtx.c b/drivers/net/i40e/i40e_rxtx.c
> index 8d65f287f4..65f325ede1 100644
> --- a/drivers/net/i40e/i40e_rxtx.c
> +++ b/drivers/net/i40e/i40e_rxtx.c
> @@ -81,6 +81,18 @@
>  #define I40E_TX_OFFLOAD_SIMPLE_NOTSUP_MASK \
>               (PKT_TX_OFFLOAD_MASK ^
> I40E_TX_OFFLOAD_SIMPLE_SUP_MASK)
> 
> +static int
> +i40e_monitor_callback(const uint64_t value,
> +             const uint64_t arg[RTE_POWER_MONITOR_OPAQUE_SZ]
> __rte_unused)
> +{
> +     const uint64_t m = rte_cpu_to_le_64(1 <<
> I40E_RX_DESC_STATUS_DD_SHIFT);
> +     /*
> +      * we expect the DD bit to be set to 1 if this descriptor was already
> +      * written to.
> +      */
> +     return (value & m) == m ? -1 : 0;
> +}
> +
>  int
>  i40e_get_monitor_addr(void *rx_queue, struct rte_power_monitor_cond
> *pmc)
>  {
> @@ -93,12 +105,8 @@ i40e_get_monitor_addr(void *rx_queue, struct
> rte_power_monitor_cond *pmc)
>       /* watch for changes in status bit */
>       pmc->addr = &rxdp->wb.qword1.status_error_len;
> 
> -     /*
> -      * we expect the DD bit to be set to 1 if this descriptor was already
> -      * written to.
> -      */
> -     pmc->val = rte_cpu_to_le_64(1 << I40E_RX_DESC_STATUS_DD_SHIFT);
> -     pmc->mask = rte_cpu_to_le_64(1 <<
> I40E_RX_DESC_STATUS_DD_SHIFT);
> +     /* comparison callback */
> +     pmc->fn = i40e_monitor_callback;
> 
>       /* registers are 64-bit */
>       pmc->size = sizeof(uint64_t);
> diff --git a/drivers/net/iavf/iavf_rxtx.c b/drivers/net/iavf/iavf_rxtx.c
> index f817fbc49b..d61b32fcee 100644
> --- a/drivers/net/iavf/iavf_rxtx.c
> +++ b/drivers/net/iavf/iavf_rxtx.c
> @@ -57,6 +57,18 @@ iavf_proto_xtr_type_to_rxdid(uint8_t flex_type)
>                               rxdid_map[flex_type] :
> IAVF_RXDID_COMMS_OVS_1;
>  }
> 
> +static int
> +iavf_monitor_callback(const uint64_t value,
> +             const uint64_t arg[RTE_POWER_MONITOR_OPAQUE_SZ]
> __rte_unused)
> +{
> +     const uint64_t m = rte_cpu_to_le_64(1 <<
> IAVF_RX_DESC_STATUS_DD_SHIFT);
> +     /*
> +      * we expect the DD bit to be set to 1 if this descriptor was already
> +      * written to.
> +      */
> +     return (value & m) == m ? -1 : 0;
> +}
> +
>  int
>  iavf_get_monitor_addr(void *rx_queue, struct rte_power_monitor_cond *pmc)
>  {
> @@ -69,12 +81,8 @@ iavf_get_monitor_addr(void *rx_queue, struct
> rte_power_monitor_cond *pmc)
>       /* watch for changes in status bit */
>       pmc->addr = &rxdp->wb.qword1.status_error_len;
> 
> -     /*
> -      * we expect the DD bit to be set to 1 if this descriptor was already
> -      * written to.
> -      */
> -     pmc->val = rte_cpu_to_le_64(1 << IAVF_RX_DESC_STATUS_DD_SHIFT);
> -     pmc->mask = rte_cpu_to_le_64(1 <<
> IAVF_RX_DESC_STATUS_DD_SHIFT);
> +     /* comparison callback */
> +     pmc->fn = iavf_monitor_callback;
> 
>       /* registers are 64-bit */
>       pmc->size = sizeof(uint64_t);
> diff --git a/drivers/net/ice/ice_rxtx.c b/drivers/net/ice/ice_rxtx.c
> index 3f6e735984..5d7ab4f047 100644
> --- a/drivers/net/ice/ice_rxtx.c
> +++ b/drivers/net/ice/ice_rxtx.c
> @@ -27,6 +27,18 @@ uint64_t
> rte_net_ice_dynflag_proto_xtr_ipv6_flow_mask;
>  uint64_t rte_net_ice_dynflag_proto_xtr_tcp_mask;
>  uint64_t rte_net_ice_dynflag_proto_xtr_ip_offset_mask;
> 
> +static int
> +ice_monitor_callback(const uint64_t value,
> +             const uint64_t arg[RTE_POWER_MONITOR_OPAQUE_SZ]
> __rte_unused)
> +{
> +     const uint64_t m = rte_cpu_to_le_16(1 <<
> ICE_RX_FLEX_DESC_STATUS0_DD_S);
> +     /*
> +      * we expect the DD bit to be set to 1 if this descriptor was already
> +      * written to.
> +      */
> +     return (value & m) == m ? -1 : 0;
> +}
> +
>  int
>  ice_get_monitor_addr(void *rx_queue, struct rte_power_monitor_cond *pmc)
>  {
> @@ -39,12 +51,8 @@ ice_get_monitor_addr(void *rx_queue, struct
> rte_power_monitor_cond *pmc)
>       /* watch for changes in status bit */
>       pmc->addr = &rxdp->wb.status_error0;
> 
> -     /*
> -      * we expect the DD bit to be set to 1 if this descriptor was already
> -      * written to.
> -      */
> -     pmc->val = rte_cpu_to_le_16(1 << ICE_RX_FLEX_DESC_STATUS0_DD_S);
> -     pmc->mask = rte_cpu_to_le_16(1 <<
> ICE_RX_FLEX_DESC_STATUS0_DD_S);
> +     /* comparison callback */
> +     pmc->fn = ice_monitor_callback;
> 
>       /* register is 16-bit */
>       pmc->size = sizeof(uint16_t);
> diff --git a/drivers/net/ixgbe/ixgbe_rxtx.c b/drivers/net/ixgbe/ixgbe_rxtx.c
> index d69f36e977..c814a28cb4 100644
> --- a/drivers/net/ixgbe/ixgbe_rxtx.c
> +++ b/drivers/net/ixgbe/ixgbe_rxtx.c
> @@ -1369,6 +1369,18 @@ const uint32_t
>               RTE_PTYPE_INNER_L3_IPV4_EXT |
> RTE_PTYPE_INNER_L4_UDP,
>  };
> 
> +static int
> +ixgbe_monitor_callback(const uint64_t value,
> +             const uint64_t arg[RTE_POWER_MONITOR_OPAQUE_SZ]
> __rte_unused)
> +{
> +     const uint64_t m = rte_cpu_to_le_32(IXGBE_RXDADV_STAT_DD);
> +     /*
> +      * we expect the DD bit to be set to 1 if this descriptor was already
> +      * written to.
> +      */
> +     return (value & m) == m ? -1 : 0;
> +}
> +
>  int
>  ixgbe_get_monitor_addr(void *rx_queue, struct rte_power_monitor_cond
> *pmc)
>  {
> @@ -1381,12 +1393,8 @@ ixgbe_get_monitor_addr(void *rx_queue, struct
> rte_power_monitor_cond *pmc)
>       /* watch for changes in status bit */
>       pmc->addr = &rxdp->wb.upper.status_error;
> 
> -     /*
> -      * we expect the DD bit to be set to 1 if this descriptor was already
> -      * written to.
> -      */
> -     pmc->val = rte_cpu_to_le_32(IXGBE_RXDADV_STAT_DD);
> -     pmc->mask = rte_cpu_to_le_32(IXGBE_RXDADV_STAT_DD);
> +     /* comparison callback */
> +     pmc->fn = ixgbe_monitor_callback;
> 
>       /* the registers are 32-bit */
>       pmc->size = sizeof(uint32_t);
> diff --git a/drivers/net/mlx5/mlx5_rx.c b/drivers/net/mlx5/mlx5_rx.c
> index 777a1d6e45..17370b77dc 100644
> --- a/drivers/net/mlx5/mlx5_rx.c
> +++ b/drivers/net/mlx5/mlx5_rx.c
> @@ -269,6 +269,18 @@ mlx5_rx_queue_count(struct rte_eth_dev *dev,
> uint16_t rx_queue_id)
>       return rx_queue_count(rxq);
>  }
> 
> +#define CLB_VAL_IDX 0
> +#define CLB_MSK_IDX 1
> +static int
> +mlx_monitor_callback(const uint64_t value,
> +             const uint64_t opaque[RTE_POWER_MONITOR_OPAQUE_SZ])
> +{
> +     const uint64_t m = opaque[CLB_MSK_IDX];
> +     const uint64_t v = opaque[CLB_VAL_IDX];
> +
> +     return (value & m) == v ? -1 : 0;
> +}
> +
>  int mlx5_get_monitor_addr(void *rx_queue, struct rte_power_monitor_cond
> *pmc)
>  {
>       struct mlx5_rxq_data *rxq = rx_queue;
> @@ -282,8 +294,9 @@ int mlx5_get_monitor_addr(void *rx_queue, struct
> rte_power_monitor_cond *pmc)
>               return -rte_errno;
>       }
>       pmc->addr = &cqe->op_own;
> -     pmc->val =  !!idx;
> -     pmc->mask = MLX5_CQE_OWNER_MASK;
> +     pmc->opaque[CLB_VAL_IDX] = !!idx;
> +     pmc->opaque[CLB_MSK_IDX] = MLX5_CQE_OWNER_MASK;
> +     pmc->fn = mlx_monitor_callback;
>       pmc->size = sizeof(uint8_t);
>       return 0;
>  }
> diff --git a/lib/eal/include/generic/rte_power_intrinsics.h
> b/lib/eal/include/generic/rte_power_intrinsics.h
> index dddca3d41c..c9aa52a86d 100644
> --- a/lib/eal/include/generic/rte_power_intrinsics.h
> +++ b/lib/eal/include/generic/rte_power_intrinsics.h
> @@ -18,19 +18,38 @@
>   * which are architecture-dependent.
>   */
> 
> +/** Size of the opaque data in monitor condition */
> +#define RTE_POWER_MONITOR_OPAQUE_SZ 4
> +
> +/**
> + * Callback definition for monitoring conditions. Callbacks with this 
> signature
> + * will be used by `rte_power_monitor()` to check if the entering of power
> + * optimized state should be aborted.
> + *
> + * @param val
> + *   The value read from memory.
> + * @param opaque
> + *   Callback-specific data.
> + *
> + * @return
> + *   0 if entering of power optimized state should proceed
> + *   -1 if entering of power optimized state should be aborted
> + */
> +typedef int (*rte_power_monitor_clb_t)(const uint64_t val,
> +             const uint64_t opaque[RTE_POWER_MONITOR_OPAQUE_SZ]);
>  struct rte_power_monitor_cond {
>       volatile void *addr;  /**< Address to monitor for changes */
> -     uint64_t val;         /**< If the `mask` is non-zero, location pointed
> -                            *   to by `addr` will be read and compared
> -                            *   against this value.
> -                            */
> -     uint64_t mask;   /**< 64-bit mask to extract value read from `addr` */
> -     uint8_t size;    /**< Data size (in bytes) that will be used to compare
> -                       *   expected value (`val`) with data read from the
> +     uint8_t size;    /**< Data size (in bytes) that will be read from the
>                         *   monitored memory location (`addr`). Can be 1, 2,
>                         *   4, or 8. Supplying any other value will result in
>                         *   an error.
>                         */
> +     rte_power_monitor_clb_t fn; /**< Callback to be used to check if
> +                                  *   entering power optimized state should
> +                                  *   be aborted.
> +                                  */
> +     uint64_t opaque[RTE_POWER_MONITOR_OPAQUE_SZ];
> +     /**< Callback-specific data */
>  };
> 
>  /**
> diff --git a/lib/eal/x86/rte_power_intrinsics.c
> b/lib/eal/x86/rte_power_intrinsics.c
> index 39ea9fdecd..66fea28897 100644
> --- a/lib/eal/x86/rte_power_intrinsics.c
> +++ b/lib/eal/x86/rte_power_intrinsics.c
> @@ -76,6 +76,7 @@ rte_power_monitor(const struct
> rte_power_monitor_cond *pmc,
>       const uint32_t tsc_h = (uint32_t)(tsc_timestamp >> 32);
>       const unsigned int lcore_id = rte_lcore_id();
>       struct power_wait_status *s;
> +     uint64_t cur_value;
> 
>       /* prevent user from running this instruction if it's not supported */
>       if (!wait_supported)
> @@ -91,6 +92,9 @@ rte_power_monitor(const struct
> rte_power_monitor_cond *pmc,
>       if (__check_val_size(pmc->size) < 0)
>               return -EINVAL;
> 
> +     if (pmc->fn == NULL)
> +             return -EINVAL;
> +
>       s = &wait_status[lcore_id];
> 
>       /* update sleep address */
> @@ -110,16 +114,11 @@ rte_power_monitor(const struct
> rte_power_monitor_cond *pmc,
>       /* now that we've put this address into monitor, we can unlock */
>       rte_spinlock_unlock(&s->lock);
> 
> -     /* if we have a comparison mask, we might not need to sleep at all */
> -     if (pmc->mask) {
> -             const uint64_t cur_value = __get_umwait_val(
> -                             pmc->addr, pmc->size);
> -             const uint64_t masked = cur_value & pmc->mask;
> +     cur_value = __get_umwait_val(pmc->addr, pmc->size);
> 
> -             /* if the masked value is already matching, abort */
> -             if (masked == pmc->val)
> -                     goto end;
> -     }
> +     /* check if callback indicates we should abort */
> +     if (pmc->fn(cur_value, pmc->opaque) != 0)
> +             goto end;
> 
>       /* execute UMWAIT */
>       asm volatile(".byte 0xf2, 0x0f, 0xae, 0xf7;"
> --
> 2.25.1

DLB changes look good to me

Acked-by: timothy.mcdan...@intel.com

Reply via email to