> -----Original Message-----
> From: Burakov, Anatoly <anatoly.bura...@intel.com>
> Sent: Thursday, July 8, 2021 9:14 AM
> To: dev@dpdk.org; McDaniel, Timothy <timothy.mcdan...@intel.com>; Xing,
> Beilei <beilei.x...@intel.com>; Wu, Jingjing <jingjing...@intel.com>; Yang,
> Qiming <qiming.y...@intel.com>; Zhang, Qi Z <qi.z.zh...@intel.com>; Wang,
> Haiyue <haiyue.w...@intel.com>; Matan Azrad <ma...@nvidia.com>; Shahaf
> Shuler <shah...@nvidia.com>; Viacheslav Ovsiienko <viachesl...@nvidia.com>;
> Richardson, Bruce <bruce.richard...@intel.com>; Ananyev, Konstantin
> <konstantin.anan...@intel.com>
> Cc: Loftus, Ciara <ciara.lof...@intel.com>; Hunt, David <david.h...@intel.com>
> Subject: [PATCH v8 1/7] power_intrinsics: use callbacks for comparison
>
> Previously, the semantics of power monitor were such that we were
> checking current value against the expected value, and if they matched,
> then the sleep was aborted. This is somewhat inflexible, because it only
> allowed us to check for a specific value in a specific way.
>
> This commit replaces the comparison with a user callback mechanism, so
> that any PMD (or other code) using `rte_power_monitor()` can define
> their own comparison semantics and decision making on how to detect the
> need to abort the entering of power optimized state.
>
> Existing implementations are adjusted to follow the new semantics.
>
> Suggested-by: Konstantin Ananyev <konstantin.anan...@intel.com>
> Signed-off-by: Anatoly Burakov <anatoly.bura...@intel.com>
> Acked-by: Konstantin Ananyev <konstantin.anan...@intel.com>
> ---
>
> Notes:
> v4:
> - Return error if callback is set to NULL
> - Replace raw number with a macro in monitor condition opaque data
>
> v2:
> - Use callback mechanism for more flexibility
> - Address feedback from Konstantin
>
> doc/guides/rel_notes/release_21_08.rst | 2 ++
> drivers/event/dlb2/dlb2.c | 17 ++++++++--
> drivers/net/i40e/i40e_rxtx.c | 20 +++++++----
> drivers/net/iavf/iavf_rxtx.c | 20 +++++++----
> drivers/net/ice/ice_rxtx.c | 20 +++++++----
> drivers/net/ixgbe/ixgbe_rxtx.c | 20 +++++++----
> drivers/net/mlx5/mlx5_rx.c | 17 ++++++++--
> .../include/generic/rte_power_intrinsics.h | 33 +++++++++++++++----
> lib/eal/x86/rte_power_intrinsics.c | 17 +++++-----
> 9 files changed, 122 insertions(+), 44 deletions(-)
>
> diff --git a/doc/guides/rel_notes/release_21_08.rst
> b/doc/guides/rel_notes/release_21_08.rst
> index c92e016783..65910de348 100644
> --- a/doc/guides/rel_notes/release_21_08.rst
> +++ b/doc/guides/rel_notes/release_21_08.rst
> @@ -135,6 +135,8 @@ API Changes
> * eal: ``rte_strscpy`` sets ``rte_errno`` to ``E2BIG`` in case of string
> truncation.
>
> +* eal: the ``rte_power_intrinsics`` API changed to use a callback mechanism.
> +
>
> ABI Changes
> -----------
> diff --git a/drivers/event/dlb2/dlb2.c b/drivers/event/dlb2/dlb2.c
> index eca183753f..252bbd8d5e 100644
> --- a/drivers/event/dlb2/dlb2.c
> +++ b/drivers/event/dlb2/dlb2.c
> @@ -3154,6 +3154,16 @@ dlb2_port_credits_inc(struct dlb2_port *qm_port,
> int num)
> }
> }
>
> +#define CLB_MASK_IDX 0
> +#define CLB_VAL_IDX 1
> +static int
> +dlb2_monitor_callback(const uint64_t val,
> + const uint64_t opaque[RTE_POWER_MONITOR_OPAQUE_SZ])
> +{
> + /* abort if the value matches */
> + return (val & opaque[CLB_MASK_IDX]) == opaque[CLB_VAL_IDX] ? -1 :
> 0;
> +}
> +
> static inline int
> dlb2_dequeue_wait(struct dlb2_eventdev *dlb2,
> struct dlb2_eventdev_port *ev_port,
> @@ -3194,8 +3204,11 @@ dlb2_dequeue_wait(struct dlb2_eventdev *dlb2,
> expected_value = 0;
>
> pmc.addr = monitor_addr;
> - pmc.val = expected_value;
> - pmc.mask = qe_mask.raw_qe[1];
> + /* store expected value and comparison mask in opaque data */
> + pmc.opaque[CLB_VAL_IDX] = expected_value;
> + pmc.opaque[CLB_MASK_IDX] = qe_mask.raw_qe[1];
> + /* set up callback */
> + pmc.fn = dlb2_monitor_callback;
> pmc.size = sizeof(uint64_t);
>
> rte_power_monitor(&pmc, timeout + start_ticks);
> diff --git a/drivers/net/i40e/i40e_rxtx.c b/drivers/net/i40e/i40e_rxtx.c
> index 8d65f287f4..65f325ede1 100644
> --- a/drivers/net/i40e/i40e_rxtx.c
> +++ b/drivers/net/i40e/i40e_rxtx.c
> @@ -81,6 +81,18 @@
> #define I40E_TX_OFFLOAD_SIMPLE_NOTSUP_MASK \
> (PKT_TX_OFFLOAD_MASK ^
> I40E_TX_OFFLOAD_SIMPLE_SUP_MASK)
>
> +static int
> +i40e_monitor_callback(const uint64_t value,
> + const uint64_t arg[RTE_POWER_MONITOR_OPAQUE_SZ]
> __rte_unused)
> +{
> + const uint64_t m = rte_cpu_to_le_64(1 <<
> I40E_RX_DESC_STATUS_DD_SHIFT);
> + /*
> + * we expect the DD bit to be set to 1 if this descriptor was already
> + * written to.
> + */
> + return (value & m) == m ? -1 : 0;
> +}
> +
> int
> i40e_get_monitor_addr(void *rx_queue, struct rte_power_monitor_cond
> *pmc)
> {
> @@ -93,12 +105,8 @@ i40e_get_monitor_addr(void *rx_queue, struct
> rte_power_monitor_cond *pmc)
> /* watch for changes in status bit */
> pmc->addr = &rxdp->wb.qword1.status_error_len;
>
> - /*
> - * we expect the DD bit to be set to 1 if this descriptor was already
> - * written to.
> - */
> - pmc->val = rte_cpu_to_le_64(1 << I40E_RX_DESC_STATUS_DD_SHIFT);
> - pmc->mask = rte_cpu_to_le_64(1 <<
> I40E_RX_DESC_STATUS_DD_SHIFT);
> + /* comparison callback */
> + pmc->fn = i40e_monitor_callback;
>
> /* registers are 64-bit */
> pmc->size = sizeof(uint64_t);
> diff --git a/drivers/net/iavf/iavf_rxtx.c b/drivers/net/iavf/iavf_rxtx.c
> index f817fbc49b..d61b32fcee 100644
> --- a/drivers/net/iavf/iavf_rxtx.c
> +++ b/drivers/net/iavf/iavf_rxtx.c
> @@ -57,6 +57,18 @@ iavf_proto_xtr_type_to_rxdid(uint8_t flex_type)
> rxdid_map[flex_type] :
> IAVF_RXDID_COMMS_OVS_1;
> }
>
> +static int
> +iavf_monitor_callback(const uint64_t value,
> + const uint64_t arg[RTE_POWER_MONITOR_OPAQUE_SZ]
> __rte_unused)
> +{
> + const uint64_t m = rte_cpu_to_le_64(1 <<
> IAVF_RX_DESC_STATUS_DD_SHIFT);
> + /*
> + * we expect the DD bit to be set to 1 if this descriptor was already
> + * written to.
> + */
> + return (value & m) == m ? -1 : 0;
> +}
> +
> int
> iavf_get_monitor_addr(void *rx_queue, struct rte_power_monitor_cond *pmc)
> {
> @@ -69,12 +81,8 @@ iavf_get_monitor_addr(void *rx_queue, struct
> rte_power_monitor_cond *pmc)
> /* watch for changes in status bit */
> pmc->addr = &rxdp->wb.qword1.status_error_len;
>
> - /*
> - * we expect the DD bit to be set to 1 if this descriptor was already
> - * written to.
> - */
> - pmc->val = rte_cpu_to_le_64(1 << IAVF_RX_DESC_STATUS_DD_SHIFT);
> - pmc->mask = rte_cpu_to_le_64(1 <<
> IAVF_RX_DESC_STATUS_DD_SHIFT);
> + /* comparison callback */
> + pmc->fn = iavf_monitor_callback;
>
> /* registers are 64-bit */
> pmc->size = sizeof(uint64_t);
> diff --git a/drivers/net/ice/ice_rxtx.c b/drivers/net/ice/ice_rxtx.c
> index 3f6e735984..5d7ab4f047 100644
> --- a/drivers/net/ice/ice_rxtx.c
> +++ b/drivers/net/ice/ice_rxtx.c
> @@ -27,6 +27,18 @@ uint64_t
> rte_net_ice_dynflag_proto_xtr_ipv6_flow_mask;
> uint64_t rte_net_ice_dynflag_proto_xtr_tcp_mask;
> uint64_t rte_net_ice_dynflag_proto_xtr_ip_offset_mask;
>
> +static int
> +ice_monitor_callback(const uint64_t value,
> + const uint64_t arg[RTE_POWER_MONITOR_OPAQUE_SZ]
> __rte_unused)
> +{
> + const uint64_t m = rte_cpu_to_le_16(1 <<
> ICE_RX_FLEX_DESC_STATUS0_DD_S);
> + /*
> + * we expect the DD bit to be set to 1 if this descriptor was already
> + * written to.
> + */
> + return (value & m) == m ? -1 : 0;
> +}
> +
> int
> ice_get_monitor_addr(void *rx_queue, struct rte_power_monitor_cond *pmc)
> {
> @@ -39,12 +51,8 @@ ice_get_monitor_addr(void *rx_queue, struct
> rte_power_monitor_cond *pmc)
> /* watch for changes in status bit */
> pmc->addr = &rxdp->wb.status_error0;
>
> - /*
> - * we expect the DD bit to be set to 1 if this descriptor was already
> - * written to.
> - */
> - pmc->val = rte_cpu_to_le_16(1 << ICE_RX_FLEX_DESC_STATUS0_DD_S);
> - pmc->mask = rte_cpu_to_le_16(1 <<
> ICE_RX_FLEX_DESC_STATUS0_DD_S);
> + /* comparison callback */
> + pmc->fn = ice_monitor_callback;
>
> /* register is 16-bit */
> pmc->size = sizeof(uint16_t);
> diff --git a/drivers/net/ixgbe/ixgbe_rxtx.c b/drivers/net/ixgbe/ixgbe_rxtx.c
> index d69f36e977..c814a28cb4 100644
> --- a/drivers/net/ixgbe/ixgbe_rxtx.c
> +++ b/drivers/net/ixgbe/ixgbe_rxtx.c
> @@ -1369,6 +1369,18 @@ const uint32_t
> RTE_PTYPE_INNER_L3_IPV4_EXT |
> RTE_PTYPE_INNER_L4_UDP,
> };
>
> +static int
> +ixgbe_monitor_callback(const uint64_t value,
> + const uint64_t arg[RTE_POWER_MONITOR_OPAQUE_SZ]
> __rte_unused)
> +{
> + const uint64_t m = rte_cpu_to_le_32(IXGBE_RXDADV_STAT_DD);
> + /*
> + * we expect the DD bit to be set to 1 if this descriptor was already
> + * written to.
> + */
> + return (value & m) == m ? -1 : 0;
> +}
> +
> int
> ixgbe_get_monitor_addr(void *rx_queue, struct rte_power_monitor_cond
> *pmc)
> {
> @@ -1381,12 +1393,8 @@ ixgbe_get_monitor_addr(void *rx_queue, struct
> rte_power_monitor_cond *pmc)
> /* watch for changes in status bit */
> pmc->addr = &rxdp->wb.upper.status_error;
>
> - /*
> - * we expect the DD bit to be set to 1 if this descriptor was already
> - * written to.
> - */
> - pmc->val = rte_cpu_to_le_32(IXGBE_RXDADV_STAT_DD);
> - pmc->mask = rte_cpu_to_le_32(IXGBE_RXDADV_STAT_DD);
> + /* comparison callback */
> + pmc->fn = ixgbe_monitor_callback;
>
> /* the registers are 32-bit */
> pmc->size = sizeof(uint32_t);
> diff --git a/drivers/net/mlx5/mlx5_rx.c b/drivers/net/mlx5/mlx5_rx.c
> index 777a1d6e45..17370b77dc 100644
> --- a/drivers/net/mlx5/mlx5_rx.c
> +++ b/drivers/net/mlx5/mlx5_rx.c
> @@ -269,6 +269,18 @@ mlx5_rx_queue_count(struct rte_eth_dev *dev,
> uint16_t rx_queue_id)
> return rx_queue_count(rxq);
> }
>
> +#define CLB_VAL_IDX 0
> +#define CLB_MSK_IDX 1
> +static int
> +mlx_monitor_callback(const uint64_t value,
> + const uint64_t opaque[RTE_POWER_MONITOR_OPAQUE_SZ])
> +{
> + const uint64_t m = opaque[CLB_MSK_IDX];
> + const uint64_t v = opaque[CLB_VAL_IDX];
> +
> + return (value & m) == v ? -1 : 0;
> +}
> +
> int mlx5_get_monitor_addr(void *rx_queue, struct rte_power_monitor_cond
> *pmc)
> {
> struct mlx5_rxq_data *rxq = rx_queue;
> @@ -282,8 +294,9 @@ int mlx5_get_monitor_addr(void *rx_queue, struct
> rte_power_monitor_cond *pmc)
> return -rte_errno;
> }
> pmc->addr = &cqe->op_own;
> - pmc->val = !!idx;
> - pmc->mask = MLX5_CQE_OWNER_MASK;
> + pmc->opaque[CLB_VAL_IDX] = !!idx;
> + pmc->opaque[CLB_MSK_IDX] = MLX5_CQE_OWNER_MASK;
> + pmc->fn = mlx_monitor_callback;
> pmc->size = sizeof(uint8_t);
> return 0;
> }
> diff --git a/lib/eal/include/generic/rte_power_intrinsics.h
> b/lib/eal/include/generic/rte_power_intrinsics.h
> index dddca3d41c..c9aa52a86d 100644
> --- a/lib/eal/include/generic/rte_power_intrinsics.h
> +++ b/lib/eal/include/generic/rte_power_intrinsics.h
> @@ -18,19 +18,38 @@
> * which are architecture-dependent.
> */
>
> +/** Size of the opaque data in monitor condition */
> +#define RTE_POWER_MONITOR_OPAQUE_SZ 4
> +
> +/**
> + * Callback definition for monitoring conditions. Callbacks with this
> signature
> + * will be used by `rte_power_monitor()` to check if the entering of power
> + * optimized state should be aborted.
> + *
> + * @param val
> + * The value read from memory.
> + * @param opaque
> + * Callback-specific data.
> + *
> + * @return
> + * 0 if entering of power optimized state should proceed
> + * -1 if entering of power optimized state should be aborted
> + */
> +typedef int (*rte_power_monitor_clb_t)(const uint64_t val,
> + const uint64_t opaque[RTE_POWER_MONITOR_OPAQUE_SZ]);
> struct rte_power_monitor_cond {
> volatile void *addr; /**< Address to monitor for changes */
> - uint64_t val; /**< If the `mask` is non-zero, location pointed
> - * to by `addr` will be read and compared
> - * against this value.
> - */
> - uint64_t mask; /**< 64-bit mask to extract value read from `addr` */
> - uint8_t size; /**< Data size (in bytes) that will be used to compare
> - * expected value (`val`) with data read from the
> + uint8_t size; /**< Data size (in bytes) that will be read from the
> * monitored memory location (`addr`). Can be 1, 2,
> * 4, or 8. Supplying any other value will result in
> * an error.
> */
> + rte_power_monitor_clb_t fn; /**< Callback to be used to check if
> + * entering power optimized state should
> + * be aborted.
> + */
> + uint64_t opaque[RTE_POWER_MONITOR_OPAQUE_SZ];
> + /**< Callback-specific data */
> };
>
> /**
> diff --git a/lib/eal/x86/rte_power_intrinsics.c
> b/lib/eal/x86/rte_power_intrinsics.c
> index 39ea9fdecd..66fea28897 100644
> --- a/lib/eal/x86/rte_power_intrinsics.c
> +++ b/lib/eal/x86/rte_power_intrinsics.c
> @@ -76,6 +76,7 @@ rte_power_monitor(const struct
> rte_power_monitor_cond *pmc,
> const uint32_t tsc_h = (uint32_t)(tsc_timestamp >> 32);
> const unsigned int lcore_id = rte_lcore_id();
> struct power_wait_status *s;
> + uint64_t cur_value;
>
> /* prevent user from running this instruction if it's not supported */
> if (!wait_supported)
> @@ -91,6 +92,9 @@ rte_power_monitor(const struct
> rte_power_monitor_cond *pmc,
> if (__check_val_size(pmc->size) < 0)
> return -EINVAL;
>
> + if (pmc->fn == NULL)
> + return -EINVAL;
> +
> s = &wait_status[lcore_id];
>
> /* update sleep address */
> @@ -110,16 +114,11 @@ rte_power_monitor(const struct
> rte_power_monitor_cond *pmc,
> /* now that we've put this address into monitor, we can unlock */
> rte_spinlock_unlock(&s->lock);
>
> - /* if we have a comparison mask, we might not need to sleep at all */
> - if (pmc->mask) {
> - const uint64_t cur_value = __get_umwait_val(
> - pmc->addr, pmc->size);
> - const uint64_t masked = cur_value & pmc->mask;
> + cur_value = __get_umwait_val(pmc->addr, pmc->size);
>
> - /* if the masked value is already matching, abort */
> - if (masked == pmc->val)
> - goto end;
> - }
> + /* check if callback indicates we should abort */
> + if (pmc->fn(cur_value, pmc->opaque) != 0)
> + goto end;
>
> /* execute UMWAIT */
> asm volatile(".byte 0xf2, 0x0f, 0xae, 0xf7;"
> --
> 2.25.1
DLB changes look good to me
Acked-by: timothy.mcdan...@intel.com