> -----Original Message----- > From: Neil Horman [mailto:nhorman at tuxdriver.com] > Sent: Thursday, August 28, 2014 9:38 PM > To: dev at dpdk.org > Cc: Neil Horman; Ananyev, Konstantin; thomas.monjalon at 6wind.com > Subject: [PATCHv4] librte_acl make it build/work for 'default' target > > Make ACL library to build/work on 'default' architecture: > - make rte_acl_classify_scalar really scalar > (make sure it wouldn't use sse4 instrincts through resolve_priority()). > - Provide two versions of rte_acl_classify code path: > rte_acl_classify_sse() - could be build and used only on systems with sse4.2 > and upper, return -ENOTSUP on lower arch. > rte_acl_classify_scalar() - a slower version, but could be build and used > on all systems. > - keep common code shared between these two codepaths. > > v2 chages: > run-time selection of most appropriate code-path for given ISA. > By default the highest supprted one is selected. > User can still override that selection by manually assigning new value to > the global function pointer rte_acl_default_classify. > rte_acl_classify() becomes a macro calling whatever rte_acl_default_classify > points to. > > V3 Changes > Updated classify pointer to be a function so as to better preserve ABI > REmoved macro definitions for match check functions to make them static > inline > > V4 Changes > Rewrote classification selection mechanim to use a function table, so that we > can just store the preferred alg in the rte_acl_ctx struct so that > multiprocess > access works. I understand that leaves us with an extra load instruction, > but I > think thats ok, because it also allows... > > Addition of a new function rte_acl_classify_alg. This function lets you > specify an enum value to override the acl contexts default algorith when > doing a > classification. This allows an application to specify a classification > algorithm without needing to pulicize each method. I know there was concern > over keeping those methods public, but we don't have a static ABI at the > moment, > so this seems to me a reasonable thing to do, as it gives us less of an ABI > surface to worry about.
Good way to overcome the problem. >From what I am seeing it adds a tiny slowdown (as expected) ... Though it provides a good flexibility and I don't have any better ideas. So I'd say let stick with that approach. Below are few technical comments. Thanks Konstantin > > Fixed misc missed static declarations > > Removed acl_match_check.h and moved match_check function to acl_run.h > > typdeffed function pointer to match check. > > Signed-off-by: Neil Horman <nhorman at tuxdriver.com> > CC: konstantin.ananyev at intel.com > CC: thomas.monjalon at 6wind.com > --- > app/test-acl/main.c | 13 +- > app/test/test_acl.c | 10 +- > lib/librte_acl/Makefile | 5 +- > lib/librte_acl/acl.h | 1 + > lib/librte_acl/acl_bld.c | 5 +- > lib/librte_acl/acl_run.c | 944 > ---------------------------------------- > lib/librte_acl/acl_run.h | 271 ++++++++++++ > lib/librte_acl/acl_run_scalar.c | 197 +++++++++ > lib/librte_acl/acl_run_sse.c | 630 +++++++++++++++++++++++++++ > lib/librte_acl/rte_acl.c | 62 +++ > lib/librte_acl/rte_acl.h | 66 ++- > 11 files changed, 1208 insertions(+), 996 deletions(-) > delete mode 100644 lib/librte_acl/acl_run.c > create mode 100644 lib/librte_acl/acl_run.h > create mode 100644 lib/librte_acl/acl_run_scalar.c > create mode 100644 lib/librte_acl/acl_run_sse.c > > diff --git a/app/test/test_acl.c b/app/test/test_acl.c > index 869f6d3..2169f59 100644 > --- a/app/test/test_acl.c > +++ b/app/test/test_acl.c > @@ -859,7 +859,7 @@ test_invalid_parameters(void) > } > > /* cover invalid but positive categories in classify */ > - result = rte_acl_classify_scalar(acx, NULL, NULL, 0, 3); > + result = rte_acl_classify(acx, NULL, NULL, 0, 3); Typo, should be: rte_acl_classify_alg(acx, RTE_ACL_CLASSIFY_SCALAR, NULL, NULL, 0, 3); > diff --git a/lib/librte_acl/acl.h b/lib/librte_acl/acl.h > index b9d63fd..9236b7b 100644 > --- a/lib/librte_acl/acl.h > +++ b/lib/librte_acl/acl.h > @@ -168,6 +168,7 @@ struct rte_acl_ctx { > void *mem; > size_t mem_sz; > struct rte_acl_config config; /* copy of build config. */ > + enum rte_acl_classify_alg alg; > }; Each rte_acl_build() will reset all fields of rte_acl_ctx starting from num_categories and below. So we need to move alg somewhere above num_categories: --- a/lib/librte_acl/acl.h +++ b/lib/librte_acl/acl.h @@ -153,6 +153,7 @@ struct rte_acl_ctx { /** Name of the ACL context. */ int32_t socket_id; /** Socket ID to allocate memory from. */ + enum rte_acl_classify_alg alg; void *rules; uint32_t max_rules; uint32_t rule_sz; @@ -168,9 +169,11 @@ struct rte_acl_ctx { void *mem; size_t mem_sz; struct rte_acl_config config; /* copy of build config. */ - enum rte_acl_classify_alg alg; }; > diff --git a/lib/librte_acl/acl_run_scalar.c b/lib/librte_acl/acl_run_scalar.c > new file mode 100644 > index 0000000..4bf58c7 > --- /dev/null > +++ b/lib/librte_acl/acl_run_scalar.c > @@ -0,0 +1,197 @@ > + > +#include "acl_run.h" > + > +int > +rte_acl_classify_scalar(const struct rte_acl_ctx *ctx, const uint8_t **data, > + uint32_t *results, uint32_t num, uint32_t categories); No need to put this declaration here. I think you can put both rte_acl_classify_sse(), rte_acl_classify_scalar() into acl.h (it is internal lib header). And remove another declarations of these functions from rte_acl.c. > diff --git a/lib/librte_acl/acl_run_sse.c b/lib/librte_acl/acl_run_sse.c > new file mode 100644 > index 0000000..7ae63dd > --- /dev/null > +++ b/lib/librte_acl/acl_run_sse.c > +#include "acl_run.h" > + + +int +rte_acl_classify_sse(const struct rte_acl_ctx *ctx, const uint8_t **data, + uint32_t *results, uint32_t num, uint32_t categories); + Move to acl.h. > diff --git a/lib/librte_acl/rte_acl.c b/lib/librte_acl/rte_acl.c > index 7c288bd..741bed4 100644 > --- a/lib/librte_acl/rte_acl.c > +++ b/lib/librte_acl/rte_acl.c > @@ -33,11 +33,72 @@ > > #include <rte_acl.h> > #include "acl.h" > +#include "acl_run.h" acl_run.h contains defintions for a lot of functions and should be included only by acl_run_*.c. I think it is better to move typedef int (*rte_acl_classify_t) into acl.h and don't include acl_run.h here. > > #define BIT_SIZEOF(x) (sizeof(x) * CHAR_BIT) > > TAILQ_HEAD(rte_acl_list, rte_tailq_entry); > > +extern int > +rte_acl_classify_scalar(const struct rte_acl_ctx *ctx, const uint8_t **data, > + uint32_t *results, uint32_t num, uint32_t categories); > + > +extern int > +rte_acl_classify_sse(const struct rte_acl_ctx *ctx, const uint8_t **data, > + uint32_t *results, uint32_t num, uint32_t categories); > + As above: I think it is safe to move these declarations into acl.h. > +static rte_acl_classify_t classify_fns[] = { > + [RTE_ACL_CLASSIFY_DEFAULT] = rte_acl_classify_scalar, > + [RTE_ACL_CLASSIFY_SCALAR] = rte_acl_classify_scalar, > + [RTE_ACL_CLASSIFY_SSE] = rte_acl_classify_sse, > +}; static const static rte_acl_classify_t classify_fns[] ? > + > + > +extern int > +rte_acl_classify_scalar(const struct rte_acl_ctx *ctx, const uint8_t **data, > + uint32_t *results, uint32_t num, uint32_t categories); Duplicate. > + > +/* by default, use always avaialbe scalar code path. */ > +static enum rte_acl_classify_alg rte_acl_default_classify = > RTE_ACL_CLASSIFY_SCALAR; Line is longer than 80 chars? > + > +void rte_acl_set_default_classify(enum rte_acl_classify_alg alg) > +{ > + rte_acl_default_classify = alg; > +} void rte_acls_set_default_classify(...) Though, I am not sure why do we need it to be public now. Users can setup ALG per context. > + > +void rte_acl_set_ctx_classify(struct rte_acl_ctx *ctx, enum > rte_acl_classify_alg alg) > +{ > + ctx->alg = alg; > +} Same as above: void rte_acl_set_ctx_classify(...) Plus probably add checking that alg is a valid argument: If ((uint32_t)alg < RTE_DIM(classify_fns)) {ctx->alg=alg; return 0;} return -EINVAL; > + > +static void __attribute__((constructor)) > +rte_acl_init(void) > +{ > + enum rte_acl_classify_alg alg = RTE_ACL_CLASSIFY_DEFAULT; > + > + if (rte_cpu_get_flag_enabled(RTE_CPUFLAG_SSE4_1)) > + alg = RTE_ACL_CLASSIFY_SSE; > + > + rte_acl_set_default_classify(alg); > +} > + > +int rte_acl_classify(const struct rte_acl_ctx *ctx, > + const uint8_t **data, > + uint32_t *results, uint32_t num, > + uint32_t categories) > +{ > + return classify_fns[ctx->alg](ctx, data, results, num, categories); > +} > + > +int rte_acl_classify_alg(const struct rte_acl_ctx *ctx, > + enum rte_acl_classify_alg alg, > + const uint8_t **data, > + uint32_t *results, uint32_t num, > + uint32_t categories) > +{ > + return classify_fns[alg](ctx, data, results, num, categories); > +} Can you move alg argument to be the last one? That would prevent copying parameters between registers. Plus same thing with the function definition style. > + > struct rte_acl_ctx * > rte_acl_find_existing(const char *name) > { > @@ -165,6 +226,7 @@ rte_acl_create(const struct rte_acl_param *param) > ctx->max_rules = param->max_rule_num; > ctx->rule_sz = param->rule_size; > ctx->socket_id = param->socket_id; > + ctx->alg = rte_acl_default_classify; > snprintf(ctx->name, sizeof(ctx->name), "%s", param->name); > > te->data = (void *) ctx; > diff --git a/lib/librte_acl/rte_acl.h b/lib/librte_acl/rte_acl.h > index afc0f69..c092a49 100644 > --- a/lib/librte_acl/rte_acl.h > +++ b/lib/librte_acl/rte_acl.h > @@ -259,39 +259,6 @@ void > rte_acl_reset(struct rte_acl_ctx *ctx); > > /** > - * Search for a matching ACL rule for each input data buffer. > - * Each input data buffer can have up to *categories* matches. > - * That implies that results array should be big enough to hold > - * (categories * num) elements. > - * Also categories parameter should be either one or multiple of > - * RTE_ACL_RESULTS_MULTIPLIER and can't be bigger than > RTE_ACL_MAX_CATEGORIES. > - * If more than one rule is applicable for given input buffer and > - * given category, then rule with highest priority will be returned as a > match. > - * Note, that it is a caller responsibility to ensure that input parameters > - * are valid and point to correct memory locations. > - * > - * @param ctx > - * ACL context to search with. > - * @param data > - * Array of pointers to input data buffers to perform search. > - * Note that all fields in input data buffers supposed to be in network > - * byte order (MSB). > - * @param results > - * Array of search results, *categories* results per each input data > buffer. > - * @param num > - * Number of elements in the input data buffers array. > - * @param categories > - * Number of maximum possible matches for each input buffer, one possible > - * match per category. > - * @return > - * zero on successful completion. > - * -EINVAL for incorrect arguments. > - */ > -int > -rte_acl_classify(const struct rte_acl_ctx *ctx, const uint8_t **data, > - uint32_t *results, uint32_t num, uint32_t categories); > - > -/** > * Perform scalar search for a matching ACL rule for each input data buffer. > * Note, that while the search itself will avoid explicit use of SSE/AVX > * intrinsics, code for comparing matching results/priorities sill might use > @@ -323,9 +290,36 @@ rte_acl_classify(const struct rte_acl_ctx *ctx, const > uint8_t **data, > * zero on successful completion. > * -EINVAL for incorrect arguments. > */ > -int > -rte_acl_classify_scalar(const struct rte_acl_ctx *ctx, const uint8_t **data, > - uint32_t *results, uint32_t num, uint32_t categories); > + > +enum rte_acl_classify_alg { > + RTE_ACL_CLASSIFY_DEFAULT = 0, > + RTE_ACL_CLASSIFY_SCALAR = 1, > + RTE_ACL_CLASSIFY_SSE = 2, > +}; > + I think you removed the wrong comment. All public API function declaration supposed to be preceded by formal doxygen style comment: Brief explanation, parameters and return value description, etc. Please restore the proper comment for it. BTW, two new functions above - they need a formal comments too. > +extern int > +rte_acl_classify(const struct rte_acl_ctx *ctx, > + const uint8_t **data, > + uint32_t *results, uint32_t num, > + uint32_t categories); > + > +extern int > +rte_acl_classify_alg(const struct rte_acl_ctx *ctx, > + enum rte_acl_classify_alg alg, > + const uint8_t **data, > + uint32_t *results, uint32_t num, > + uint32_t categories); > +/* > + * Set the default classify algorithm for newly allocated classify contexts > + */ > +extern void > +rte_acl_set_default_classify(enum rte_acl_classify_alg alg); > + > +/* > + * Override the default classifier function for a given ctx > + */ > +extern void > +rte_acl_set_ctx_classify(struct rte_acl_ctx *ctx, enum rte_acl_classify_alg > alg); > > /** > * Dump an ACL context structure to the console. > -- > 1.9.3 Also, need to update examples/l3fwd-acl/ (remove rte_acl_classify_scalar() calls). Something like: diff --git a/examples/l3fwd-acl/main.c b/examples/l3fwd-acl/main.c index 9b2c21b..8cbf202 100644 --- a/examples/l3fwd-acl/main.c +++ b/examples/l3fwd-acl/main.c @@ -278,15 +278,6 @@ send_single_packet(struct rte_mbuf *m, uint8_t port); (in) = end + 1; \ } while (0) -#define CLASSIFY(context, data, res, num, cat) do { \ - if (scalar) \ - rte_acl_classify_scalar((context), (data), \ - (res), (num), (cat)); \ - else \ - rte_acl_classify((context), (data), \ - (res), (num), (cat)); \ -} while (0) - /* * ACL rules should have higher priorities than route ones to ensure ACL rule * always be found when input packets have multi-matches in the database. @@ -1253,6 +1244,9 @@ app_acl_init(void) dump_acl_config(); + if (parm_config.scalar) + rte_acl_set_default_classify(RTE_ACL_CLASSIFY_SCALAR); + /* Load rules from the input file */ if (add_rules(parm_config.rule_ipv4_name, &route_base_ipv4, &route_num_ipv4, &acl_base_ipv4, &acl_num_ipv4, @@ -1436,10 +1430,8 @@ main_loop(__attribute__((unused)) void *dummy) int socketid; const uint64_t drain_tsc = (rte_get_tsc_hz() + US_PER_S - 1) / US_PER_S * BURST_TX_DRAIN_US; - int scalar = parm_config.scalar; prev_tsc = 0; - lcore_id = rte_lcore_id(); qconf = &lcore_conf[lcore_id]; socketid = rte_lcore_to_socket_id(lcore_id); @@ -1503,7 +1495,8 @@ main_loop(__attribute__((unused)) void *dummy) nb_rx); if (acl_search.num_ipv4) { - CLASSIFY(acl_config.acx_ipv4[socketid], + rte_acl_classify( + acl_config.acx_ipv4[socketid], acl_search.data_ipv4, acl_search.res_ipv4, acl_search.num_ipv4, @@ -1515,7 +1508,8 @@ main_loop(__attribute__((unused)) void *dummy) } if (acl_search.num_ipv6) { - CLASSIFY(acl_config.acx_ipv6[socketid], + rte_acl_classify( + acl_config.acx_ipv6[socketid], acl_search.data_ipv6, acl_search.res_ipv6, acl_search.num_ipv6,