[dpdk-dev] [PATCH] acl: If build does not support sse4.2, emulate missing instructions with C code

Neil Horman Mon, 4 Aug 2014 11:35:58 -0400

The ACL library makes extensive use of some SSE4.2 instructions, which means the
default build can't compile this library.  Work around the problem by testing
the __SSE42__ definition in the acl_vects.h file and defining the macros there
as intrinsics or c-level equivalants.  Note this is a minimal patch, adjusting
only the definitions that are currently used in the ACL library.


Only compile tested so far, but I wanted to post it for early review so that
others could aid in unit testing.

Signed-off-by: Neil Horman <nhorman at tuxdriver.com>
CC: Thomas Monjalon <thomas.monjalon at 6wind.com>
CC: "Konstantin Ananyev" <konstantin.ananyev at intel.com>
CC: Bruce Richardson <bruce.richardson at intel.com>
---
 lib/librte_acl/acl_bld.c  |   3 +-
 lib/librte_acl/acl_vect.h | 102 ++++++++++++++++++++++++++++++++++++++++++++--
 2 files changed, 100 insertions(+), 5 deletions(-)

diff --git a/lib/librte_acl/acl_bld.c b/lib/librte_acl/acl_bld.c
index 873447b..de974a4 100644
--- a/lib/librte_acl/acl_bld.c
+++ b/lib/librte_acl/acl_bld.c
@@ -31,7 +31,6 @@
  *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */

-#include <nmmintrin.h>
 #include <rte_acl.h>
 #include "tb_mem.h"
 #include "acl.h"
@@ -1481,7 +1480,7 @@ acl_calc_wildness(struct rte_acl_build_rule *head,
                        switch (rule->config->defs[n].type) {
                        case RTE_ACL_FIELD_TYPE_BITMASK:
                                wild = (size -
-                                       _mm_popcnt_u32(fld->mask_range.u8)) /
+                                       
__builtin_popcountl(fld->mask_range.u8)) /
                                        size;
                                break;

diff --git a/lib/librte_acl/acl_vect.h b/lib/librte_acl/acl_vect.h
index d813600..e5f391b 100644
--- a/lib/librte_acl/acl_vect.h
+++ b/lib/librte_acl/acl_vect.h
@@ -34,6 +34,10 @@
 #ifndef _RTE_ACL_VECT_H_
 #define _RTE_ACL_VECT_H_

+#ifdef __SSE4_1__
+#include <smmintrin.h>
+#endif
+
 /**
  * @file
  *
@@ -44,12 +48,12 @@
 extern "C" {
 #endif

+
 #define        MM_ADD16(a, b)          _mm_add_epi16(a, b)
 #define        MM_ADD32(a, b)          _mm_add_epi32(a, b)
 #define        MM_ALIGNR8(a, b, c)     _mm_alignr_epi8(a, b, c)
 #define        MM_AND(a, b)            _mm_and_si128(a, b)
 #define MM_ANDNOT(a, b)                _mm_andnot_si128(a, b)
-#define MM_BLENDV8(a, b, c)    _mm_blendv_epi8(a, b, c)
 #define MM_CMPEQ16(a, b)       _mm_cmpeq_epi16(a, b)
 #define MM_CMPEQ32(a, b)       _mm_cmpeq_epi32(a, b)
 #define        MM_CMPEQ8(a, b)         _mm_cmpeq_epi8(a, b)
@@ -59,7 +63,6 @@ extern "C" {
 #define        MM_CVT32(a)             _mm_cvtsi128_si32(a)
 #define MM_CVTU32(a)           _mm_cvtsi32_si128(a)
 #define        MM_INSERT16(a, c, b)    _mm_insert_epi16(a, c, b)
-#define        MM_INSERT32(a, c, b)    _mm_insert_epi32(a, c, b)
 #define        MM_LOAD(a)              _mm_load_si128(a)
 #define        MM_LOADH_PI(a, b)       _mm_loadh_pi(a, b)
 #define        MM_LOADU(a)             _mm_loadu_si128(a)
@@ -82,7 +85,6 @@ extern "C" {
 #define        MM_SRL32(a, b)          _mm_srli_epi32(a, b)
 #define        MM_STORE(a, b)          _mm_store_si128(a, b)
 #define        MM_STOREU(a, b)         _mm_storeu_si128(a, b)
-#define        MM_TESTZ(a, b)          _mm_testz_si128(a, b)
 #define        MM_XOR(a, b)            _mm_xor_si128(a, b)

 #define        MM_SET16(a, b, c, d, e, f, g, h)        \
@@ -93,6 +95,100 @@ extern "C" {
        _mm_set_epi8(c0, c1, c2, c3, c4, c5, c6, c7,    \
                c8, c9, cA, cB, cC, cD, cE, cF)

+
+#ifndef __SSE4_1__
+static inline xmm_t pblendvb(xmm_t dst, xmm_t src, xmm_t mask)
+{
+       unsigned char tmpd[16], tmps[16], tmpm[16];
+       int i;
+
+       MM_STOREU((xmm_t *)&tmpd, dst);
+       MM_STOREU((xmm_t *)&tmps, src);
+       MM_STOREU((xmm_t *)&tmpm, mask);
+
+       for (i = 0; i < 16; i++)
+               if (mask[i] & 0x8)
+                       dst[i] = src[i];
+
+       dst = MM_LOADU((xmm_t *)&tmpd);
+
+       return dst;
+}
+
+#define MM_BLENDV8(a, b, c)    pblendvb(a, b, c)
+
+
+static inline int ptestz(xmm_t a, xmm_t b)
+{
+       unsigned long long tmpa[2], tmpb[2];
+
+       MM_STOREU((xmm_t *)&tmpa, a);
+       MM_STOREU((xmm_t *)&tmpb, b);
+
+       if (tmpa[0] & tmpb[0])
+               return 1;
+       if (tmpa[1] & tmpb[1])
+               return 1;
+
+       return 0;
+}
+
+#define        MM_TESTZ(a, b)          ptestz(a, b)
+
+static inline xmm_t pinsrd(xmm_t dst, int32_t val, char off)
+{
+       unsigned long long tmpa[2];
+       unsigned long long mask;
+       int32_t tmp;
+       
+       MM_STOREU((xmm_t *)&tmpa, dst);
+
+       /*
+        * Inserting a dword is a bit odd as it can cross a word boundary
+        */
+
+       if (off > 32) {
+               /*
+                * If the offset is more than 32, then part of the 
+                * inserted word will appear in the upper half of the xmm
+                * register.  Grab the part of the value that crosses the 64 
bit 
+                * boundary.
+                */
+               tmp = val >> (off - 32);
+
+               /*
+                * Mask off the least significant bits of the upper longword
+                */
+               mask = ~((1 << (off - 32)) - 1);
+               tmpa[1] &= mask;
+
+               /*
+                * and insert the new value
+                */
+               tmpa[1] |= tmp;
+       }
+       if (off < 64) {
+               /*
+                * If the offset is less than 64 bits, we also need to mask and 
+                * assign the lower longword
+                */
+               mask = (1 << off) - 1;
+               tmpa[0] &= mask;
+               tmpa[0] |= (val << off);
+       }
+
+       dst = MM_LOADU((xmm_t *)&tmpa);
+       return dst;
+}
+
+#define        MM_INSERT32(a, c, b)    pinsrd(a, c, b)
+
+#else
+#define        MM_BLENDV8(a, b, c)     _mm_blendv_epi8(a, b, c)
+#define        MM_TESTZ(a, b)          _mm_testz_si128(a, b)
+#define        MM_INSERT32(a, c, b)    _mm_insert_epi32(a, c, b)
+#endif
+
 #ifdef RTE_ARCH_X86_64

 #define        MM_CVT64(a)             _mm_cvtsi128_si64(a)
-- 
1.8.3.1

[dpdk-dev] [PATCH] acl: If build does not support sse4.2, emulate missing instructions with C code

Reply via email to