clang version 14.0.5

On 12/5/24 2:34 PM, Wathsala Wathawana Vithanage wrote:
> What version of CLANG are you using?
>
>> -----Original Message-----
>> From: Roger Melton (rmelton) <rmel...@cisco.com>
>> Sent: Wednesday, December 4, 2024 11:24 AM
>> To: Ruifeng Wang <ruifeng.w...@arm.com>; dev@dpdk.org
>> Cc: Wathsala Wathawana Vithanage <wathsala.vithan...@arm.com>; nd
>> <n...@arm.com>
>> Subject: Re: lib/eal/arm/include/rte_vect.h fails to compile with clang14 for
>> 32bit ARM
>>
>> Considering this problem further, I don't see a way to avoid the CLANG
>> compiler error with a function implementation.  We would need a macro
>> implementation similar to CLANGS arm_neon.h.  In addition, it may be
>> necessary to provide separate implementations for CLANG and non-CLANG
>> compilers since the builtins between the toolchains are different.  One way 
>> to
>> address this would be keep the existing function implementation, and add a
>> new macro implementation for CLANG.
>>
>> For example, something like:
>>
>>
>>
>>      #if !defined(RTE_CC_CLANG)
>>      #if (defined(RTE_ARCH_ARM) && defined(RTE_ARCH_32)) || \
>>      (defined(RTE_ARCH_ARM64) && RTE_CC_IS_GNU && (GCC_VERSION
>> < 70000))
>>      /* NEON intrinsic vcopyq_laneq_u32() is not supported in ARMv7-
>> A(AArch32)
>>       * On AArch64, this intrinsic is supported since GCC version 7.
>>       */
>>      static inline uint32x4_t
>>      vcopyq_laneq_u32(uint32x4_t a, const int lane_a,
>>               uint32x4_t b, const int lane_b)
>>      {
>>          return vsetq_lane_u32(vgetq_lane_u32(b, lane_b), a, lane_a);
>>      }
>>      #endif
>>      #else
>>      #if defined(RTE_ARCH_ARM) && defined(RTE_ARCH_32)
>>      /* NEON intrinsic vcopyq_laneq_u32() is not supported in ARMv7-
>> A(AArch32)
>>       * On AArch64, this intrinsic is supported
>>       */
>>      #ifdef LITTLE_ENDIAN
>>      #define vcopyq_laneq_u32(__arg1, __arg2, __arg3, __arg4)
>> __extension__ ({ \
>>        uint32x4_t __ret; \
>>        uint32x4_t __lcl_arg1 = __arg1; \
>>        uint32x4_t __lcl_arg3 = __arg3; \
>>        __ret = vsetq_lane_u32(vgetq_lane_u32(__lcl_arg3, __arg4),
>> __lcl_arg1, __arg2); \
>>        __ret; \
>>      })
>>      #else
>>      #define __noswap_vsetq_lane_u32(__arg1, __arg2, __arg3)
>> __extension__ ({ \
>>        uint32x4_t __ret; \
>>        uint32_t __lcl_arg1 = __arg1; \
>>        uint32x4_t __lcl_arg2 = __arg2; \
>>        __ret = (uint32x4_t) __builtin_neon_vsetq_lane_i32(__lcl_arg1,
>> (int32x4_t)__lcl_arg2, __arg3); \
>>        __ret; \
>>      })
>>      #define __noswap_vgetq_lane_u32(__arg1, __arg2) __extension__ ({
>> \
>>        uint32_t __ret; \
>>        uint32x4_t __lcl_arg1 = __arg1; \
>>        __ret = (uint32_t)
>> __builtin_neon_vgetq_lane_i32((int32x4_t)__lcl_arg1, __arg2); \
>>        __ret; \
>>      })
>>      #define vcopyq_laneq_u32(__arg1, __arg2, __arg3, __arg4)
>> __extension__ ({ \
>>        uint32x4_t __ret; \
>>        uint32x4_t __lcl_arg1 = __arg1; \
>>        uint32x4_t __lcl_arg3 = __arg3; \
>>        uint32x4_t __rev1; \
>>        uint32x4_t __rev3; \
>>        __rev1 = __builtin_shufflevector(__lcl_arg1, __lcl_arg1, 3, 2, 1, 0); 
>> \
>>        __rev3 = __builtin_shufflevector(__lcl_arg3, __lcl_arg3, 3, 2, 1, 0); 
>> \
>>        __ret =
>> __noswap_vsetq_lane_u32(__noswap_vgetq_lane_u32(__rev3, __arg4),
>> __rev1, __arg2); \
>>        __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \
>>        __ret; \
>>      })
>>      #endif
>>      #endif
>>      #endif
>>
>>
>>
>> NOTE1:  I saw no reason the CLANG arm_neon.h AARCH64 macros would not
>> work for AARCH32, so the macros in this sample implementation are copies
>> CLANG originals modified for (my) readability.  I'm not an attorney, but if 
>> used,
>> it may be necessary to include the banner from the CLANG arm_neon.h.
>>
>> NOTE2: While I can build the CLANG ARM implementation, I lack the hardware
>> to test it.
>>
>>
>> Regards,
>> Roger
>>
>> On 12/3/24 7:37 PM, Roger Melton (rmelton) wrote:
>>
>>
>>      After looking at this a bit closer today, I realize that my assertion 
>> that
>> CLANG14 does support vcopyq_laneq_u32() for 32bit ARM was incorrect.  It
>> does not.  The reason that disabling the implementation in rte_vect.h works
>> for our clang builds is that we do not build the l3fwd app nor the ixgbe PMD
>> for our application, and they are the only libraries that reference that 
>> function.
>>
>>      The clang compile errors appear to be related to how clang handles
>> compile time constants, but I'm am again unsure how to resolve them in a way
>> that would work for both GNU and clang.
>>
>>      Any suggestions?
>>
>>
>>      Regards,
>>      Roger
>>
>>
>>      On 12/2/24 8:26 PM, Ruifeng Wang wrote:
>>
>>
>>              +Arm folks.
>>
>>
>>
>>              From: Roger Melton (rmelton) <rmel...@cisco.com>
>> <mailto:rmel...@cisco.com>
>>              Date: Tuesday, December 3, 2024 at 3:39 AM
>>              To: dev@dpdk.org <mailto:dev@dpdk.org>  <dev@dpdk.org>
>> <mailto:dev@dpdk.org> , Ruifeng Wang <ruifeng.w...@arm.com>
>> <mailto:ruifeng.w...@arm.com>
>>              Subject: lib/eal/arm/include/rte_vect.h fails to compile with
>> clang14 for 32bit ARM
>>
>>              Hey folks,
>>
>>              We are building DPDK with clang14 for a 32bit armv8-a based
>> CPU and ran into a compile error with the following from
>> lib/eal/arm/include/rte_vect.h:
>>
>>
>>
>>
>>
>>                      #if (defined(RTE_ARCH_ARM) &&
>> defined(RTE_ARCH_32)) || \
>>                      (defined(RTE_ARCH_ARM64) && RTE_CC_IS_GNU
>> <https://elixir.bootlin.com/dpdk/v24.11/C/ident/RTE_CC_IS_GNU>  &&
>> (GCC_VERSION
>> <https://elixir.bootlin.com/dpdk/v24.11/C/ident/GCC_VERSION>  < 70000))
>>                      /* NEON intrinsic vcopyq_laneq_u32() is not
>> supported in ARMv7-A(AArch32)
>>                       * On AArch64, this intrinsic is supported since GCC
>> version 7.
>>                       */
>>                      static inline uint32x4_t
>>                      vcopyq_laneq_u32
>> <https://elixir.bootlin.com/dpdk/v24.11/C/ident/vcopyq_laneq_u32>
>> (uint32x4_t a, const int lane_a,
>>                                uint32x4_t b, const int lane_b)
>>                      {
>>                        return vsetq_lane_u32(vgetq_lane_u32(b, lane_b),
>> a, lane_a);
>>                      }
>>                      #endif
>>
>>
>>              clang14 compile fails as follows:
>>
>>
>>
>>                      In file included from ../../../../../../cisco-dpdk-
>> upstream-arm-clang-fixes.git/lib/eal/common/eal_common_options.c:36:
>>                      ../../../../../../cisco-dpdk-upstream-arm-clang-
>> fixes.git/lib/eal/arm/include/rte_vect.h:80:24: error: argument to
>> '__builtin_neon_vgetq_lane_i32' must be a constant integer
>>                      return vsetq_lane_u32(vgetq_lane_u32(b, lane_b), a,
>> lane_a);
>>                      ^ ~~~~~~
>>                      /auto/binos-tools/llvm14/llvm-14.0-
>> p24/lib/clang/14.0.5/include/arm_neon.h:7697:22: note: expanded from
>> macro 'vgetq_lane_u32'
>>                      __ret = (uint32_t)
>> __builtin_neon_vgetq_lane_i32((int32x4_t)__s0, __p1); \
>>                      ^ ~~~~
>>                      /auto/binos-tools/llvm14/llvm-14.0-
>> p24/lib/clang/14.0.5/include/arm_neon.h:24148:19: note: expanded from
>> macro 'vsetq_lane_u32'
>>                      uint32_t __s0 = __p0; \
>>                      ^~~~
>>                      In file included from ../../../../../../cisco-dpdk-
>> upstream-arm-clang-fixes.git/lib/eal/common/eal_common_options.c:36:
>>                      ../../../../../../cisco-dpdk-upstream-arm-clang-
>> fixes.git/lib/eal/arm/include/rte_vect.h:80:9: error: argument to
>> '__builtin_neon_vsetq_lane_i32' must be a constant integer
>>                      return vsetq_lane_u32(vgetq_lane_u32(b, lane_b), a,
>> lane_a);
>>                      ^ ~~~~~~
>>                      /auto/binos-tools/llvm14/llvm-14.0-
>> p24/lib/clang/14.0.5/include/arm_neon.h:24150:24: note: expanded from
>> macro 'vsetq_lane_u32'
>>                      __ret = (uint32x4_t)
>> __builtin_neon_vsetq_lane_i32(__s0, (int32x4_t)__s1, __p2); \
>>                      ^ ~~~~
>>                      2 errors generated.
>>
>>
>>
>>              clang14 does appear to support the vcopyq_laneq_u32()
>> intrinsic, s0 we want to skip the conditional implementation.
>>
>>              Two approaches I have tested to resolve the error are:
>>
>>              1) skip if building with clang:
>>
>>
>>                      #if !defined(__clang__) &&
>> ((defined(RTE_ARCH_ARM) && defined(RTE_ARCH_32)) || \
>>                      72 (defined(RTE_ARCH_ARM64) && RTE_CC_IS_GNU
>> && (GCC_VERSION < 70000)))
>>
>>
>>
>>
>>              2) skip if not building for ARMv7:
>>
>>
>>
>>
>>                      #if (defined(RTE_ARCH_ARMv7) &&
>> defined(RTE_ARCH_32)) || \
>>                      (defined(RTE_ARCH_ARM64) && RTE_CC_IS_GNU &&
>> (GCC_VERSION < 70000))
>>
>>
>>
>>              Both address our immediate problem, but may not be a
>> appropriate for all cases.
>>
>>              Can anyone suggest the proper way to address this?  I'll be
>> submitting an patch once I have a solution that is acceptable to the
>> community.
>>
>>              Regards,
>>              Roger
>>
>>
>>
>>
>>
>>
>>
>>
>>
>>
>>
>>
>>
>>
>>
>>
>>
>>
>>
>>
>>
>

Reply via email to