Module Name: src Committed By: rin Date: Mon Aug 7 01:07:36 UTC 2023
Modified Files: src/sys/crypto/aes/arch/arm: aes_neon_impl.h src/sys/crypto/aes/arch/x86: aes_sse2_impl.h aes_ssse3_impl.h src/sys/crypto/chacha/arch/arm: chacha_neon.c src/sys/crypto/chacha/arch/x86: chacha_sse2.c Added Files: src/sys/crypto/arch/arm: arm_neon.h arm_neon_imm.h src/sys/crypto/arch/x86: immintrin.h immintrin_ext.h Removed Files: src/sys/crypto/aes/arch/arm: arm_neon.h arm_neon_imm.h src/sys/crypto/aes/arch/x86: immintrin.h immintrin_ext.h src/sys/crypto/chacha/arch/arm: arm_neon.h arm_neon_imm.h src/sys/crypto/chacha/arch/x86: immintrin.h Log Message: sys/crypto: Introduce arch/{arm,x86} to share common MD headers Dedup between aes and chacha. No binary changes. To generate a diff of this commit: cvs rdiff -u -r1.3 -r1.4 src/sys/crypto/aes/arch/arm/aes_neon_impl.h cvs rdiff -u -r1.12 -r0 src/sys/crypto/aes/arch/arm/arm_neon.h cvs rdiff -u -r1.2 -r0 src/sys/crypto/aes/arch/arm/arm_neon_imm.h cvs rdiff -u -r1.2 -r1.3 src/sys/crypto/aes/arch/x86/aes_sse2_impl.h cvs rdiff -u -r1.1 -r1.2 src/sys/crypto/aes/arch/x86/aes_ssse3_impl.h cvs rdiff -u -r1.5 -r0 src/sys/crypto/aes/arch/x86/immintrin.h cvs rdiff -u -r1.1 -r0 src/sys/crypto/aes/arch/x86/immintrin_ext.h cvs rdiff -u -r0 -r1.1 src/sys/crypto/arch/arm/arm_neon.h \ src/sys/crypto/arch/arm/arm_neon_imm.h cvs rdiff -u -r0 -r1.1 src/sys/crypto/arch/x86/immintrin.h \ src/sys/crypto/arch/x86/immintrin_ext.h cvs rdiff -u -r1.7 -r0 src/sys/crypto/chacha/arch/arm/arm_neon.h cvs rdiff -u -r1.2 -r0 src/sys/crypto/chacha/arch/arm/arm_neon_imm.h cvs rdiff -u -r1.8 -r1.9 src/sys/crypto/chacha/arch/arm/chacha_neon.c cvs rdiff -u -r1.2 -r1.3 src/sys/crypto/chacha/arch/x86/chacha_sse2.c cvs rdiff -u -r1.1 -r0 src/sys/crypto/chacha/arch/x86/immintrin.h Please note that diffs are not public domain; they are subject to the copyright notices on the relevant files.
Modified files: Index: src/sys/crypto/aes/arch/arm/aes_neon_impl.h diff -u src/sys/crypto/aes/arch/arm/aes_neon_impl.h:1.3 src/sys/crypto/aes/arch/arm/aes_neon_impl.h:1.4 --- src/sys/crypto/aes/arch/arm/aes_neon_impl.h:1.3 Sat Aug 8 14:47:01 2020 +++ src/sys/crypto/aes/arch/arm/aes_neon_impl.h Mon Aug 7 01:07:35 2023 @@ -1,4 +1,4 @@ -/* $NetBSD: aes_neon_impl.h,v 1.3 2020/08/08 14:47:01 riastradh Exp $ */ +/* $NetBSD: aes_neon_impl.h,v 1.4 2023/08/07 01:07:35 rin Exp $ */ /*- * Copyright (c) 2020 The NetBSD Foundation, Inc. @@ -31,8 +31,8 @@ #include <sys/types.h> -#include "arm_neon.h" -#include "arm_neon_imm.h" +#include <crypto/arch/arm/arm_neon.h> +#include <crypto/arch/arm/arm_neon_imm.h> #include <crypto/aes/aes.h> #include <crypto/aes/arch/arm/aes_neon.h> Index: src/sys/crypto/aes/arch/x86/aes_sse2_impl.h diff -u src/sys/crypto/aes/arch/x86/aes_sse2_impl.h:1.2 src/sys/crypto/aes/arch/x86/aes_sse2_impl.h:1.3 --- src/sys/crypto/aes/arch/x86/aes_sse2_impl.h:1.2 Mon Jun 29 23:50:05 2020 +++ src/sys/crypto/aes/arch/x86/aes_sse2_impl.h Mon Aug 7 01:07:36 2023 @@ -1,4 +1,4 @@ -/* $NetBSD: aes_sse2_impl.h,v 1.2 2020/06/29 23:50:05 riastradh Exp $ */ +/* $NetBSD: aes_sse2_impl.h,v 1.3 2023/08/07 01:07:36 rin Exp $ */ /*- * Copyright (c) 2020 The NetBSD Foundation, Inc. @@ -33,8 +33,8 @@ #include <crypto/aes/aes.h> #include <crypto/aes/arch/x86/aes_sse2.h> -#include <crypto/aes/arch/x86/immintrin.h> -#include <crypto/aes/arch/x86/immintrin_ext.h> +#include <crypto/arch/x86/immintrin.h> +#include <crypto/arch/x86/immintrin_ext.h> void aes_sse2_bitslice_Sbox(__m128i[static 4]); void aes_sse2_bitslice_invSbox(__m128i[static 4]); Index: src/sys/crypto/aes/arch/x86/aes_ssse3_impl.h diff -u src/sys/crypto/aes/arch/x86/aes_ssse3_impl.h:1.1 src/sys/crypto/aes/arch/x86/aes_ssse3_impl.h:1.2 --- src/sys/crypto/aes/arch/x86/aes_ssse3_impl.h:1.1 Mon Jun 29 23:51:35 2020 +++ src/sys/crypto/aes/arch/x86/aes_ssse3_impl.h Mon Aug 7 01:07:36 2023 @@ -1,4 +1,4 @@ -/* $NetBSD: aes_ssse3_impl.h,v 1.1 2020/06/29 23:51:35 riastradh Exp $ */ +/* $NetBSD: aes_ssse3_impl.h,v 1.2 2023/08/07 01:07:36 rin Exp $ */ /*- * Copyright (c) 2020 The NetBSD Foundation, Inc. @@ -33,8 +33,8 @@ #include <crypto/aes/aes.h> #include <crypto/aes/arch/x86/aes_ssse3.h> -#include <crypto/aes/arch/x86/immintrin.h> -#include <crypto/aes/arch/x86/immintrin_ext.h> +#include <crypto/arch/x86/immintrin.h> +#include <crypto/arch/x86/immintrin_ext.h> __m128i aes_ssse3_enc1(const struct aesenc *, __m128i, unsigned); __m128i aes_ssse3_dec1(const struct aesdec *, __m128i, unsigned); Index: src/sys/crypto/chacha/arch/arm/chacha_neon.c diff -u src/sys/crypto/chacha/arch/arm/chacha_neon.c:1.8 src/sys/crypto/chacha/arch/arm/chacha_neon.c:1.9 --- src/sys/crypto/chacha/arch/arm/chacha_neon.c:1.8 Sat Aug 8 14:47:01 2020 +++ src/sys/crypto/chacha/arch/arm/chacha_neon.c Mon Aug 7 01:07:36 2023 @@ -1,4 +1,4 @@ -/* $NetBSD: chacha_neon.c,v 1.8 2020/08/08 14:47:01 riastradh Exp $ */ +/* $NetBSD: chacha_neon.c,v 1.9 2023/08/07 01:07:36 rin Exp $ */ /*- * Copyright (c) 2020 The NetBSD Foundation, Inc. @@ -29,8 +29,8 @@ #include <sys/types.h> #include <sys/endian.h> -#include "arm_neon.h" -#include "arm_neon_imm.h" +#include <crypto/arch/arm/arm_neon.h> +#include <crypto/arch/arm/arm_neon_imm.h> #include "chacha_neon.h" /* Index: src/sys/crypto/chacha/arch/x86/chacha_sse2.c diff -u src/sys/crypto/chacha/arch/x86/chacha_sse2.c:1.2 src/sys/crypto/chacha/arch/x86/chacha_sse2.c:1.3 --- src/sys/crypto/chacha/arch/x86/chacha_sse2.c:1.2 Mon Jul 27 20:48:18 2020 +++ src/sys/crypto/chacha/arch/x86/chacha_sse2.c Mon Aug 7 01:07:36 2023 @@ -1,4 +1,4 @@ -/* $NetBSD: chacha_sse2.c,v 1.2 2020/07/27 20:48:18 riastradh Exp $ */ +/* $NetBSD: chacha_sse2.c,v 1.3 2023/08/07 01:07:36 rin Exp $ */ /*- * Copyright (c) 2020 The NetBSD Foundation, Inc. @@ -29,7 +29,7 @@ #include <sys/types.h> #include <sys/endian.h> -#include "immintrin.h" +#include <crypto/arch/x86/immintrin.h> #include "chacha_sse2.h" Added files: Index: src/sys/crypto/arch/arm/arm_neon.h diff -u /dev/null src/sys/crypto/arch/arm/arm_neon.h:1.1 --- /dev/null Mon Aug 7 01:07:36 2023 +++ src/sys/crypto/arch/arm/arm_neon.h Mon Aug 7 01:07:36 2023 @@ -0,0 +1,717 @@ +/* $NetBSD: arm_neon.h,v 1.1 2023/08/07 01:07:36 rin Exp $ */ + +/*- + * Copyright (c) 2020 The NetBSD Foundation, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS + * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED + * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS + * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef _SYS_CRYPTO_ARCH_ARM_ARM_NEON_H +#define _SYS_CRYPTO_ARCH_ARM_ARM_NEON_H + +#if defined(__GNUC__) && !defined(__clang__) + +#define _INTRINSATTR \ + __extension__ \ + __attribute__((__always_inline__, __gnu_inline__, __artificial__)) + +#ifdef __aarch64__ +typedef __Int32x4_t int32x4_t; +typedef __Int64x2_t int64x2_t; +typedef __Int8x16_t int8x16_t; +typedef __Uint16x8_t uint16x8_t; +typedef __Uint32x4_t uint32x4_t; +typedef __Uint64x2_t uint64x2_t; +typedef __Uint8x16_t uint8x16_t; +typedef struct { uint8x16_t val[2]; } uint8x16x2_t; +#else +typedef __simd128_int32_t int32x4_t; +typedef __simd128_int64_t int64x2_t; +typedef __simd128_int8_t int8x16_t; +typedef __simd128_uint16_t uint16x8_t; +typedef __simd128_uint32_t uint32x4_t; +typedef __simd128_uint64_t uint64x2_t; +typedef __simd128_uint8_t uint8x16_t; + +typedef __simd64_int8_t int8x8_t; +typedef __simd64_uint8_t uint8x8_t; +typedef __builtin_neon_udi uint64x1_t; +typedef struct { uint8x8_t val[2]; } uint8x8x2_t; +typedef struct { uint8x16_t val[2]; } uint8x16x2_t; +#endif + +#if defined(__AARCH64EB__) +#define __neon_lane_index(__v, __i) (__arraycount(__v) - 1 - (__i)) +#define __neon_laneq_index(__v, __i) (__arraycount(__v) - 1 - (__i)) +#elif defined(__ARM_BIG_ENDIAN) +#define __neon_lane_index(__v, __i) ((__i) ^ (__arraycount(__v) - 1)) +#define __neon_laneq_index(__v, __i) ((__i) ^ (__arraycount(__v)/2 - 1)) +#else +#define __neon_lane_index(__v, __i) (__i) +#define __neon_laneq_index(__v, __i) (__i) +#endif + +#elif defined(__clang__) + +#define _INTRINSATTR \ + __attribute__((__always_inline__, __nodebug__)) + +typedef __attribute__((neon_vector_type(16))) int8_t int8x16_t; +typedef __attribute__((neon_vector_type(2))) int64_t int64x2_t; +typedef __attribute__((neon_vector_type(4))) int32_t int32x4_t; + +typedef __attribute__((neon_vector_type(16))) uint8_t uint8x16_t; +typedef __attribute__((neon_vector_type(2))) uint64_t uint64x2_t; +typedef __attribute__((neon_vector_type(4))) uint32_t uint32x4_t; +typedef __attribute__((neon_vector_type(8))) uint16_t uint16x8_t; + +typedef __attribute__((neon_vector_type(8))) int8_t int8x8_t; + +typedef __attribute__((neon_vector_type(8))) uint8_t uint8x8_t; + +typedef struct { uint8x8_t val[2]; } uint8x8x2_t; +typedef struct { uint8x16_t val[2]; } uint8x16x2_t; + +#ifdef __LITTLE_ENDIAN__ +#define __neon_lane_index(__v, __i) __i +#define __neon_laneq_index(__v, __i) __i +#else +#define __neon_lane_index(__v, __i) (__arraycount(__v) - 1 - __i) +#define __neon_laneq_index(__v, __i) (__arraycount(__v) - 1 - __i) +#endif + +#else + +#error Teach me how to neon in your compile! + +#endif + +_INTRINSATTR +static __inline uint32x4_t +vaddq_u32(uint32x4_t __v0, uint32x4_t __v1) +{ + return __v0 + __v1; +} + +_INTRINSATTR +static __inline uint32x4_t +vcltq_s32(int32x4_t __v0, int32x4_t __v1) +{ + return (uint32x4_t)(__v0 < __v1); +} + +_INTRINSATTR +static __inline int32x4_t +vdupq_n_s32(int32_t __x) +{ + return (int32x4_t) { __x, __x, __x, __x }; +} + +_INTRINSATTR +static __inline uint32x4_t +vdupq_n_u32(uint32_t __x) +{ + return (uint32x4_t) { __x, __x, __x, __x }; +} + +_INTRINSATTR +static __inline uint8x16_t +vdupq_n_u8(uint8_t __x) +{ + return (uint8x16_t) { + __x, __x, __x, __x, __x, __x, __x, __x, + __x, __x, __x, __x, __x, __x, __x, __x, + }; +} + +#if defined(__GNUC__) && !defined(__clang__) +_INTRINSATTR +static __inline uint32x4_t +vextq_u32(uint32x4_t __lo, uint32x4_t __hi, uint8_t __i) +{ +#if defined(__AARCH64EB__) || defined(__ARM_BIG_ENDIAN) + return __builtin_shuffle(__hi, __lo, + (uint32x4_t) { 4 - __i, 5 - __i, 6 - __i, 7 - __i }); +#else + return __builtin_shuffle(__lo, __hi, + (uint32x4_t) { __i + 0, __i + 1, __i + 2, __i + 3 }); +#endif +} +#elif defined(__clang__) +#ifdef __LITTLE_ENDIAN__ +#define vextq_u32(__lo, __hi, __i) \ + (uint32x4_t)__builtin_neon_vextq_v((int8x16_t)(__lo), \ + (int8x16_t)(__hi), (__i), 50) +#else +#define vextq_u32(__lo, __hi, __i) ( \ +{ \ + uint32x4_t __tlo = (__lo); \ + uint32x4_t __thi = (__hi); \ + uint32x4_t __lo_r = __builtin_shufflevector(__tlo, __tlo, 3,2,1,0); \ + uint32x4_t __hi_r = __builtin_shufflevector(__thi, __thi, 3,2,1,0); \ + uint32x4_t __r = __builtin_neon_vextq_v((int8x16_t)__lo_r, \ + (int8x16_t)__hi_r, __i, 50); \ + __builtin_shufflevector(__r, __r, 3,2,1,0); \ +}) +#endif /* __LITTLE_ENDIAN__ */ +#endif + +#if defined(__GNUC__) && !defined(__clang__) +_INTRINSATTR +static __inline uint8x16_t +vextq_u8(uint8x16_t __lo, uint8x16_t __hi, uint8_t __i) +{ +#ifdef __aarch64__ +#if defined(__AARCH64EB__) + return __builtin_shuffle(__hi, __lo, + (uint8x16_t) { + 16 - __i, 17 - __i, 18 - __i, 19 - __i, + 20 - __i, 21 - __i, 22 - __i, 23 - __i, + 24 - __i, 25 - __i, 26 - __i, 27 - __i, + 28 - __i, 29 - __i, 30 - __i, 31 - __i, + }); +#else + return __builtin_shuffle(__lo, __hi, + (uint8x16_t) { + __i + 0, __i + 1, __i + 2, __i + 3, + __i + 4, __i + 5, __i + 6, __i + 7, + __i + 8, __i + 9, __i + 10, __i + 11, + __i + 12, __i + 13, __i + 14, __i + 15, + }); +#endif +#else + return (uint8x16_t)__builtin_neon_vextv16qi((int8x16_t)__lo, + (int8x16_t)__hi, __i); +#endif +} +#elif defined(__clang__) +#ifdef __LITTLE_ENDIAN__ +#define vextq_u8(__lo, __hi, __i) \ + (uint8x16_t)__builtin_neon_vextq_v((int8x16_t)(__lo), \ + (int8x16_t)(__hi), (__i), 48) +#else +#define vextq_u8(__lo, __hi, __i) ( \ +{ \ + uint8x16_t __tlo = (__lo); \ + uint8x16_t __thi = (__hi); \ + uint8x16_t __lo_r = __builtin_shufflevector(__tlo, __tlo, \ + 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0); \ + uint8x16_t __hi_r = __builtin_shufflevector(__thi, __thi, \ + 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0); \ + uint8x16_t __r = __builtin_neon_vextq_v((int8x16_t)__lo_r, \ + (int8x16_t)__hi_r, (__i), 48); \ + __builtin_shufflevector(__r, __r, \ + 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0); \ +}) +#endif /* __LITTLE_ENDIAN */ +#endif + +#if defined(__GNUC__) && !defined(__clang__) +_INTRINSATTR +static __inline uint32_t +vgetq_lane_u32(uint32x4_t __v, uint8_t __i) +{ +#ifdef __aarch64__ + return __v[__neon_laneq_index(__v, __i)]; +#else + return (uint32_t)__builtin_neon_vget_laneuv4si((int32x4_t)__v, __i); +#endif +} +#elif defined(__clang__) +#define vgetq_lane_u32(__v, __i) \ + (uint32_t)__builtin_neon_vgetq_lane_i32((int32x4_t)(__v), \ + __neon_laneq_index(__v, __i)) +#endif + +_INTRINSATTR +static __inline uint32x4_t +vld1q_u32(const uint32_t *__p32) +{ +#if defined(__GNUC__) && !defined(__clang__) +#ifdef __aarch64__ + const __builtin_aarch64_simd_si *__p = + (const __builtin_aarch64_simd_si *)__p32; + + return (uint32x4_t)__builtin_aarch64_ld1v4si(__p); +#else + const __builtin_neon_si *__p = (const __builtin_neon_si *)__p32; + + return (uint32x4_t)__builtin_neon_vld1v4si(__p); +#endif +#elif defined(__clang__) + uint32x4_t __v = (uint32x4_t)__builtin_neon_vld1q_v(__p32, 50); +#ifndef __LITTLE_ENDIAN__ + __v = __builtin_shufflevector(__v, __v, 3,2,1,0); +#endif + return __v; +#endif +} + +_INTRINSATTR +static __inline uint8x16_t +vld1q_u8(const uint8_t *__p8) +{ +#if defined(__GNUC__) && !defined(__clang__) +#ifdef __aarch64__ + const __builtin_aarch64_simd_qi *__p = + (const __builtin_aarch64_simd_qi *)__p8; + + return (uint8x16_t)__builtin_aarch64_ld1v16qi(__p); +#else + const __builtin_neon_qi *__p = (const __builtin_neon_qi *)__p8; + + return (uint8x16_t)__builtin_neon_vld1v16qi(__p); +#endif +#elif defined(__clang__) + uint8x16_t __v = (uint8x16_t)__builtin_neon_vld1q_v(__p8, 48); +#ifndef __LITTLE_ENDIAN__ + __v = __builtin_shufflevector(__v, __v, + 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0); +#endif + return __v; +#endif +} + +_INTRINSATTR +static __inline uint8x16_t +vqtbl1q_u8(uint8x16_t __tab, uint8x16_t __idx) +{ +#if defined(__GNUC__) && !defined(__clang__) +#ifdef __aarch64__ + uint8x16_t __res; + __asm__("tbl %0.16b, {%1.16b}, %2.16b" + : "=w"(__res) : "w"(__tab), "w"(__idx)); + return __res; +#else + /* + * No native ARMv7 NEON instruction for this, so do it via two + * half-width TBLs instead (vtbl2_u8 equivalent). + */ + uint64x2_t __tab64 = (uint64x2_t)__tab; + uint8x8_t __tablo = (uint8x8_t)__tab64[0]; + uint8x8_t __tabhi = (uint8x8_t)__tab64[1]; + uint8x8x2_t __tab8x8x2 = { { __tablo, __tabhi } }; + union { + uint8x8x2_t __u8x8x2; + __builtin_neon_ti __ti; + } __u = { __tab8x8x2 }; + uint64x2_t __idx64, __out64; + int8x8_t __idxlo, __idxhi, __outlo, __outhi; + + __idx64 = (uint64x2_t)__idx; + __idxlo = (int8x8_t)__idx64[0]; + __idxhi = (int8x8_t)__idx64[1]; + __outlo = (int8x8_t)__builtin_neon_vtbl2v8qi(__u.__ti, __idxlo); + __outhi = (int8x8_t)__builtin_neon_vtbl2v8qi(__u.__ti, __idxhi); + __out64 = (uint64x2_t) { (uint64x1_t)__outlo, (uint64x1_t)__outhi }; + + return (uint8x16_t)__out64; +#endif +#elif defined(__clang__) +#ifndef __LITTLE_ENDIAN__ + __tab = __builtin_shufflevector(__tab, __tab, + 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0); + __idx = __builtin_shufflevector(__idx, __idx, + 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0); +#endif + uint8x16_t __r; +#ifdef __aarch64__ + __r = __builtin_neon_vqtbl1q_v((int8x16_t)__tab, (int8x16_t)__idx, 48); +#else + uint64x2_t __tab64 = (uint64x2_t)__tab; + uint8x8_t __tablo = (uint8x8_t)__tab64[0]; + uint8x8_t __tabhi = (uint8x8_t)__tab64[1]; + uint64x2_t __idx64, __out64; + int8x8_t __idxlo, __idxhi, __outlo, __outhi; + + __idx64 = (uint64x2_t)__idx; + __idxlo = (int8x8_t)__idx64[0]; + __idxhi = (int8x8_t)__idx64[1]; + __outlo = (uint8x8_t)__builtin_neon_vtbl2_v((int8x8_t)__tablo, + (int8x8_t)__tabhi, (int8x8_t)__idxlo, 16); + __outhi = (uint8x8_t)__builtin_neon_vtbl2_v((int8x8_t)__tablo, + (int8x8_t)__tabhi, (int8x8_t)__idxhi, 16); + __out64 = (uint64x2_t) { (uint64_t)__outlo, (uint64_t)__outhi }; + __r = (uint8x16_t)__out64; +#endif +#ifndef __LITTLE_ENDIAN__ + __r = __builtin_shufflevector(__r, __r, + 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0); +#endif + return __r; +#endif +} + +_INTRINSATTR +static __inline int32x4_t +vreinterpretq_s32_u8(uint8x16_t __v) +{ + return (int32x4_t)__v; +} + +_INTRINSATTR +static __inline uint16x8_t +vreinterpretq_u16_u32(uint32x4_t __v) +{ + return (uint16x8_t)__v; +} + +_INTRINSATTR +static __inline uint32x4_t +vreinterpretq_u32_u16(uint16x8_t __v) +{ + return (uint32x4_t)__v; +} + +_INTRINSATTR +static __inline uint32x4_t +vreinterpretq_u32_u64(uint64x2_t __v) +{ + return (uint32x4_t)__v; +} + +_INTRINSATTR +static __inline uint32x4_t +vreinterpretq_u32_u8(uint8x16_t __v) +{ + return (uint32x4_t)__v; +} + +_INTRINSATTR +static __inline uint64x2_t +vreinterpretq_u64_u32(uint32x4_t __v) +{ + return (uint64x2_t)__v; +} + +_INTRINSATTR +static __inline uint64x2_t +vreinterpretq_u64_u8(uint8x16_t __v) +{ + return (uint64x2_t)__v; +} + +_INTRINSATTR +static __inline uint8x16_t +vreinterpretq_u8_s32(int32x4_t __v) +{ + return (uint8x16_t)__v; +} + +_INTRINSATTR +static __inline uint8x16_t +vreinterpretq_u8_u32(uint32x4_t __v) +{ + return (uint8x16_t)__v; +} + +_INTRINSATTR +static __inline uint8x16_t +vreinterpretq_u8_u64(uint64x2_t __v) +{ + return (uint8x16_t)__v; +} + +_INTRINSATTR +static __inline uint16x8_t +vrev32q_u16(uint16x8_t __v) +{ +#if defined(__GNUC__) && !defined(__clang__) + return __builtin_shuffle(__v, (uint16x8_t) { 1,0, 3,2, 5,4, 7,6 }); +#elif defined(__clang__) + return __builtin_shufflevector(__v, __v, 1,0, 3,2, 5,4, 7,6); +#endif +} + +_INTRINSATTR +static __inline uint8x16_t +vrev32q_u8(uint8x16_t __v) +{ +#if defined(__GNUC__) && !defined(__clang__) + return __builtin_shuffle(__v, + (uint8x16_t) { 3,2,1,0, 7,6,5,4, 11,10,9,8, 15,14,13,12 }); +#elif defined(__clang__) + return __builtin_shufflevector(__v, __v, + 3,2,1,0, 7,6,5,4, 11,10,9,8, 15,14,13,12); +#endif +} + +#if defined(__GNUC__) && !defined(__clang__) +_INTRINSATTR +static __inline uint32x4_t +vsetq_lane_u32(uint32_t __x, uint32x4_t __v, uint8_t __i) +{ + __v[__neon_laneq_index(__v, __i)] = __x; + return __v; +} +#elif defined(__clang__) +#define vsetq_lane_u32(__x, __v, __i) \ + (uint32x4_t)__builtin_neon_vsetq_lane_i32((__x), (int32x4_t)(__v), \ + __neon_laneq_index(__v, __i)) +#endif + +#if defined(__GNUC__) && !defined(__clang__) +_INTRINSATTR +static __inline uint64x2_t +vsetq_lane_u64(uint64_t __x, uint64x2_t __v, uint8_t __i) +{ + __v[__neon_laneq_index(__v, __i)] = __x; + return __v; +} +#elif defined(__clang__) +#define vsetq_lane_u64(__x, __v, __i) \ + (uint64x2_t)__builtin_neon_vsetq_lane_i64((__x), (int64x2_t)(__v), \ + __neon_laneq_index(__v, __i)); +#endif + +#if defined(__GNUC__) && !defined(__clang__) +_INTRINSATTR +static __inline int32x4_t +vshlq_n_s32(int32x4_t __v, uint8_t __bits) +{ +#ifdef __aarch64__ + return (int32x4_t)__builtin_aarch64_ashlv4si(__v, __bits); +#else + return (int32x4_t)__builtin_neon_vshl_nv4si(__v, __bits); +#endif +} +#elif defined(__clang__) +#define vshlq_n_s32(__v, __bits) \ + (int32x4_t)__builtin_neon_vshlq_n_v((int32x4_t)(__v), (__bits), 34) +#endif + +#if defined(__GNUC__) && !defined(__clang__) +_INTRINSATTR +static __inline uint32x4_t +vshlq_n_u32(uint32x4_t __v, uint8_t __bits) +{ +#ifdef __aarch64__ + return (uint32x4_t)__builtin_aarch64_ashlv4si((int32x4_t)__v, __bits); +#else + return (uint32x4_t)__builtin_neon_vshl_nv4si((int32x4_t)__v, __bits); +#endif +} +#elif defined(__clang__) +#define vshlq_n_u32(__v, __bits) \ + (uint32x4_t)__builtin_neon_vshlq_n_v((int32x4_t)(__v), (__bits), 50) +#endif + +#if defined(__GNUC__) && !defined(__clang__) +_INTRINSATTR +static __inline uint32x4_t +vshrq_n_u32(uint32x4_t __v, uint8_t __bits) +{ +#ifdef __aarch64__ + return (uint32x4_t)__builtin_aarch64_lshrv4si((int32x4_t)__v, __bits); +#else + return (uint32x4_t)__builtin_neon_vshru_nv4si((int32x4_t)__v, __bits); +#endif +} +#elif defined(__clang__) +#define vshrq_n_u32(__v, __bits) \ + (uint32x4_t)__builtin_neon_vshrq_n_v((int32x4_t)(__v), (__bits), 50) +#endif + +#if defined(__GNUC__) && !defined(__clang__) +_INTRINSATTR +static __inline uint8x16_t +vshrq_n_u8(uint8x16_t __v, uint8_t __bits) +{ +#ifdef __aarch64__ + return (uint8x16_t)__builtin_aarch64_lshrv16qi((int8x16_t)__v, __bits); +#else + return (uint8x16_t)__builtin_neon_vshru_nv16qi((int8x16_t)__v, __bits); +#endif +} +#elif defined(__clang__) +#define vshrq_n_u8(__v, __bits) \ + (uint8x16_t)__builtin_neon_vshrq_n_v((int8x16_t)(__v), (__bits), 48) +#endif + +#if defined(__GNUC__) && !defined(__clang__) +_INTRINSATTR +static __inline int32x4_t +vsliq_n_s32(int32x4_t __vins, int32x4_t __vsh, uint8_t __bits) +{ +#ifdef __aarch64__ + return (int32x4_t)__builtin_aarch64_ssli_nv4si(__vins, __vsh, __bits); +#else + return (int32x4_t)__builtin_neon_vsli_nv4si(__vins, __vsh, __bits); +#endif +} +#elif defined(__clang__) +#ifdef __LITTLE_ENDIAN__ +#define vsliq_n_s32(__vins, __vsh, __bits) \ + (int32x4_t)__builtin_neon_vsliq_n_v((int32x4_t)(__vins), \ + (int32x4_t)(__vsh), (__bits), 34) +#else +#define vsliq_n_s32(__vins, __vsh, __bits) ( \ +{ \ + int32x4_t __tvins = (__vins); \ + int32x4_t __tvsh = (__vsh); \ + uint8_t __tbits = (__bits); \ + int32x4_t __vins_r = __builtin_shufflevector(__tvins, __tvins, \ + 3,2,1,0); \ + int32x4_t __vsh_r = __builtin_shufflevector(__tvsh, __tvsh, \ + 3,2,1,0); \ + int32x4_t __r = __builtin_neon_vsliq_n_v(__tvins, __tvsh, __tbits, \ + 34); \ + __builtin_shufflevector(__r, __r, 3,2,1,0); \ +}) +#endif /* __LITTLE_ENDIAN__ */ +#endif + +#if defined(__GNUC__) && !defined(__clang__) +_INTRINSATTR +static __inline uint32x4_t +vsriq_n_u32(uint32x4_t __vins, uint32x4_t __vsh, uint8_t __bits) +{ +#ifdef __aarch64__ + return __builtin_aarch64_usri_nv4si_uuus(__vins, __vsh, __bits); +#else + return (uint32x4_t)__builtin_neon_vsri_nv4si((int32x4_t)__vins, + (int32x4_t)__vsh, __bits); +#endif +} +#elif defined(__clang__) +#ifdef __LITTLE_ENDIAN__ +#define vsriq_n_u32(__vins, __vsh, __bits) \ + (int32x4_t)__builtin_neon_vsriq_n_v((int32x4_t)(__vins), \ + (int32x4_t)(__vsh), (__bits), 34) +#else +#define vsriq_n_s32(__vins, __vsh, __bits) ( \ +{ \ + int32x4_t __tvins = (__vins); \ + int32x4_t __tvsh = (__vsh); \ + uint8_t __tbits = (__bits); \ + int32x4_t __vins_r = __builtin_shufflevector(__tvins, __tvins, \ + 3,2,1,0); \ + int32x4_t __vsh_r = __builtin_shufflevector(__tvsh, __tvsh, \ + 3,2,1,0); \ + int32x4_t __r = __builtin_neon_vsriq_n_v(__tvins, __tvsh, __tbits, \ + 34); \ + __builtin_shufflevector(__r, __r, 3,2,1,0); \ +}) +#endif +#endif + +_INTRINSATTR +static __inline void +vst1q_u32(uint32_t *__p32, uint32x4_t __v) +{ +#if defined(__GNUC__) && !defined(__clang__) +#ifdef __aarch64__ + __builtin_aarch64_simd_si *__p = (__builtin_aarch64_simd_si *)__p32; + + __builtin_aarch64_st1v4si(__p, (int32x4_t)__v); +#else + __builtin_neon_si *__p = (__builtin_neon_si *)__p32; + + __builtin_neon_vst1v4si(__p, (int32x4_t)__v); +#endif +#elif defined(__clang__) +#ifndef __LITTLE_ENDIAN__ + __v = __builtin_shufflevector(__v, __v, 3,2,1,0); +#endif + __builtin_neon_vst1q_v(__p32, __v, 50); +#endif +} + +_INTRINSATTR +static __inline void +vst1q_u8(uint8_t *__p8, uint8x16_t __v) +{ +#if defined(__GNUC__) && !defined(__clang__) +#ifdef __aarch64__ + __builtin_aarch64_simd_qi *__p = (__builtin_aarch64_simd_qi *)__p8; + + __builtin_aarch64_st1v16qi(__p, (int8x16_t)__v); +#else + __builtin_neon_qi *__p = (__builtin_neon_qi *)__p8; + + __builtin_neon_vst1v16qi(__p, (int8x16_t)__v); +#endif +#elif defined(__clang__) +#ifndef __LITTLE_ENDIAN__ + __v = __builtin_shufflevector(__v, __v, + 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0); +#endif + __builtin_neon_vst1q_v(__p8, __v, 48); +#endif +} + +#ifndef __aarch64__ /* XXX */ + +_INTRINSATTR +static __inline uint8x8_t +vtbl1_u8(uint8x8_t __tab, uint8x8_t __idx) +{ +#if defined(__GNUC__) && !defined(__clang__) + return (uint8x8_t)__builtin_neon_vtbl1v8qi((int8x8_t)__tab, + (int8x8_t)__idx); +#elif defined(__clang__) + uint8x8_t __ret; +#ifndef __LITTLE_ENDIAN__ + __tab = __builtin_shufflevector(__tab, __tab, 7,6,5,4,3,2,1,0); + __idx = __builtin_shufflevector(__idx, __idx, 7,6,5,4,3,2,1,0); +#endif + __ret = (uint8x8_t)__builtin_neon_vtbl1_v((int8x8_t)__tab, + (int8x8_t)__idx, 16); +#ifndef __LITTLE_ENDIAN__ + __ret = __builtin_shufflevector(__ret, __ret, 7,6,5,4,3,2,1,0); +#endif + return __ret; +#endif +} + +_INTRINSATTR +static __inline uint8x8_t +vtbl2_u8(uint8x8x2_t __tab, uint8x8_t __idx) +{ +#if defined(__GNUC__) && !defined(__clang__) + union { + uint8x8x2_t __u8x8x82; + __builtin_neon_ti __ti; + } __u = { __tab }; + return (uint8x8_t)__builtin_neon_vtbl2v8qi(__u.__ti, (int8x8_t)__idx); +#elif defined(__clang__) + uint8x8_t __ret; +#ifndef __LITTLE_ENDIAN__ + __tab.val[0] = __builtin_shufflevector(__tab.val[0], __tab.val[0], + 7,6,5,4,3,2,1,0); + __tab.val[1] = __builtin_shufflevector(__tab.val[1], __tab.val[1], + 7,6,5,4,3,2,1,0); + __idx = __builtin_shufflevector(__idx, __idx, 7,6,5,4,3,2,1,0); +#endif + __ret = (uint8x8_t)__builtin_neon_vtbl2_v((int8x8_t)__tab.val[0], + (int8x8_t)__tab.val[1], (int8x8_t)__idx, 16); +#ifndef __LITTLE_ENDIAN__ + __ret = __builtin_shufflevector(__ret, __ret, 7,6,5,4,3,2,1,0); +#endif + return __ret; +#endif +} + +#endif /* !defined(__aarch64__) */ + +#endif /* _SYS_CRYPTO_ARCH_ARM_ARM_NEON_H */ Index: src/sys/crypto/arch/arm/arm_neon_imm.h diff -u /dev/null src/sys/crypto/arch/arm/arm_neon_imm.h:1.1 --- /dev/null Mon Aug 7 01:07:36 2023 +++ src/sys/crypto/arch/arm/arm_neon_imm.h Mon Aug 7 01:07:36 2023 @@ -0,0 +1,80 @@ +/* $NetBSD: arm_neon_imm.h,v 1.1 2023/08/07 01:07:36 rin Exp $ */ + +/*- + * Copyright (c) 2020 The NetBSD Foundation, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS + * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED + * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS + * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef _SYS_CRYPTO_ARCH_ARM_ARM_NEON_IMM_H +#define _SYS_CRYPTO_ARCH_ARM_ARM_NEON_IMM_H + +/* + * Non-standard macros for writing ARM NEON vector literals. Needed + * because apparently GCC and Clang disagree wildly on the ordering of + * vector literal components -- and both disagree with the + * architectural indexing! + */ + +#if defined(__GNUC__) && !defined(__clang__) +#if defined(__AARCH64EB__) +#define V_N_U8(a,b,c,d,e,f,g,h) \ + {h,g,f,e,d,c,b,a} +#define VQ_N_U8(a,b,c,d,e,f,g,h, i,j,k,l,m,n,o,p) \ + {p,o,n,m,l,k,j,i, h,g,f,e,d,c,b,a} +#define VQ_N_U32(a,b,c,d) \ + {d,c, b,a} +#elif defined(__ARM_BIG_ENDIAN) +#define V_N_U8(a,b,c,d,e,f,g,h) \ + {h,g,f,e,d,c,b,a} +#define VQ_N_U8(a,b,c,d,e,f,g,h, i,j,k,l,m,n,o,p) \ + {h,g,f,e,d,c,b,a, p,o,n,m,l,k,j,i} +#define VQ_N_U32(a,b,c,d) \ + {b,a, d,c} +#else +#define V_N_U8(a,b,c,d,e,f,g,h) \ + {a,b,c,d,e,f,g,h} +#define VQ_N_U8(a,b,c,d,e,f,g,h, i,j,k,l,m,n,o,p) \ + {a,b,c,d,e,f,g,h, i,j,k,l,m,n,o,p} +#define VQ_N_U32(a,b,c,d) \ + {a,b, c,d} +#endif +#elif defined(__clang__) +#ifdef __LITTLE_ENDIAN__ +#define V_N_U8(a,b,c,d,e,f,g,h) \ + {a,b,c,d,e,f,g,h} +#define VQ_N_U8(a,b,c,d,e,f,g,h, i,j,k,l,m,n,o,p) \ + {a,b,c,d,e,f,g,h, i,j,k,l,m,n,o,p} +#define VQ_N_U32(a,b,c,d) \ + {a,b, c,d} +#else +#define V_N_U8(a,b,c,d,e,f,g,h) \ + {h,g,f,e,d,c,b,a} +#define VQ_N_U8(a,b,c,d,e,f,g,h, i,j,k,l,m,n,o,p) \ + {p,o,n,m,l,k,j,i, h,g,f,e,d,c,b,a} +#define VQ_N_U32(a,b,c,d) \ + {d,c, b,a} +#endif +#endif + +#endif /* _SYS_CRYPTO_ARCH_ARM_ARM_NEON_IMM_H */ Index: src/sys/crypto/arch/x86/immintrin.h diff -u /dev/null src/sys/crypto/arch/x86/immintrin.h:1.1 --- /dev/null Mon Aug 7 01:07:36 2023 +++ src/sys/crypto/arch/x86/immintrin.h Mon Aug 7 01:07:36 2023 @@ -0,0 +1,351 @@ +/* $NetBSD: immintrin.h,v 1.1 2023/08/07 01:07:36 rin Exp $ */ + +/*- + * Copyright (c) 2020 The NetBSD Foundation, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS + * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED + * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS + * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef _SYS_CRYPTO_ARCH_X86_IMMINTRIN_H +#define _SYS_CRYPTO_ARCH_X86_IMMINTRIN_H + +#include <sys/types.h> + +/* + * This kludgerous header file provides definitions for the Intel + * intrinsics that work with GCC and Clang, because <immintrin.h> is + * not available during the kernel build and arranging to make it + * available is complicated. Please fix this properly! + */ + +#if defined(__GNUC__) && !defined(__clang__) + +#define _INTRINSATTR \ + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +#define _PACKALIAS + +typedef float __m128 __attribute__((__vector_size__(16), __may_alias__)); +typedef long long __m128i __attribute__((__vector_size__(16), __may_alias__)); +typedef long long __m128i_u + __attribute__((__vector_size__(16), __may_alias__, __aligned__(1))); +typedef long long __v2di __attribute__((__vector_size__(16))); +typedef unsigned long long __v2du __attribute__((__vector_size__(16))); +typedef int __v4si __attribute__((__vector_size__(16))); +typedef unsigned __v4su __attribute__((__vector_size__(16))); +typedef float __v4sf __attribute__((__vector_size__(16))); +typedef short __v8hi __attribute__((__vector_size__(16))); +typedef char __v16qi __attribute__((__vector_size__(16))); + +#elif defined(__clang__) + +typedef float __m128 __attribute__((__vector_size__(16), __aligned__(16))); +typedef long long __m128i + __attribute__((__vector_size__(16), __aligned__(16))); +typedef long long __m128i_u + __attribute__((__vector_size__(16), __may_alias__, __aligned__(1))); +typedef long long __v2di __attribute__((__vector_size__(16))); +typedef unsigned long long __v2du __attribute__((__vector_size__(16))); +typedef int __v4si __attribute__((__vector_size__(16))); +typedef unsigned __v4su __attribute__((__vector_size__(16))); +typedef float __v4sf __attribute__((__vector_size__(16))); +typedef short __v8hi __attribute__((__vector_size__(16))); +typedef char __v16qi __attribute__((__vector_size__(16))); + +#define _INTRINSATTR \ + __attribute__((__always_inline__, __nodebug__, __target__("sse2"), \ + __min_vector_width__(128))) +#define _PACKALIAS \ + __attribute__((__packed__, __may_alias__)) + +#else + +#error Please teach me how to do Intel intrinsics for your compiler! + +#endif + +#define _SSSE3_ATTR __attribute__((target("ssse3"))) + +_INTRINSATTR +static __inline __m128i +_mm_add_epi32(__m128i __a, __m128i __b) +{ + return (__m128i)((__v4su)__a + (__v4su)__b); +} + +#if defined(__GNUC__) && !defined(__clang__) +#define _mm_alignr_epi8(hi,lo,bytes) \ + (__m128i)__builtin_ia32_palignr128((__v2di)(__m128i)(hi), \ + (__v2di)(__m128i)(lo), 8*(int)(bytes)) +#elif defined(__clang__) +#define _mm_alignr_epi8(hi,lo,bytes) \ + (__m128i)__builtin_ia32_palignr128((__v16qi)(__m128i)(hi), \ + (__v16qi)(__m128i)(lo), (int)(bytes)) +#endif + +_INTRINSATTR +static __inline __m128 +_mm_load1_ps(const float *__p) +{ + return __extension__ (__m128)(__v4sf) { *__p, *__p, *__p, *__p }; +} + +_INTRINSATTR +static __inline __m128i +_mm_loadu_si128(const __m128i_u *__p) +{ + return ((const struct { __m128i_u __v; } _PACKALIAS *)__p)->__v; +} + +_INTRINSATTR +static __inline __m128i +_mm_loadu_si32(const void *__p) +{ + int32_t __v = ((const struct { int32_t __v; } _PACKALIAS *)__p)->__v; + return __extension__ (__m128i)(__v4si){ __v, 0, 0, 0 }; +} + +_INTRINSATTR +static __inline __m128i +_mm_loadu_si64(const void *__p) +{ + int64_t __v = ((const struct { int64_t __v; } _PACKALIAS *)__p)->__v; + return __extension__ (__m128i)(__v2di){ __v, 0 }; +} + +_INTRINSATTR +static __inline __m128i +_mm_load_si128(const __m128i *__p) +{ + return *__p; +} + +_INTRINSATTR +static __inline __m128 +_mm_movehl_ps(__m128 __v0, __m128 __v1) +{ +#if defined(__GNUC__) && !defined(__clang__) + return (__m128)__builtin_ia32_movhlps((__v4sf)__v0, (__v4sf)__v1); +#elif defined(__clang__) + return __builtin_shufflevector((__v4sf)__v0, (__v4sf)__v1, 6,7,2,3); +#endif +} + +_INTRINSATTR +static __inline __m128 +_mm_movelh_ps(__m128 __v0, __m128 __v1) +{ +#if defined(__GNUC__) && !defined(__clang__) + return (__m128)__builtin_ia32_movlhps((__v4sf)__v0, (__v4sf)__v1); +#elif defined(__clang__) + return __builtin_shufflevector((__v4sf)__v0, (__v4sf)__v1, 0,1,4,5); +#endif +} + +_INTRINSATTR +static __inline __m128i +_mm_set1_epi16(int16_t __v) +{ + return __extension__ (__m128i)(__v8hi){ + __v, __v, __v, __v, __v, __v, __v, __v + }; +} + +_INTRINSATTR +static __inline __m128i +_mm_set1_epi32(int32_t __v) +{ + return __extension__ (__m128i)(__v4si){ __v, __v, __v, __v }; +} + +_INTRINSATTR +static __inline __m128i +_mm_set1_epi64x(int64_t __v) +{ + return __extension__ (__m128i)(__v2di){ __v, __v }; +} + +_INTRINSATTR +static __inline __m128i +_mm_set_epi32(int32_t __v3, int32_t __v2, int32_t __v1, int32_t __v0) +{ + return __extension__ (__m128i)(__v4si){ __v0, __v1, __v2, __v3 }; +} + +_INTRINSATTR +static __inline __m128i +_mm_set_epi64x(int64_t __v1, int64_t __v0) +{ + return __extension__ (__m128i)(__v2di){ __v0, __v1 }; +} + +_INTRINSATTR +static __inline __m128 +_mm_setzero_ps(void) +{ + return __extension__ (__m128){ 0, 0, 0, 0 }; +} + +_INTRINSATTR +static __inline __m128i +_mm_setzero_si128(void) +{ + return _mm_set1_epi64x(0); +} + +_INTRINSATTR _SSSE3_ATTR +static __inline __m128i +_mm_shuffle_epi8(__m128i __vtbl, __m128i __vidx) +{ + return (__m128i)__builtin_ia32_pshufb128((__v16qi)__vtbl, + (__v16qi)__vidx); +} + +#define _mm_shuffle_epi32(v,m) \ + (__m128i)__builtin_ia32_pshufd((__v4si)(__m128i)(v), (int)(m)) + +#define _mm_shuffle_ps(x,y,m) \ + (__m128)__builtin_ia32_shufps((__v4sf)(__m128)(x), \ + (__v4sf)(__m128)(y), (int)(m)) \ + +_INTRINSATTR +static __inline __m128i +_mm_slli_epi32(__m128i __v, uint8_t __bits) +{ + return (__m128i)__builtin_ia32_pslldi128((__v4si)__v, (int)__bits); +} + +_INTRINSATTR +static __inline __m128i +_mm_slli_epi64(__m128i __v, uint8_t __bits) +{ + return (__m128i)__builtin_ia32_psllqi128((__v2di)__v, (int)__bits); +} + +#if defined(__GNUC__) && !defined(__clang__) +#define _mm_slli_si128(v,bytes) \ + (__m128i)__builtin_ia32_pslldqi128((__v2di)(__m128i)(v), \ + 8*(int)(bytes)) +#elif defined(__clang__) +#define _mm_slli_si128(v,bytes) \ + (__m128i)__builtin_ia32_pslldqi128_byteshift((__v2di)(__m128i)(v), \ + (int)(bytes)) +#endif + +_INTRINSATTR +static __inline __m128i +_mm_srli_epi32(__m128i __v, uint8_t __bits) +{ + return (__m128i)__builtin_ia32_psrldi128((__v4si)__v, (int)__bits); +} + +_INTRINSATTR +static __inline __m128i +_mm_srli_epi64(__m128i __v, uint8_t __bits) +{ + return (__m128i)__builtin_ia32_psrlqi128((__v2di)__v, (int)__bits); +} + +#if defined(__GNUC__) && !defined(__clang__) +#define _mm_srli_si128(v,bytes) \ + (__m128i)__builtin_ia32_psrldqi128((__m128i)(v), 8*(int)(bytes)) +#elif defined(__clang__) +#define _mm_srli_si128(v,bytes) \ + (__m128i)__builtin_ia32_psrldqi128_byteshift((__v2di)(__m128i)(v), \ + (int)(bytes)); +#endif + +_INTRINSATTR +static __inline void +_mm_storeu_si128(__m128i_u *__p, __m128i __v) +{ + ((struct { __m128i_u __v; } _PACKALIAS *)__p)->__v = __v; +} + +_INTRINSATTR +static __inline void +_mm_storeu_si32(void *__p, __m128i __v) +{ + ((struct { int32_t __v; } _PACKALIAS *)__p)->__v = ((__v4si)__v)[0]; +} + +_INTRINSATTR +static __inline void +_mm_storeu_si64(void *__p, __m128i __v) +{ + ((struct { int64_t __v; } _PACKALIAS *)__p)->__v = ((__v2di)__v)[0]; +} + +_INTRINSATTR +static __inline void +_mm_store_si128(__m128i *__p, __m128i __v) +{ + *__p = __v; +} + +_INTRINSATTR +static __inline __m128i +_mm_sub_epi64(__m128i __x, __m128i __y) +{ + return (__m128i)((__v2du)__x - (__v2du)__y); +} + +_INTRINSATTR +static __inline __m128i +_mm_unpackhi_epi32(__m128i __lo, __m128i __hi) +{ +#if defined(__GNUC__) && !defined(__clang__) + return (__m128i)__builtin_ia32_punpckhdq128((__v4si)__lo, + (__v4si)__hi); +#elif defined(__clang__) + return (__m128i)__builtin_shufflevector((__v4si)__lo, (__v4si)__hi, + 2,6,3,7); +#endif +} + +_INTRINSATTR +static __inline __m128i +_mm_unpacklo_epi32(__m128i __lo, __m128i __hi) +{ +#if defined(__GNUC__) && !defined(__clang__) + return (__m128i)__builtin_ia32_punpckldq128((__v4si)__lo, + (__v4si)__hi); +#elif defined(__clang__) + return (__m128i)__builtin_shufflevector((__v4si)__lo, (__v4si)__hi, + 0,4,1,5); +#endif +} + +_INTRINSATTR +static __inline __m128i +_mm_unpacklo_epi64(__m128i __lo, __m128i __hi) +{ +#if defined(__GNUC__) && !defined(__clang__) + return (__m128i)__builtin_ia32_punpcklqdq128((__v2di)__lo, + (__v2di)__hi); +#elif defined(__clang__) + return (__m128i)__builtin_shufflevector((__v2di)__lo, (__v2di)__hi, + 0,2); +#endif +} + +#endif /* _SYS_CRYPTO_ARCH_X86_IMMINTRIN_H */ Index: src/sys/crypto/arch/x86/immintrin_ext.h diff -u /dev/null src/sys/crypto/arch/x86/immintrin_ext.h:1.1 --- /dev/null Mon Aug 7 01:07:36 2023 +++ src/sys/crypto/arch/x86/immintrin_ext.h Mon Aug 7 01:07:36 2023 @@ -0,0 +1,48 @@ +/* $NetBSD: immintrin_ext.h,v 1.1 2023/08/07 01:07:36 rin Exp $ */ + +/*- + * Copyright (c) 2020 The NetBSD Foundation, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS + * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED + * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS + * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef _SYS_CRYPTO_ARCH_X86_IMMINTRIN_EXT_H +#define _SYS_CRYPTO_ARCH_X86_IMMINTRIN_EXT_H + +#include <crypto/arch/x86/immintrin.h> + +_INTRINSATTR +static __inline __m128i +_mm_loadu_epi8(const void *__p) +{ + return ((const struct { __m128i_u __v; } _PACKALIAS *)__p)->__v; +} + +_INTRINSATTR +static __inline void +_mm_storeu_epi8(void *__p, __m128i __v) +{ + ((struct { __m128i_u __v; } _PACKALIAS *)__p)->__v = __v; +} + +#endif /* _SYS_CRYPTO_ARCH_X86_IMMINTRIN_EXT_H */