[1]
https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_stream_load
[2]
https://developer.arm.com/documentation/100076/0100/A64-Instruction-Set-Reference/A64-Floating-point-Instructions/LDNP--SIMD-and-FP-
V2:
- Only copy from non-temporal source to non-temporal destination.
I.e. remove the two variants with only source and/or destination being
non-temporal.
- Do not require alignment.
Instead, offer additional 4 and 16 byte aligned functions for performance
purposes.
- Implemented two of the functions for x86.
- Remove memset function.
Signed-off-by: Morten Brørup <m...@smartsharesystems.com>
---
/**
* @warning
* @b EXPERIMENTAL: this API may change without prior notice.
*
* Copy data from non-temporal source to non-temporal destination.
*
* @param dst
* Pointer to the non-temporal destination of the data.
* Should be 4 byte aligned, for optimal performance.
* @param src
* Pointer to the non-temporal source data.
* No alignment requirements.
* @param len
* Number of bytes to copy.
* Should be be divisible by 4, for optimal performance.
*/
__rte_experimental
static __rte_always_inline
__attribute__((__nonnull__(1, 2), __access__(write_only, 1, 3),
__access__(read_only, 2, 3)))
void rte_memcpy_nt(void * __rte_restrict dst, const void * __rte_restrict src,
size_t len)
/* Implementation T.B.D. */
/**
* @warning
* @b EXPERIMENTAL: this API may change without prior notice.
*
* Copy data in blocks of 16 byte from aligned non-temporal source
* to aligned non-temporal destination.
*
* @param dst
* Pointer to the non-temporal destination of the data.
* Must be 16 byte aligned.
* @param src
* Pointer to the non-temporal source data.
* Must be 16 byte aligned.
* @param len
* Number of bytes to copy.
* Must be divisible by 16.
*/
__rte_experimental
static __rte_always_inline
__attribute__((__nonnull__(1, 2), __access__(write_only, 1, 3),
__access__(read_only, 2, 3)))
void rte_memcpy_nt16a(void * __rte_restrict dst, const void * __rte_restrict
src, size_t len)
{
const void * const end = RTE_PTR_ADD(src, len);
RTE_ASSERT(rte_is_aligned(dst, sizeof(__m128i)));
RTE_ASSERT(rte_is_aligned(src, sizeof(__m128i)));
RTE_ASSERT(rte_is_aligned(len, sizeof(__m128i)));
/* Copy large portion of data. */
while (RTE_PTR_DIFF(end, src) >= 4 * sizeof(__m128i)) {
register __m128i xmm0, xmm1, xmm2, xmm3;
/* Note: Workaround for _mm_stream_load_si128() not taking a const pointer as
parameter. */
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wdiscarded-qualifiers"
xmm0 = _mm_stream_load_si128(RTE_PTR_ADD(src, 0 * sizeof(__m128i)));
xmm1 = _mm_stream_load_si128(RTE_PTR_ADD(src, 1 * sizeof(__m128i)));
xmm2 = _mm_stream_load_si128(RTE_PTR_ADD(src, 2 * sizeof(__m128i)));
xmm3 = _mm_stream_load_si128(RTE_PTR_ADD(src, 3 * sizeof(__m128i)));
#pragma GCC diagnostic pop
_mm_stream_si128(RTE_PTR_ADD(dst, 0 * sizeof(__m128i)), xmm0);
_mm_stream_si128(RTE_PTR_ADD(dst, 1 * sizeof(__m128i)), xmm1);
_mm_stream_si128(RTE_PTR_ADD(dst, 2 * sizeof(__m128i)), xmm2);
_mm_stream_si128(RTE_PTR_ADD(dst, 3 * sizeof(__m128i)), xmm3);
src = RTE_PTR_ADD(src, 4 * sizeof(__m128i));
dst = RTE_PTR_ADD(dst, 4 * sizeof(__m128i));
}
/* Copy remaining data. */
while (src != end) {
register __m128i xmm;
/* Note: Workaround for _mm_stream_load_si128() not taking a const pointer as
parameter. */
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wdiscarded-qualifiers"
xmm = _mm_stream_load_si128(src);
#pragma GCC diagnostic pop
_mm_stream_si128(dst, xmm);
src = RTE_PTR_ADD(src, sizeof(__m128i));
dst = RTE_PTR_ADD(dst, sizeof(__m128i));
}
}
/**
* @warning
* @b EXPERIMENTAL: this API may change without prior notice.
*
* Copy data in blocks of 4 byte from aligned non-temporal source
* to aligned non-temporal destination.
*
* @param dst
* Pointer to the non-temporal destination of the data.
* Must be 4 byte aligned.
* @param src
* Pointer to the non-temporal source data.
* Must be 4 byte aligned.
* @param len
* Number of bytes to copy.
* Must be divisible by 4.
*/
__rte_experimental
static __rte_always_inline
__attribute__((__nonnull__(1, 2), __access__(write_only, 1, 3),
__access__(read_only, 2, 3)))
void rte_memcpy_nt4a(void * __rte_restrict dst, const void * __rte_restrict
src, size_t len)
{
int32_t buf[sizeof(__m128i) / sizeof(int32_t)]
__rte_aligned(sizeof(__m128i));
/** Address of source data, rounded down to achieve alignment. */
const void * srca = RTE_PTR_ALIGN_FLOOR(src, sizeof(__m128i));
/** Address of end of source data, rounded down to achieve alignment. */
const void * const srcenda = RTE_PTR_ALIGN_FLOOR(RTE_PTR_ADD(src, len),
sizeof(__m128i));
const int offset = RTE_PTR_DIFF(src, srca) / sizeof(int32_t);
register __m128i xmm0;
RTE_ASSERT(rte_is_aligned(dst, sizeof(int32_t)));
RTE_ASSERT(rte_is_aligned(src, sizeof(int32_t)));
RTE_ASSERT(rte_is_aligned(len, sizeof(int32_t)));
if (unlikely(len == 0)) return;
/* Copy first, non-__m128i aligned, part of source data. */
if (offset) {
/* Note: Workaround for _mm_stream_load_si128() not taking a const pointer as
parameter. */
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wdiscarded-qualifiers"
xmm0 = _mm_stream_load_si128(srca);
_mm_store_si128((void *)buf, xmm0);
#pragma GCC diagnostic pop
switch (offset) {
case 1:
_mm_stream_si32(RTE_PTR_ADD(dst, 0 * sizeof(int32_t)), buf[1]);
if (unlikely(len == 1 * sizeof(int32_t))) return;
_mm_stream_si32(RTE_PTR_ADD(dst, 1 * sizeof(int32_t)), buf[2]);
if (unlikely(len == 2 * sizeof(int32_t))) return;
_mm_stream_si32(RTE_PTR_ADD(dst, 2 * sizeof(int32_t)), buf[3]);
break;
case 2:
_mm_stream_si32(RTE_PTR_ADD(dst, 0 * sizeof(int32_t)), buf[2]);
if (unlikely(len == 1 * sizeof(int32_t))) return;
_mm_stream_si32(RTE_PTR_ADD(dst, 1 * sizeof(int32_t)), buf[3]);
break;
case 3:
_mm_stream_si32(RTE_PTR_ADD(dst, 0 * sizeof(int32_t)), buf[3]);
break;
}
srca = RTE_PTR_ADD(srca, (4 - offset) * sizeof(int32_t));
dst = RTE_PTR_ADD(dst, (4 - offset) * sizeof(int32_t));
}
/* Copy middle, __m128i aligned, part of source data. */
while (srca != srcenda) {
/* Note: Workaround for _mm_stream_load_si128() not taking a const pointer as
parameter. */
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wdiscarded-qualifiers"
xmm0 = _mm_stream_load_si128(srca);
#pragma GCC diagnostic pop
_mm_store_si128((void *)buf, xmm0);
_mm_stream_si32(RTE_PTR_ADD(dst, 0 * sizeof(int32_t)), buf[0]);
_mm_stream_si32(RTE_PTR_ADD(dst, 1 * sizeof(int32_t)), buf[1]);
_mm_stream_si32(RTE_PTR_ADD(dst, 2 * sizeof(int32_t)), buf[2]);
_mm_stream_si32(RTE_PTR_ADD(dst, 3 * sizeof(int32_t)), buf[3]);
srca = RTE_PTR_ADD(srca, sizeof(__m128i));
dst = RTE_PTR_ADD(dst, 4 * sizeof(int32_t));
}
/* Copy last, non-__m128i aligned, part of source data. */
if (RTE_PTR_DIFF(srca, src) != 4) {
/* Note: Workaround for _mm_stream_load_si128() not taking a const pointer as
parameter. */
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wdiscarded-qualifiers"
xmm0 = _mm_stream_load_si128(srca);
_mm_store_si128((void *)buf, xmm0);
#pragma GCC diagnostic pop
switch (offset) {
case 1:
_mm_stream_si32(RTE_PTR_ADD(dst, 0 * sizeof(int32_t)), buf[0]);
break;
case 2:
_mm_stream_si32(RTE_PTR_ADD(dst, 0 * sizeof(int32_t)), buf[0]);
if (unlikely(RTE_PTR_DIFF(srca, src) == 1 * sizeof(int32_t)))
return;
_mm_stream_si32(RTE_PTR_ADD(dst, 1 * sizeof(int32_t)), buf[1]);
break;
case 3:
_mm_stream_si32(RTE_PTR_ADD(dst, 0 * sizeof(int32_t)), buf[0]);
if (unlikely(RTE_PTR_DIFF(srca, src) == 1 * sizeof(int32_t)))
return;
_mm_stream_si32(RTE_PTR_ADD(dst, 1 * sizeof(int32_t)), buf[1]);
if (unlikely(RTE_PTR_DIFF(srca, src) == 2 * sizeof(int32_t)))
return;
_mm_stream_si32(RTE_PTR_ADD(dst, 2 * sizeof(int32_t)), buf[2]);
break;
}
}
}