A55 A76 deinterleave_bytes_c: 70342.0 34497.5 deinterleave_bytes_neon: 21594.5 ( 3.26x) 5535.2 ( 6.23x) deinterleave_bytes_aligned_c: 71340.8 34651.2 deinterleave_bytes_aligned_neon: 8616.8 ( 8.28x) 3996.2 ( 8.67x) --- libswscale/aarch64/rgb2rgb.c | 4 ++ libswscale/aarch64/rgb2rgb_neon.S | 59 +++++++++++++++++++++++ tests/checkasm/sw_rgb.c | 77 +++++++++++++++++++++++++++++++ 3 files changed, 140 insertions(+)
diff --git a/libswscale/aarch64/rgb2rgb.c b/libswscale/aarch64/rgb2rgb.c index a9bf6ff9e0..31db23bff4 100644 --- a/libswscale/aarch64/rgb2rgb.c +++ b/libswscale/aarch64/rgb2rgb.c @@ -30,6 +30,9 @@ void ff_interleave_bytes_neon(const uint8_t *src1, const uint8_t *src2, uint8_t *dest, int width, int height, int src1Stride, int src2Stride, int dstStride); +void ff_deinterleave_bytes_neon(const uint8_t *src, uint8_t *dst1, uint8_t *dst2, + int width, int height, int srcStride, + int dst1Stride, int dst2Stride); av_cold void rgb2rgb_init_aarch64(void) { @@ -37,5 +40,6 @@ av_cold void rgb2rgb_init_aarch64(void) if (have_neon(cpu_flags)) { interleaveBytes = ff_interleave_bytes_neon; + deinterleaveBytes = ff_deinterleave_bytes_neon; } } diff --git a/libswscale/aarch64/rgb2rgb_neon.S b/libswscale/aarch64/rgb2rgb_neon.S index d81110ec57..2e4f2fb766 100644 --- a/libswscale/aarch64/rgb2rgb_neon.S +++ b/libswscale/aarch64/rgb2rgb_neon.S @@ -77,3 +77,62 @@ function ff_interleave_bytes_neon, export=1 0: ret endfunc + +// void ff_deinterleave_bytes_neon(const uint8_t *src, uint8_t *dst1, uint8_t *dst2, +// int width, int height, int srcStride, +// int dst1Stride, int dst2Stride); +function ff_deinterleave_bytes_neon, export=1 + sub w5, w5, w3, lsl #1 + sub w6, w6, w3 + sub w7, w7, w3 +1: + ands w8, w3, #0xfffffff0 // & ~15 + b.eq 3f +2: + ld2 {v0.16b, v1.16b}, [x0], #32 + subs w8, w8, #16 + st1 {v0.16b}, [x1], #16 + st1 {v1.16b}, [x2], #16 + b.gt 2b + + tst w3, #15 + b.eq 9f + +3: + tst w3, #8 + b.eq 4f + ld2 {v0.8b, v1.8b}, [x0], #16 + st1 {v0.8b}, [x1], #8 + st1 {v1.8b}, [x2], #8 +4: + tst w3, #4 + b.eq 5f + + ld1 {v0.8b}, [x0], #8 + shrn v1.8b, v0.8h, #8 + xtn v0.8b, v0.8h + st1 {v0.s}[0], [x1], #4 + st1 {v1.s}[0], [x2], #4 + +5: + ands w8, w3, #3 + b.eq 9f +6: + ldrh w9, [x0], #2 + subs w8, w8, #1 + ubfx w10, w9, #8, #8 + strb w9, [x1], #1 + strb w10, [x2], #1 + b.gt 6b + +9: + subs w4, w4, #1 + b.eq 0f + add x0, x0, w5, sxtw + add x1, x1, w6, sxtw + add x2, x2, w7, sxtw + b 1b + +0: + ret +endfunc diff --git a/tests/checkasm/sw_rgb.c b/tests/checkasm/sw_rgb.c index f278454d3d..987841a54f 100644 --- a/tests/checkasm/sw_rgb.c +++ b/tests/checkasm/sw_rgb.c @@ -182,6 +182,80 @@ static void check_interleave_bytes(void) } } +static void check_deinterleave_bytes(void) +{ + LOCAL_ALIGNED_16(uint8_t, src_buf, [2*MAX_STRIDE*MAX_HEIGHT+2]); + LOCAL_ALIGNED_16(uint8_t, dst0_u_buf, [MAX_STRIDE*MAX_HEIGHT+1]); + LOCAL_ALIGNED_16(uint8_t, dst0_v_buf, [MAX_STRIDE*MAX_HEIGHT+1]); + LOCAL_ALIGNED_16(uint8_t, dst1_u_buf, [MAX_STRIDE*MAX_HEIGHT+1]); + LOCAL_ALIGNED_16(uint8_t, dst1_v_buf, [MAX_STRIDE*MAX_HEIGHT+1]); + // Intentionally using unaligned buffers, as this function doesn't have + // any alignment requirements. + uint8_t *src = src_buf + 2; + uint8_t *dst0_u = dst0_u_buf + 1; + uint8_t *dst0_v = dst0_v_buf + 1; + uint8_t *dst1_u = dst1_u_buf + 1; + uint8_t *dst1_v = dst1_v_buf + 1; + + declare_func(void, const uint8_t *src, uint8_t *dst1, uint8_t *dst2, + int width, int height, int srcStride, + int dst1Stride, int dst2Stride); + + randomize_buffers(src, 2*MAX_STRIDE*MAX_HEIGHT+2); + + if (check_func(deinterleaveBytes, "deinterleave_bytes")) { + for (int i = 0; i <= 16; i++) { + // Try all widths [1,16], and try one random width. + + int w = i > 0 ? i : (1 + (rnd() % (MAX_STRIDE-2))); + int h = 1 + (rnd() % (MAX_HEIGHT-2)); + + int src_offset = 0, src_stride = 2 * MAX_STRIDE; + int dst_u_offset = 0, dst_u_stride = MAX_STRIDE; + int dst_v_offset = 0, dst_v_stride = MAX_STRIDE; + + memset(dst0_u, 0, MAX_STRIDE * MAX_HEIGHT); + memset(dst0_v, 0, MAX_STRIDE * MAX_HEIGHT); + memset(dst1_u, 0, MAX_STRIDE * MAX_HEIGHT); + memset(dst1_v, 0, MAX_STRIDE * MAX_HEIGHT); + + // Try different combinations of negative strides + if (i & 1) { + src_offset = (h-1)*src_stride; + src_stride = -src_stride; + } + if (i & 2) { + dst_u_offset = (h-1)*dst_u_stride; + dst_u_stride = -dst_u_stride; + } + if (i & 4) { + dst_v_offset = (h-1)*dst_v_stride; + dst_v_stride = -dst_v_stride; + } + + call_ref(src + src_offset, dst0_u + dst_u_offset, dst0_v + dst_v_offset, + w, h, src_stride, dst_u_stride, dst_v_stride); + call_new(src + src_offset, dst1_u + dst_u_offset, dst1_v + dst_v_offset, + w, h, src_stride, dst_u_stride, dst_v_stride); + // Check a one pixel-pair edge around the destination area, + // to catch overwrites past the end. + checkasm_check(uint8_t, dst0_u, MAX_STRIDE, dst1_u, MAX_STRIDE, + w + 1, h + 1, "dst_u"); + checkasm_check(uint8_t, dst0_v, MAX_STRIDE, dst1_v, MAX_STRIDE, + w + 1, h + 1, "dst_v"); + } + + bench_new(src, dst1_u, dst1_v, 127, MAX_HEIGHT, + 2*MAX_STRIDE, MAX_STRIDE, MAX_STRIDE); + } + if (check_func(deinterleaveBytes, "deinterleave_bytes_aligned")) { + // Bench the function in a more typical case, with aligned + // buffers and widths. + bench_new(src_buf, dst1_u_buf, dst1_v_buf, 128, MAX_HEIGHT, + 2*MAX_STRIDE, MAX_STRIDE, MAX_STRIDE); + } +} + #define MAX_LINE_SIZE 1920 static const int input_sizes[] = {8, 128, 1080, MAX_LINE_SIZE}; static const enum AVPixelFormat rgb_formats[] = { @@ -315,6 +389,9 @@ void checkasm_check_sw_rgb(void) check_interleave_bytes(); report("interleave_bytes"); + check_deinterleave_bytes(); + report("deinterleave_bytes"); + ctx = sws_getContext(MAX_LINE_SIZE, MAX_LINE_SIZE, AV_PIX_FMT_RGB24, MAX_LINE_SIZE, MAX_LINE_SIZE, AV_PIX_FMT_YUV420P, SWS_ACCURATE_RND | SWS_BITEXACT, NULL, NULL, NULL); -- 2.30.2 _______________________________________________ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-devel To unsubscribe, visit link above, or email ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".