Re: [FFmpeg-devel] [PATCH 2/2] swscale: Neon rgb_to_yuv_half process 16 pixels at a time

Martin Storsjö Thu, 29 May 2025 12:09:59 -0700

On Tue, 27 May 2025, Dmitriy Kovalenko wrote:

This patches integrates so called double bufferring when we are loading
2 batch elements at a time and then processing them in parallel. On the
moden arm processors especially Apple Silicon it gives a visible
benefit, for subsampled pixel processing it is especially nice because
it allows to read elements w/ 2 instructions and write with a single one
(which is usually the slowest part).


Including the previous patch in a stack on macbook pro m4 max rgb_to_yuv_half
in checkasm goes up 2x of the c version
---
libswscale/aarch64/input.S | 332 ++++++++++++++++++++++++++++++++++---
1 file changed, 309 insertions(+), 23 deletions(-)

diff --git a/libswscale/aarch64/input.S b/libswscale/aarch64/input.S
index ee8eb24c14..59d66d0022 100644
--- a/libswscale/aarch64/input.S
+++ b/libswscale/aarch64/input.S
@@ -194,40 +194,94 @@ function ff_\fmt_rgb\()ToUV_half_neon, export=1
        ldp             w12, w13, [x6, #20]     // w12: bu, w13: rv
        ldp             w14, w15, [x6, #28]     // w14: gv, w15: bv
4:
-        cmp             w5, #8
        rgb_set_uv_coeff half=1
-        b.lt            2f
-1:  // load 16 pixels and prefetch memory for the next block
+
+        cmp             w5, #16
+        b.lt            2f                      // Go directly to scalar if < 
16
+
+1:
    .if \element == 3
-        ld3             { v16.16b, v17.16b, v18.16b }, [x3], #48
-        prfm            pldl1strm, [x3, #48]
+        ld3             { v16.16b, v17.16b, v18.16b }, [x3], #48  // First 16 
pixels
+        ld3             { v26.16b, v27.16b, v28.16b }, [x3], #48  // Second 16 
pixels
+        prfm            pldl1keep, [x3, #96]
    .else
-        ld4             { v16.16b, v17.16b, v18.16b, v19.16b }, [x3], #64
-        prfm            pldl1strm, [x3, #64]
+        ld4             { v16.16b, v17.16b, v18.16b, v19.16b }, [x3], #64  // 
First 16 pixels
+        ld4             { v26.16b, v27.16b, v28.16b, v29.16b }, [x3], #64  // 
Second 16 pixels
+        prfm            pldl1keep, [x3, #128]
    .endif

+    // **Sum adjacent pixel pairs**
    .if \alpha_first
-        uaddlp          v21.8h, v19.16b         // v21: summed b pairs
-        uaddlp          v20.8h, v18.16b         // v20: summed g pairs
-        uaddlp          v19.8h, v17.16b         // v19: summed r pairs
+        uaddlp          v21.8h, v19.16b         // Block 1: B sums
+        uaddlp          v20.8h, v18.16b         // Block 1: G sums
+        uaddlp          v19.8h, v17.16b         // Block 1: R sums
+        uaddlp          v31.8h, v29.16b         // Block 2: B sums
+        uaddlp          v30.8h, v28.16b         // Block 2: G sums
+        uaddlp          v29.8h, v27.16b         // Block 2: R sums
    .else
-        uaddlp          v19.8h, v16.16b         // v19: summed r pairs
-        uaddlp          v20.8h, v17.16b         // v20: summed g pairs
-        uaddlp          v21.8h, v18.16b         // v21: summed b pairs
+        uaddlp          v19.8h, v16.16b         // Block 1: R sums
+        uaddlp          v20.8h, v17.16b         // Block 1: G sums
+        uaddlp          v21.8h, v18.16b         // Block 1: B sums
+        uaddlp          v29.8h, v26.16b         // Block 2: R sums
+        uaddlp          v30.8h, v27.16b         // Block 2: G sums
+        uaddlp          v31.8h, v28.16b         // Block 2: B sums
    .endif

-        mov             v22.16b, v6.16b         // U first half
-        mov             v23.16b, v6.16b         // U second half
-        mov             v24.16b, v6.16b         // V first half
-        mov             v25.16b, v6.16b         // V second half
-
-        rgb_to_uv_interleaved_product v19, v20, v21, v0, v1, v2, v3, v4, v5, 
v22, v23, v24, v25, v16, v17, #10
+        // init accumulatos for both blocks
+        mov             v7.16b, v6.16b          //  U_low
+        mov             v8.16b, v6.16b          //  U_high
+        mov             v9.16b, v6.16b          //  V_low
+        mov             v10.16b, v6.16b         //  V_high
+        mov             v11.16b, v6.16b         //  U_low
+        mov             v12.16b, v6.16b         //  U_high
+        mov             v13.16b, v6.16b         //  V_low
+        mov             v14.16b, v6.16b         //  V_high
+
+        smlal           v7.4s, v0.4h, v19.4h    // U += ru * r (0-3)
+        smlal           v9.4s, v3.4h, v19.4h    // V += rv * r (0-3)
+        smlal           v11.4s, v0.4h, v29.4h   // U += ru * r (0-3)
+        smlal           v13.4s, v3.4h, v29.4h   // V += rv * r (0-3)
+
+        smlal2          v8.4s, v0.8h, v19.8h    // U += ru * r (4-7)
+        smlal2          v10.4s, v3.8h, v19.8h   // V += rv * r (4-7)
+        smlal2          v12.4s, v0.8h, v29.8h   // U += ru * r (4-7)
+        smlal2          v14.4s, v3.8h, v29.8h   // V += rv * r (4-7)
+
+        smlal           v7.4s, v1.4h, v20.4h    // U += gu * g (0-3)
+        smlal           v9.4s, v4.4h, v20.4h    // V += gv * g (0-3)
+        smlal           v11.4s, v1.4h, v30.4h   // U += gu * g (0-3)
+        smlal           v13.4s, v4.4h, v30.4h   // V += gv * g (0-3)
+
+        smlal2          v8.4s, v1.8h, v20.8h    // U += gu * g (4-7)
+        smlal2          v10.4s, v4.8h, v20.8h   // V += gv * g (4-7)
+        smlal2          v12.4s, v1.8h, v30.8h   // U += gu * g (4-7)
+        smlal2          v14.4s, v4.8h, v30.8h   // V += gv * g (4-7)
+
+        smlal           v7.4s, v2.4h, v21.4h    // U += bu * b (0-3)
+        smlal           v9.4s, v5.4h, v21.4h    // V += bv * b (0-3)
+        smlal           v11.4s, v2.4h, v31.4h   // U += bu * b (0-3)
+        smlal           v13.4s, v5.4h, v31.4h   // V += bv * b (0-3)
+
+        smlal2          v8.4s, v2.8h, v21.8h    // U += bu * b (4-7)
+        smlal2          v10.4s, v5.8h, v21.8h   // V += bv * b (4-7)
+        smlal2          v12.4s, v2.8h, v31.8h   // U += bu * b (4-7)
+        smlal2          v14.4s, v5.8h, v31.8h   // V += bv * b (4-7)
+
+        sqshrn          v16.4h, v7.4s, #10      // U (0-3)
+        sqshrn          v17.4h, v9.4s, #10      // V (0-3)
+        sqshrn          v22.4h, v11.4s, #10     // U (0-3)
+        sqshrn          v23.4h, v13.4s, #10     // V (0-3)
+
+        sqshrn2         v16.8h, v8.4s, #10      // U (0-7)
+        sqshrn2         v17.8h, v10.4s, #10     // V (0-7)
+        sqshrn2         v22.8h, v12.4s, #10     // U (0-7)
+        sqshrn2         v23.8h, v14.4s, #10     // V (0-7)

-        str             q16, [x0], #16          // store dst_u
-        str             q17, [x1], #16          // store dst_v
+        stp             q16, q22, [x0], #32     // Store all 16 U values
+        stp             q17, q23, [x1], #32     // Store all 16 V values

-        sub             w5, w5, #8              // width -= 8
-        cmp             w5, #8                  // width >= 8 ?
+        sub             w5, w5, #16             // width -= 16
+        cmp             w5, #16                 // width >= 16 ?
        b.ge            1b
        cbz             w5, 3f                  // No pixels left? Exit

@@ -459,3 +513,235 @@ endfunc

DISABLE_DOTPROD
#endif
+
+.macro rgbToUV_half_neon_double fmt_bgr, fmt_rgb, element, alpha_first=0
+function ff_\fmt_bgr\()ToUV_half_neon_double, export=1
+        cbz             w5, 9f                  // exit immediately if width 
is 0
+        cmp             w5, #16                 // check if we have at least 
16 pixels
+        b.lt            _ff_\fmt_bgr\()ToUV_half_neon

This fails to link on anything other than Darwin targets; other platformsdon't have an underscore prefix on symbols. Use the X() macro aroundsymbol names to get the right external symbol name for the function.

Also, with that fixed, this fails to properly back up and restoreregisters v8-v15; checkasm doesn't notice this on macOS, but on Linux andwindows, checkasm has a call wrapper which does detect such issues.

I have set up a set of test configurations for aarch64 assembly on github;if you fetch the branchhttps://github.com/mstorsjo/ffmpeg/commits/gha-aarch64, append your owncommits on top, and push this to your own fork on github, it'll testbuilding it in all the relevant configurations (most relevantplatforms/toolchains, including rare ones that not everybody may haveavailalbe). (You may need to activate the actions by visitinghttp://github.com/<yourusername>/ffmpeg/actions.) It also does check thatthe indentation of the assembly matches the common style.


// Martin

_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".

Re: [FFmpeg-devel] [PATCH 2/2] swscale: Neon rgb_to_yuv_half process 16 pixels at a time

Reply via email to