This is an automated email from the git hooks/post-receive script.

Git pushed a commit to branch master
in repository ffmpeg.

commit ec959e20c5852e288f3d7bbecb8b09303a53c61b
Author:     Niklas Haas <[email protected]>
AuthorDate: Tue Feb 24 12:41:38 2026 +0100
Commit:     Niklas Haas <[email protected]>
CommitDate: Thu Feb 26 10:15:52 2026 +0000

    swscale/x86/ops: add special case for expanding bits to bytes/words
    
    Not super useful but also not expensive to carry.
    
    monob -> gbrp:
     Before: time=84 us, ref=137 us, speedup=1.618x faster
     After:  time=23 us, ref=185 us, speedup=7.773x faster
    
    monob -> gray16le:
     Before: time=75 us, ref=108 us, speedup=1.440x faster
     After:  time=20 us, ref=108 us, speedup=5.192x faster
    
    Signed-off-by: Niklas Haas <[email protected]>
---
 libswscale/x86/ops.c       | 10 ++++++++++
 libswscale/x86/ops_int.asm | 19 +++++++++++++++++--
 2 files changed, 27 insertions(+), 2 deletions(-)

diff --git a/libswscale/x86/ops.c b/libswscale/x86/ops.c
index ba8d987fc2..0db6837c3a 100644
--- a/libswscale/x86/ops.c
+++ b/libswscale/x86/ops.c
@@ -185,6 +185,12 @@ static int setup_shift(const SwsOp *op, SwsOpPriv *out)
         .flexible = true,                                                      
 \
     );
 
+#define DECL_EXPAND_BITS(EXT, BITS)                                            
 \
+    DECL_ASM(U##BITS, expand_bits##BITS##EXT,                                  
 \
+        .op = SWS_OP_SCALE,                                                    
 \
+        .scale = Q((1 << (BITS)) - 1),                                         
 \
+    );
+
 static int setup_dither(const SwsOp *op, SwsOpPriv *out)
 {
     /* 1x1 matrix / single constant */
@@ -268,6 +274,7 @@ static int setup_linear(const SwsOp *op, SwsOpPriv *out)
     DECL_RW(EXT, U8, read_nibbles,  READ,  1, false, 1)                        
 \
     DECL_RW(EXT, U8, read_bits,     READ,  1, false, 3)                        
 \
     DECL_RW(EXT, U8, write_bits,    WRITE, 1, false, 3)                        
 \
+    DECL_EXPAND_BITS(EXT, 8)                                                   
 \
     DECL_PACKED_RW(EXT, 8)                                                     
 \
     DECL_PACK_UNPACK(EXT, U8, 1, 2, 1, 0)                                      
 \
     DECL_PACK_UNPACK(EXT, U8, 3, 3, 2, 0)                                      
 \
@@ -336,6 +343,7 @@ static const SwsOpTable ops8##EXT = {
         &op_read_nibbles1##EXT,                                                
 \
         &op_read_bits1##EXT,                                                   
 \
         &op_write_bits1##EXT,                                                  
 \
+        &op_expand_bits8##EXT,                                                 
 \
         &op_pack_1210##EXT,                                                    
 \
         &op_pack_3320##EXT,                                                    
 \
         &op_pack_2330##EXT,                                                    
 \
@@ -386,6 +394,7 @@ static const SwsOpTable ops8##EXT = {
 
 #define DECL_FUNCS_16(SIZE, EXT, FLAG)                                         
 \
     DECL_PACKED_RW(EXT, 16)                                                    
 \
+    DECL_EXPAND_BITS(EXT, 16)                                                  
 \
     DECL_PACK_UNPACK(EXT, U16, 4, 4, 4, 0)                                     
 \
     DECL_PACK_UNPACK(EXT, U16, 5, 5, 5, 0)                                     
 \
     DECL_PACK_UNPACK(EXT, U16, 5, 6, 5, 0)                                     
 \
@@ -414,6 +423,7 @@ static const SwsOpTable ops16##EXT = {
         &op_unpack_4440##EXT,                                                  
 \
         &op_unpack_5550##EXT,                                                  
 \
         &op_unpack_5650##EXT,                                                  
 \
+        &op_expand_bits16##EXT,                                                
 \
         REF_COMMON_PATTERNS(swap_bytes_U16##EXT),                              
 \
         REF_COMMON_PATTERNS(convert_U8_U16##EXT),                              
 \
         REF_COMMON_PATTERNS(convert_U16_U8##EXT),                              
 \
diff --git a/libswscale/x86/ops_int.asm b/libswscale/x86/ops_int.asm
index 44af92a7da..bc9e43a098 100644
--- a/libswscale/x86/ops_int.asm
+++ b/libswscale/x86/ops_int.asm
@@ -52,6 +52,9 @@ mask2: times 32 db 0x03
 mask3: times 32 db 0x07
 mask4: times 32 db 0x0F
 
+const1b equ mask1
+const1w: times 16 dw 0x01
+
 SECTION .text
 
 ;---------------------------------------------------------
@@ -456,7 +459,7 @@ IF V2,  movd mx2, [in0q + 2]
 %endif
         mova m8, [bits_shuf]
         VBROADCASTI128 m9,  [bits_mask]
-        VBROADCASTI128 m10, [mask1]
+        VBROADCASTI128 m10, [const1b]
         LOAD_CONT tmp0q
         add in0q, (mmsize >> 3) * (1 + V2)
         pshufb mx,  m8
@@ -947,7 +950,7 @@ IF W,   vpermq mw, mw, q3120
 %endmacro
 
 ;---------------------------------------------------------
-; Shifting
+; Shifting and scaling
 
 %macro lshift16 0
 op lshift16
@@ -983,6 +986,16 @@ IF W,   psrlw mw2, xm8
         CONTINUE tmp0q
 %endmacro
 
+; special cases for expanding bits to full range
+%macro expand_bits 2 ; bits, suffix
+op expand_bits%1
+        mova m8, [const1%2]
+        LOAD_CONT tmp0q
+        pcmpeq%2 mx, m8
+IF V2,  pcmpeq%2 mx2, m8
+        CONTINUE tmp0q
+%endmacro
+
 ;---------------------------------------------------------
 ; Macro instantiations for kernel functions
 
@@ -1000,6 +1013,7 @@ IF W,   psrlw mw2, xm8
     read_nibbles
     read_bits
     write_bits
+    expand_bits 8, b
 
     pack_generic 1, 2, 1
     pack_generic 3, 3, 2
@@ -1022,6 +1036,7 @@ IF W,   psrlw mw2, xm8
 
 %macro funcs_u16 0
     rw_packed 16
+    expand_bits 16, w
     pack_generic  4, 4, 4
     pack_generic  5, 5, 5
     pack_generic  5, 6, 5

_______________________________________________
ffmpeg-cvslog mailing list -- [email protected]
To unsubscribe send an email to [email protected]

Reply via email to