On Wed, Dec 4, 2019 at 4:03 AM Ting Fu <ting...@intel.com> wrote:
> +    VBROADCASTSD y_offset, [pointer_c_ditherq + 8  * 8]
> +    VBROADCASTSD u_offset, [pointer_c_ditherq + 9  * 8]
> +    VBROADCASTSD v_offset, [pointer_c_ditherq + 10 * 8]
> +    VBROADCASTSD ug_coff,  [pointer_c_ditherq + 7  * 8]
> +    VBROADCASTSD vg_coff,  [pointer_c_ditherq + 6  * 8]
> +    VBROADCASTSD y_coff,   [pointer_c_ditherq + 3  * 8]
> +    VBROADCASTSD ub_coff,  [pointer_c_ditherq + 5  * 8]
> +    VBROADCASTSD vr_coff,  [pointer_c_ditherq + 4  * 8]
[...]
> +    vpbroadcastq m2, mu_offset
> +    vpbroadcastq m3, mv_offset
> +    vpbroadcastq m4, my_offset

VBROADCASTSD/vpbroadcastq -> movddup

> +    mova m2, m0
> +    mova m3, m1
> +    vpbroadcastq m4, mug_coff
> +    vpbroadcastq m5, mvg_coff
> +    pmulhw m2, m4
> +    pmulhw m3, m5

The register-register moves can be eliminated:
    movddup m2, mug_coff
    movddup m3, mvg_coff
    pmulhw m2, m0
    pmulhw m3, m1

> +    mova m0, m3
> +    pshufb m0, [mask_evenword] ; R2 G2 R6 G6 R10 G10 R14 G14 -- -- -- -- -- 
> -- -- --
> +    mova m1, m2
> +    pshufb m1, [mask_oddword]  ; G1 B1 G5 B5 G9 B9 G13 B13 -- -- -- -- -- -- 
> -- --
> +    punpcklwd m1, m0           ; G1 B1 R2 G2 G5 B5 R6 G6 G9 B9 R10 G10 G13 
> B13 R14 G14
> +    mova m0,m6
> +    pshufb m0, [mask_evenword] ; B2 R3 B6 R7 B10 R11 B14 R15 -- -- -- -- -- 
> -- -- --
> +    mova m4, m2
> +    pshufb m4, [mask_evenword] ; G3 B3 G7 B7 G11 B11 G15 G15 -- -- -- -- -- 
> -- -- --
> +    punpcklwd m0, m4
> +    pshufb m3, [mask_oddword]  ; R0 G0 R4 G4 R8 G8 R12 G12 -- -- -- -- -- -- 
> -- --
> +    pshufb m6, [mask_oddword]  ; B0 R1 B4 R5 B8 R9 B12 R13 -- -- -- -- -- -- 
> -- --
> +    mova m5, m0
> +    mova m7, m1
> +    punpcklwd m3, m6 ; R0  G0  B0  R1  R4  G4  B4  R5  R8  G9  B8  R9  R12 
> G12 B12 R13
> +    punpckldq m7, m5 ; G1  B1  R2  G2  B2  R3  G3  B3  G5  B5  R5  G5  B6  
> R7  G7  B7
> +    punpckhdq m1, m0 ; G9  B9  R10 G10 B10 R11 G11 B11 G13 B13 R14 G14 B14 
> R15 G15 B15
> +    mova m0, m3
> +    mova m2, m7
> +    pshufb m0, [mask_dw01to03] ; R0 G0 B0 R1 -- -- -- -- -- -- -- -- R4 G4 
> B4 R5
> +    pshufb m2, [mask_dw01to12] ; -- -- -- -- G1 B1 R2 G2 B2 R3 G3 B3 -- -- 
> -- --
> +    por m0, m2                 ; R0 G0 B0 R1 G1 B1 R2 G2 B2 R3 G3 B3 R4 G4 
> B4 R5
> +    mova m2, m3
> +    mova m4, m7
> +    pshufb m2, [mask_dw2to2]   ; -- -- -- -- -- -- -- -- R8 G8 B8 R9 -- -- 
> -- --
> +    pshufb m4, [mask_dw23to01] ; G5 B5 R6 G6 B6 R7 G7 B7 -- -- -- -- -- -- 
> -- --
> +    por m2, m4
> +    mova m4, m1
> +    pshufb m4, [mask_dw0to3]   ; -- -- -- -- -- -- -- -- -- -- -- -- G9 B9 
> R10 G10
> +    por m2, m4                 ; G5 B5 R6 G6 B6 R7 G7 B7 R8 G8 B8 R9 G9 B9 
> R10 G10
> +    pshufb m3, [mask_dw3to1]     ; --- --- --- --- R12 G12 B12 R13 --- --- 
> --- --- --- --- --- ---
> +    pshufb m1, [mask_dw123to023] ; B10 R11 G11 B11 --- --- --- --- G13 B13 
> R14 G14 B14 R15 G15 B15
> +    por m1, m3                   ; B10 R11 G11 B11 R12 G12 B12 R13 G13 B13 
> R14 G14 B14 R15 G15 B15

Probably faster to do fewer shuffles in favor of masking instead, e.g.
something along the lines of

rgb_shuf1: db  0,  1,  6,  7, 12, 13,  2,  3,  8,  9, 14, 15,  4,  5, 10, 11
rgb_shuf2: db 10, 11,  0,  1,  6,  7, 12, 13,  2,  3,  8,  9, 14, 15,  4,  5
rgb_shuf3: db  4,  5, 10, 11,  0,  1,  6,  7, 12, 13,  2,  3,  8,  9, 14, 15
rgb_mask1: db -1, -1,  0,  0,  0,  0, -1, -1,  0,  0,  0,  0, -1, -1,  0,  0
rgb_mask2: db  0,  0, -1, -1,  0,  0,  0,  0, -1, -1,  0,  0,  0,  0, -1, -1
rgb_mask3: db  0,  0,  0,  0, -1, -1,  0,  0,  0,  0, -1, -1,  0,  0,  0,  0
[...]
pshufb m3, [rgb_shuf1] ; r0  g0  r6  g6  r12 g12 r2  g2  r8  g8  r14
g14 r4  g4  r10 g10
pshufb m6, [rgb_shuf2] ; b10 r11 b0  r1  b6  r7  b12 r13 b2  r3  b8
r9  b14 r15 b4  r5
pshufb m2, [rgb_shuf3] ; g5  b5  g11 b11 g1  b1  g7  b7  g13 b13 g3
b3  g9  b9  g15 b15
mova   m7, [rgb_mask1]
mova   m4, [rgb_mask2]
mova   m5, [rgb_mask3]
pand   m0, m7, m3      ; r0  g0  ___ ___ ___ ___ r2  g2  ___ ___ ___
___ r4  g4  ___ ___
pand   m1, m4, m6      ; ___ ___ b0  r1  ___ ___ ___ ___ b2  r3  ___
___ ___ ___ b4  r5
por    m0, m1
pand   m1, m5, m2      ; ___ ___ ___ ___ g1  b1  ___ ___ ___ ___ g3
b3  ___ ___ ___ ___
por    m0, m1          ; r0  g0  b0  r1  g1  b1  r2  g2  b2  r3  g3
b3  r4  g4  b4  r5
pand   m1, m7, m2      ; g5  b5  ___ ___ ___ ___ g7  b7  ___ ___ ___
___ g9  b9  ___ ___
pand   m7, m6          ; b10 r11 ___ ___ ___ ___ b12 r13 ___ ___ ___
___ b14 r15 ___ ___
pand   m6, m5          ; ___ ___ ___ ___ b6  r7  ___ ___ ___ ___ b8
r9  ___ ___ ___ ___
por    m1, m6
pand   m6, m4, m3      ; ___ ___ r6  g6  ___ ___ ___ ___ r8  g8  ___
___ ___ ___ r10 g10
pand   m2, m4          ; ___ ___ g11 b11 ___ ___ ___ ___ g13 b13 ___
___ ___ ___ g15 b15
pand   m3, m5          ; ___ ___ ___ ___ r12 g12 ___ ___ ___ ___ r14
g14 ___ ___ ___ ___
por    m2, m7
por    m1, m6          ; g5  b5  r6  g6  b6  r7  g7  b7  r8  g8  b8
r9  g9  b9  r10 g10
por    m2, m3          ; b10 r11 g11 b11 r12 g12 b12 r13 g13 b13 r14
g14 b14 r15 g15 b15
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".

Reply via email to