On Wed, Dec 4, 2019 at 4:03 AM Ting Fu <[email protected]> wrote:
> + VBROADCASTSD y_offset, [pointer_c_ditherq + 8 * 8]
> + VBROADCASTSD u_offset, [pointer_c_ditherq + 9 * 8]
> + VBROADCASTSD v_offset, [pointer_c_ditherq + 10 * 8]
> + VBROADCASTSD ug_coff, [pointer_c_ditherq + 7 * 8]
> + VBROADCASTSD vg_coff, [pointer_c_ditherq + 6 * 8]
> + VBROADCASTSD y_coff, [pointer_c_ditherq + 3 * 8]
> + VBROADCASTSD ub_coff, [pointer_c_ditherq + 5 * 8]
> + VBROADCASTSD vr_coff, [pointer_c_ditherq + 4 * 8]
[...]
> + vpbroadcastq m2, mu_offset
> + vpbroadcastq m3, mv_offset
> + vpbroadcastq m4, my_offset
VBROADCASTSD/vpbroadcastq -> movddup
> + mova m2, m0
> + mova m3, m1
> + vpbroadcastq m4, mug_coff
> + vpbroadcastq m5, mvg_coff
> + pmulhw m2, m4
> + pmulhw m3, m5
The register-register moves can be eliminated:
movddup m2, mug_coff
movddup m3, mvg_coff
pmulhw m2, m0
pmulhw m3, m1
> + mova m0, m3
> + pshufb m0, [mask_evenword] ; R2 G2 R6 G6 R10 G10 R14 G14 -- -- -- -- --
> -- -- --
> + mova m1, m2
> + pshufb m1, [mask_oddword] ; G1 B1 G5 B5 G9 B9 G13 B13 -- -- -- -- -- --
> -- --
> + punpcklwd m1, m0 ; G1 B1 R2 G2 G5 B5 R6 G6 G9 B9 R10 G10 G13
> B13 R14 G14
> + mova m0,m6
> + pshufb m0, [mask_evenword] ; B2 R3 B6 R7 B10 R11 B14 R15 -- -- -- -- --
> -- -- --
> + mova m4, m2
> + pshufb m4, [mask_evenword] ; G3 B3 G7 B7 G11 B11 G15 G15 -- -- -- -- --
> -- -- --
> + punpcklwd m0, m4
> + pshufb m3, [mask_oddword] ; R0 G0 R4 G4 R8 G8 R12 G12 -- -- -- -- -- --
> -- --
> + pshufb m6, [mask_oddword] ; B0 R1 B4 R5 B8 R9 B12 R13 -- -- -- -- -- --
> -- --
> + mova m5, m0
> + mova m7, m1
> + punpcklwd m3, m6 ; R0 G0 B0 R1 R4 G4 B4 R5 R8 G9 B8 R9 R12
> G12 B12 R13
> + punpckldq m7, m5 ; G1 B1 R2 G2 B2 R3 G3 B3 G5 B5 R5 G5 B6
> R7 G7 B7
> + punpckhdq m1, m0 ; G9 B9 R10 G10 B10 R11 G11 B11 G13 B13 R14 G14 B14
> R15 G15 B15
> + mova m0, m3
> + mova m2, m7
> + pshufb m0, [mask_dw01to03] ; R0 G0 B0 R1 -- -- -- -- -- -- -- -- R4 G4
> B4 R5
> + pshufb m2, [mask_dw01to12] ; -- -- -- -- G1 B1 R2 G2 B2 R3 G3 B3 -- --
> -- --
> + por m0, m2 ; R0 G0 B0 R1 G1 B1 R2 G2 B2 R3 G3 B3 R4 G4
> B4 R5
> + mova m2, m3
> + mova m4, m7
> + pshufb m2, [mask_dw2to2] ; -- -- -- -- -- -- -- -- R8 G8 B8 R9 -- --
> -- --
> + pshufb m4, [mask_dw23to01] ; G5 B5 R6 G6 B6 R7 G7 B7 -- -- -- -- -- --
> -- --
> + por m2, m4
> + mova m4, m1
> + pshufb m4, [mask_dw0to3] ; -- -- -- -- -- -- -- -- -- -- -- -- G9 B9
> R10 G10
> + por m2, m4 ; G5 B5 R6 G6 B6 R7 G7 B7 R8 G8 B8 R9 G9 B9
> R10 G10
> + pshufb m3, [mask_dw3to1] ; --- --- --- --- R12 G12 B12 R13 --- ---
> --- --- --- --- --- ---
> + pshufb m1, [mask_dw123to023] ; B10 R11 G11 B11 --- --- --- --- G13 B13
> R14 G14 B14 R15 G15 B15
> + por m1, m3 ; B10 R11 G11 B11 R12 G12 B12 R13 G13 B13
> R14 G14 B14 R15 G15 B15
Probably faster to do fewer shuffles in favor of masking instead, e.g.
something along the lines of
rgb_shuf1: db 0, 1, 6, 7, 12, 13, 2, 3, 8, 9, 14, 15, 4, 5, 10, 11
rgb_shuf2: db 10, 11, 0, 1, 6, 7, 12, 13, 2, 3, 8, 9, 14, 15, 4, 5
rgb_shuf3: db 4, 5, 10, 11, 0, 1, 6, 7, 12, 13, 2, 3, 8, 9, 14, 15
rgb_mask1: db -1, -1, 0, 0, 0, 0, -1, -1, 0, 0, 0, 0, -1, -1, 0, 0
rgb_mask2: db 0, 0, -1, -1, 0, 0, 0, 0, -1, -1, 0, 0, 0, 0, -1, -1
rgb_mask3: db 0, 0, 0, 0, -1, -1, 0, 0, 0, 0, -1, -1, 0, 0, 0, 0
[...]
pshufb m3, [rgb_shuf1] ; r0 g0 r6 g6 r12 g12 r2 g2 r8 g8 r14
g14 r4 g4 r10 g10
pshufb m6, [rgb_shuf2] ; b10 r11 b0 r1 b6 r7 b12 r13 b2 r3 b8
r9 b14 r15 b4 r5
pshufb m2, [rgb_shuf3] ; g5 b5 g11 b11 g1 b1 g7 b7 g13 b13 g3
b3 g9 b9 g15 b15
mova m7, [rgb_mask1]
mova m4, [rgb_mask2]
mova m5, [rgb_mask3]
pand m0, m7, m3 ; r0 g0 ___ ___ ___ ___ r2 g2 ___ ___ ___
___ r4 g4 ___ ___
pand m1, m4, m6 ; ___ ___ b0 r1 ___ ___ ___ ___ b2 r3 ___
___ ___ ___ b4 r5
por m0, m1
pand m1, m5, m2 ; ___ ___ ___ ___ g1 b1 ___ ___ ___ ___ g3
b3 ___ ___ ___ ___
por m0, m1 ; r0 g0 b0 r1 g1 b1 r2 g2 b2 r3 g3
b3 r4 g4 b4 r5
pand m1, m7, m2 ; g5 b5 ___ ___ ___ ___ g7 b7 ___ ___ ___
___ g9 b9 ___ ___
pand m7, m6 ; b10 r11 ___ ___ ___ ___ b12 r13 ___ ___ ___
___ b14 r15 ___ ___
pand m6, m5 ; ___ ___ ___ ___ b6 r7 ___ ___ ___ ___ b8
r9 ___ ___ ___ ___
por m1, m6
pand m6, m4, m3 ; ___ ___ r6 g6 ___ ___ ___ ___ r8 g8 ___
___ ___ ___ r10 g10
pand m2, m4 ; ___ ___ g11 b11 ___ ___ ___ ___ g13 b13 ___
___ ___ ___ g15 b15
pand m3, m5 ; ___ ___ ___ ___ r12 g12 ___ ___ ___ ___ r14
g14 ___ ___ ___ ___
por m2, m7
por m1, m6 ; g5 b5 r6 g6 b6 r7 g7 b7 r8 g8 b8
r9 g9 b9 r10 g10
por m2, m3 ; b10 r11 g11 b11 r12 g12 b12 r13 g13 b13 r14
g14 b14 r15 g15 b15
_______________________________________________
ffmpeg-devel mailing list
[email protected]
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
[email protected] with subject "unsubscribe".