Signed-off-by: Paul B Mahol <one...@gmail.com> --- libavfilter/x86/vf_v360.asm | 67 ++++++++++++++++++++++++++++++++++ libavfilter/x86/vf_v360_init.c | 12 ++++++ 2 files changed, 79 insertions(+)
diff --git a/libavfilter/x86/vf_v360.asm b/libavfilter/x86/vf_v360.asm index a0936eb6dc..5b241220d8 100644 --- a/libavfilter/x86/vf_v360.asm +++ b/libavfilter/x86/vf_v360.asm @@ -26,7 +26,9 @@ SECTION_RODATA pb_mask: db 0,4,8,12,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1 +pw_mask: db 0,1,4, 5, 8, 9,12,13,-1,-1,-1,-1,-1,-1,-1,-1 pd_255: times 4 dd 255 +pd_65535: times 4 dd 65535 SECTION .text @@ -60,6 +62,34 @@ cglobal remap1_8bit_line, 6, 7, 6, dst, width, src, in_linesize, u, v, x jl .loop RET +INIT_YMM avx2 +cglobal remap1_16bit_line, 6, 7, 6, dst, width, src, in_linesize, u, v, x + movsxdifnidn widthq, widthd + xor xq, xq + movd xm0, in_linesized + pcmpeqw m4, m4 + VBROADCASTI128 m3, [pw_mask] + vpbroadcastd m0, xm0 + + .loop: + pmovsxwd m1, [vq + xq * 2] + pmovsxwd m2, [uq + xq * 2] + + pslld m2, 0x1 + pmulld m1, m0 + paddd m1, m2 + mova m2, m4 + vpgatherdd m5, [srcq + m1], m2 + pshufb m1, m5, m3 + vextracti128 xm2, m1, 1 + movq [dstq+xq*2], xm1 + movq [dstq+xq*2+8], xm2 + + add xq, mmsize / 4 + cmp xq, widthq + jl .loop + RET + INIT_YMM avx2 cglobal remap2_8bit_line, 7, 8, 8, dst, width, src, in_linesize, u, v, ker, x movsxdifnidn widthq, widthd @@ -96,6 +126,43 @@ DEFINE_ARGS dst, width, src, x, u, v, ker jl .loop RET +INIT_YMM avx2 +cglobal remap2_16bit_line, 7, 8, 8, dst, width, src, in_linesize, u, v, ker, x + movsxdifnidn widthq, widthd + movd xm0, in_linesized +%if ARCH_X86_32 +DEFINE_ARGS dst, width, src, x, u, v, ker +%endif + xor xq, xq + pcmpeqw m7, m7 + vpbroadcastd m0, xm0 + vpbroadcastd m6, [pd_65535] + + .loop: + pmovsxwd m1, [kerq + xq * 8] + pmovsxwd m2, [vq + xq * 8] + pmovsxwd m3, [uq + xq * 8] + + pslld m3, 0x1 + pmulld m4, m2, m0 + paddd m4, m3 + mova m3, m7 + vpgatherdd m2, [srcq + m4], m3 + pand m2, m6 + pmulld m2, m1 + phaddd m2, m2 + phaddd m1, m2, m2 + psrld m1, m1, 0xe + vextracti128 xm2, m1, 1 + + pextrw [dstq+xq*2], xm1, 0 + pextrw [dstq+xq*2+2], xm2, 0 + + add xq, mmsize / 16 + cmp xq, widthq + jl .loop + RET + %if ARCH_X86_64 INIT_YMM avx2 diff --git a/libavfilter/x86/vf_v360_init.c b/libavfilter/x86/vf_v360_init.c index 8c1a10c705..c7f4a3dd6d 100644 --- a/libavfilter/x86/vf_v360_init.c +++ b/libavfilter/x86/vf_v360_init.c @@ -32,6 +32,12 @@ void ff_remap2_8bit_line_avx2(uint8_t *dst, int width, const uint8_t *src, ptrdi void ff_remap4_8bit_line_avx2(uint8_t *dst, int width, const uint8_t *src, ptrdiff_t in_linesize, const uint16_t *u, const uint16_t *v, const int16_t *ker); +void ff_remap1_16bit_line_avx2(uint8_t *dst, int width, const uint8_t *src, ptrdiff_t in_linesize, + const uint16_t *u, const uint16_t *v, const int16_t *ker); + +void ff_remap2_16bit_line_avx2(uint8_t *dst, int width, const uint8_t *src, ptrdiff_t in_linesize, + const uint16_t *u, const uint16_t *v, const int16_t *ker); + av_cold void ff_v360_init_x86(V360Context *s, int depth) { int cpu_flags = av_get_cpu_flags(); @@ -42,6 +48,12 @@ av_cold void ff_v360_init_x86(V360Context *s, int depth) if (EXTERNAL_AVX2_FAST(cpu_flags) && s->interp == BILINEAR && depth <= 8) s->remap_line = ff_remap2_8bit_line_avx2; + if (EXTERNAL_AVX2_FAST(cpu_flags) && s->interp == NEAREST && depth > 8) + s->remap_line = ff_remap1_16bit_line_avx2; + + if (EXTERNAL_AVX2_FAST(cpu_flags) && s->interp == BILINEAR && depth > 8) + s->remap_line = ff_remap2_16bit_line_avx2; + #if ARCH_X86_64 if (EXTERNAL_AVX2_FAST(cpu_flags) && (s->interp == BICUBIC || s->interp == LANCZOS) && depth <= 8) -- 2.17.1 _______________________________________________ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-devel To unsubscribe, visit link above, or email ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".