The branch, master has been updated
via 00225e9ebc943fdec451f826e342c821489a81f3 (commit)
via 3049694e9f51d883c54c2c5500c8fd9d388536f9 (commit)
via 3ed590c7b9fb0663903a2927f713bec8ad0232b6 (commit)
via 617c042093129758a53b01e6fc400356d8f4e566 (commit)
via 29f439077a1b7a0a93832fdaefddcc0b7577c5e7 (commit)
via 4539f7e4d4953fbf2086cac0e54575627f7df1e8 (commit)
via 3e2d9b73c13292a324d8846dc49d807b59224612 (commit)
via 15a4289b79d8c2e2453921544f7983248ef498bd (commit)
from c7815a4b707b7ae685e5809179094a275885759e (commit)
- Log -----------------------------------------------------------------
commit 00225e9ebc943fdec451f826e342c821489a81f3
Author: Andreas Rheinhardt <[email protected]>
AuthorDate: Sat Oct 4 15:45:36 2025 +0200
Commit: Andreas Rheinhardt <[email protected]>
CommitDate: Tue Oct 7 18:06:40 2025 +0200
avcodec/x86/h264_qpel: Simplify macros
1. Remove the OP parameter from the QPEL_H264* macros. These are
a remnant of inline assembly and were forgotten in
610e00b3594bf0f2a75713f20e9c4edf0d03a818.
2. Pass the instruction set extension for the shift5 function
explicitly in the macro instead of using magic #defines.
3. Likewise, avoid magic #defines for (8|16)_v_lowpass_ssse3.
Signed-off-by: Andreas Rheinhardt <[email protected]>
diff --git a/libavcodec/x86/h264_qpel.c b/libavcodec/x86/h264_qpel.c
index 9b22c74286..f7596329e2 100644
--- a/libavcodec/x86/h264_qpel.c
+++ b/libavcodec/x86/h264_qpel.c
@@ -78,7 +78,7 @@ void ff_put_h264_qpel4_hv_lowpass_v_mmxext(const uint8_t
*src, int16_t *tmp, ptr
DEF_QPEL(avg)
DEF_QPEL(put)
-#define QPEL_H264(OPNAME, OP, MMX)\
+#define QPEL_H264(OPNAME, MMX)\
static av_always_inline void OPNAME ## h264_qpel4_hv_lowpass_ ## MMX(uint8_t
*dst, int16_t *tmp, const uint8_t *src, ptrdiff_t dstStride, ptrdiff_t
srcStride)\
{\
src -= 2*srcStride+2;\
@@ -100,17 +100,17 @@ static av_always_inline void ff_ ## OPNAME ##
h264_qpel16_h_lowpass_l2_ ## EXT(u
#if ARCH_X86_64
-#define QPEL_H264_H16_XMM(OPNAME, OP, MMX)\
+#define QPEL_H264_H16_XMM(OPNAME, MMX)\
void ff_avg_h264_qpel16_h_lowpass_l2_ssse3(uint8_t *dst, const uint8_t *src,
const uint8_t *src2, ptrdiff_t dstStride, ptrdiff_t src2Stride);
void ff_put_h264_qpel16_h_lowpass_l2_ssse3(uint8_t *dst, const uint8_t *src,
const uint8_t *src2, ptrdiff_t dstStride, ptrdiff_t src2Stride);
#else // ARCH_X86_64
-#define QPEL_H264_H16_XMM(OPNAME, OP, EXT) QPEL_H264_H16(OPNAME, EXT)
+#define QPEL_H264_H16_XMM(OPNAME, EXT) QPEL_H264_H16(OPNAME, EXT)
#endif // ARCH_X86_64
-#define QPEL_H264_H_XMM(OPNAME, OP, MMX)\
-QPEL_H264_H16_XMM(OPNAME, OP, MMX)\
+#define QPEL_H264_H_XMM(OPNAME, MMX)\
+QPEL_H264_H16_XMM(OPNAME, MMX)\
static av_always_inline void ff_ ## OPNAME ## h264_qpel16_h_lowpass_ ##
MMX(uint8_t *dst, const uint8_t *src, ptrdiff_t dstStride, ptrdiff_t srcStride)\
{\
ff_ ## OPNAME ## h264_qpel8_h_lowpass_ ## MMX(dst , src , dstStride,
srcStride);\
@@ -121,15 +121,15 @@ static av_always_inline void ff_ ## OPNAME ##
h264_qpel16_h_lowpass_ ## MMX(uint
ff_ ## OPNAME ## h264_qpel8_h_lowpass_ ## MMX(dst+8, src+8, dstStride,
srcStride);\
}\
-#define QPEL_H264_V_XMM(OPNAME, OP, MMX)\
-static av_always_inline void ff_ ## OPNAME ## h264_qpel8_v_lowpass_ ##
MMX(uint8_t *dst, const uint8_t *src, ptrdiff_t dstStride, ptrdiff_t srcStride)\
+#define QPEL_H264_V_XMM(OPNAME, XMM, XMM2)\
+static av_always_inline void ff_ ## OPNAME ## h264_qpel8_v_lowpass_ ##
XMM(uint8_t *dst, const uint8_t *src, ptrdiff_t dstStride, ptrdiff_t srcStride)\
{\
- ff_ ## OPNAME ## h264_qpel8or16_v_lowpass_ ## MMX(dst , src , dstStride,
srcStride, 8);\
+ ff_ ## OPNAME ## h264_qpel8or16_v_lowpass_ ## XMM2(dst , src ,
dstStride, srcStride, 8);\
}\
-static av_always_inline void ff_ ## OPNAME ## h264_qpel16_v_lowpass_ ##
MMX(uint8_t *dst, const uint8_t *src, ptrdiff_t dstStride, ptrdiff_t srcStride)\
+static av_always_inline void ff_ ## OPNAME ## h264_qpel16_v_lowpass_ ##
XMM(uint8_t *dst, const uint8_t *src, ptrdiff_t dstStride, ptrdiff_t srcStride)\
{\
- ff_ ## OPNAME ## h264_qpel8or16_v_lowpass_ ## MMX(dst , src , dstStride,
srcStride, 16);\
- ff_ ## OPNAME ## h264_qpel8or16_v_lowpass_ ## MMX(dst+8, src+8, dstStride,
srcStride, 16);\
+ ff_ ## OPNAME ## h264_qpel8or16_v_lowpass_ ## XMM2(dst , src ,
dstStride, srcStride, 16);\
+ ff_ ## OPNAME ## h264_qpel8or16_v_lowpass_ ## XMM2(dst+8, src+8,
dstStride, srcStride, 16);\
}
static av_always_inline void put_h264_qpel8or16_hv1_lowpass_sse2(int16_t *tmp,
@@ -146,7 +146,7 @@ static av_always_inline void
put_h264_qpel8or16_hv1_lowpass_sse2(int16_t *tmp,
}
}
-#define QPEL_H264_HV_XMM(OPNAME, OP, MMX)\
+#define QPEL_H264_HV_XMM(OPNAME, MMX)\
static av_always_inline void OPNAME ## h264_qpel8_hv_lowpass_ ## MMX(uint8_t
*dst, int16_t *tmp, const uint8_t *src, ptrdiff_t dstStride, ptrdiff_t
srcStride)\
{\
put_h264_qpel8or16_hv1_lowpass_sse2(tmp, src, srcStride, 8);\
@@ -158,18 +158,10 @@ static av_always_inline void OPNAME ##
h264_qpel16_hv_lowpass_ ## MMX(uint8_t *d
ff_ ## OPNAME ## h264_qpel16_hv2_lowpass_ ## MMX(dst, tmp, dstStride);\
}\
-#define ff_put_h264_qpel8_v_lowpass_ssse3 ff_put_h264_qpel8_v_lowpass_sse2
-#define ff_avg_h264_qpel8_v_lowpass_ssse3 ff_avg_h264_qpel8_v_lowpass_sse2
-#define ff_put_h264_qpel16_v_lowpass_ssse3 ff_put_h264_qpel16_v_lowpass_sse2
-#define ff_avg_h264_qpel16_v_lowpass_ssse3 ff_avg_h264_qpel16_v_lowpass_sse2
-
-#define ff_put_pixels4_l2_shift5_sse2 ff_put_pixels4_l2_shift5_mmxext
-#define ff_avg_pixels4_l2_shift5_sse2 ff_avg_pixels4_l2_shift5_mmxext
-
-#define H264_MC_V_H_HV(OPNAME, SIZE, MMX, ALIGN) \
-H264_MC_V(OPNAME, SIZE, MMX, ALIGN)\
-H264_MC_H(OPNAME, SIZE, MMX, ALIGN)\
-H264_MC_HV(OPNAME, SIZE, MMX, ALIGN)\
+#define H264_MC_V_H_HV(OPNAME, SIZE, MMX, ALIGN, SHIFT5_EXT) \
+H264_MC_V(OPNAME, SIZE, MMX, ALIGN, SHIFT5_EXT)\
+H264_MC_H(OPNAME, SIZE, MMX, ALIGN, SHIFT5_EXT)\
+H264_MC_HV(OPNAME, SIZE, MMX, ALIGN, SHIFT5_EXT)\
static void put_h264_qpel16_mc00_sse2 (uint8_t *dst, const uint8_t *src,
ptrdiff_t stride)
@@ -188,7 +180,7 @@ static void avg_h264_qpel8_mc00_mmxext(uint8_t *dst, const
uint8_t *src,
ff_avg_pixels8_mmxext(dst, src, stride, 8);
}
-#define H264_MC_H(OPNAME, SIZE, MMX, ALIGN) \
+#define H264_MC_H(OPNAME, SIZE, MMX, ALIGN, UNUSED) \
static void OPNAME ## h264_qpel ## SIZE ## _mc10_ ## MMX(uint8_t *dst, const
uint8_t *src, ptrdiff_t stride)\
{\
ff_ ## OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src, src,
stride, stride);\
@@ -204,7 +196,7 @@ static void OPNAME ## h264_qpel ## SIZE ## _mc30_ ##
MMX(uint8_t *dst, const uin
ff_ ## OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src,
src+1, stride, stride);\
}\
-#define H264_MC_V(OPNAME, SIZE, MMX, ALIGN) \
+#define H264_MC_V(OPNAME, SIZE, MMX, ALIGN, UNUSED) \
static void OPNAME ## h264_qpel ## SIZE ## _mc01_ ## MMX(uint8_t *dst, const
uint8_t *src, ptrdiff_t stride)\
{\
LOCAL_ALIGNED(ALIGN, uint8_t, temp, [SIZE*SIZE]);\
@@ -224,7 +216,7 @@ static void OPNAME ## h264_qpel ## SIZE ## _mc03_ ##
MMX(uint8_t *dst, const uin
ff_ ## OPNAME ## pixels ## SIZE ## _l2_ ## MMX(dst, src+stride, temp,
stride, stride, SIZE);\
}\
-#define H264_MC_HV(OPNAME, SIZE, MMX, ALIGN) \
+#define H264_MC_HV(OPNAME, SIZE, MMX, ALIGN, SHIFT5_EXT) \
static void OPNAME ## h264_qpel ## SIZE ## _mc11_ ## MMX(uint8_t *dst, const
uint8_t *src, ptrdiff_t stride)\
{\
LOCAL_ALIGNED(ALIGN, uint8_t, temp, [SIZE*SIZE]);\
@@ -286,7 +278,7 @@ static void OPNAME ## h264_qpel ## SIZE ## _mc12_ ##
MMX(uint8_t *dst, const uin
int16_t * const halfV= (int16_t*)(temp + SIZE*SIZE);\
av_assert2(((uintptr_t)temp & 7) == 0);\
put_h264_qpel ## SIZE ## _hv_lowpass_ ## MMX(halfHV, halfV, src, SIZE,
stride);\
- ff_ ## OPNAME ## pixels ## SIZE ## _l2_shift5_sse2(dst, halfV+2, halfHV,
stride);\
+ ff_ ## OPNAME ## pixels ## SIZE ## _l2_shift5_ ## SHIFT5_EXT(dst, halfV+2,
halfHV, stride);\
}\
\
static void OPNAME ## h264_qpel ## SIZE ## _mc32_ ## MMX(uint8_t *dst, const
uint8_t *src, ptrdiff_t stride)\
@@ -296,35 +288,37 @@ static void OPNAME ## h264_qpel ## SIZE ## _mc32_ ##
MMX(uint8_t *dst, const uin
int16_t * const halfV= (int16_t*)(temp + SIZE*SIZE);\
av_assert2(((uintptr_t)temp & 7) == 0);\
put_h264_qpel ## SIZE ## _hv_lowpass_ ## MMX(halfHV, halfV, src, SIZE,
stride);\
- ff_ ## OPNAME ## pixels ## SIZE ## _l2_shift5_sse2(dst, halfV+3, halfHV,
stride);\
+ ff_ ## OPNAME ## pixels ## SIZE ## _l2_shift5_ ## SHIFT5_EXT(dst, halfV+3,
halfHV, stride);\
}\
-#define H264_MC(QPEL, SIZE, MMX, ALIGN)\
-QPEL(put_, SIZE, MMX, ALIGN) \
-QPEL(avg_, SIZE, MMX, ALIGN) \
-
-#define H264_MC_816(QPEL, XMM)\
-QPEL(put_, 8, XMM, 16)\
-QPEL(put_, 16,XMM, 16)\
-QPEL(avg_, 8, XMM, 16)\
-QPEL(avg_, 16,XMM, 16)\
-
-QPEL_H264(put_, PUT_OP, mmxext)
-QPEL_H264(avg_, AVG_MMXEXT_OP, mmxext)
-QPEL_H264_V_XMM(put_, PUT_OP, sse2)
-QPEL_H264_V_XMM(avg_,AVG_MMXEXT_OP, sse2)
-QPEL_H264_HV_XMM(put_, PUT_OP, sse2)
-QPEL_H264_HV_XMM(avg_,AVG_MMXEXT_OP, sse2)
-QPEL_H264_H_XMM(put_, PUT_OP, ssse3)
-QPEL_H264_H_XMM(avg_,AVG_MMXEXT_OP, ssse3)
-QPEL_H264_HV_XMM(put_, PUT_OP, ssse3)
-QPEL_H264_HV_XMM(avg_,AVG_MMXEXT_OP, ssse3)
-
-H264_MC(H264_MC_V_H_HV, 4, mmxext, 8)
-H264_MC_816(H264_MC_V, sse2)
-H264_MC_816(H264_MC_HV, sse2)
-H264_MC_816(H264_MC_H, ssse3)
-H264_MC_816(H264_MC_HV, ssse3)
+#define H264_MC(QPEL, SIZE, MMX, ALIGN, SHIFT5_EXT)\
+QPEL(put_, SIZE, MMX, ALIGN, SHIFT5_EXT) \
+QPEL(avg_, SIZE, MMX, ALIGN, SHIFT5_EXT) \
+
+#define H264_MC_816(QPEL, XMM, SHIFT5_EXT)\
+QPEL(put_, 8, XMM, 16, SHIFT5_EXT)\
+QPEL(put_, 16,XMM, 16, SHIFT5_EXT)\
+QPEL(avg_, 8, XMM, 16, SHIFT5_EXT)\
+QPEL(avg_, 16,XMM, 16, SHIFT5_EXT)\
+
+QPEL_H264(put_, mmxext)
+QPEL_H264(avg_, mmxext)
+QPEL_H264_V_XMM(put_, sse2, sse2)
+QPEL_H264_V_XMM(avg_, sse2, sse2)
+QPEL_H264_HV_XMM(put_, sse2)
+QPEL_H264_HV_XMM(avg_, sse2)
+QPEL_H264_H_XMM(put_, ssse3)
+QPEL_H264_H_XMM(avg_, ssse3)
+QPEL_H264_V_XMM(put_, ssse3, sse2)
+QPEL_H264_V_XMM(avg_, ssse3, sse2)
+QPEL_H264_HV_XMM(put_, ssse3)
+QPEL_H264_HV_XMM(avg_, ssse3)
+
+H264_MC(H264_MC_V_H_HV, 4, mmxext, 8, mmxext)
+H264_MC_816(H264_MC_V, sse2, sse2)
+H264_MC_816(H264_MC_HV, sse2, sse2)
+H264_MC_816(H264_MC_H, ssse3, sse2)
+H264_MC_816(H264_MC_HV, ssse3, sse2)
//10bit
commit 3049694e9f51d883c54c2c5500c8fd9d388536f9
Author: Andreas Rheinhardt <[email protected]>
AuthorDate: Sat Oct 4 15:07:18 2025 +0200
Commit: Andreas Rheinhardt <[email protected]>
CommitDate: Tue Oct 7 18:06:40 2025 +0200
avcodec/x86/h264_qpel: Split hv2_lowpass_sse2 into size 8,16 funcs
This is beneficial size-wise: 384B of new asm functions are more
than outweighted by 416B savings from simpler calls here (for size 16,
the size 8 function had been called twice).
It also makes the code more readable, as it allowed to remove
several wrappers in h264_qpel.c.
It is also beneficial performance-wise. Old benchmarks:
avg_h264_qpel_16_mc12_8_c: 1757.7 ( 1.00x)
avg_h264_qpel_16_mc12_8_sse2: 197.7 ( 8.89x)
avg_h264_qpel_16_mc12_8_ssse3: 204.6 ( 8.59x)
avg_h264_qpel_16_mc21_8_c: 1631.6 ( 1.00x)
avg_h264_qpel_16_mc21_8_sse2: 276.4 ( 5.90x)
avg_h264_qpel_16_mc21_8_ssse3: 290.7 ( 5.61x)
avg_h264_qpel_16_mc22_8_c: 1122.7 ( 1.00x)
avg_h264_qpel_16_mc22_8_sse2: 179.5 ( 6.25x)
avg_h264_qpel_16_mc22_8_ssse3: 181.8 ( 6.17x)
avg_h264_qpel_16_mc23_8_c: 1626.7 ( 1.00x)
avg_h264_qpel_16_mc23_8_sse2: 276.8 ( 5.88x)
avg_h264_qpel_16_mc23_8_ssse3: 290.9 ( 5.59x)
avg_h264_qpel_16_mc32_8_c: 1754.1 ( 1.00x)
avg_h264_qpel_16_mc32_8_sse2: 193.8 ( 9.05x)
avg_h264_qpel_16_mc32_8_ssse3: 203.6 ( 8.62x)
put_h264_qpel_16_mc12_8_c: 1733.6 ( 1.00x)
put_h264_qpel_16_mc12_8_sse2: 189.6 ( 9.14x)
put_h264_qpel_16_mc12_8_ssse3: 199.6 ( 8.69x)
put_h264_qpel_16_mc21_8_c: 1616.0 ( 1.00x)
put_h264_qpel_16_mc21_8_sse2: 284.3 ( 5.69x)
put_h264_qpel_16_mc21_8_ssse3: 296.5 ( 5.45x)
put_h264_qpel_16_mc22_8_c: 963.7 ( 1.00x)
put_h264_qpel_16_mc22_8_sse2: 169.9 ( 5.67x)
put_h264_qpel_16_mc22_8_ssse3: 186.1 ( 5.18x)
put_h264_qpel_16_mc23_8_c: 1607.2 ( 1.00x)
put_h264_qpel_16_mc23_8_sse2: 275.0 ( 5.84x)
put_h264_qpel_16_mc23_8_ssse3: 297.8 ( 5.40x)
put_h264_qpel_16_mc32_8_c: 1734.7 ( 1.00x)
put_h264_qpel_16_mc32_8_sse2: 189.4 ( 9.16x)
put_h264_qpel_16_mc32_8_ssse3: 199.4 ( 8.70x)
New benchmarks:
avg_h264_qpel_16_mc12_8_c: 1743.7 ( 1.00x)
avg_h264_qpel_16_mc12_8_sse2: 189.7 ( 9.19x)
avg_h264_qpel_16_mc12_8_ssse3: 204.4 ( 8.53x)
avg_h264_qpel_16_mc21_8_c: 1637.7 ( 1.00x)
avg_h264_qpel_16_mc21_8_sse2: 267.7 ( 6.12x)
avg_h264_qpel_16_mc21_8_ssse3: 291.5 ( 5.62x)
avg_h264_qpel_16_mc22_8_c: 1150.3 ( 1.00x)
avg_h264_qpel_16_mc22_8_sse2: 164.6 ( 6.99x)
avg_h264_qpel_16_mc22_8_ssse3: 182.1 ( 6.32x)
avg_h264_qpel_16_mc23_8_c: 1635.3 ( 1.00x)
avg_h264_qpel_16_mc23_8_sse2: 268.5 ( 6.09x)
avg_h264_qpel_16_mc23_8_ssse3: 298.5 ( 5.48x)
avg_h264_qpel_16_mc32_8_c: 1740.6 ( 1.00x)
avg_h264_qpel_16_mc32_8_sse2: 182.6 ( 9.53x)
avg_h264_qpel_16_mc32_8_ssse3: 201.9 ( 8.62x)
put_h264_qpel_16_mc12_8_c: 1727.4 ( 1.00x)
put_h264_qpel_16_mc12_8_sse2: 188.1 ( 9.18x)
put_h264_qpel_16_mc12_8_ssse3: 199.6 ( 8.65x)
put_h264_qpel_16_mc21_8_c: 1623.5 ( 1.00x)
put_h264_qpel_16_mc21_8_sse2: 265.9 ( 6.11x)
put_h264_qpel_16_mc21_8_ssse3: 299.4 ( 5.42x)
put_h264_qpel_16_mc22_8_c: 954.0 ( 1.00x)
put_h264_qpel_16_mc22_8_sse2: 161.8 ( 5.89x)
put_h264_qpel_16_mc22_8_ssse3: 180.4 ( 5.29x)
put_h264_qpel_16_mc23_8_c: 1611.2 ( 1.00x)
put_h264_qpel_16_mc23_8_sse2: 265.8 ( 6.06x)
put_h264_qpel_16_mc23_8_ssse3: 300.3 ( 5.37x)
put_h264_qpel_16_mc32_8_c: 1734.5 ( 1.00x)
put_h264_qpel_16_mc32_8_sse2: 180.0 ( 9.63x)
put_h264_qpel_16_mc32_8_ssse3: 199.7 ( 8.69x)
Signed-off-by: Andreas Rheinhardt <[email protected]>
diff --git a/libavcodec/x86/h264_qpel.c b/libavcodec/x86/h264_qpel.c
index f4082e2242..9b22c74286 100644
--- a/libavcodec/x86/h264_qpel.c
+++ b/libavcodec/x86/h264_qpel.c
@@ -65,7 +65,8 @@ void ff_ ## OPNAME ## _h264_qpel4_v_lowpass_mmxext(uint8_t
*dst, const uint8_t *
void ff_ ## OPNAME ## _h264_qpel8or16_v_lowpass_sse2(uint8_t *dst, const
uint8_t *src, ptrdiff_t dstStride, ptrdiff_t srcStride, int h);\
void ff_ ## OPNAME ## _h264_qpel4_hv_lowpass_h_mmxext(int16_t *tmp, uint8_t
*dst, ptrdiff_t dstStride);\
void ff_ ## OPNAME ## _h264_qpel8or16_hv1_lowpass_op_sse2(const uint8_t *src,
int16_t *tmp, ptrdiff_t srcStride, int size);\
-void ff_ ## OPNAME ## _h264_qpel8or16_hv2_lowpass_op_sse2(uint8_t *dst,
int16_t *tmp, ptrdiff_t dstStride, int h);\
+void ff_ ## OPNAME ## _h264_qpel8_hv2_lowpass_sse2(uint8_t *dst, int16_t *tmp,
ptrdiff_t dstStride);\
+void ff_ ## OPNAME ## _h264_qpel16_hv2_lowpass_sse2(uint8_t *dst, int16_t
*tmp, ptrdiff_t dstStride);\
void ff_ ## OPNAME ## _h264_qpel8_hv2_lowpass_ssse3(uint8_t *dst, int16_t
*tmp, ptrdiff_t dstStride);\
void ff_ ## OPNAME ## _h264_qpel16_hv2_lowpass_ssse3(uint8_t *dst, int16_t
*tmp, ptrdiff_t dstStride);\
void ff_ ## OPNAME ## _pixels4_l2_shift5_mmxext(uint8_t *dst, const int16_t
*src16, const uint8_t *src8, ptrdiff_t dstStride);\
@@ -84,16 +85,6 @@ static av_always_inline void OPNAME ##
h264_qpel4_hv_lowpass_ ## MMX(uint8_t *ds
ff_put_h264_qpel4_hv_lowpass_v_mmxext(src, tmp, srcStride);\
ff_ ## OPNAME ## h264_qpel4_hv_lowpass_h_mmxext(tmp, dst, dstStride);\
}\
-\
-static av_always_inline void ff_ ## OPNAME ## h264_qpel8or16_hv2_lowpass_ ##
MMX(uint8_t *dst, int16_t *tmp, ptrdiff_t dstStride, int size)\
-{\
- int w = size>>4;\
- do{\
- ff_ ## OPNAME ## h264_qpel8or16_hv2_lowpass_op_sse2(dst, tmp, dstStride,
size);\
- tmp += 8;\
- dst += 8;\
- }while(w--);\
-}\
#define QPEL_H264_H16(OPNAME, EXT) \
static av_always_inline void ff_ ## OPNAME ## h264_qpel16_h_lowpass_l2_ ##
EXT(uint8_t *dst, const uint8_t *src, const uint8_t *src2, ptrdiff_t dstStride,
ptrdiff_t src2Stride)\
@@ -156,40 +147,22 @@ static av_always_inline void
put_h264_qpel8or16_hv1_lowpass_sse2(int16_t *tmp,
}
#define QPEL_H264_HV_XMM(OPNAME, OP, MMX)\
-static av_always_inline void OPNAME ## h264_qpel8or16_hv_lowpass_ ##
MMX(uint8_t *dst, int16_t *tmp, const uint8_t *src, ptrdiff_t dstStride,
ptrdiff_t srcStride, int size)\
-{\
- put_h264_qpel8or16_hv1_lowpass_sse2(tmp, src, srcStride, size);\
- ff_ ## OPNAME ## h264_qpel8or16_hv2_lowpass_ ## MMX(dst, tmp, dstStride,
size);\
-}\
static av_always_inline void OPNAME ## h264_qpel8_hv_lowpass_ ## MMX(uint8_t
*dst, int16_t *tmp, const uint8_t *src, ptrdiff_t dstStride, ptrdiff_t
srcStride)\
{\
- OPNAME ## h264_qpel8or16_hv_lowpass_ ## MMX(dst, tmp, src, dstStride,
srcStride, 8);\
+ put_h264_qpel8or16_hv1_lowpass_sse2(tmp, src, srcStride, 8);\
+ ff_ ## OPNAME ## h264_qpel8_hv2_lowpass_ ## MMX(dst, tmp, dstStride);\
}\
static av_always_inline void OPNAME ## h264_qpel16_hv_lowpass_ ## MMX(uint8_t
*dst, int16_t *tmp, const uint8_t *src, ptrdiff_t dstStride, ptrdiff_t
srcStride)\
{\
- OPNAME ## h264_qpel8or16_hv_lowpass_ ## MMX(dst, tmp, src, dstStride,
srcStride, 16);\
+ put_h264_qpel8or16_hv1_lowpass_sse2(tmp, src, srcStride, 16);\
+ ff_ ## OPNAME ## h264_qpel16_hv2_lowpass_ ## MMX(dst, tmp, dstStride);\
}\
-#define SSSE3_HV2_LOWPASS_WRAPPER(OPNAME) \
-static av_always_inline void \
-ff_ ## OPNAME ## _h264_qpel8or16_hv2_lowpass_ssse3(uint8_t *dst, int16_t *tmp,
ptrdiff_t dstStride, int size) \
-{\
- if (size == 8)\
- ff_ ## OPNAME ## _h264_qpel8_hv2_lowpass_ssse3(dst, tmp, dstStride);\
- else\
- ff_ ## OPNAME ## _h264_qpel16_hv2_lowpass_ssse3(dst, tmp, dstStride);\
-}
-SSSE3_HV2_LOWPASS_WRAPPER(avg)
-SSSE3_HV2_LOWPASS_WRAPPER(put)
-
#define ff_put_h264_qpel8_v_lowpass_ssse3 ff_put_h264_qpel8_v_lowpass_sse2
#define ff_avg_h264_qpel8_v_lowpass_ssse3 ff_avg_h264_qpel8_v_lowpass_sse2
#define ff_put_h264_qpel16_v_lowpass_ssse3 ff_put_h264_qpel16_v_lowpass_sse2
#define ff_avg_h264_qpel16_v_lowpass_ssse3 ff_avg_h264_qpel16_v_lowpass_sse2
-#define ff_put_h264_qpel8or16_hv2_lowpass_sse2
ff_put_h264_qpel8or16_hv2_lowpass_mmxext
-#define ff_avg_h264_qpel8or16_hv2_lowpass_sse2
ff_avg_h264_qpel8or16_hv2_lowpass_mmxext
-
#define ff_put_pixels4_l2_shift5_sse2 ff_put_pixels4_l2_shift5_mmxext
#define ff_avg_pixels4_l2_shift5_sse2 ff_avg_pixels4_l2_shift5_mmxext
diff --git a/libavcodec/x86/h264_qpel_8bit.asm
b/libavcodec/x86/h264_qpel_8bit.asm
index 39a387b4bb..6e082819ac 100644
--- a/libavcodec/x86/h264_qpel_8bit.asm
+++ b/libavcodec/x86/h264_qpel_8bit.asm
@@ -611,28 +611,45 @@ cglobal put_h264_qpel8or16_hv1_lowpass_op, 4,4,8 ; src,
tmp, srcStride, size
.end:
RET
+%macro HV2_LOWPASS 2
+ mova %1, [r1+%2]
+ movu m1, [r1+2+%2]
+ movu m3, [r1+10+%2]
+ movu m4, [r1+8+%2]
+ movu m2, [r1+4+%2]
+ paddw %1, m3
+ movu m3, [r1+6+%2]
+ paddw m1, m4
+ psubw %1, m1
+ psraw %1, 2
+ paddw m2, m3
+ psubw %1, m1
+ paddsw %1, m2
+ psraw %1, 2
+ paddw %1, m2
+ psraw %1, 6
+%endmacro
-%macro QPEL8OR16_HV2_LOWPASS_OP 1
-cglobal %1_h264_qpel8or16_hv2_lowpass_op, 4,4,6 ; dst, tmp, dstStride, h
+%macro QPEL8AND16_HV2_LOWPASS_OP 1
+cglobal %1_h264_qpel8_hv2_lowpass, 3,4,6 ; dst, tmp, dstStride
+ mov r3d, 8
.loop:
- mova m0, [r1]
- movu m1, [r1+2]
- movu m3, [r1+10]
- movu m4, [r1+8]
- movu m2, [r1+4]
- movu m5, [r1+6]
- paddw m0, m3
- paddw m1, m4
- psubw m0, m1
- psraw m0, 2
- paddw m2, m5
- psubw m0, m1
- paddsw m0, m2
- psraw m0, 2
- paddw m0, m2
- psraw m0, 6
+ HV2_LOWPASS m0, 0
packuswb m0, m0
- op_%1h m0, [r0], m5
+ op_%1h m0, [r0], m3
+ add r1, 48
+ add r0, r2
+ dec r3d
+ jne .loop
+ RET
+
+cglobal %1_h264_qpel16_hv2_lowpass, 3,4,6 ; dst, tmp, dstStride
+ mov r3d, 16
+.loop:
+ HV2_LOWPASS m0, 0
+ HV2_LOWPASS m5, 16
+ packuswb m0, m5
+ op_%1 m0, [r0], m3
add r1, 48
add r0, r2
dec r3d
@@ -641,8 +658,8 @@ cglobal %1_h264_qpel8or16_hv2_lowpass_op, 4,4,6 ; dst, tmp,
dstStride, h
%endmacro
INIT_XMM sse2
-QPEL8OR16_HV2_LOWPASS_OP put
-QPEL8OR16_HV2_LOWPASS_OP avg
+QPEL8AND16_HV2_LOWPASS_OP put
+QPEL8AND16_HV2_LOWPASS_OP avg
%macro QPEL8OR16_HV2_LOWPASS_OP_XMM 1
cglobal %1_h264_qpel8_hv2_lowpass, 3,4,6 ; dst, tmp, dstStride
commit 3ed590c7b9fb0663903a2927f713bec8ad0232b6
Author: Andreas Rheinhardt <[email protected]>
AuthorDate: Sat Oct 4 12:38:32 2025 +0200
Commit: Andreas Rheinhardt <[email protected]>
CommitDate: Tue Oct 7 18:06:40 2025 +0200
avcodec/x86/h264_qpel: Port qpel8or16_hv2_lowpass_op_mmxext to SSE2
This means that only blocksize 4 still uses mmx(ext).
Old benchmarks:
avg_h264_qpel_8_mc12_8_c: 428.4 ( 1.00x)
avg_h264_qpel_8_mc12_8_sse2: 74.3 ( 5.77x)
avg_h264_qpel_8_mc12_8_ssse3: 69.3 ( 6.18x)
avg_h264_qpel_8_mc21_8_c: 401.4 ( 1.00x)
avg_h264_qpel_8_mc21_8_sse2: 97.8 ( 4.10x)
avg_h264_qpel_8_mc21_8_ssse3: 93.7 ( 4.28x)
avg_h264_qpel_8_mc22_8_c: 281.8 ( 1.00x)
avg_h264_qpel_8_mc22_8_sse2: 66.7 ( 4.23x)
avg_h264_qpel_8_mc22_8_ssse3: 62.6 ( 4.50x)
avg_h264_qpel_8_mc23_8_c: 397.2 ( 1.00x)
avg_h264_qpel_8_mc23_8_sse2: 97.9 ( 4.06x)
avg_h264_qpel_8_mc23_8_ssse3: 93.7 ( 4.24x)
avg_h264_qpel_8_mc32_8_c: 432.4 ( 1.00x)
avg_h264_qpel_8_mc32_8_sse2: 73.9 ( 5.85x)
avg_h264_qpel_8_mc32_8_ssse3: 69.5 ( 6.22x)
avg_h264_qpel_16_mc12_8_c: 1756.4 ( 1.00x)
avg_h264_qpel_16_mc12_8_sse2: 240.0 ( 7.32x)
avg_h264_qpel_16_mc12_8_ssse3: 204.5 ( 8.59x)
avg_h264_qpel_16_mc21_8_c: 1635.3 ( 1.00x)
avg_h264_qpel_16_mc21_8_sse2: 321.2 ( 5.09x)
avg_h264_qpel_16_mc21_8_ssse3: 288.5 ( 5.67x)
avg_h264_qpel_16_mc22_8_c: 1130.8 ( 1.00x)
avg_h264_qpel_16_mc22_8_sse2: 219.4 ( 5.15x)
avg_h264_qpel_16_mc22_8_ssse3: 182.2 ( 6.21x)
avg_h264_qpel_16_mc23_8_c: 1622.5 ( 1.00x)
avg_h264_qpel_16_mc23_8_sse2: 321.3 ( 5.05x)
avg_h264_qpel_16_mc23_8_ssse3: 289.5 ( 5.60x)
avg_h264_qpel_16_mc32_8_c: 1762.5 ( 1.00x)
avg_h264_qpel_16_mc32_8_sse2: 236.1 ( 7.46x)
avg_h264_qpel_16_mc32_8_ssse3: 205.2 ( 8.59x)
put_h264_qpel_8_mc12_8_c: 427.2 ( 1.00x)
put_h264_qpel_8_mc12_8_sse2: 72.1 ( 5.93x)
put_h264_qpel_8_mc12_8_ssse3: 67.0 ( 6.38x)
put_h264_qpel_8_mc21_8_c: 402.9 ( 1.00x)
put_h264_qpel_8_mc21_8_sse2: 95.9 ( 4.20x)
put_h264_qpel_8_mc21_8_ssse3: 91.9 ( 4.38x)
put_h264_qpel_8_mc22_8_c: 235.0 ( 1.00x)
put_h264_qpel_8_mc22_8_sse2: 64.6 ( 3.64x)
put_h264_qpel_8_mc22_8_ssse3: 60.0 ( 3.92x)
put_h264_qpel_8_mc23_8_c: 403.6 ( 1.00x)
put_h264_qpel_8_mc23_8_sse2: 95.9 ( 4.21x)
put_h264_qpel_8_mc23_8_ssse3: 91.7 ( 4.40x)
put_h264_qpel_8_mc32_8_c: 430.7 ( 1.00x)
put_h264_qpel_8_mc32_8_sse2: 72.1 ( 5.97x)
put_h264_qpel_8_mc32_8_ssse3: 67.0 ( 6.43x)
put_h264_qpel_16_mc12_8_c: 1724.2 ( 1.00x)
put_h264_qpel_16_mc12_8_sse2: 230.7 ( 7.47x)
put_h264_qpel_16_mc12_8_ssse3: 199.8 ( 8.63x)
put_h264_qpel_16_mc21_8_c: 1613.3 ( 1.00x)
put_h264_qpel_16_mc21_8_sse2: 327.5 ( 4.93x)
put_h264_qpel_16_mc21_8_ssse3: 297.2 ( 5.43x)
put_h264_qpel_16_mc22_8_c: 959.2 ( 1.00x)
put_h264_qpel_16_mc22_8_sse2: 211.9 ( 4.53x)
put_h264_qpel_16_mc22_8_ssse3: 186.1 ( 5.15x)
put_h264_qpel_16_mc23_8_c: 1619.0 ( 1.00x)
put_h264_qpel_16_mc23_8_sse2: 319.7 ( 5.06x)
put_h264_qpel_16_mc23_8_ssse3: 299.2 ( 5.41x)
put_h264_qpel_16_mc32_8_c: 1741.7 ( 1.00x)
put_h264_qpel_16_mc32_8_sse2: 230.9 ( 7.54x)
put_h264_qpel_16_mc32_8_ssse3: 199.4 ( 8.74x)
New benchmarks:
avg_h264_qpel_8_mc12_8_c: 427.2 ( 1.00x)
avg_h264_qpel_8_mc12_8_sse2: 63.9 ( 6.69x)
avg_h264_qpel_8_mc12_8_ssse3: 69.2 ( 6.18x)
avg_h264_qpel_8_mc21_8_c: 399.2 ( 1.00x)
avg_h264_qpel_8_mc21_8_sse2: 87.7 ( 4.55x)
avg_h264_qpel_8_mc21_8_ssse3: 93.9 ( 4.25x)
avg_h264_qpel_8_mc22_8_c: 285.7 ( 1.00x)
avg_h264_qpel_8_mc22_8_sse2: 56.4 ( 5.07x)
avg_h264_qpel_8_mc22_8_ssse3: 62.6 ( 4.56x)
avg_h264_qpel_8_mc23_8_c: 398.6 ( 1.00x)
avg_h264_qpel_8_mc23_8_sse2: 87.6 ( 4.55x)
avg_h264_qpel_8_mc23_8_ssse3: 93.8 ( 4.25x)
avg_h264_qpel_8_mc32_8_c: 425.8 ( 1.00x)
avg_h264_qpel_8_mc32_8_sse2: 63.8 ( 6.67x)
avg_h264_qpel_8_mc32_8_ssse3: 69.0 ( 6.17x)
avg_h264_qpel_16_mc12_8_c: 1748.2 ( 1.00x)
avg_h264_qpel_16_mc12_8_sse2: 198.5 ( 8.81x)
avg_h264_qpel_16_mc12_8_ssse3: 203.2 ( 8.60x)
avg_h264_qpel_16_mc21_8_c: 1638.1 ( 1.00x)
avg_h264_qpel_16_mc21_8_sse2: 277.4 ( 5.91x)
avg_h264_qpel_16_mc21_8_ssse3: 291.1 ( 5.63x)
avg_h264_qpel_16_mc22_8_c: 1140.7 ( 1.00x)
avg_h264_qpel_16_mc22_8_sse2: 180.3 ( 6.33x)
avg_h264_qpel_16_mc22_8_ssse3: 181.9 ( 6.27x)
avg_h264_qpel_16_mc23_8_c: 1629.9 ( 1.00x)
avg_h264_qpel_16_mc23_8_sse2: 278.0 ( 5.86x)
avg_h264_qpel_16_mc23_8_ssse3: 291.0 ( 5.60x)
avg_h264_qpel_16_mc32_8_c: 1752.1 ( 1.00x)
avg_h264_qpel_16_mc32_8_sse2: 193.7 ( 9.05x)
avg_h264_qpel_16_mc32_8_ssse3: 203.4 ( 8.61x)
put_h264_qpel_8_mc12_8_c: 421.8 ( 1.00x)
put_h264_qpel_8_mc12_8_sse2: 61.7 ( 6.83x)
put_h264_qpel_8_mc12_8_ssse3: 67.2 ( 6.28x)
put_h264_qpel_8_mc21_8_c: 396.8 ( 1.00x)
put_h264_qpel_8_mc21_8_sse2: 85.4 ( 4.65x)
put_h264_qpel_8_mc21_8_ssse3: 91.6 ( 4.33x)
put_h264_qpel_8_mc22_8_c: 234.1 ( 1.00x)
put_h264_qpel_8_mc22_8_sse2: 54.4 ( 4.30x)
put_h264_qpel_8_mc22_8_ssse3: 60.2 ( 3.89x)
put_h264_qpel_8_mc23_8_c: 399.2 ( 1.00x)
put_h264_qpel_8_mc23_8_sse2: 85.5 ( 4.67x)
put_h264_qpel_8_mc23_8_ssse3: 91.8 ( 4.35x)
put_h264_qpel_8_mc32_8_c: 422.2 ( 1.00x)
put_h264_qpel_8_mc32_8_sse2: 61.8 ( 6.83x)
put_h264_qpel_8_mc32_8_ssse3: 67.0 ( 6.30x)
put_h264_qpel_16_mc12_8_c: 1720.3 ( 1.00x)
put_h264_qpel_16_mc12_8_sse2: 189.9 ( 9.06x)
put_h264_qpel_16_mc12_8_ssse3: 199.9 ( 8.61x)
put_h264_qpel_16_mc21_8_c: 1624.5 ( 1.00x)
put_h264_qpel_16_mc21_8_sse2: 285.4 ( 5.69x)
put_h264_qpel_16_mc21_8_ssse3: 296.4 ( 5.48x)
put_h264_qpel_16_mc22_8_c: 963.9 ( 1.00x)
put_h264_qpel_16_mc22_8_sse2: 170.1 ( 5.67x)
put_h264_qpel_16_mc22_8_ssse3: 186.4 ( 5.17x)
put_h264_qpel_16_mc23_8_c: 1613.5 ( 1.00x)
put_h264_qpel_16_mc23_8_sse2: 274.6 ( 5.88x)
put_h264_qpel_16_mc23_8_ssse3: 300.4 ( 5.37x)
put_h264_qpel_16_mc32_8_c: 1735.9 ( 1.00x)
put_h264_qpel_16_mc32_8_sse2: 189.6 ( 9.15x)
put_h264_qpel_16_mc32_8_ssse3: 199.5 ( 8.70x)
Signed-off-by: Andreas Rheinhardt <[email protected]>
diff --git a/libavcodec/x86/h264_qpel.c b/libavcodec/x86/h264_qpel.c
index af031fe2e9..f4082e2242 100644
--- a/libavcodec/x86/h264_qpel.c
+++ b/libavcodec/x86/h264_qpel.c
@@ -65,7 +65,7 @@ void ff_ ## OPNAME ## _h264_qpel4_v_lowpass_mmxext(uint8_t
*dst, const uint8_t *
void ff_ ## OPNAME ## _h264_qpel8or16_v_lowpass_sse2(uint8_t *dst, const
uint8_t *src, ptrdiff_t dstStride, ptrdiff_t srcStride, int h);\
void ff_ ## OPNAME ## _h264_qpel4_hv_lowpass_h_mmxext(int16_t *tmp, uint8_t
*dst, ptrdiff_t dstStride);\
void ff_ ## OPNAME ## _h264_qpel8or16_hv1_lowpass_op_sse2(const uint8_t *src,
int16_t *tmp, ptrdiff_t srcStride, int size);\
-void ff_ ## OPNAME ## _h264_qpel8or16_hv2_lowpass_op_mmxext(uint8_t *dst,
int16_t *tmp, ptrdiff_t dstStride, int h);\
+void ff_ ## OPNAME ## _h264_qpel8or16_hv2_lowpass_op_sse2(uint8_t *dst,
int16_t *tmp, ptrdiff_t dstStride, int h);\
void ff_ ## OPNAME ## _h264_qpel8_hv2_lowpass_ssse3(uint8_t *dst, int16_t
*tmp, ptrdiff_t dstStride);\
void ff_ ## OPNAME ## _h264_qpel16_hv2_lowpass_ssse3(uint8_t *dst, int16_t
*tmp, ptrdiff_t dstStride);\
void ff_ ## OPNAME ## _pixels4_l2_shift5_mmxext(uint8_t *dst, const int16_t
*src16, const uint8_t *src8, ptrdiff_t dstStride);\
@@ -89,7 +89,7 @@ static av_always_inline void ff_ ## OPNAME ##
h264_qpel8or16_hv2_lowpass_ ## MMX
{\
int w = size>>4;\
do{\
- ff_ ## OPNAME ## h264_qpel8or16_hv2_lowpass_op_mmxext(dst, tmp, dstStride,
size);\
+ ff_ ## OPNAME ## h264_qpel8or16_hv2_lowpass_op_sse2(dst, tmp, dstStride,
size);\
tmp += 8;\
dst += 8;\
}while(w--);\
diff --git a/libavcodec/x86/h264_qpel_8bit.asm
b/libavcodec/x86/h264_qpel_8bit.asm
index c66a9bda40..39a387b4bb 100644
--- a/libavcodec/x86/h264_qpel_8bit.asm
+++ b/libavcodec/x86/h264_qpel_8bit.asm
@@ -613,36 +613,26 @@ cglobal put_h264_qpel8or16_hv1_lowpass_op, 4,4,8 ; src,
tmp, srcStride, size
%macro QPEL8OR16_HV2_LOWPASS_OP 1
-cglobal %1_h264_qpel8or16_hv2_lowpass_op, 4,4 ; dst, tmp, dstStride, h
+cglobal %1_h264_qpel8or16_hv2_lowpass_op, 4,4,6 ; dst, tmp, dstStride, h
.loop:
mova m0, [r1]
- mova m3, [r1+8]
- mova m1, [r1+2]
- mova m4, [r1+10]
- paddw m0, m4
- paddw m1, m3
- paddw m3, [r1+18]
- paddw m4, [r1+16]
- mova m2, [r1+4]
- mova m5, [r1+12]
- paddw m2, [r1+6]
- paddw m5, [r1+14]
+ movu m1, [r1+2]
+ movu m3, [r1+10]
+ movu m4, [r1+8]
+ movu m2, [r1+4]
+ movu m5, [r1+6]
+ paddw m0, m3
+ paddw m1, m4
psubw m0, m1
- psubw m3, m4
psraw m0, 2
- psraw m3, 2
+ paddw m2, m5
psubw m0, m1
- psubw m3, m4
paddsw m0, m2
- paddsw m3, m5
psraw m0, 2
- psraw m3, 2
paddw m0, m2
- paddw m3, m5
psraw m0, 6
- psraw m3, 6
- packuswb m0, m3
- op_%1 m0, [r0], m7
+ packuswb m0, m0
+ op_%1h m0, [r0], m5
add r1, 48
add r0, r2
dec r3d
@@ -650,7 +640,7 @@ cglobal %1_h264_qpel8or16_hv2_lowpass_op, 4,4 ; dst, tmp,
dstStride, h
RET
%endmacro
-INIT_MMX mmxext
+INIT_XMM sse2
QPEL8OR16_HV2_LOWPASS_OP put
QPEL8OR16_HV2_LOWPASS_OP avg
commit 617c042093129758a53b01e6fc400356d8f4e566
Author: Andreas Rheinhardt <[email protected]>
AuthorDate: Sat Oct 4 10:21:38 2025 +0200
Commit: Andreas Rheinhardt <[email protected]>
CommitDate: Tue Oct 7 18:06:40 2025 +0200
avcodec/x86/h264_qpel_8bit: Avoid doing unnecessary work
Signed-off-by: Andreas Rheinhardt <[email protected]>
diff --git a/libavcodec/x86/h264_qpel_8bit.asm
b/libavcodec/x86/h264_qpel_8bit.asm
index 64c91ba63a..c66a9bda40 100644
--- a/libavcodec/x86/h264_qpel_8bit.asm
+++ b/libavcodec/x86/h264_qpel_8bit.asm
@@ -387,8 +387,12 @@ QPEL8_H_LOWPASS_L2_OP_XMM avg
; All functions that call this are required to have function arguments of
; dst, src, dstStride, srcStride
-%macro FILT_V 1
+%macro FILT_V 1-2
+%ifnidn %2, last
mova m6, m2
+%else
+ SWAP 6, 2
+%endif
movh m5, [r1]
paddw m6, m3
psllw m6, 2
@@ -403,7 +407,9 @@ QPEL8_H_LOWPASS_L2_OP_XMM avg
psraw m6, 5
packuswb m6, m6
op_%1h m6, [r0], m0 ; 1
+%ifnidn %2, last
add r0, r2
+%endif
SWAP 0, 1, 2, 3, 4, 5
%endmacro
@@ -428,7 +434,7 @@ cglobal %1_h264_qpel4_v_lowpass, 4,4 ; dst, src, dstStride,
srcStride
FILT_V %1
FILT_V %1
FILT_V %1
- FILT_V %1
+ FILT_V %1, last
RET
%endmacro
@@ -473,7 +479,7 @@ cglobal %1_h264_qpel8or16_v_lowpass, 5,5,8 ; dst, src,
dstStride, srcStride, h
FILT_V %1
FILT_V %1
FILT_V %1
- FILT_V %1
+ FILT_V %1, last
.end:
RET
%endmacro
@@ -485,8 +491,12 @@ QPEL8OR16_V_LOWPASS_OP avg
; All functions that use this are required to have args:
; src, tmp, srcSize
-%macro FILT_HV 1 ; offset
+%macro FILT_HV 1-2 ; offset, last
+%ifnidn %2, last
mova m6, m2
+%else
+ SWAP 2, 6
+%endif
movh m5, [r0]
paddw m6, m3
psllw m6, 2
@@ -496,7 +506,9 @@ QPEL8OR16_V_LOWPASS_OP avg
punpcklbw m5, m7
pmullw m6, [pw_5]
paddw m0, m5
+%ifnidn %2, last
add r0, r2
+%endif
paddw m6, m0
mova [r1+%1], m6
SWAP 0, 1, 2, 3, 4, 5
@@ -524,7 +536,7 @@ cglobal put_h264_qpel4_hv_lowpass_v, 3,5 ; src, tmp,
srcStride
FILT_HV 0*24
FILT_HV 1*24
FILT_HV 2*24
- FILT_HV 3*24
+ FILT_HV 3*24, last
add r3, 4
add r1, 8
mov r0, r3
@@ -595,7 +607,7 @@ cglobal put_h264_qpel8or16_hv1_lowpass_op, 4,4,8 ; src,
tmp, srcStride, size
FILT_HV 12*48
FILT_HV 13*48
FILT_HV 14*48
- FILT_HV 15*48
+ FILT_HV 15*48, last
.end:
RET
commit 29f439077a1b7a0a93832fdaefddcc0b7577c5e7
Author: Andreas Rheinhardt <[email protected]>
AuthorDate: Sat Oct 4 10:15:09 2025 +0200
Commit: Andreas Rheinhardt <[email protected]>
CommitDate: Tue Oct 7 18:06:40 2025 +0200
avcodec/h264_qpel: Move loop into qpel4_hv_lowpass_v_mmxext()
Every caller calls it three times in a loop, with slightly
modified arguments. So it makes sense to move the loop
into the callee.
Signed-off-by: Andreas Rheinhardt <[email protected]>
diff --git a/libavcodec/x86/h264_qpel.c b/libavcodec/x86/h264_qpel.c
index 45e0878e57..af031fe2e9 100644
--- a/libavcodec/x86/h264_qpel.c
+++ b/libavcodec/x86/h264_qpel.c
@@ -80,14 +80,8 @@ DEF_QPEL(put)
#define QPEL_H264(OPNAME, OP, MMX)\
static av_always_inline void OPNAME ## h264_qpel4_hv_lowpass_ ## MMX(uint8_t
*dst, int16_t *tmp, const uint8_t *src, ptrdiff_t dstStride, ptrdiff_t
srcStride)\
{\
- int w=3;\
src -= 2*srcStride+2;\
- while(w--){\
- ff_put_h264_qpel4_hv_lowpass_v_mmxext(src, tmp, srcStride);\
- tmp += 4;\
- src += 4;\
- }\
- tmp -= 3*4;\
+ ff_put_h264_qpel4_hv_lowpass_v_mmxext(src, tmp, srcStride);\
ff_ ## OPNAME ## h264_qpel4_hv_lowpass_h_mmxext(tmp, dst, dstStride);\
}\
\
diff --git a/libavcodec/x86/h264_qpel_8bit.asm
b/libavcodec/x86/h264_qpel_8bit.asm
index a610a831db..64c91ba63a 100644
--- a/libavcodec/x86/h264_qpel_8bit.asm
+++ b/libavcodec/x86/h264_qpel_8bit.asm
@@ -503,8 +503,11 @@ QPEL8OR16_V_LOWPASS_OP avg
%endmacro
INIT_MMX mmxext
-cglobal put_h264_qpel4_hv_lowpass_v, 3,3 ; src, tmp, srcStride
+cglobal put_h264_qpel4_hv_lowpass_v, 3,5 ; src, tmp, srcStride
+ mov r4d, 3
+ mov r3, r0
pxor m7, m7
+.loop:
movh m0, [r0]
movh m1, [r0+r2]
lea r0, [r0+2*r2]
@@ -522,6 +525,11 @@ cglobal put_h264_qpel4_hv_lowpass_v, 3,3 ; src, tmp,
srcStride
FILT_HV 1*24
FILT_HV 2*24
FILT_HV 3*24
+ add r3, 4
+ add r1, 8
+ mov r0, r3
+ dec r4d
+ jnz .loop
RET
%macro QPEL4_HV1_LOWPASS_OP 1
commit 4539f7e4d4953fbf2086cac0e54575627f7df1e8
Author: Andreas Rheinhardt <[email protected]>
AuthorDate: Sat Oct 4 09:19:23 2025 +0200
Commit: Andreas Rheinhardt <[email protected]>
CommitDate: Tue Oct 7 18:06:40 2025 +0200
avcodec/x86/h264_qpel_8bit: Don't duplicate qpel4_hv_lowpass_v_mmxext
Signed-off-by: Andreas Rheinhardt <[email protected]>
diff --git a/libavcodec/x86/h264_qpel.c b/libavcodec/x86/h264_qpel.c
index 636be54530..45e0878e57 100644
--- a/libavcodec/x86/h264_qpel.c
+++ b/libavcodec/x86/h264_qpel.c
@@ -63,7 +63,6 @@ void ff_ ## OPNAME ## _h264_qpel16_h_lowpass_l2_sse2(uint8_t
*dst, const uint8_t
void ff_ ## OPNAME ## _h264_qpel8_h_lowpass_l2_ssse3(uint8_t *dst, const
uint8_t *src, const uint8_t *src2, ptrdiff_t dstStride, ptrdiff_t src2Stride);\
void ff_ ## OPNAME ## _h264_qpel4_v_lowpass_mmxext(uint8_t *dst, const uint8_t
*src, ptrdiff_t dstStride, ptrdiff_t srcStride);\
void ff_ ## OPNAME ## _h264_qpel8or16_v_lowpass_sse2(uint8_t *dst, const
uint8_t *src, ptrdiff_t dstStride, ptrdiff_t srcStride, int h);\
-void ff_ ## OPNAME ## _h264_qpel4_hv_lowpass_v_mmxext(const uint8_t *src,
int16_t *tmp, ptrdiff_t srcStride);\
void ff_ ## OPNAME ## _h264_qpel4_hv_lowpass_h_mmxext(int16_t *tmp, uint8_t
*dst, ptrdiff_t dstStride);\
void ff_ ## OPNAME ## _h264_qpel8or16_hv1_lowpass_op_sse2(const uint8_t *src,
int16_t *tmp, ptrdiff_t srcStride, int size);\
void ff_ ## OPNAME ## _h264_qpel8or16_hv2_lowpass_op_mmxext(uint8_t *dst,
int16_t *tmp, ptrdiff_t dstStride, int h);\
@@ -73,6 +72,8 @@ void ff_ ## OPNAME ## _pixels4_l2_shift5_mmxext(uint8_t *dst,
const int16_t *src
void ff_ ## OPNAME ## _pixels8_l2_shift5_sse2(uint8_t *dst, const int16_t
*src16, const uint8_t *src8, ptrdiff_t dstStride);\
void ff_ ## OPNAME ## _pixels16_l2_shift5_sse2(uint8_t *dst, const int16_t
*src16, const uint8_t *src8, ptrdiff_t dstStride);\
+void ff_put_h264_qpel4_hv_lowpass_v_mmxext(const uint8_t *src, int16_t *tmp,
ptrdiff_t srcStride);
+
DEF_QPEL(avg)
DEF_QPEL(put)
@@ -82,7 +83,7 @@ static av_always_inline void OPNAME ## h264_qpel4_hv_lowpass_
## MMX(uint8_t *ds
int w=3;\
src -= 2*srcStride+2;\
while(w--){\
- ff_ ## OPNAME ## h264_qpel4_hv_lowpass_v_mmxext(src, tmp, srcStride);\
+ ff_put_h264_qpel4_hv_lowpass_v_mmxext(src, tmp, srcStride);\
tmp += 4;\
src += 4;\
}\
diff --git a/libavcodec/x86/h264_qpel_8bit.asm
b/libavcodec/x86/h264_qpel_8bit.asm
index bbf591664a..a610a831db 100644
--- a/libavcodec/x86/h264_qpel_8bit.asm
+++ b/libavcodec/x86/h264_qpel_8bit.asm
@@ -502,8 +502,8 @@ QPEL8OR16_V_LOWPASS_OP avg
SWAP 0, 1, 2, 3, 4, 5
%endmacro
-%macro QPEL4_HV1_LOWPASS_OP 1
-cglobal %1_h264_qpel4_hv_lowpass_v, 3,3 ; src, tmp, srcStride
+INIT_MMX mmxext
+cglobal put_h264_qpel4_hv_lowpass_v, 3,3 ; src, tmp, srcStride
pxor m7, m7
movh m0, [r0]
movh m1, [r0+r2]
@@ -524,6 +524,7 @@ cglobal %1_h264_qpel4_hv_lowpass_v, 3,3 ; src, tmp,
srcStride
FILT_HV 3*24
RET
+%macro QPEL4_HV1_LOWPASS_OP 1
cglobal %1_h264_qpel4_hv_lowpass_h, 3,4 ; tmp, dst, dstStride
mov r3d, 4
.loop:
commit 3e2d9b73c13292a324d8846dc49d807b59224612
Author: Andreas Rheinhardt <[email protected]>
AuthorDate: Sat Oct 4 08:43:21 2025 +0200
Commit: Andreas Rheinhardt <[email protected]>
CommitDate: Tue Oct 7 18:06:40 2025 +0200
avcodec/h264qpel: Move Snow-only code to snow.c
Blocksize 2 is Snow-only, so move all the code pertaining
to it to snow.c. Also make the put array in H264QpelContext
smaller -- it only needs three sets of 16 function pointers.
This continues 6eb8bc42176f73c1d7c2e9f4bc1ab988f7149de5
and b0c91c2fba82f98dfe7a70f2591ec7a2126820c0.
Signed-off-by: Andreas Rheinhardt <[email protected]>
diff --git a/libavcodec/h264qpel.c b/libavcodec/h264qpel.c
index be80203c4b..0bc715c638 100644
--- a/libavcodec/h264qpel.c
+++ b/libavcodec/h264qpel.c
@@ -26,65 +26,6 @@
#define pixeltmp int16_t
#define BIT_DEPTH 8
#include "h264qpel_template.c"
-
-static void put_h264_qpel2_h_lowpass_8(uint8_t *dst, const uint8_t *restrict
src, int dstStride, int srcStride)
-{
- const int h = 2;
- for (int i = 0; i < h; ++i) {
- dst[0] = av_clip_uint8(((src[0]+src[1])*20 - (src[-1]+src[2])*5 +
(src[-2]+src[3]) + 16) >> 5);
- dst[1] = av_clip_uint8(((src[1]+src[2])*20 - (src[0 ]+src[3])*5 +
(src[-1]+src[4]) + 16) >> 5);
- dst += dstStride;
- src += srcStride;
- }
-}
-
-static void put_h264_qpel2_v_lowpass_8(uint8_t *dst, const uint8_t *restrict
src, int dstStride, int srcStride)
-{
- const int w = 2;
- for (int i = 0; i < w; ++i) {
- const int srcB = src[-2*srcStride];
- const int srcA = src[-1*srcStride];
- const int src0 = src[0 *srcStride];
- const int src1 = src[1 *srcStride];
- const int src2 = src[2 *srcStride];
- const int src3 = src[3 *srcStride];
- const int src4 = src[4 *srcStride];
- dst[0*dstStride] = av_clip_uint8(((src0+src1)*20 - (srcA+src2)*5 +
(srcB+src3) + 16) >> 5);
- dst[1*dstStride] = av_clip_uint8(((src1+src2)*20 - (src0+src3)*5 +
(srcA+src4) + 16) >> 5);
- dst++;
- src++;
- }
-}
-
-static void put_h264_qpel2_hv_lowpass_8(uint8_t *dst, pixeltmp *tmp, const
uint8_t *restrict src, int dstStride, int tmpStride, int srcStride)
-{
- const int h = 2;
- const int w = 2;
- src -= 2*srcStride;
- for (int i = 0; i < h + 5; ++i) {
- tmp[0] = (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]);
- tmp[1] = (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]);
- tmp += tmpStride;
- src += srcStride;
- }
- tmp -= tmpStride*(h+5-2);
- for (int i = 0; i < w; ++i) {
- const int tmpB = tmp[-2*tmpStride];
- const int tmpA = tmp[-1*tmpStride];
- const int tmp0 = tmp[0 *tmpStride];
- const int tmp1 = tmp[1 *tmpStride];
- const int tmp2 = tmp[2 *tmpStride];
- const int tmp3 = tmp[3 *tmpStride];
- const int tmp4 = tmp[4 *tmpStride];
- dst[0*dstStride] = av_clip_uint8(((tmp0+tmp1)*20 - (tmpA+tmp2)*5 +
(tmpB+tmp3) + 512) >> 10);
- dst[1*dstStride] = av_clip_uint8(((tmp1+tmp2)*20 - (tmp0+tmp3)*5 +
(tmpA+tmp4) + 512) >> 10);
- dst++;
- tmp++;
- }
-}
-
-H264_MC(put_, 2)
-
#undef BIT_DEPTH
#define BIT_DEPTH 9
@@ -140,7 +81,6 @@ av_cold void ff_h264qpel_init(H264QpelContext *c, int
bit_depth)
switch (bit_depth) {
default:
SET_QPEL(8);
- dspfunc2(put_h264_qpel, 3, 2, 8); // only used by Snow
break;
case 9:
SET_QPEL(9);
diff --git a/libavcodec/h264qpel.h b/libavcodec/h264qpel.h
index 6ae5ba1724..f198a9483c 100644
--- a/libavcodec/h264qpel.h
+++ b/libavcodec/h264qpel.h
@@ -25,7 +25,7 @@
#include "qpeldsp.h"
typedef struct H264QpelContext {
- qpel_mc_func put_h264_qpel_pixels_tab[4][16];
+ qpel_mc_func put_h264_qpel_pixels_tab[3][16];
qpel_mc_func avg_h264_qpel_pixels_tab[3][16];
} H264QpelContext;
diff --git a/libavcodec/h264qpel_template.c b/libavcodec/h264qpel_template.c
index a55b45e824..875ac86d15 100644
--- a/libavcodec/h264qpel_template.c
+++ b/libavcodec/h264qpel_template.c
@@ -304,134 +304,134 @@ static void FUNC(OPNAME ##
h264_qpel16_hv_lowpass)(uint8_t *dst, pixeltmp *tmp,
FUNC(OPNAME ## h264_qpel8_hv_lowpass)(dst+8*sizeof(pixel), tmp+8,
src+8*sizeof(pixel), dstStride, tmpStride, srcStride);\
}\
-#define H264_MC(OPNAME, SIZE) \
-static void FUNCC(OPNAME ## h264_qpel ## SIZE ## _mc00)(uint8_t *dst, const
uint8_t *restrict src, ptrdiff_t stride)\
+#define H264_MC(OPNAME, NAME, SIZE) \
+static void FUNCC(OPNAME ## NAME ## _qpel ## SIZE ## _mc00)(uint8_t *dst,
const uint8_t *restrict src, ptrdiff_t stride)\
{\
FUNCC(OPNAME ## pixels ## SIZE)(dst, src, stride, SIZE);\
}\
\
-static void FUNCC(OPNAME ## h264_qpel ## SIZE ## _mc10)(uint8_t *dst, const
uint8_t *restrict src, ptrdiff_t stride)\
+static void FUNCC(OPNAME ## NAME ## _qpel ## SIZE ## _mc10)(uint8_t *dst,
const uint8_t *restrict src, ptrdiff_t stride)\
{\
uint8_t half[SIZE*SIZE*sizeof(pixel)];\
- FUNC(put_h264_qpel ## SIZE ## _h_lowpass)(half, src, SIZE*sizeof(pixel),
stride);\
+ FUNC(put_ ## NAME ## _qpel ## SIZE ## _h_lowpass)(half, src,
SIZE*sizeof(pixel), stride);\
FUNC(OPNAME ## pixels ## SIZE ## _l2)(dst, src, half, stride, stride,
SIZE*sizeof(pixel), SIZE);\
}\
\
-static void FUNCC(OPNAME ## h264_qpel ## SIZE ## _mc20)(uint8_t *dst, const
uint8_t *restrict src, ptrdiff_t stride)\
+static void FUNCC(OPNAME ## NAME ## _qpel ## SIZE ## _mc20)(uint8_t *dst,
const uint8_t *restrict src, ptrdiff_t stride)\
{\
- FUNC(OPNAME ## h264_qpel ## SIZE ## _h_lowpass)(dst, src, stride, stride);\
+ FUNC(OPNAME ## NAME ## _qpel ## SIZE ## _h_lowpass)(dst, src, stride,
stride);\
}\
\
-static void FUNCC(OPNAME ## h264_qpel ## SIZE ## _mc30)(uint8_t *dst, const
uint8_t *restrict src, ptrdiff_t stride)\
+static void FUNCC(OPNAME ## NAME ## _qpel ## SIZE ## _mc30)(uint8_t *dst,
const uint8_t *restrict src, ptrdiff_t stride)\
{\
uint8_t half[SIZE*SIZE*sizeof(pixel)];\
- FUNC(put_h264_qpel ## SIZE ## _h_lowpass)(half, src, SIZE*sizeof(pixel),
stride);\
+ FUNC(put_ ## NAME ## _qpel ## SIZE ## _h_lowpass)(half, src,
SIZE*sizeof(pixel), stride);\
FUNC(OPNAME ## pixels ## SIZE ## _l2)(dst, src+sizeof(pixel), half,
stride, stride, SIZE*sizeof(pixel), SIZE);\
}\
\
-static void FUNCC(OPNAME ## h264_qpel ## SIZE ## _mc01)(uint8_t *dst, const
uint8_t *restrict src, ptrdiff_t stride)\
+static void FUNCC(OPNAME ## NAME ## _qpel ## SIZE ## _mc01)(uint8_t *dst,
const uint8_t *restrict src, ptrdiff_t stride)\
{\
uint8_t full[SIZE*(SIZE+5)*sizeof(pixel)];\
uint8_t * const full_mid= full + SIZE*2*sizeof(pixel);\
uint8_t half[SIZE*SIZE*sizeof(pixel)];\
FUNC(copy_block ## SIZE )(full, src - stride*2, SIZE*sizeof(pixel),
stride, SIZE + 5);\
- FUNC(put_h264_qpel ## SIZE ## _v_lowpass)(half, full_mid,
SIZE*sizeof(pixel), SIZE*sizeof(pixel));\
+ FUNC(put_ ## NAME ## _qpel ## SIZE ## _v_lowpass)(half, full_mid,
SIZE*sizeof(pixel), SIZE*sizeof(pixel));\
FUNC(OPNAME ## pixels ## SIZE ## _l2)(dst, full_mid, half, stride,
SIZE*sizeof(pixel), SIZE*sizeof(pixel), SIZE);\
}\
\
-static void FUNCC(OPNAME ## h264_qpel ## SIZE ## _mc02)(uint8_t *dst, const
uint8_t *restrict src, ptrdiff_t stride)\
+static void FUNCC(OPNAME ## NAME ## _qpel ## SIZE ## _mc02)(uint8_t *dst,
const uint8_t *restrict src, ptrdiff_t stride)\
{\
uint8_t full[SIZE*(SIZE+5)*sizeof(pixel)];\
uint8_t * const full_mid= full + SIZE*2*sizeof(pixel);\
FUNC(copy_block ## SIZE )(full, src - stride*2, SIZE*sizeof(pixel),
stride, SIZE + 5);\
- FUNC(OPNAME ## h264_qpel ## SIZE ## _v_lowpass)(dst, full_mid, stride,
SIZE*sizeof(pixel));\
+ FUNC(OPNAME ## NAME ## _qpel ## SIZE ## _v_lowpass)(dst, full_mid, stride,
SIZE*sizeof(pixel));\
}\
\
-static void FUNCC(OPNAME ## h264_qpel ## SIZE ## _mc03)(uint8_t *dst, const
uint8_t *restrict src, ptrdiff_t stride)\
+static void FUNCC(OPNAME ## NAME ## _qpel ## SIZE ## _mc03)(uint8_t *dst,
const uint8_t *restrict src, ptrdiff_t stride)\
{\
uint8_t full[SIZE*(SIZE+5)*sizeof(pixel)];\
uint8_t * const full_mid= full + SIZE*2*sizeof(pixel);\
uint8_t half[SIZE*SIZE*sizeof(pixel)];\
FUNC(copy_block ## SIZE )(full, src - stride*2, SIZE*sizeof(pixel),
stride, SIZE + 5);\
- FUNC(put_h264_qpel ## SIZE ## _v_lowpass)(half, full_mid,
SIZE*sizeof(pixel), SIZE*sizeof(pixel));\
+ FUNC(put_ ## NAME ## _qpel ## SIZE ## _v_lowpass)(half, full_mid,
SIZE*sizeof(pixel), SIZE*sizeof(pixel));\
FUNC(OPNAME ## pixels ## SIZE ## _l2)(dst, full_mid+SIZE*sizeof(pixel),
half, stride, SIZE*sizeof(pixel), SIZE*sizeof(pixel), SIZE);\
}\
\
-static void FUNCC(OPNAME ## h264_qpel ## SIZE ## _mc11)(uint8_t *dst, const
uint8_t *restrict src, ptrdiff_t stride)\
+static void FUNCC(OPNAME ## NAME ## _qpel ## SIZE ## _mc11)(uint8_t *dst,
const uint8_t *restrict src, ptrdiff_t stride)\
{\
uint8_t full[SIZE*(SIZE+5)*sizeof(pixel)];\
uint8_t * const full_mid= full + SIZE*2*sizeof(pixel);\
uint8_t halfH[SIZE*SIZE*sizeof(pixel)];\
uint8_t halfV[SIZE*SIZE*sizeof(pixel)];\
- FUNC(put_h264_qpel ## SIZE ## _h_lowpass)(halfH, src, SIZE*sizeof(pixel),
stride);\
+ FUNC(put_ ## NAME ## _qpel ## SIZE ## _h_lowpass)(halfH, src,
SIZE*sizeof(pixel), stride);\
FUNC(copy_block ## SIZE )(full, src - stride*2, SIZE*sizeof(pixel),
stride, SIZE + 5);\
- FUNC(put_h264_qpel ## SIZE ## _v_lowpass)(halfV, full_mid,
SIZE*sizeof(pixel), SIZE*sizeof(pixel));\
+ FUNC(put_ ## NAME ## _qpel ## SIZE ## _v_lowpass)(halfV, full_mid,
SIZE*sizeof(pixel), SIZE*sizeof(pixel));\
FUNC(OPNAME ## pixels ## SIZE ## _l2)(dst, halfH, halfV, stride,
SIZE*sizeof(pixel), SIZE*sizeof(pixel), SIZE);\
}\
\
-static void FUNCC(OPNAME ## h264_qpel ## SIZE ## _mc31)(uint8_t *dst, const
uint8_t *restrict src, ptrdiff_t stride)\
+static void FUNCC(OPNAME ## NAME ## _qpel ## SIZE ## _mc31)(uint8_t *dst,
const uint8_t *restrict src, ptrdiff_t stride)\
{\
uint8_t full[SIZE*(SIZE+5)*sizeof(pixel)];\
uint8_t * const full_mid= full + SIZE*2*sizeof(pixel);\
uint8_t halfH[SIZE*SIZE*sizeof(pixel)];\
uint8_t halfV[SIZE*SIZE*sizeof(pixel)];\
- FUNC(put_h264_qpel ## SIZE ## _h_lowpass)(halfH, src, SIZE*sizeof(pixel),
stride);\
+ FUNC(put_ ## NAME ## _qpel ## SIZE ## _h_lowpass)(halfH, src,
SIZE*sizeof(pixel), stride);\
FUNC(copy_block ## SIZE )(full, src - stride*2 + sizeof(pixel),
SIZE*sizeof(pixel), stride, SIZE + 5);\
- FUNC(put_h264_qpel ## SIZE ## _v_lowpass)(halfV, full_mid,
SIZE*sizeof(pixel), SIZE*sizeof(pixel));\
+ FUNC(put_ ## NAME ## _qpel ## SIZE ## _v_lowpass)(halfV, full_mid,
SIZE*sizeof(pixel), SIZE*sizeof(pixel));\
FUNC(OPNAME ## pixels ## SIZE ## _l2)(dst, halfH, halfV, stride,
SIZE*sizeof(pixel), SIZE*sizeof(pixel), SIZE);\
}\
\
-static void FUNCC(OPNAME ## h264_qpel ## SIZE ## _mc13)(uint8_t *dst, const
uint8_t *restrict src, ptrdiff_t stride)\
+static void FUNCC(OPNAME ## NAME ## _qpel ## SIZE ## _mc13)(uint8_t *dst,
const uint8_t *restrict src, ptrdiff_t stride)\
{\
uint8_t full[SIZE*(SIZE+5)*sizeof(pixel)];\
uint8_t * const full_mid= full + SIZE*2*sizeof(pixel);\
uint8_t halfH[SIZE*SIZE*sizeof(pixel)];\
uint8_t halfV[SIZE*SIZE*sizeof(pixel)];\
- FUNC(put_h264_qpel ## SIZE ## _h_lowpass)(halfH, src + stride,
SIZE*sizeof(pixel), stride);\
+ FUNC(put_ ## NAME ## _qpel ## SIZE ## _h_lowpass)(halfH, src + stride,
SIZE*sizeof(pixel), stride);\
FUNC(copy_block ## SIZE )(full, src - stride*2, SIZE*sizeof(pixel),
stride, SIZE + 5);\
- FUNC(put_h264_qpel ## SIZE ## _v_lowpass)(halfV, full_mid,
SIZE*sizeof(pixel), SIZE*sizeof(pixel));\
+ FUNC(put_ ## NAME ## _qpel ## SIZE ## _v_lowpass)(halfV, full_mid,
SIZE*sizeof(pixel), SIZE*sizeof(pixel));\
FUNC(OPNAME ## pixels ## SIZE ## _l2)(dst, halfH, halfV, stride,
SIZE*sizeof(pixel), SIZE*sizeof(pixel), SIZE);\
}\
\
-static void FUNCC(OPNAME ## h264_qpel ## SIZE ## _mc33)(uint8_t *dst, const
uint8_t *restrict src, ptrdiff_t stride)\
+static void FUNCC(OPNAME ## NAME ## _qpel ## SIZE ## _mc33)(uint8_t *dst,
const uint8_t *restrict src, ptrdiff_t stride)\
{\
uint8_t full[SIZE*(SIZE+5)*sizeof(pixel)];\
uint8_t * const full_mid= full + SIZE*2*sizeof(pixel);\
uint8_t halfH[SIZE*SIZE*sizeof(pixel)];\
uint8_t halfV[SIZE*SIZE*sizeof(pixel)];\
- FUNC(put_h264_qpel ## SIZE ## _h_lowpass)(halfH, src + stride,
SIZE*sizeof(pixel), stride);\
+ FUNC(put_ ## NAME ## _qpel ## SIZE ## _h_lowpass)(halfH, src + stride,
SIZE*sizeof(pixel), stride);\
FUNC(copy_block ## SIZE )(full, src - stride*2 + sizeof(pixel),
SIZE*sizeof(pixel), stride, SIZE + 5);\
- FUNC(put_h264_qpel ## SIZE ## _v_lowpass)(halfV, full_mid,
SIZE*sizeof(pixel), SIZE*sizeof(pixel));\
+ FUNC(put_ ## NAME ## _qpel ## SIZE ## _v_lowpass)(halfV, full_mid,
SIZE*sizeof(pixel), SIZE*sizeof(pixel));\
FUNC(OPNAME ## pixels ## SIZE ## _l2)(dst, halfH, halfV, stride,
SIZE*sizeof(pixel), SIZE*sizeof(pixel), SIZE);\
}\
\
-static void FUNCC(OPNAME ## h264_qpel ## SIZE ## _mc22)(uint8_t *dst, const
uint8_t *restrict src, ptrdiff_t stride)\
+static void FUNCC(OPNAME ## NAME ## _qpel ## SIZE ## _mc22)(uint8_t *dst,
const uint8_t *restrict src, ptrdiff_t stride)\
{\
pixeltmp tmp[SIZE*(SIZE+5)*sizeof(pixel)];\
- FUNC(OPNAME ## h264_qpel ## SIZE ## _hv_lowpass)(dst, tmp, src, stride,
SIZE*sizeof(pixel), stride);\
+ FUNC(OPNAME ## NAME ## _qpel ## SIZE ## _hv_lowpass)(dst, tmp, src,
stride, SIZE*sizeof(pixel), stride);\
}\
\
-static void FUNCC(OPNAME ## h264_qpel ## SIZE ## _mc21)(uint8_t *dst, const
uint8_t *restrict src, ptrdiff_t stride)\
+static void FUNCC(OPNAME ## NAME ## _qpel ## SIZE ## _mc21)(uint8_t *dst,
const uint8_t *restrict src, ptrdiff_t stride)\
{\
pixeltmp tmp[SIZE*(SIZE+5)*sizeof(pixel)];\
uint8_t halfH[SIZE*SIZE*sizeof(pixel)];\
uint8_t halfHV[SIZE*SIZE*sizeof(pixel)];\
- FUNC(put_h264_qpel ## SIZE ## _h_lowpass)(halfH, src, SIZE*sizeof(pixel),
stride);\
- FUNC(put_h264_qpel ## SIZE ## _hv_lowpass)(halfHV, tmp, src,
SIZE*sizeof(pixel), SIZE*sizeof(pixel), stride);\
+ FUNC(put_ ## NAME ## _qpel ## SIZE ## _h_lowpass)(halfH, src,
SIZE*sizeof(pixel), stride);\
+ FUNC(put_ ## NAME ## _qpel ## SIZE ## _hv_lowpass)(halfHV, tmp, src,
SIZE*sizeof(pixel), SIZE*sizeof(pixel), stride);\
FUNC(OPNAME ## pixels ## SIZE ## _l2)(dst, halfH, halfHV, stride,
SIZE*sizeof(pixel), SIZE*sizeof(pixel), SIZE);\
}\
\
-static void FUNCC(OPNAME ## h264_qpel ## SIZE ## _mc23)(uint8_t *dst, const
uint8_t *restrict src, ptrdiff_t stride)\
+static void FUNCC(OPNAME ## NAME ## _qpel ## SIZE ## _mc23)(uint8_t *dst,
const uint8_t *restrict src, ptrdiff_t stride)\
{\
pixeltmp tmp[SIZE*(SIZE+5)*sizeof(pixel)];\
uint8_t halfH[SIZE*SIZE*sizeof(pixel)];\
uint8_t halfHV[SIZE*SIZE*sizeof(pixel)];\
- FUNC(put_h264_qpel ## SIZE ## _h_lowpass)(halfH, src + stride,
SIZE*sizeof(pixel), stride);\
- FUNC(put_h264_qpel ## SIZE ## _hv_lowpass)(halfHV, tmp, src,
SIZE*sizeof(pixel), SIZE*sizeof(pixel), stride);\
+ FUNC(put_ ## NAME ## _qpel ## SIZE ## _h_lowpass)(halfH, src + stride,
SIZE*sizeof(pixel), stride);\
+ FUNC(put_ ## NAME ## _qpel ## SIZE ## _hv_lowpass)(halfHV, tmp, src,
SIZE*sizeof(pixel), SIZE*sizeof(pixel), stride);\
FUNC(OPNAME ## pixels ## SIZE ## _l2)(dst, halfH, halfHV, stride,
SIZE*sizeof(pixel), SIZE*sizeof(pixel), SIZE);\
}\
\
-static void FUNCC(OPNAME ## h264_qpel ## SIZE ## _mc12)(uint8_t *dst, const
uint8_t *restrict src, ptrdiff_t stride)\
+static void FUNCC(OPNAME ## NAME ## _qpel ## SIZE ## _mc12)(uint8_t *dst,
const uint8_t *restrict src, ptrdiff_t stride)\
{\
uint8_t full[SIZE*(SIZE+5)*sizeof(pixel)];\
uint8_t * const full_mid= full + SIZE*2*sizeof(pixel);\
@@ -439,12 +439,12 @@ static void FUNCC(OPNAME ## h264_qpel ## SIZE ##
_mc12)(uint8_t *dst, const uint
uint8_t halfV[SIZE*SIZE*sizeof(pixel)];\
uint8_t halfHV[SIZE*SIZE*sizeof(pixel)];\
FUNC(copy_block ## SIZE )(full, src - stride*2, SIZE*sizeof(pixel),
stride, SIZE + 5);\
- FUNC(put_h264_qpel ## SIZE ## _v_lowpass)(halfV, full_mid,
SIZE*sizeof(pixel), SIZE*sizeof(pixel));\
- FUNC(put_h264_qpel ## SIZE ## _hv_lowpass)(halfHV, tmp, src,
SIZE*sizeof(pixel), SIZE*sizeof(pixel), stride);\
+ FUNC(put_ ## NAME ## _qpel ## SIZE ## _v_lowpass)(halfV, full_mid,
SIZE*sizeof(pixel), SIZE*sizeof(pixel));\
+ FUNC(put_ ## NAME ## _qpel ## SIZE ## _hv_lowpass)(halfHV, tmp, src,
SIZE*sizeof(pixel), SIZE*sizeof(pixel), stride);\
FUNC(OPNAME ## pixels ## SIZE ## _l2)(dst, halfV, halfHV, stride,
SIZE*sizeof(pixel), SIZE*sizeof(pixel), SIZE);\
}\
\
-static void FUNCC(OPNAME ## h264_qpel ## SIZE ## _mc32)(uint8_t *dst, const
uint8_t *restrict src, ptrdiff_t stride)\
+static void FUNCC(OPNAME ## NAME ## _qpel ## SIZE ## _mc32)(uint8_t *dst,
const uint8_t *restrict src, ptrdiff_t stride)\
{\
uint8_t full[SIZE*(SIZE+5)*sizeof(pixel)];\
uint8_t * const full_mid= full + SIZE*2*sizeof(pixel);\
@@ -452,8 +452,8 @@ static void FUNCC(OPNAME ## h264_qpel ## SIZE ##
_mc32)(uint8_t *dst, const uint
uint8_t halfV[SIZE*SIZE*sizeof(pixel)];\
uint8_t halfHV[SIZE*SIZE*sizeof(pixel)];\
FUNC(copy_block ## SIZE )(full, src - stride*2 + sizeof(pixel),
SIZE*sizeof(pixel), stride, SIZE + 5);\
- FUNC(put_h264_qpel ## SIZE ## _v_lowpass)(halfV, full_mid,
SIZE*sizeof(pixel), SIZE*sizeof(pixel));\
- FUNC(put_h264_qpel ## SIZE ## _hv_lowpass)(halfHV, tmp, src,
SIZE*sizeof(pixel), SIZE*sizeof(pixel), stride);\
+ FUNC(put_ ## NAME ## _qpel ## SIZE ## _v_lowpass)(halfV, full_mid,
SIZE*sizeof(pixel), SIZE*sizeof(pixel));\
+ FUNC(put_ ## NAME ## _qpel ## SIZE ## _hv_lowpass)(halfHV, tmp, src,
SIZE*sizeof(pixel), SIZE*sizeof(pixel), stride);\
FUNC(OPNAME ## pixels ## SIZE ## _l2)(dst, halfV, halfHV, stride,
SIZE*sizeof(pixel), SIZE*sizeof(pixel), SIZE);\
}\
@@ -463,14 +463,16 @@ static void FUNCC(OPNAME ## h264_qpel ## SIZE ##
_mc32)(uint8_t *dst, const uint
#define op2_avg(a, b) a = (((a)+CLIP(((b) + 512)>>10)+1)>>1)
#define op2_put(a, b) a = CLIP(((b) + 512)>>10)
+#ifndef SNOW
H264_LOWPASS(put_ , op_put, op2_put)
H264_LOWPASS(avg_ , op_avg, op2_avg)
-H264_MC(put_, 4)
-H264_MC(put_, 8)
-H264_MC(put_, 16)
-H264_MC(avg_, 4)
-H264_MC(avg_, 8)
-H264_MC(avg_, 16)
+H264_MC(put_, h264, 4)
+H264_MC(put_, h264, 8)
+H264_MC(put_, h264, 16)
+H264_MC(avg_, h264, 4)
+H264_MC(avg_, h264, 8)
+H264_MC(avg_, h264, 16)
+#endif
#undef op_avg
#undef op_put
diff --git a/libavcodec/pel_template.c b/libavcodec/pel_template.c
index 6da7a56b2d..7de3db72f1 100644
--- a/libavcodec/pel_template.c
+++ b/libavcodec/pel_template.c
@@ -66,7 +66,7 @@ static inline void FUNCC(OPNAME ## _pixels8)(uint8_t *block,
\
block += line_size; \
} \
} \
- \
+av_unused \
CALL_2X_PIXELS(FUNCC(OPNAME ## _pixels16), \
FUNCC(OPNAME ## _pixels8), \
8 * sizeof(pixel))
diff --git a/libavcodec/snow.c b/libavcodec/snow.c
index e0ce83eb9c..006d84d8ce 100644
--- a/libavcodec/snow.c
+++ b/libavcodec/snow.c
@@ -18,6 +18,8 @@
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
+#include <assert.h>
+
#include "libavutil/log.h"
#include "libavutil/mem.h"
#include "libavutil/thread.h"
@@ -26,6 +28,91 @@
#include "snow.h"
#include "snowdata.h"
+#define pixeltmp int16_t
+#define BIT_DEPTH 8
+#define SNOW
+#include "h264qpel_template.c"
+
+static void put_snow_qpel2_h_lowpass_8(uint8_t *dst, const uint8_t *restrict
src, int dstStride, int srcStride)
+{
+ const int h = 2;
+ for (int i = 0; i < h; ++i) {
+ dst[0] = av_clip_uint8(((src[0]+src[1])*20 - (src[-1]+src[2])*5 +
(src[-2]+src[3]) + 16) >> 5);
+ dst[1] = av_clip_uint8(((src[1]+src[2])*20 - (src[0 ]+src[3])*5 +
(src[-1]+src[4]) + 16) >> 5);
+ dst += dstStride;
+ src += srcStride;
+ }
+}
+
+static void put_snow_qpel2_v_lowpass_8(uint8_t *dst, const uint8_t *restrict
src, int dstStride, int srcStride)
+{
+ const int w = 2;
+ for (int i = 0; i < w; ++i) {
+ const int srcB = src[-2*srcStride];
+ const int srcA = src[-1*srcStride];
+ const int src0 = src[0 *srcStride];
+ const int src1 = src[1 *srcStride];
+ const int src2 = src[2 *srcStride];
+ const int src3 = src[3 *srcStride];
+ const int src4 = src[4 *srcStride];
+ dst[0*dstStride] = av_clip_uint8(((src0+src1)*20 - (srcA+src2)*5 +
(srcB+src3) + 16) >> 5);
+ dst[1*dstStride] = av_clip_uint8(((src1+src2)*20 - (src0+src3)*5 +
(srcA+src4) + 16) >> 5);
+ dst++;
+ src++;
+ }
+}
+
+static void put_snow_qpel2_hv_lowpass_8(uint8_t *dst, pixeltmp *tmp, const
uint8_t *restrict src, int dstStride, int tmpStride, int srcStride)
+{
+ const int h = 2;
+ const int w = 2;
+ src -= 2*srcStride;
+ for (int i = 0; i < h + 5; ++i) {
+ tmp[0] = (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]);
+ tmp[1] = (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]);
+ tmp += tmpStride;
+ src += srcStride;
+ }
+ tmp -= tmpStride*(h+5-2);
+ for (int i = 0; i < w; ++i) {
+ const int tmpB = tmp[-2*tmpStride];
+ const int tmpA = tmp[-1*tmpStride];
+ const int tmp0 = tmp[0 *tmpStride];
+ const int tmp1 = tmp[1 *tmpStride];
+ const int tmp2 = tmp[2 *tmpStride];
+ const int tmp3 = tmp[3 *tmpStride];
+ const int tmp4 = tmp[4 *tmpStride];
+ dst[0*dstStride] = av_clip_uint8(((tmp0+tmp1)*20 - (tmpA+tmp2)*5 +
(tmpB+tmp3) + 512) >> 10);
+ dst[1*dstStride] = av_clip_uint8(((tmp1+tmp2)*20 - (tmp0+tmp3)*5 +
(tmpA+tmp4) + 512) >> 10);
+ dst++;
+ tmp++;
+ }
+}
+
+H264_MC(put_, snow, 2)
+
+static av_cold void init_qpel(SnowContext *const s)
+{
+ static_assert(offsetof(H264QpelContext, put_h264_qpel_pixels_tab) == 0,
+ "put_h264_qpel_pixels_tab not at start of H264QpelContext");
+ ff_h264qpel_init(&s->h264qpel, 8);
+ s->put_snow_qpel_pixels_tab[3][0] = put_snow_qpel2_mc00_8_c;
+ s->put_snow_qpel_pixels_tab[3][1] = put_snow_qpel2_mc10_8_c;
+ s->put_snow_qpel_pixels_tab[3][2] = put_snow_qpel2_mc20_8_c;
+ s->put_snow_qpel_pixels_tab[3][3] = put_snow_qpel2_mc30_8_c;
+ s->put_snow_qpel_pixels_tab[3][4] = put_snow_qpel2_mc01_8_c;
+ s->put_snow_qpel_pixels_tab[3][5] = put_snow_qpel2_mc11_8_c;
+ s->put_snow_qpel_pixels_tab[3][6] = put_snow_qpel2_mc21_8_c;
+ s->put_snow_qpel_pixels_tab[3][7] = put_snow_qpel2_mc31_8_c;
+ s->put_snow_qpel_pixels_tab[3][8] = put_snow_qpel2_mc02_8_c;
+ s->put_snow_qpel_pixels_tab[3][9] = put_snow_qpel2_mc12_8_c;
+ s->put_snow_qpel_pixels_tab[3][10] = put_snow_qpel2_mc22_8_c;
+ s->put_snow_qpel_pixels_tab[3][11] = put_snow_qpel2_mc32_8_c;
+ s->put_snow_qpel_pixels_tab[3][12] = put_snow_qpel2_mc03_8_c;
+ s->put_snow_qpel_pixels_tab[3][13] = put_snow_qpel2_mc13_8_c;
+ s->put_snow_qpel_pixels_tab[3][14] = put_snow_qpel2_mc23_8_c;
+ s->put_snow_qpel_pixels_tab[3][15] = put_snow_qpel2_mc33_8_c;
+}
void ff_snow_inner_add_yblock(const uint8_t *obmc, const int obmc_stride,
uint8_t * * block, int b_w, int b_h,
int src_x, int src_y, int src_stride,
slice_buffer * sb, int add, uint8_t * dst8){
@@ -354,18 +441,18 @@ void ff_snow_pred_block(SnowContext *s, uint8_t *dst,
uint8_t *tmp, ptrdiff_t st
else if(b_w==32){
int y;
for(y=0; y<b_h; y+=16){
- s->h264qpel.put_h264_qpel_pixels_tab[0][dy+(dx>>2)](dst +
y*stride, src + 3 + (y+3)*stride,stride);
- s->h264qpel.put_h264_qpel_pixels_tab[0][dy+(dx>>2)](dst + 16 +
y*stride, src + 19 + (y+3)*stride,stride);
+ s->put_snow_qpel_pixels_tab[0][dy+(dx>>2)](dst + y*stride, src
+ 3 + (y+3)*stride,stride);
+ s->put_snow_qpel_pixels_tab[0][dy+(dx>>2)](dst + 16 +
y*stride, src + 19 + (y+3)*stride,stride);
}
}else if(b_w==b_h)
- s->h264qpel.put_h264_qpel_pixels_tab[tab_index
][dy+(dx>>2)](dst,src + 3 + 3*stride,stride);
+ s->put_snow_qpel_pixels_tab[tab_index ][dy+(dx>>2)](dst,src + 3 +
3*stride,stride);
else if(b_w==2*b_h){
- s->h264qpel.put_h264_qpel_pixels_tab[tab_index+1][dy+(dx>>2)](dst
,src + 3 + 3*stride,stride);
-
s->h264qpel.put_h264_qpel_pixels_tab[tab_index+1][dy+(dx>>2)](dst+b_h,src + 3 +
b_h + 3*stride,stride);
+ s->put_snow_qpel_pixels_tab[tab_index+1][dy+(dx>>2)](dst ,src +
3 + 3*stride,stride);
+ s->put_snow_qpel_pixels_tab[tab_index+1][dy+(dx>>2)](dst+b_h,src +
3 + b_h + 3*stride,stride);
}else{
av_assert2(2*b_w==b_h);
- s->h264qpel.put_h264_qpel_pixels_tab[tab_index ][dy+(dx>>2)](dst
,src + 3 + 3*stride ,stride);
- s->h264qpel.put_h264_qpel_pixels_tab[tab_index
][dy+(dx>>2)](dst+b_w*stride,src + 3 + 3*stride+b_w*stride,stride);
+ s->put_snow_qpel_pixels_tab[tab_index ][dy+(dx>>2)](dst
,src + 3 + 3*stride ,stride);
+ s->put_snow_qpel_pixels_tab[tab_index
][dy+(dx>>2)](dst+b_w*stride,src + 3 + 3*stride+b_w*stride,stride);
}
}
}
@@ -404,7 +491,8 @@ av_cold int ff_snow_common_init(AVCodecContext *avctx){
ff_videodsp_init(&s->vdsp, 8);
ff_dwt_init(&s->dwt);
- ff_h264qpel_init(&s->h264qpel, 8);
+
+ init_qpel(s);
#define mcfh(dx,dy)\
s->hdsp.put_pixels_tab [0][dy/4+dx/8]=\
diff --git a/libavcodec/snow.h b/libavcodec/snow.h
index 9b19e70bd5..83dc6c1256 100644
--- a/libavcodec/snow.h
+++ b/libavcodec/snow.h
@@ -116,7 +116,11 @@ typedef struct SnowContext{
RangeCoder c;
HpelDSPContext hdsp;
VideoDSPContext vdsp;
- H264QpelContext h264qpel;
+ union {
+ /// everything except size 2 are from H.264
+ qpel_mc_func put_snow_qpel_pixels_tab[4][16];
+ H264QpelContext h264qpel;
+ };
SnowDWTContext dwt;
AVFrame *input_picture; ///< new_picture with the internal
linesizes
AVFrame *current_picture;
diff --git a/tests/checkasm/h264qpel.c b/tests/checkasm/h264qpel.c
index 7387c2510a..6f4a021faf 100644
--- a/tests/checkasm/h264qpel.c
+++ b/tests/checkasm/h264qpel.c
@@ -64,7 +64,7 @@ void checkasm_check_h264qpel(void)
for (bit_depth = 8; bit_depth <= 10; bit_depth++) {
ff_h264qpel_init(&h, bit_depth);
- for (i = 0; i < (op || bit_depth != 8 ? 3 : 4); i++) {
+ for (i = 0; i < 3; i++) {
int size = 16 >> i;
for (j = 0; j < 16; j++)
if (check_func(tab[i][j], "%s_h264_qpel_%d_mc%d%d_%d",
op_name, size, j & 3, j >> 2, bit_depth)) {
commit 15a4289b79d8c2e2453921544f7983248ef498bd
Author: Andreas Rheinhardt <[email protected]>
AuthorDate: Sat Oct 4 07:29:35 2025 +0200
Commit: Andreas Rheinhardt <[email protected]>
CommitDate: Tue Oct 7 18:06:40 2025 +0200
avcodec/x86/h264_qpel_8bit: Improve register allocation
None of the other registers need to be preserved at this time,
so six XMM registers are always enough. Forgotten in
fa9ea5113b48904daef9df6a282bd9c04c32258d.
Signed-off-by: Andreas Rheinhardt <[email protected]>
diff --git a/libavcodec/x86/h264_qpel_8bit.asm
b/libavcodec/x86/h264_qpel_8bit.asm
index ede4f382e1..bbf591664a 100644
--- a/libavcodec/x86/h264_qpel_8bit.asm
+++ b/libavcodec/x86/h264_qpel_8bit.asm
@@ -634,11 +634,7 @@ QPEL8OR16_HV2_LOWPASS_OP put
QPEL8OR16_HV2_LOWPASS_OP avg
%macro QPEL8OR16_HV2_LOWPASS_OP_XMM 1
-%ifidn %1, avg
-cglobal %1_h264_qpel8_hv2_lowpass, 3,4,7 ; dst, tmp, dstStride
-%else
cglobal %1_h264_qpel8_hv2_lowpass, 3,4,6 ; dst, tmp, dstStride
-%endif
mov r3d, 8
.loop:
mova m1, [r1+16]
@@ -663,7 +659,7 @@ cglobal %1_h264_qpel8_hv2_lowpass, 3,4,6 ; dst, tmp,
dstStride
paddw m0, m2
psraw m0, 6
packuswb m0, m0
- op_%1h m0, [r0], m6
+ op_%1h m0, [r0], m5
add r1, 48
add r0, r2
dec r3d
-----------------------------------------------------------------------
Summary of changes:
libavcodec/h264qpel.c | 60 ---------------
libavcodec/h264qpel.h | 2 +-
libavcodec/h264qpel_template.c | 94 +++++++++++------------
libavcodec/pel_template.c | 2 +-
libavcodec/snow.c | 104 +++++++++++++++++++++++--
libavcodec/snow.h | 6 +-
libavcodec/x86/h264_qpel.c | 154 ++++++++++++++------------------------
libavcodec/x86/h264_qpel_8bit.asm | 116 ++++++++++++++++------------
tests/checkasm/h264qpel.c | 2 +-
9 files changed, 280 insertions(+), 260 deletions(-)
hooks/post-receive
--
_______________________________________________
ffmpeg-cvslog mailing list -- [email protected]
To unsubscribe send an email to [email protected]