+void ff_hevc_put_hevc_qpel_h6_8_neon(int16_t *dst, uint8_t *_src, ptrdiff_t
_srcstride, int height,
+ intptr_t mx, intptr_t my, int width);
+void ff_hevc_put_hevc_qpel_h8_8_neon(int16_t *dst, uint8_t *_src, ptrdiff_t
_srcstride, int height,
+ intptr_t mx, intptr_t my, int width);
+void ff_hevc_put_hevc_qpel_h12_8_neon(int16_t *dst, uint8_t *_src, ptrdiff_t
_srcstride, int height,
+ intptr_t mx, intptr_t my, int width);
+void ff_hevc_put_hevc_qpel_h16_8_neon(int16_t *dst, uint8_t *_src, ptrdiff_t
_srcstride, int height,
+ intptr_t mx, intptr_t my, int width);
+void ff_hevc_put_hevc_qpel_uni_h4_8_neon(uint8_t *_dst, ptrdiff_t _dststride,
uint8_t *_src,
+ ptrdiff_t _srcstride, int height,
intptr_t mx, intptr_t my,
+ int width);
+void ff_hevc_put_hevc_qpel_uni_h6_8_neon(uint8_t *_dst, ptrdiff_t _dststride,
uint8_t *_src,
+ ptrdiff_t _srcstride, int height,
intptr_t mx, intptr_t my,
+ int width);
+void ff_hevc_put_hevc_qpel_uni_h8_8_neon(uint8_t *_dst, ptrdiff_t _dststride,
uint8_t *_src,
+ ptrdiff_t _srcstride, int height,
intptr_t mx, intptr_t my,
+ int width);
+void ff_hevc_put_hevc_qpel_uni_h12_8_neon(uint8_t *_dst, ptrdiff_t _dststride,
uint8_t *_src,
+ ptrdiff_t _srcstride, int height,
intptr_t mx, intptr_t
+ my, int width);
+void ff_hevc_put_hevc_qpel_uni_h16_8_neon(uint8_t *_dst, ptrdiff_t _dststride,
uint8_t *_src,
+ ptrdiff_t _srcstride, int height,
intptr_t mx, intptr_t
+ my, int width);
+void ff_hevc_put_hevc_qpel_bi_h4_8_neon(uint8_t *_dst, ptrdiff_t _dststride,
uint8_t *_src,
+ ptrdiff_t _srcstride, int16_t *src2,
int height, intptr_t
+ mx, intptr_t my, int width);
+void ff_hevc_put_hevc_qpel_bi_h6_8_neon(uint8_t *_dst, ptrdiff_t _dststride,
uint8_t *_src,
+ ptrdiff_t _srcstride, int16_t *src2,
int height, intptr_t
+ mx, intptr_t my, int width);
+void ff_hevc_put_hevc_qpel_bi_h8_8_neon(uint8_t *_dst, ptrdiff_t _dststride,
uint8_t *_src,
+ ptrdiff_t _srcstride, int16_t *src2,
int height, intptr_t
+ mx, intptr_t my, int width);
+void ff_hevc_put_hevc_qpel_bi_h12_8_neon(uint8_t *_dst, ptrdiff_t _dststride,
uint8_t *_src,
+ ptrdiff_t _srcstride, int16_t *src2,
int height, intptr_t
+ mx, intptr_t my, int width);
+void ff_hevc_put_hevc_qpel_bi_h16_8_neon(uint8_t *_dst, ptrdiff_t _dststride,
uint8_t *_src,
+ ptrdiff_t _srcstride, int16_t *src2,
int height, intptr_t
+ mx, intptr_t my, int width);
av_cold void ff_hevc_dsp_init_aarch64(HEVCDSPContext *c, const int bit_depth)
{
@@ -95,6 +135,33 @@ av_cold void ff_hevc_dsp_init_aarch64(HEVCDSPContext *c,
const int bit_depth)
c->sao_edge_filter[2] =
c->sao_edge_filter[3] =
c->sao_edge_filter[4] = ff_hevc_sao_edge_filter_16x16_8_neon;
+ c->put_hevc_qpel[1][0][1] = ff_hevc_put_hevc_qpel_h4_8_neon;
+ c->put_hevc_qpel[2][0][1] = ff_hevc_put_hevc_qpel_h6_8_neon;
+ c->put_hevc_qpel[3][0][1] = ff_hevc_put_hevc_qpel_h8_8_neon;
+ c->put_hevc_qpel[4][0][1] =
+ c->put_hevc_qpel[6][0][1] = ff_hevc_put_hevc_qpel_h12_8_neon;
+ c->put_hevc_qpel[5][0][1] =
+ c->put_hevc_qpel[7][0][1] =
+ c->put_hevc_qpel[8][0][1] =
+ c->put_hevc_qpel[9][0][1] = ff_hevc_put_hevc_qpel_h16_8_neon;
+ c->put_hevc_qpel_uni[1][0][1] = ff_hevc_put_hevc_qpel_uni_h4_8_neon;
+ c->put_hevc_qpel_uni[2][0][1] = ff_hevc_put_hevc_qpel_uni_h6_8_neon;
+ c->put_hevc_qpel_uni[3][0][1] = ff_hevc_put_hevc_qpel_uni_h8_8_neon;
+ c->put_hevc_qpel_uni[4][0][1] =
+ c->put_hevc_qpel_uni[6][0][1] = ff_hevc_put_hevc_qpel_uni_h12_8_neon;
+ c->put_hevc_qpel_uni[5][0][1] =
+ c->put_hevc_qpel_uni[7][0][1] =
+ c->put_hevc_qpel_uni[8][0][1] =
+ c->put_hevc_qpel_uni[9][0][1] = ff_hevc_put_hevc_qpel_uni_h16_8_neon;
+ c->put_hevc_qpel_bi[1][0][1] = ff_hevc_put_hevc_qpel_bi_h4_8_neon;
+ c->put_hevc_qpel_bi[2][0][1] = ff_hevc_put_hevc_qpel_bi_h6_8_neon;
+ c->put_hevc_qpel_bi[3][0][1] = ff_hevc_put_hevc_qpel_bi_h8_8_neon;
+ c->put_hevc_qpel_bi[4][0][1] =
+ c->put_hevc_qpel_bi[6][0][1] = ff_hevc_put_hevc_qpel_bi_h12_8_neon;
+ c->put_hevc_qpel_bi[5][0][1] =
+ c->put_hevc_qpel_bi[7][0][1] =
+ c->put_hevc_qpel_bi[8][0][1] =
+ c->put_hevc_qpel_bi[9][0][1] = ff_hevc_put_hevc_qpel_bi_h16_8_neon;
}
if (bit_depth == 10) {
c->add_residual[0] = ff_hevc_add_residual_4x4_10_neon;
diff --git a/libavcodec/aarch64/hevcdsp_qpel_neon.S
b/libavcodec/aarch64/hevcdsp_qpel_neon.S
new file mode 100644
index 0000000000..7974b8529e
--- /dev/null
+++ b/libavcodec/aarch64/hevcdsp_qpel_neon.S
@@ -0,0 +1,484 @@
+/* -*-arm64-*-
+ * vim: syntax=arm64asm
+ *
+ * Copyright (c) 2022 J. Dekker <j...@itanimul.li>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/aarch64/asm.S"
+#define MAX_PB_SIZE 64
+
+const qpel_filters, align=4
+ .byte 0, 0, 0, 0, 0, 0, 0, 0
+ .byte -1, 4,-10, 58, 17, -5, 1, 0
+ .byte -1, 4,-11, 40, 40,-11, 4, -1
+ .byte 0, 1, -5, 17, 58,-10, 4, -1
+endconst
+
+.macro load_filter m
+ movrel x15, qpel_filters
+ add x15, x15, \m, lsl #3
+ ld1 {v0.8b}, [x15]
+ sxtl v0.8h, v0.8b
+.endm
+
+.macro put_hevc type
+.ifc \type, qpel
+ // void put_hevc_qpel_h(int16_t *dst,
+ // uint8_t *_src, ptrdiff_t _srcstride,
+ // int height, intptr_t mx, intptr_t my, int
width)
+ dst .req x0
+ dststride .req x7
+ src .req x1
+ srcstride .req x2
+ height .req x3
+ heightw .req w3
+ mx .req x4
+ width .req w6
+.endif
+.ifc \type, qpel_uni
+ // void put_hevc_qpel_uni_h(uint8_t *_dst, ptrdiff_t _dststride,
+ // uint8_t *_src, ptrdiff_t _srcstride,
+ // int height, intptr_t mx, intptr_t my, int
width)
+ dst .req x0
+ dststride .req x1
+ src .req x2
+ srcstride .req x3
+ height .req x4
+ heightw .req w4
+ mx .req x5
+ width .req w7
+.endif
+.ifc \type, qpel_bi
+ // void put_hevc_qpel_bi_h(uint8_t *_dst, ptrdiff_t _dststride,
+ // uint8_t *_src, ptrdiff_t _srcstride,
+ // int16_t *src2, int height, intptr_t mx,
+ // intptr_t my, int width)
+ dst .req x0
+ dststride .req x1
+ src .req x2
+ srcstride .req x3
+ height .req x5
+ heightw .req w5
+ mx .req x6
+ width .req w8
+.endif
+
+.ifc \type, qpel
+function ff_hevc_put_hevc_h4_8_neon, export=0
+ uxtl v16.8h, v16.8b
+ uxtl v17.8h, v17.8b
+ uxtl v18.8h, v18.8b
+ uxtl v19.8h, v19.8b
+
+ mul v23.4h, v16.4h, v0.h[0]
+ mul v24.4h, v18.4h, v0.h[0]
+
+.irpc i, 1234567
+ ext v20.16b, v16.16b, v17.16b, #(2*\i)
+ ext v21.16b, v18.16b, v19.16b, #(2*\i)
+ mla v23.4h, v20.4h, v0.h[\i]
+ mla v24.4h, v21.4h, v0.h[\i]
+.endr
+ ret
+endfunc
+.endif
+
+function ff_hevc_put_hevc_\type\()_h4_8_neon, export=1
+ load_filter mx
+.ifc \type, qpel_bi
+ mov x16, #(MAX_PB_SIZE << 2) // src2bstridel
+ add x15, x4, #(MAX_PB_SIZE << 1) // src2b