--bench on AWS Graviton: hevc_sao_band_8x8_8_c: 317.5 hevc_sao_band_8x8_8_neon: 97.5 hevc_sao_band_16x16_8_c: 1115.0 hevc_sao_band_16x16_8_neon: 322.7 hevc_sao_band_32x32_8_c: 4599.2 hevc_sao_band_32x32_8_neon: 1246.2 hevc_sao_band_48x48_8_c: 10021.7 hevc_sao_band_48x48_8_neon: 2740.5 hevc_sao_band_64x64_8_c: 17635.0 hevc_sao_band_64x64_8_neon: 4875.7
Signed-off-by: J. Dekker <j...@itanimul.li> --- libavcodec/aarch64/hevcdsp_init_aarch64.c | 6 +++++- libavcodec/aarch64/hevcdsp_sao_neon.S | 9 ++++++--- 2 files changed, 11 insertions(+), 4 deletions(-) diff --git a/libavcodec/aarch64/hevcdsp_init_aarch64.c b/libavcodec/aarch64/hevcdsp_init_aarch64.c index b93cec9e44..2002530266 100644 --- a/libavcodec/aarch64/hevcdsp_init_aarch64.c +++ b/libavcodec/aarch64/hevcdsp_init_aarch64.c @@ -77,7 +77,11 @@ av_cold void ff_hevc_dsp_init_aarch64(HEVCDSPContext *c, const int bit_depth) c->idct_dc[1] = ff_hevc_idct_8x8_dc_8_neon; c->idct_dc[2] = ff_hevc_idct_16x16_dc_8_neon; c->idct_dc[3] = ff_hevc_idct_32x32_dc_8_neon; - c->sao_band_filter[0] = ff_hevc_sao_band_filter_8x8_8_neon; + c->sao_band_filter[0] = + c->sao_band_filter[1] = + c->sao_band_filter[2] = + c->sao_band_filter[3] = + c->sao_band_filter[4] = ff_hevc_sao_band_filter_8x8_8_neon; c->sao_edge_filter[0] = ff_hevc_sao_edge_filter_8x8_8_neon; c->sao_edge_filter[1] = c->sao_edge_filter[2] = diff --git a/libavcodec/aarch64/hevcdsp_sao_neon.S b/libavcodec/aarch64/hevcdsp_sao_neon.S index c4b931aab7..263747149f 100644 --- a/libavcodec/aarch64/hevcdsp_sao_neon.S +++ b/libavcodec/aarch64/hevcdsp_sao_neon.S @@ -35,6 +35,7 @@ function ff_hevc_sao_band_filter_8x8_8_neon, export=1 stp xzr, xzr, [sp, #32] stp xzr, xzr, [sp, #48] mov w8, #4 + sxtw x6, w6 0: ldrsh x9, [x4, x8, lsl #1] // x9 = sao_offset_val[k+1] subs w8, w8, #1 @@ -44,8 +45,10 @@ function ff_hevc_sao_band_filter_8x8_8_neon, export=1 bne 0b ld1 {v16.16b-v19.16b}, [sp], #64 movi v20.8h, #1 + sub x2, x2, x6 // stride_dst - width + sub x3, x3, x6 // stride_src - width 1: // beginning of line - mov w8, w6 + mov x8, x6 2: // Simple layout for accessing 16bit values // with 8bit LUT. @@ -56,7 +59,7 @@ function ff_hevc_sao_band_filter_8x8_8_neon, export=1 // +-----------------------------------> // i-0 i-1 i-2 i-3 // dst[x] = av_clip_pixel(src[x] + offset_table[src[x] >> shift]); - ld1 {v2.8b}, [x1] + ld1 {v2.8b}, [x1], #8 // load src[x] uxtl v0.8h, v2.8b // >> shift @@ -74,7 +77,7 @@ function ff_hevc_sao_band_filter_8x8_8_neon, export=1 // clip + narrow sqxtun v4.8b, v1.8h // store - st1 {v4.8b}, [x0] + st1 {v4.8b}, [x0], #8 // done 8 pixels subs w8, w8, #8 bne 2b -- 2.30.1 (Apple Git-130) _______________________________________________ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-devel To unsubscribe, visit link above, or email ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".