On Sun, 2 Jul 2023, Thomas Mundt wrote:

Am So., 2. Juli 2023 um 14:34 Uhr schrieb John Cox <j...@kynesim.co.uk>:
      Add an optional filter_line3 to the available optimisations.

      filter_line3 is equivalent to filter_line, memcpy, filter_line

      filter_line shares quite a number of loads and some calculations
      in
      common with its next iteration and testing shows that using
      aarch64
      neon filter_line3s performance is 30% better than two
      filter_lines
      and a memcpy.

      Signed-off-by: John Cox <j...@kynesim.co.uk>
      ---
       libavfilter/bwdif.h    |  7 +++++++
       libavfilter/vf_bwdif.c | 31 +++++++++++++++++++++++++++++++
       2 files changed, 38 insertions(+)

      diff --git a/libavfilter/bwdif.h b/libavfilter/bwdif.h
      index cce99953f3..496cec72ef 100644
      --- a/libavfilter/bwdif.h
      +++ b/libavfilter/bwdif.h
      @@ -35,6 +35,9 @@ typedef struct BWDIFContext {
           void (*filter_edge)(void *dst, void *prev, void *cur, void
      *next,
                               int w, int prefs, int mrefs, int
      prefs2, int mrefs2,
                               int parity, int clip_max, int spat);
      +    void (*filter_line3)(void *dst, int dstride,
      +                         const void *prev, const void *cur,
      const void *next, int prefs,
      +                         int w, int parity, int clip_max);
       } BWDIFContext;

       void ff_bwdif_init_filter_line(BWDIFContext *bwdif, int
      bit_depth);
      @@ -53,4 +56,8 @@ void ff_bwdif_filter_line_c(void *dst1, void
      *prev1, void *cur1, void *next1,
                                   int prefs3, int mrefs3, int prefs4,
      int mrefs4,
                                   int parity, int clip_max);

      +void ff_bwdif_filter_line3_c(void * dst1, int d_stride,
      +                             const void * prev1, const void *
      cur1, const void * next1, int s_stride,
      +                             int w, int parity, int clip_max);
      +
       #endif /* AVFILTER_BWDIF_H */
      diff --git a/libavfilter/vf_bwdif.c b/libavfilter/vf_bwdif.c
      index 26349da1fd..52bc676cf8 100644
      --- a/libavfilter/vf_bwdif.c
      +++ b/libavfilter/vf_bwdif.c
      @@ -150,6 +150,31 @@ void ff_bwdif_filter_line_c(void *dst1,
      void *prev1, void *cur1, void *next1,
           FILTER2()
       }

      +#define NEXT_LINE()\
      +    dst += d_stride; \
      +    prev += prefs; \
      +    cur  += prefs; \
      +    next += prefs;
      +
      +void ff_bwdif_filter_line3_c(void * dst1, int d_stride,
      +                             const void * prev1, const void *
      cur1, const void * next1, int s_stride,
      +                             int w, int parity, int clip_max)
      +{
      +    const int prefs = s_stride;
      +    uint8_t * dst  = dst1;
      +    const uint8_t * prev = prev1;
      +    const uint8_t * cur  = cur1;
      +    const uint8_t * next = next1;
      +
      +    ff_bwdif_filter_line_c(dst, (void*)prev, (void*)cur,
      (void*)next, w,
      +                           prefs, -prefs, prefs * 2, - prefs *
      2, prefs * 3, -prefs * 3, prefs * 4, -prefs * 4, parity,
      clip_max);
      +    NEXT_LINE();
      +    memcpy(dst, cur, w);
      +    NEXT_LINE();
      +    ff_bwdif_filter_line_c(dst, (void*)prev, (void*)cur,
      (void*)next, w,
      +                           prefs, -prefs, prefs * 2, - prefs *
      2, prefs * 3, -prefs * 3, prefs * 4, -prefs * 4, parity,
      clip_max);
      +}
      +
       void ff_bwdif_filter_edge_c(void *dst1, void *prev1, void
      *cur1, void *next1,
                                   int w, int prefs, int mrefs, int
      prefs2, int mrefs2,
                                   int parity, int clip_max, int spat)
      @@ -244,6 +269,11 @@ static int filter_slice(AVFilterContext
      *ctx, void *arg, int jobnr, int nb_jobs)
                                      refs << 1, -(refs << 1),
                                      td->parity ^ td->tff, clip_max,
                                      (y < 2) || ((y + 3) > td->h) ? 0
      : 1);
      +            } else if (s->filter_line3 && y + 2 < slice_end &&
      y + 6 < td->h) {
      +                s->filter_line3(dst,
      td->frame->linesize[td->plane],
      +                                prev, cur, next, linesize,
      td->w,
      +                                td->parity ^ td->tff,
      clip_max);
      +                y += 2;
                   } else {
                       s->filter_line(dst, prev, cur, next, td->w,
                                      refs, -refs, refs << 1, -(refs
      << 1),


Maybe I'm missing something, but doesn't this kick out most of the x86 SIMD
optimization because there is no filter_line3?

It looks to me like it doesn't; it adds a new optional function pointer, but the reference C function isn't set. So by default (and on x86) filter_line3 is null, and the old assembly optimized codepaths are used. But if an architecture does implement filter_line3, that's used instead of filter_line.

// Martin
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".

Reply via email to