On 5/12/2024 2:07 PM, Rémi Denis-Courmont wrote:
T-Head C908:
flac_wasted_32_c:       949.0
flac_wasted_32_rvv_i32: 278.7
---
  libavcodec/riscv/flacdsp_init.c |  7 ++++++-
  libavcodec/riscv/flacdsp_rvv.S  | 15 +++++++++++++++
  2 files changed, 21 insertions(+), 1 deletion(-)

diff --git a/libavcodec/riscv/flacdsp_init.c b/libavcodec/riscv/flacdsp_init.c
index 6cfb50ead8..4043715a3b 100644
--- a/libavcodec/riscv/flacdsp_init.c
+++ b/libavcodec/riscv/flacdsp_init.c
@@ -31,6 +31,7 @@ void ff_flac_lpc32_rvv(int32_t *decoded, const int coeffs[32],
                         int pred_order, int qlevel, int len);
  void ff_flac_lpc32_rvv_simple(int32_t *decoded, const int coeffs[32],
                                int pred_order, int qlevel, int len);
+void ff_flac_wasted32_rvv(int32_t *, int shift, int len);
  void ff_flac_decorrelate_indep2_16_rvv(uint8_t **out, int32_t **in,
                                         int channels, int len, int shift);
  void ff_flac_decorrelate_indep4_16_rvv(uint8_t **out, int32_t **in,
@@ -76,8 +77,12 @@ av_cold void ff_flacdsp_init_riscv(FLACDSPContext *c, enum 
AVSampleFormat fmt,
                  c->lpc32 = ff_flac_lpc32_rvv_simple;
              else
                  c->lpc32 = ff_flac_lpc32_rvv;
+# endif
          }
+ c->wasted32 = ff_flac_wasted32_rvv;
+
+# if (__riscv_xlen >= 64)
          switch (fmt) {
          case AV_SAMPLE_FMT_S16:
              switch (channels) {
@@ -117,8 +122,8 @@ av_cold void ff_flacdsp_init_riscv(FLACDSPContext *c, enum 
AVSampleFormat fmt,
              c->decorrelate[2] = ff_flac_decorrelate_rs_32_rvv;
              c->decorrelate[3] = ff_flac_decorrelate_ms_32_rvv;
              break;
-# endif
          }
+# endif
      }
  #endif
  }
diff --git a/libavcodec/riscv/flacdsp_rvv.S b/libavcodec/riscv/flacdsp_rvv.S
index 2a0b50f7a9..d576a0cc21 100644
--- a/libavcodec/riscv/flacdsp_rvv.S
+++ b/libavcodec/riscv/flacdsp_rvv.S
@@ -100,7 +100,22 @@ func ff_flac_lpc32_rvv_simple, zve32x
ret
  endfunc
+#endif
+
+func ff_flac_wasted32_rvv, zve32x
+1:
+        vsetvli t0, a2, e32, m8, ta, ma
+        vle32.v v8, (a0)
+        sub     a2, a2, t0
+        vsll.vx v8, v8, a1
+        vse32.v v8, (a0)
+        sh2add  a0, t0, a0
+        bnez    a2, 1b

Not sure if you're taking it into account, but the minimum blocksize is 16 and the buffer is always allocated for max_blocksize plus padding, so you should be able to do more samples per loop than this. Same for wasted33.

+ ret
+endfunc
+
+#if (__riscv_xlen == 64)
  func ff_flac_decorrelate_indep2_16_rvv, zve32x
          ld      a0,  (a0)
          ld      a2, 8(a1)
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".

Reply via email to