>From 5cb1b3a901c94475247f07773e636fa25bf0d513 Mon Sep 17 00:00:00 2001
From: raduct <radu.taraibuta@gmail.com>
Date: Wed, 8 May 2024 08:24:46 +0300
Subject: [PATCH] area changed: scdet filter

Improve scene detection accuracy by comparing frame with both previous and next frame (creates one frame delay).
Add new mode parameter and new method to compute the frame difference using cubic square to increase the weight of small changes and new mean formula. This improves accuracy significantly. Slightly improve performance by not using frame clone.
Add legacy mode for backward compatibility.

Signed-off-by: raduct <radu.taraibuta@gmail.com>
---
 doc/filters.texi                      |  16 +++
 libavfilter/scene_sad.c               | 157 ++++++++++++++++++++++++
 libavfilter/scene_sad.h               |  20 ++++
 libavfilter/vf_scdet.c                | 164 ++++++++++++++++++--------
 tests/fate/filter-video.mak           |   3 +
 tests/ref/fate/filter-metadata-scdet1 |  13 ++
 6 files changed, 325 insertions(+), 48 deletions(-)
 create mode 100644 tests/ref/fate/filter-metadata-scdet1

diff --git a/doc/filters.texi b/doc/filters.texi
index bfa8ccec8b..53814e003b 100644
--- a/doc/filters.texi
+++ b/doc/filters.texi
@@ -21797,6 +21797,22 @@ Default value is @code{10.}.
 @item sc_pass, s
 Set the flag to pass scene change frames to the next filter. Default value is @code{0}
 You can enable it if you want to get snapshot of scene change frames only.
+
+@item mode
+Set the scene change detection method. Default value is @code{-1}
+Available values are:
+
+@table @samp
+@item -1
+Legacy mode for sum of absolute linear differences. Compare frame with previous only and no delay.
+
+@item 0
+Sum of absolute linear differences. Compare frame with both previous and next which introduces a 1 frame delay.
+
+@item 1
+Sum of mean of cubic root differences. Compare frame with both previous and next which introduces a 1 frame delay.
+
+@end table
 @end table
 
 @anchor{selectivecolor}
diff --git a/libavfilter/scene_sad.c b/libavfilter/scene_sad.c
index caf911eb5d..1585d0a522 100644
--- a/libavfilter/scene_sad.c
+++ b/libavfilter/scene_sad.c
@@ -21,6 +21,8 @@
  * Scene SAD functions
  */
 
+#include "libavutil/mem.h"
+#include "libavutil/thread.h"
 #include "scene_sad.h"
 
 void ff_scene_sad16_c(SCENE_SAD_PARAMS)
@@ -71,3 +73,158 @@ ff_scene_sad_fn ff_scene_sad_get_fn(int depth)
     return sad;
 }
 
+static AVMutex cbrt_mutex = AV_MUTEX_INITIALIZER;
+static uint8_t *cbrt_table[16] = { NULL };
+static int cbrt_table_ref[16] = { 0 };
+
+int ff_init_cbrt(int bitdepth)
+{
+    uint8_t *table;
+    int size;
+
+    if (bitdepth < 4 || bitdepth > 16)
+        return AVERROR(EINVAL);
+
+    ff_mutex_lock(&cbrt_mutex);
+
+    table = cbrt_table[bitdepth];
+    if (table) {
+        cbrt_table_ref[bitdepth]++;
+        goto end;
+    }
+
+    table = av_malloc((1 << bitdepth) * (bitdepth > 8 ? 2 : 1));
+    if (!table)
+        goto end;
+    cbrt_table[bitdepth] = table;
+    cbrt_table_ref[bitdepth] = 1;
+
+    size = 1 << bitdepth;
+    double factor = pow(size - 1, 2. / 3.);
+    if (bitdepth <= 8) {
+        for (int i = 0; i < size; i++)
+            table[i] = round(factor * pow(i, 1. / 3.));
+    } else {
+        uint16_t *tablew = (uint16_t*)table;
+        for (int i = 0; i < size; i++)
+            tablew[i] = round(factor * pow(i, 1. / 3.));
+    }
+
+end:
+    ff_mutex_unlock(&cbrt_mutex);
+    return table != NULL;
+}
+
+void ff_uninit_cbrt(int bitdepth)
+{
+    if (bitdepth < 4 || bitdepth > 16)
+        return;
+    ff_mutex_lock(&cbrt_mutex);
+    if (!--cbrt_table_ref[bitdepth]) {
+        av_free(cbrt_table[bitdepth]);
+        cbrt_table[bitdepth] = NULL;
+    }
+    ff_mutex_unlock(&cbrt_mutex);
+}
+
+void ff_scene_scrd_c(SCENE_SAD_PARAMS)
+{
+    uint64_t scrdPlus = 0;
+    uint64_t scrdMinus = 0;
+    int x, y;
+    double mean;
+    uint8_t *table = cbrt_table[8];
+
+    if (!table) {
+        *sum = 0;
+        return;
+    }
+
+    for (y = 0; y < height; y++) {
+        for (x = 0; x < width; x++)
+            if (src1[x] > src2[x])
+                scrdMinus += table[src1[x] - src2[x]];
+            else
+                scrdPlus += table[src2[x] - src1[x]];
+        src1 += stride1;
+        src2 += stride2;
+    }
+
+    mean = (sqrt(scrdPlus) + sqrt(scrdMinus)) / 2.0;
+    *sum = 2.0 * mean * mean;
+}
+
+void ff_scene_scrd2B_c(SCENE_SAD_PARAMS, int bitdepth)
+{
+    uint64_t scrdPlus = 0;
+    uint64_t scrdMinus = 0;
+    const uint16_t *src1w = (const uint16_t*)src1;
+    const uint16_t *src2w = (const uint16_t*)src2;
+    int x, y;
+    double mean;
+    uint16_t *table = (uint16_t*)cbrt_table[bitdepth];
+
+    if (!table) {
+        *sum = 0;
+        return;
+    }
+
+    stride1 /= 2;
+    stride2 /= 2;
+
+    for (y = 0; y < height; y++) {
+        for (x = 0; x < width; x++)
+            if (src1w[x] > src2w[x])
+                scrdMinus += table[src1w[x] - src2w[x]];
+            else
+                scrdPlus += table[src2w[x] - src1w[x]];
+        src1w += stride1;
+        src2w += stride2;
+    }
+
+    mean = (sqrt(scrdPlus) + sqrt(scrdMinus)) / 2.0;
+    *sum = 2.0 * mean * mean;
+}
+
+void ff_scene_scrd9_c(SCENE_SAD_PARAMS)
+{
+    ff_scene_scrd2B_c(src1, stride1, src2, stride2, width, height, sum, 9);
+}
+
+void ff_scene_scrd10_c(SCENE_SAD_PARAMS)
+{
+    ff_scene_scrd2B_c(src1, stride1, src2, stride2, width, height, sum, 10);
+}
+
+void ff_scene_scrd12_c(SCENE_SAD_PARAMS)
+{
+    ff_scene_scrd2B_c(src1, stride1, src2, stride2, width, height, sum, 12);
+}
+
+void ff_scene_scrd14_c(SCENE_SAD_PARAMS)
+{
+    ff_scene_scrd2B_c(src1, stride1, src2, stride2, width, height, sum, 14);
+}
+
+void ff_scene_scrd16_c(SCENE_SAD_PARAMS)
+{
+    ff_scene_scrd2B_c(src1, stride1, src2, stride2, width, height, sum, 16);
+}
+
+ff_scene_sad_fn ff_scene_scrd_get_fn(int depth)
+{
+    ff_scene_sad_fn scrd = NULL;
+    if (depth == 8)
+        scrd = ff_scene_scrd_c;
+    else if (depth == 9)
+        scrd = ff_scene_scrd9_c;
+    else if (depth == 10)
+        scrd = ff_scene_scrd10_c;
+    else if (depth == 12)
+        scrd = ff_scene_scrd12_c;
+    else if (depth == 14)
+        scrd = ff_scene_scrd14_c;
+    else if (depth == 16)
+        scrd = ff_scene_scrd16_c;
+    return scrd;
+}
diff --git a/libavfilter/scene_sad.h b/libavfilter/scene_sad.h
index 173a051f2b..dd20e1a259 100644
--- a/libavfilter/scene_sad.h
+++ b/libavfilter/scene_sad.h
@@ -41,4 +41,24 @@ ff_scene_sad_fn ff_scene_sad_get_fn_x86(int depth);
 
 ff_scene_sad_fn ff_scene_sad_get_fn(int depth);
 
+void ff_scene_scrd_c(SCENE_SAD_PARAMS);
+
+void ff_scene_scrd2B_c(SCENE_SAD_PARAMS, int bitdepth);
+
+void ff_scene_scrd9_c(SCENE_SAD_PARAMS);
+
+void ff_scene_scrd10_c(SCENE_SAD_PARAMS);
+
+void ff_scene_scrd12_c(SCENE_SAD_PARAMS);
+
+void ff_scene_scrd14_c(SCENE_SAD_PARAMS);
+
+void ff_scene_scrd16_c(SCENE_SAD_PARAMS);
+
+ff_scene_sad_fn ff_scene_scrd_get_fn(int depth);
+
+int ff_init_cbrt(int bitdepth);
+
+void ff_uninit_cbrt(int bitdepth);
+
 #endif /* AVFILTER_SCENE_SAD_H */
diff --git a/libavfilter/vf_scdet.c b/libavfilter/vf_scdet.c
index 15399cfebf..2bb1f51965 100644
--- a/libavfilter/vf_scdet.c
+++ b/libavfilter/vf_scdet.c
@@ -31,6 +31,18 @@
 #include "scene_sad.h"
 #include "video.h"
 
+enum SCDETMode {
+    MODE_LEGACY = -1,
+    MODE_LINEAR = 0,
+    MODE_MEAN_CBRT = 1
+};
+
+typedef struct SCDETFrameInfo {
+    AVFrame *picref;
+    double mafd;
+    double diff;
+} SCDETFrameInfo;
+
 typedef struct SCDetContext {
     const AVClass *class;
 
@@ -39,11 +51,12 @@ typedef struct SCDetContext {
     int nb_planes;
     int bitdepth;
     ff_scene_sad_fn sad;
-    double prev_mafd;
-    double scene_score;
-    AVFrame *prev_picref;
+    SCDETFrameInfo curr_frame;
+    SCDETFrameInfo prev_frame;
+
     double threshold;
     int sc_pass;
+    enum SCDETMode mode;
 } SCDetContext;
 
 #define OFFSET(x) offsetof(SCDetContext, x)
@@ -55,6 +68,7 @@ static const AVOption scdet_options[] = {
     { "t",           "set scene change detect threshold",        OFFSET(threshold),  AV_OPT_TYPE_DOUBLE,   {.dbl = 10.},     0,  100., V|F },
     { "sc_pass",     "Set the flag to pass scene change frames", OFFSET(sc_pass),    AV_OPT_TYPE_BOOL,     {.dbl =  0  },    0,    1,  V|F },
     { "s",           "Set the flag to pass scene change frames", OFFSET(sc_pass),    AV_OPT_TYPE_BOOL,     {.dbl =  0  },    0,    1,  V|F },
+    { "mode",        "scene change detection method",            OFFSET(mode),       AV_OPT_TYPE_INT,      {.i64 = MODE_LEGACY}, MODE_LEGACY, MODE_MEAN_CBRT, V|F },
     {NULL}
 };
 
@@ -91,7 +105,14 @@ static int config_input(AVFilterLink *inlink)
         s->height[plane] = inlink->h >> ((plane == 1 || plane == 2) ? desc->log2_chroma_h : 0);
     }
 
-    s->sad = ff_scene_sad_get_fn(s->bitdepth == 8 ? 8 : 16);
+    if (s->mode == MODE_LINEAR || s->mode == MODE_LEGACY)
+        s->sad = ff_scene_sad_get_fn(s->bitdepth == 8 ? 8 : 16);
+    else if (s->mode == MODE_MEAN_CBRT) {
+        int ret = ff_init_cbrt(s->bitdepth);
+        if (ret < 0)
+            return ret;
+        s->sad = ff_scene_scrd_get_fn(s->bitdepth);
+    }
     if (!s->sad)
         return AVERROR(EINVAL);
 
@@ -101,46 +122,102 @@ static int config_input(AVFilterLink *inlink)
 static av_cold void uninit(AVFilterContext *ctx)
 {
     SCDetContext *s = ctx->priv;
-
-    av_frame_free(&s->prev_picref);
+    if (s->mode == MODE_LEGACY)
+        av_frame_free(&s->prev_frame.picref);
+    if (s->mode == MODE_MEAN_CBRT)
+        ff_uninit_cbrt(s->bitdepth);
 }
 
-static double get_scene_score(AVFilterContext *ctx, AVFrame *frame)
+static void compute_diff(AVFilterContext *ctx)
 {
-    double ret = 0;
     SCDetContext *s = ctx->priv;
-    AVFrame *prev_picref = s->prev_picref;
+    AVFrame *prev_picref = s->prev_frame.picref;
+    AVFrame *curr_picref = s->curr_frame.picref;
 
-    if (prev_picref && frame->height == prev_picref->height
-                    && frame->width  == prev_picref->width) {
-        uint64_t sad = 0;
-        double mafd, diff;
-        uint64_t count = 0;
+    if (prev_picref && curr_picref
+            && curr_picref->height == prev_picref->height
+            && curr_picref->width  == prev_picref->width) {
 
+        uint64_t sum = 0;
+        uint64_t count = 0;
         for (int plane = 0; plane < s->nb_planes; plane++) {
-            uint64_t plane_sad;
+            uint64_t plane_sum;
             s->sad(prev_picref->data[plane], prev_picref->linesize[plane],
-                    frame->data[plane], frame->linesize[plane],
-                    s->width[plane], s->height[plane], &plane_sad);
-            sad += plane_sad;
+                    curr_picref->data[plane], curr_picref->linesize[plane],
+                    s->width[plane], s->height[plane], &plane_sum);
+            sum += plane_sum;
             count += s->width[plane] * s->height[plane];
         }
 
-        mafd = (double)sad * 100. / count / (1ULL << s->bitdepth);
-        diff = fabs(mafd - s->prev_mafd);
-        ret  = av_clipf(FFMIN(mafd, diff), 0, 100.);
-        s->prev_mafd = mafd;
-        av_frame_free(&prev_picref);
+        s->curr_frame.mafd = (double)sum * 100. / count / (1ULL << s->bitdepth);
+        s->curr_frame.diff = s->curr_frame.mafd - s->prev_frame.mafd;
+        if (s->mode == MODE_LEGACY)
+            s->curr_frame.diff = fabs(s->curr_frame.diff);
+    } else {
+        s->curr_frame.mafd = 0;
+        s->curr_frame.diff = 0;
     }
-    s->prev_picref = av_frame_clone(frame);
-    return ret;
 }
 
-static int set_meta(SCDetContext *s, AVFrame *frame, const char *key, const char *value)
+static int set_meta(AVFrame *frame, const char *key, const char *value)
 {
     return av_dict_set(&frame->metadata, key, value, 0);
 }
 
+static int filter_frame(AVFilterContext *ctx, AVFrame *frame)
+{
+    AVFilterLink *inlink = ctx->inputs[0];
+    AVFilterLink *outlink = ctx->outputs[0];
+    SCDetContext *s = ctx->priv;
+
+    s->prev_frame = s->curr_frame;
+    s->curr_frame.picref = frame;
+
+    if ((s->mode != MODE_LEGACY && s->prev_frame.picref) || (s->mode == MODE_LEGACY && frame != NULL)) {
+        SCDETFrameInfo fwd_frame;
+        double scene_score;
+        char buf[64];
+
+        compute_diff(ctx);
+
+        if (s->mode == MODE_LEGACY) {
+            av_frame_free(&s->prev_frame.picref);
+            fwd_frame = s->curr_frame;
+            s->curr_frame.picref = av_frame_clone(s->curr_frame.picref);
+        } else {
+            if (s->prev_frame.diff < -s->curr_frame.diff) {
+                s->prev_frame.diff = -s->curr_frame.diff;
+                s->prev_frame.mafd = s->curr_frame.mafd;
+            }
+            fwd_frame = s->prev_frame;
+        }
+        scene_score = av_clipf(s->mode == MODE_LEGACY ? FFMIN(fwd_frame.mafd, fwd_frame.diff) : FFMAX(fwd_frame.diff, 0), 0, 100.);
+
+        snprintf(buf, sizeof(buf), "%0.3f", fwd_frame.mafd);
+        set_meta(fwd_frame.picref, "lavfi.scd.mafd", buf);
+        snprintf(buf, sizeof(buf), "%0.3f", scene_score);
+        set_meta(fwd_frame.picref, "lavfi.scd.score", buf);
+
+        if (scene_score >= s->threshold) {
+            av_log(s, AV_LOG_INFO, "lavfi.scd.score: %.3f, lavfi.scd.time: %s\n",
+                scene_score, av_ts2timestr(fwd_frame.picref->pts, &inlink->time_base));
+            set_meta(fwd_frame.picref, "lavfi.scd.time",
+                av_ts2timestr(fwd_frame.picref->pts, &inlink->time_base));
+        }
+
+        if (s->sc_pass) {
+            if (scene_score >= s->threshold)
+                return ff_filter_frame(outlink, fwd_frame.picref);
+            else
+                av_frame_free(&fwd_frame.picref);
+        }
+        else
+            return ff_filter_frame(outlink, fwd_frame.picref);
+    }
+
+    return 0;
+}
+
 static int activate(AVFilterContext *ctx)
 {
     int ret;
@@ -148,6 +225,8 @@ static int activate(AVFilterContext *ctx)
     AVFilterLink *outlink = ctx->outputs[0];
     SCDetContext *s = ctx->priv;
     AVFrame *frame;
+    int64_t pts;
+    int status;
 
     FF_FILTER_FORWARD_STATUS_BACK(outlink, inlink);
 
@@ -155,31 +234,20 @@ static int activate(AVFilterContext *ctx)
     if (ret < 0)
         return ret;
 
-    if (frame) {
-        char buf[64];
-        s->scene_score = get_scene_score(ctx, frame);
-        snprintf(buf, sizeof(buf), "%0.3f", s->prev_mafd);
-        set_meta(s, frame, "lavfi.scd.mafd", buf);
-        snprintf(buf, sizeof(buf), "%0.3f", s->scene_score);
-        set_meta(s, frame, "lavfi.scd.score", buf);
+    if (ret > 0) {
+        ret = filter_frame(ctx, frame);
+        if (ret < 0)
+            return ret;
+    }
 
-        if (s->scene_score >= s->threshold) {
-            av_log(s, AV_LOG_INFO, "lavfi.scd.score: %.3f, lavfi.scd.time: %s\n",
-                    s->scene_score, av_ts2timestr(frame->pts, &inlink->time_base));
-            set_meta(s, frame, "lavfi.scd.time",
-                    av_ts2timestr(frame->pts, &inlink->time_base));
-        }
-        if (s->sc_pass) {
-            if (s->scene_score >= s->threshold)
-                return ff_filter_frame(outlink, frame);
-            else {
-                av_frame_free(&frame);
-            }
-        } else
-            return ff_filter_frame(outlink, frame);
+    if (ff_inlink_acknowledge_status(inlink, &status, &pts)) {
+        if (status == AVERROR_EOF)
+            ret = filter_frame(ctx, NULL);
+
+        ff_outlink_set_status(outlink, status, pts);
+        return ret;
     }
 
-    FF_FILTER_FORWARD_STATUS(inlink, outlink);
     FF_FILTER_FORWARD_WANTED(outlink, inlink);
 
     return FFERROR_NOT_READY;
diff --git a/tests/fate/filter-video.mak b/tests/fate/filter-video.mak
index ee9f0f5e40..0c5794685f 100644
--- a/tests/fate/filter-video.mak
+++ b/tests/fate/filter-video.mak
@@ -672,6 +672,9 @@ SCDET_DEPS = LAVFI_INDEV FILE_PROTOCOL MOVIE_FILTER SCDET_FILTER SCALE_FILTER \
 FATE_METADATA_FILTER-$(call ALLYES, $(SCDET_DEPS)) += fate-filter-metadata-scdet
 fate-filter-metadata-scdet: SRC = $(TARGET_SAMPLES)/svq3/Vertical400kbit.sorenson3.mov
 fate-filter-metadata-scdet: CMD = run $(FILTER_METADATA_COMMAND) "sws_flags=+accurate_rnd+bitexact;movie='$(SRC)',scdet=s=1"
+FATE_METADATA_FILTER-$(call ALLYES, $(SCDET_DEPS)) += fate-filter-metadata-scdet1
+fate-filter-metadata-scdet1: SRC = $(TARGET_SAMPLES)/svq3/Vertical400kbit.sorenson3.mov
+fate-filter-metadata-scdet1: CMD = run $(FILTER_METADATA_COMMAND) "sws_flags=+accurate_rnd+bitexact;movie='$(SRC)',scdet=s=1:t=30:mode=1"
 
 CROPDETECT_DEPS = LAVFI_INDEV FILE_PROTOCOL MOVIE_FILTER MOVIE_FILTER MESTIMATE_FILTER CROPDETECT_FILTER \
                   SCALE_FILTER MOV_DEMUXER H264_DECODER
diff --git a/tests/ref/fate/filter-metadata-scdet1 b/tests/ref/fate/filter-metadata-scdet1
new file mode 100644
index 0000000000..0c60b3fc80
--- /dev/null
+++ b/tests/ref/fate/filter-metadata-scdet1
@@ -0,0 +1,13 @@
+pts=1620|tag:lavfi.scd.score=41.567|tag:lavfi.scd.mafd=0.279|tag:lavfi.scd.time=2.7
+pts=4020|tag:lavfi.scd.score=31.824|tag:lavfi.scd.mafd=5.039|tag:lavfi.scd.time=6.7
+pts=4060|tag:lavfi.scd.score=32.793|tag:lavfi.scd.mafd=37.833|tag:lavfi.scd.time=6.766667
+pts=5800|tag:lavfi.scd.score=40.633|tag:lavfi.scd.mafd=40.633|tag:lavfi.scd.time=9.666667
+pts=6720|tag:lavfi.scd.score=57.019|tag:lavfi.scd.mafd=0.313|tag:lavfi.scd.time=11.2
+pts=8160|tag:lavfi.scd.score=34.500|tag:lavfi.scd.mafd=38.651|tag:lavfi.scd.time=13.6
+pts=9760|tag:lavfi.scd.score=31.848|tag:lavfi.scd.mafd=39.432|tag:lavfi.scd.time=16.266667
+pts=13740|tag:lavfi.scd.score=30.266|tag:lavfi.scd.mafd=2.976|tag:lavfi.scd.time=22.9
+pts=13780|tag:lavfi.scd.score=31.131|tag:lavfi.scd.mafd=34.106|tag:lavfi.scd.time=22.966667
+pts=14080|tag:lavfi.scd.score=31.244|tag:lavfi.scd.mafd=34.654|tag:lavfi.scd.time=23.466667
+pts=15700|tag:lavfi.scd.score=41.191|tag:lavfi.scd.mafd=41.292|tag:lavfi.scd.time=26.166667
+pts=18500|tag:lavfi.scd.score=48.512|tag:lavfi.scd.mafd=5.036|tag:lavfi.scd.time=30.833333
+pts=21760|tag:lavfi.scd.score=41.083|tag:lavfi.scd.mafd=42.842|tag:lavfi.scd.time=36.266667
-- 
2.44.1.windows.1

