From: chen <chenm...@163.com>

Replace the existing C code for filter_column() with chen's code. Modify 
filter_slice() to be compatible with this change.

Tested using the command:
./ffmpeg_g -s 1280*720 -pix_fmt yuv420p -i test.yuv -vf convolution="1 2 3 4 5 
6 7 8 9:1 2 3 4 5 6 7 8 9:1 2 3 4 5 6 7 8 9:1 2 3 4 5 6 7 8 
9:1/45:1/45:1/45:1/45:1:2:3:4:column:column:column:column" -an -vframes 5000 -f 
null /dev/null -benchmark

after patch:
frame= 4317 fps=271 q=-0.0 Lsize=N/A time=00:02:52.68 bitrate=N/A speed=10.8x
video:2260kB audio:0kB subtitle:0kB other streams:0kB global headers:0kB muxing 
overhead: unknown
bench: utime=76.097s stime=1.676s rtime=15.929s
bench: maxrss=15160kB

before patch:
frame= 4317 fps=192 q=-0.0 Lsize=N/A time=00:02:52.68 bitrate=N/A speed= 7.7x
video:2260kB audio:0kB subtitle:0kB other streams:0kB global headers:0kB muxing 
overhead: unknown
bench: utime=104.253s stime=2.668s rtime=22.426s
bench: maxrss=15216kB

Signed-off-by: Xu Jun <xuju...@sjtu.edu.cn>
---
 libavfilter/vf_convolution.c | 36 +++++++++++++++++++++++++-----------
 1 file changed, 25 insertions(+), 11 deletions(-)

diff --git a/libavfilter/vf_convolution.c b/libavfilter/vf_convolution.c
index 5909feaad1..bc816b58bb 100644
--- a/libavfilter/vf_convolution.c
+++ b/libavfilter/vf_convolution.c
@@ -24,6 +24,7 @@
 #include "libavutil/intreadwrite.h"
 #include "libavutil/opt.h"
 #include "libavutil/pixdesc.h"
+#include "libavutil/avassert.h"
 #include "avfilter.h"
 #include "convolution.h"
 #include "formats.h"
@@ -389,19 +390,29 @@ static void filter_row(uint8_t *dst, int width,
 
 static void filter_column(uint8_t *dst, int height,
                           float rdiv, float bias, const int *const matrix,
-                          const uint8_t *c[], int peak, int radius,
+                          const uint8_t *c[], int length, int radius,
                           int dstride, int stride)
 {
-    int y;
+    int y, off16;
+
+    av_assert2(length <=  16);
 
+    // NOTE: alignment to 64-bytes, so 16 of int can be fill into full of a 
cache line
+    DECLARE_ALIGNED(64, int, sum)[16];
     for (y = 0; y < height; y++) {
-        int i, sum = 0;
+        int i;
+        memset(sum, 0, sizeof(sum));
 
-        for (i = 0; i < 2 * radius + 1; i++)
-            sum += c[i][0 + y * stride] * matrix[i];
+        for (i = 0; i < 2 * radius + 1; i++) {
+            for (off16 = 0; off16 < length; off16++) {
+                sum[off16] += c[i][0 + y * stride + off16] * matrix[i];
+            }
+        }
 
-        sum = (int)(sum * rdiv + bias + 0.5f);
-        dst[0] = av_clip_uint8(sum);
+        for (off16 = 0; off16 < length; off16++) {
+            sum[off16] = (int)(sum[off16] * rdiv + bias + 0.5f);
+            dst[off16] = av_clip_uint8(sum[off16]);
+        }
         dst += dstride;
     }
 }
@@ -521,7 +532,10 @@ static int filter_slice(AVFilterContext *ctx, void *arg, 
int jobnr, int nb_jobs)
             continue;
         }
 
-        for (y = slice_start; y < slice_end; y++) {
+        const int step = mode == MATRIX_COLUMN ? 16 : 1;
+        int smax = mode == MATRIX_COLUMN ?  16: s->max;
+        for (y = slice_start; y < slice_end; y += step) {
+            if (mode == MATRIX_COLUMN && slice_end - y < 16) smax = slice_end 
- y;
             const int xoff = mode == MATRIX_COLUMN ? (y - slice_start) * bpc : 
radius * bpc;
             const int yoff = mode == MATRIX_COLUMN ? radius * stride : 0;
 
@@ -531,12 +545,12 @@ static int filter_slice(AVFilterContext *ctx, void *arg, 
int jobnr, int nb_jobs)
 
                 s->setup[plane](radius, c, src, stride, x, width, y, height, 
bpc);
                 s->filter[plane](dst + yoff + xoff, 1, rdiv,
-                                 bias, matrix, c, s->max, radius,
+                                 bias, matrix, c, smax, radius,
                                  dstride, stride);
             }
             s->setup[plane](radius, c, src, stride, radius, width, y, height, 
bpc);
             s->filter[plane](dst + yoff + xoff, sizew - 2 * radius,
-                             rdiv, bias, matrix, c, s->max, radius,
+                             rdiv, bias, matrix, c, smax, radius,
                              dstride, stride);
             for (x = sizew - radius; x < sizew; x++) {
                 const int xoff = mode == MATRIX_COLUMN ? (y - slice_start) * 
bpc : x * bpc;
@@ -544,7 +558,7 @@ static int filter_slice(AVFilterContext *ctx, void *arg, 
int jobnr, int nb_jobs)
 
                 s->setup[plane](radius, c, src, stride, x, width, y, height, 
bpc);
                 s->filter[plane](dst + yoff + xoff, 1, rdiv,
-                                 bias, matrix, c, s->max, radius,
+                                 bias, matrix, c, smax, radius,
                                  dstride, stride);
             }
             if (mode != MATRIX_COLUMN)
-- 
2.17.1

_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".

Reply via email to