ff_shuffle_filter_coefficients shuffles the tail as required. --- libswscale/utils.c | 17 +++++++++++++++-- libswscale/x86/swscale.c | 4 ++-- 2 files changed, 17 insertions(+), 4 deletions(-)
diff --git a/libswscale/utils.c b/libswscale/utils.c index 52f07e1661..7e1e9c3834 100644 --- a/libswscale/utils.c +++ b/libswscale/utils.c @@ -285,7 +285,7 @@ int ff_shuffle_filter_coefficients(SwsContext *c, int *filterPos, #if ARCH_X86_64 int i = 0, j = 0, k = 0; int cpu_flags = av_get_cpu_flags(); - if (!filter || dstW % 16 != 0) return 0; + if (!filter || (dstW % 4 != 0)) return 0; if (EXTERNAL_AVX2_FAST(cpu_flags) && !(cpu_flags & AV_CPU_FLAG_SLOW_GATHER)) { if ((c->srcBpc == 8) && (c->dstBpc <= 14)) { int16_t *filterCopy = NULL; @@ -296,9 +296,11 @@ int ff_shuffle_filter_coefficients(SwsContext *c, int *filterPos, } // Do not swap filterPos for pixels which won't be processed by // the main loop. - for (i = 0; i + 8 <= dstW; i += 8) { + for (i = 0; i + 16 <= dstW; i += 16) { FFSWAP(int, filterPos[i + 2], filterPos[i + 4]); FFSWAP(int, filterPos[i + 3], filterPos[i + 5]); + FFSWAP(int, filterPos[i + 10], filterPos[i + 12]); + FFSWAP(int, filterPos[i + 11], filterPos[i + 13]); } if (filterSize > 4) { // 16 pixels are processed at a time. @@ -312,6 +314,17 @@ int ff_shuffle_filter_coefficients(SwsContext *c, int *filterPos, } } } + // 4 pixels are processed at a time in the tail. + for (; i + 4 <= dstW; i += 4) { + // 4 filter coeffs are processed at a time. + for (k = 0; k + 4 <= filterSize; k += 4) { + for (j = 0; j < 4; ++j) { + int from = (i + j) * filterSize + k; + int to = i * filterSize + j * 4 + k * 4; + memcpy(&filter[to], &filterCopy[from], 4 * sizeof(int16_t)); + } + } + } } if (filterCopy) av_free(filterCopy); diff --git a/libswscale/x86/swscale.c b/libswscale/x86/swscale.c index fdc93866a6..1d8f19aa5a 100644 --- a/libswscale/x86/swscale.c +++ b/libswscale/x86/swscale.c @@ -580,9 +580,9 @@ switch(c->dstBpc){ \ if (EXTERNAL_AVX2_FAST(cpu_flags) && !(cpu_flags & AV_CPU_FLAG_SLOW_GATHER)) { if ((c->srcBpc == 8) && (c->dstBpc <= 14)) { - if (c->chrDstW % 16 == 0) + if (c->chrDstW % 4 == 0) ASSIGN_AVX2_SCALE_FUNC(c->hcScale, c->hChrFilterSize); - if (c->dstW % 16 == 0) + if (c->dstW % 4 == 0) ASSIGN_AVX2_SCALE_FUNC(c->hyScale, c->hLumFilterSize); } } -- 2.34.1.575.g55b058a8bb-goog _______________________________________________ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-devel To unsubscribe, visit link above, or email ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".