Re: [FFmpeg-devel] [PATCH 3/4] libswscale/x86/rgb2rgb: add uyvytoyuv422 avx2

2021-09-27 Thread chen
The current algoithm may get improve, may you combin these optimize with your 
patches? since extra VPERM make code a little more slower.



On Haswell
Current alogithm:
RSHIFT_COPY m6, m2, 1 ; UYVY UYVY -> YVYU YVY...
pand m6, m1; YxYx YxYx... RSHIFT_COPY m7, m3, 1 ; UYVY UYVY -> YVYU YVY... pand 
m7, m1 ; YxYx YxYx... packuswb m6, m7 ;  ...


Latency:
1 + 1 + 1 + 1 + 1 = 5


Proposed:
pshufb m6, m2, mX ; UYVY UYVY ->  
pshufb m7, m3, mX
punpcklqdq m6, m7 ;  


Latency:
1 + 1 + 1 = 3


I guess the current algorithm optimize for compatible with SSE2, because PSHUFB 
addition since SSSE3.
Now, we try to optimzie with AVX, AVX2 and AVX512, so I suggest we use proposed 
algorithm to get more performance.


Regards,
Min Chen




At 2021-09-28 13:34:03, "Wu Jianhua"  wrote:
>With the accelerating by means of AVX2, the uyvytoyuv422 can be faster
>
>Performance data(Less is better):
>uyvytoyuv422_sse20.49381
>uyvytoyuv422_avx 0.42981
>uyvytoyuv422_avx20.27915
>
>Signed-off-by: Wu Jianhua 
>---
> libswscale/x86/rgb2rgb.c |  6 +
> libswscale/x86/rgb_2_rgb.asm | 48 +++-
> 2 files changed, 42 insertions(+), 12 deletions(-)
>
>diff --git a/libswscale/x86/rgb2rgb.c b/libswscale/x86/rgb2rgb.c
>index c9ff33ab77..a965a1755c 100644
>--- a/libswscale/x86/rgb2rgb.c
>+++ b/libswscale/x86/rgb2rgb.c
>@@ -164,6 +164,9 @@ void ff_uyvytoyuv422_sse2(uint8_t *ydst, uint8_t *udst, 
>uint8_t *vdst,
> void ff_uyvytoyuv422_avx(uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
>  const uint8_t *src, int width, int height,
>  int lumStride, int chromStride, int srcStride);
>+void ff_uyvytoyuv422_avx2(uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
>+  const uint8_t *src, int width, int height,
>+  int lumStride, int chromStride, int srcStride);
> #endif
> 
> av_cold void rgb2rgb_init_x86(void)
>@@ -216,5 +219,8 @@ av_cold void rgb2rgb_init_x86(void)
> if (EXTERNAL_AVX(cpu_flags)) {
> uyvytoyuv422 = ff_uyvytoyuv422_avx;
> }
>+if (EXTERNAL_AVX2_FAST(cpu_flags)) {
>+uyvytoyuv422 = ff_uyvytoyuv422_avx2;
>+}
> #endif
> }
>diff --git a/libswscale/x86/rgb_2_rgb.asm b/libswscale/x86/rgb_2_rgb.asm
>index 3380a1272c..683bd067a5 100644
>--- a/libswscale/x86/rgb_2_rgb.asm
>+++ b/libswscale/x86/rgb_2_rgb.asm
>@@ -31,9 +31,16 @@ pb_shuffle0321: db 0, 3, 2, 1, 4, 7, 6, 5, 8, 11, 10, 9, 
>12, 15, 14, 13
> pb_shuffle1230: db 1, 2, 3, 0, 5, 6, 7, 4, 9, 10, 11, 8, 13, 14, 15, 12
> pb_shuffle3012: db 3, 0, 1, 2, 7, 4, 5, 6, 11, 8, 9, 10, 15, 12, 13, 14
> pb_shuffle3210: db 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12
>+pd_permd256_uv: dd 0, 4, 1, 5, 2, 6, 3, 7
> 
> SECTION .text
> 
>+%macro VPERM 5
>+%if mmsize == %2
>+vperm%1 %3, %4, %5
>+%endif
>+%endmacro
>+
> %macro RSHIFT_COPY 3
> ; %1 dst ; %2 src ; %3 shift
> %if cpuflag(avx)
>@@ -198,11 +205,15 @@ cglobal uyvytoyuv422, 9, 14, 8, ydst, udst, vdst, src, 
>w, h, lum_stride, chrom_s
> mov  whalfq, wq
> shr  whalfq, 1 ; whalf = width / 2
> 
>-lea srcq, [srcq + wq * 2]
>+leasrcq, [srcq + wq * 2]
> addydstq, wq
> addudstq, whalfq
> addvdstq, whalfq
> 
>+%if mmsize == 32
>+movum15, [pd_permd256_uv]
>+%endif
>+
> .loop_line:
> mov  xq, wq
> mov   wtwoq, wq
>@@ -251,8 +262,10 @@ cglobal uyvytoyuv422, 9, 14, 8, ydst, udst, vdst, src, w, 
>h, lum_stride, chrom_s
> 
> RSHIFT_COPYm7, m3, 1 ; UYVY UYVY -> YVYU YVY...
> pand   m7, m1 ; YxYx YxYx...
>-
> packuswb   m6, m7 ;  ...
>+
>+VPERM   q, 32, m6, m6, 0xd8
>+
> movu [ydstq + wq], m6
> 
> ; extract y part 2
>@@ -261,8 +274,10 @@ cglobal uyvytoyuv422, 9, 14, 8, ydst, udst, vdst, src, w, 
>h, lum_stride, chrom_s
> 
> RSHIFT_COPYm7, m5, 1 ; UYVY UYVY -> YVYU YVY...
> pand   m7, m1 ; YxYx YxYx...
>+packuswb   m6, m7 ;  ...
>+
>+VPERM   q, 32, m6, m6, 0xd8
> 
>-packuswbm6, m7 ;  ...
> movu [ydstq + wq + mmsize], m6
> 
> ; extract uv
>@@ -275,17 +290,21 @@ cglobal uyvytoyuv422, 9, 14, 8, ydst, udst, vdst, src, 
>w, h, lum_stride, chrom_s
> packuswb   m4, m5   ; UVUV...
> 
> ; U
>-pand   m6, m2, m1 ; UxUx...
>-pand   m7, m4, m1 ; UxUx...
>+pand m6, m2, m1 ; UxUx...
>+pand m7, m4, m1 ; UxUx...
>+packuswb m6, m7 ; 
> 
>-packuswb m6

Re: [FFmpeg-devel] [PATCH] avfilter/vf_lut3d: add x86-optimized tetrahedral interpolation

2021-09-28 Thread chen
Hello,


Excuse me, how about FMADD on AVX2 platform?


For example
+mulps m7, m7, m14
+addps m0, m0, m7

==>


fmadd231ps m0,m7,m14


Regards,
Min Chen


 2021-09-29 09:18:05,mindm...@gmail.com 
>From: Mark Reid 
>
>Only supports float and 16bit planer formats at the momoment.
>Mainly focused on AVX and AVX2 optimizations, but SSE2 does seem offer some
>speed gains.
>
>f32 1920x1080 1 thread with prelut
>c impl
>1389936500 UNITS in lut3d->interp,   1 runs,  0 skips
>1425800240 UNITS in lut3d->interp,   2 runs,  0 skips
>1433312777 UNITS in lut3d->interp,   4 runs,  0 skips
>1443346798 UNITS in lut3d->interp,   8 runs,  0 skips
>
>sse2
>948662320 UNITS in lut3d->interp,   1 runs,  0 skips
>1101247540 UNITS in lut3d->interp,   2 runs,  0 skips
>1050645695 UNITS in lut3d->interp,   4 runs,  0 skips
>1041102937 UNITS in lut3d->interp,   8 runs,  0 skips
>
>avx
>633837000 UNITS in lut3d->interp,   1 runs,  0 skips
>669452850 UNITS in lut3d->interp,   2 runs,  0 skips
>650716580 UNITS in lut3d->interp,   4 runs,  0 skips
>644698550 UNITS in lut3d->interp,   8 runs,  0 skips
>
>avx2
>354940020 UNITS in lut3d->interp,   1 runs,  0 skips
>362384340 UNITS in lut3d->interp,   2 runs,  0 skips
>356799020 UNITS in lut3d->interp,   4 runs,  0 skips
>357276815 UNITS in lut3d->interp,   8 runs,  0 skips
>
>gbrap16 1920x1080 1 thread with prelut
>c impl
>1445071160 UNITS in lut3d->interp,   1 runs,  0 skips
>1477959120 UNITS in lut3d->interp,   2 runs,  0 skips
>1472102670 UNITS in lut3d->interp,   4 runs,  0 skips
>1462579330 UNITS in lut3d->interp,   8 runs,  0 skips
>
>sse2
>1035437580 UNITS in lut3d->interp,   1 runs,  0 skips
>1050139710 UNITS in lut3d->interp,   2 runs,  0 skips
>1070147205 UNITS in lut3d->interp,   4 runs,  0 skips
>1064583037 UNITS in lut3d->interp,   8 runs,  0 skips
>
>avx
>678089880 UNITS in lut3d->interp,   1 runs,  0 skips
>679112485 UNITS in lut3d->interp,   2 runs,  0 skips
>695527212 UNITS in lut3d->interp,   4 runs,  0 skips
>691300053 UNITS in lut3d->interp,   8 runs,  0 skips
>
>avx2
>372671340 UNITS in lut3d->interp,   1 runs,  0 skips
>373449870 UNITS in lut3d->interp,   2 runs,  0 skips
>383725625 UNITS in lut3d->interp,   4 runs,  0 skips
>382860848 UNITS in lut3d->interp,   8 runs,  0 skips
>
>---
> libavfilter/lut3d.h |  83 
> libavfilter/vf_lut3d.c  |  61 +--
> libavfilter/x86/Makefile|   2 +
> libavfilter/x86/vf_lut3d.asm| 757 
> libavfilter/x86/vf_lut3d_init.c |  88 
> 5 files changed, 935 insertions(+), 56 deletions(-)
> create mode 100644 libavfilter/lut3d.h
> create mode 100644 libavfilter/x86/vf_lut3d.asm
> create mode 100644 libavfilter/x86/vf_lut3d_init.c
>
>diff --git a/libavfilter/lut3d.h b/libavfilter/lut3d.h
>new file mode 100644
>index 00..ded2a036a5
>--- /dev/null
>+++ b/libavfilter/lut3d.h
>@@ -0,0 +1,83 @@
>+/*
>+ * Copyright (c) 2013 Clément Bœsch
>+ * Copyright (c) 2018 Paul B Mahol
>+ *
>+ * This file is part of FFmpeg.
>+ *
>+ * FFmpeg is free software; you can redistribute it and/or
>+ * modify it under the terms of the GNU Lesser General Public
>+ * License as published by the Free Software Foundation; either
>+ * version 2.1 of the License, or (at your option) any later version.
>+ *
>+ * FFmpeg is distributed in the hope that it will be useful,
>+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
>+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
>+ * Lesser General Public License for more details.
>+ *
>+ * You should have received a copy of the GNU Lesser General Public
>+ * License along with FFmpeg; if not, write to the Free Software
>+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 
>USA
>+ */
>+#ifndef AVFILTER_LUT3D_H
>+#define AVFILTER_LUT3D_H
>+
>+#include "libavutil/pixdesc.h"
>+#include "framesync.h"
>+#include "avfilter.h"
>+
>+enum interp_mode {
>+INTERPOLATE_NEAREST,
>+INTERPOLATE_TRILINEAR,
>+INTERPOLATE_TETRAHEDRAL,
>+INTERPOLATE_PYRAMID,
>+INTERPOLATE_PRISM,
>+NB_INTERP_MODE
>+};
>+
>+struct rgbvec {
>+float r, g, b;
>+};
>+
>+/* 3D LUT don't often go up to level 32, but it is common to have a Hald CLUT
>+ * of 512x512 (64x64x64) */
>+#

Re: [FFmpeg-devel] [PATCH v2 3/4] libswscale/x86/rgb2rgb: add uyvytoyuv422 avx2

2021-09-29 Thread chen
Hello,

>+pb_shuffle_low: times 4 db 1, 3, 5, 7, 9, 11, 13, 15, -1, -1, -1, -1, -1, -1, 
>-1, -1
Why we times 4?
AVX2 provided instruction VPBROADCASTQ to load these constant into SIMD 
register.

Moreover, the plane U/V also apply same algorithm to get improve.

Regards,
Min Chen

At 2021-09-30 09:56:11, "Wu Jianhua"  wrote:
>With the accelerating by means of AVX2, the uyvytoyuv422 can be faster
>
>Performance data(Less is better):
>uyvytoyuv422_sse20.50388
>uyvytoyuv422_avx 0.46132
>uyvytoyuv422_avx20.27309
>
>Signed-off-by: Wu Jianhua 
>---
> libswscale/x86/rgb2rgb.c |  6 
> libswscale/x86/rgb_2_rgb.asm | 60 
> 2 files changed, 53 insertions(+), 13 deletions(-)
>
>diff --git a/libswscale/x86/rgb2rgb.c b/libswscale/x86/rgb2rgb.c
>index c9ff33ab77..a965a1755c 100644
>--- a/libswscale/x86/rgb2rgb.c
>+++ b/libswscale/x86/rgb2rgb.c
>@@ -164,6 +164,9 @@ void ff_uyvytoyuv422_sse2(uint8_t *ydst, uint8_t *udst, 
>uint8_t *vdst,
> void ff_uyvytoyuv422_avx(uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
>  const uint8_t *src, int width, int height,
>  int lumStride, int chromStride, int srcStride);
>+void ff_uyvytoyuv422_avx2(uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
>+  const uint8_t *src, int width, int height,
>+  int lumStride, int chromStride, int srcStride);
> #endif
> 

>
>___
>ffmpeg-devel mailing list
>ffmpeg-devel@ffmpeg.org
>https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
>
>To unsubscribe, visit link above, or email
>ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".
___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".


Re: [FFmpeg-devel] [PATCH v2 3/4] libswscale/x86/rgb2rgb: add uyvytoyuv422 avx2

2021-09-30 Thread chen




At 2021-09-30 15:23:08, "Wu, Jianhua"  wrote:
>Min Chen wrote:
>> Sent: Thursday, September 30, 2021 10:29 AM
>> To: FFmpeg development discussions and patches > de...@ffmpeg.org>
>> Subject: Re: [FFmpeg-devel] [PATCH v2 3/4] libswscale/x86/rgb2rgb: add
>> uyvytoyuv422 avx2
>> 
>> Hello,
>> 
>> >+pb_shuffle_low: times 4 db 1, 3, 5, 7, 9, 11, 13, 15, -1, -1, -1, -1,
>> >+-1, -1, -1, -1
>> Why we times 4?
>> AVX2 provided instruction VPBROADCASTQ to load these constant into SIMD
>> register.
>> 
>> Moreover, the plane U/V also apply same algorithm to get improve.
>> 
>> Regards,
>> Min Chen
>> 
>Hi Min Chen,
>
>Much appreciated your helpful suggestions. 
>
>Correct! It's not necessary to use time 4 here.  It's funny that I did try to 
>avoid using it here
>when writing the codes and get no way because I ignored the VBROADCASTI128 
>instruction.
>
>About the UV extracting, I have estimated the new method before making a 
>decision to keep
>using the masterpiece of the previous author. The former is better, and pand 
>instruction has a better
>reciprocal throughput, or issue latency.
>
>Best regards,
>Jianhua



For VBROADCASTI128, we don't care high part of result, so we just need lowest 
64-bits constant table. VPBROADCASTQ enough.


Regards,
Min Chen
___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".


Re: [FFmpeg-devel] Support for loongson loongarch.

2021-11-02 Thread chen
Hi,


Glad to hear there have some optimize code for loongarch.
In my view, a remote debuggable machine may help more people focus on loongarch 
assembly code. for generic C/C++ code, qemu enough.


Regards,
Min Chen

At 2021-11-02 20:51:43,   wrote:
>Hello
>
>I am trying to add support for loongarch(a new architecture launched by 
>Loongson) in ffmpeg, 
>Should I donate a test machine or just offer a qemu before uploading patches?
>Alos, who should I contact if I want to add loongarch state into 
>fate.ffmpeg.org <http://fate.ffmpeg.org/>.
>Any other suggestions or precautions will be appreciated.
>
>About loongarch:
>Manual can be found here: https://github.com/loongson/LoongArch-Documentation 
><https://github.com/loongson/LoongArch-Documentation>
>Applying cloud host  is available in here: http://www.loongcloud.com.cn/ 
><http://www.loongcloud.com.cn/>
>___
>ffmpeg-devel mailing list
>ffmpeg-devel@ffmpeg.org
>https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
>
>To unsubscribe, visit link above, or email
>ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".
___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".


Re: [FFmpeg-devel] [PATCH v3 2/2] lavc/aarch64: add pred16x16 10-bit functions

2021-04-14 Thread chen
Inlined a few comments for ff_pred16x16_top_dc_neon_10, other are similar.

At 2021-04-14 20:35:44, "Martin Storsjö"  wrote:
>On Tue, 13 Apr 2021, Mikhail Nitenko wrote:
>
>> Benchmarks:
>> pred16x16_dc_10_c: 124.0
>> pred16x16_dc_10_neon: 97.2
>> pred16x16_horizontal_10_c: 71.7
>> pred16x16_horizontal_10_neon: 66.2
>> pred16x16_top_dc_10_c: 90.7
>> pred16x16_top_dc_10_neon: 71.5
>> pred16x16_vertical_10_c: 64.7
>> pred16x16_vertical_10_neon: 61.7
>
>When posting benchmark numbers, it's nice if you'd mention what CPU it was 
>benchmarked on, as the numbers sometimes differ quite a bit between 
>various CPUs.
>
>>
>> Some functions work slower than C and are left commented out.
>> ---
>> libavcodec/aarch64/h264pred_init.c |  12 +++
>> libavcodec/aarch64/h264pred_neon.S | 117 +
>> 2 files changed, 129 insertions(+)
>>
>> diff --git a/libavcodec/aarch64/h264pred_init.c 
>> b/libavcodec/aarch64/h264pred_init.c
>> index fc8989ae0d..9a1f13910d 100644
>> --- a/libavcodec/aarch64/h264pred_init.c
>> +++ b/libavcodec/aarch64/h264pred_init.c
>> @@ -45,6 +45,11 @@ void ff_pred8x8_0lt_dc_neon(uint8_t *src, ptrdiff_t 
>> stride);
>> void ff_pred8x8_l00_dc_neon(uint8_t *src, ptrdiff_t stride);
>> void ff_pred8x8_0l0_dc_neon(uint8_t *src, ptrdiff_t stride);
>> 
>> +void ff_pred16x16_top_dc_neon_10(uint8_t *src, ptrdiff_t stride);
>> +void ff_pred16x16_dc_neon_10(uint8_t *src, ptrdiff_t stride);
>> +void ff_pred16x16_hor_neon_10(uint8_t *src, ptrdiff_t stride);
>> +void ff_pred16x16_vert_neon_10(uint8_t *src, ptrdiff_t stride);
>> +
>> static av_cold void h264_pred_init_neon(H264PredContext *h, int codec_id,
>> const int bit_depth,
>> const int chroma_format_idc)
>> @@ -78,6 +83,12 @@ static av_cold void h264_pred_init_neon(H264PredContext 
>> *h, int codec_id,
>> codec_id != AV_CODEC_ID_VP7 && codec_id != AV_CODEC_ID_VP8)
>> h->pred16x16[PLANE_PRED8x8  ] = ff_pred16x16_plane_neon;
>> }
>> +if (bit_depth == 10) {
>> +h->pred16x16[DC_PRED8x8 ] = ff_pred16x16_dc_neon_10;
>> +h->pred16x16[VERT_PRED8x8   ] = ff_pred16x16_vert_neon_10;
>> +h->pred16x16[HOR_PRED8x8] = ff_pred16x16_hor_neon_10;
>> +h->pred16x16[TOP_DC_PRED8x8 ] = ff_pred16x16_top_dc_neon_10;
>> +}
>> }
>> 
>> av_cold void ff_h264_pred_init_aarch64(H264PredContext *h, int codec_id,
>> @@ -88,3 +99,4 @@ av_cold void ff_h264_pred_init_aarch64(H264PredContext *h, 
>> int codec_id,
>> if (have_neon(cpu_flags))
>> h264_pred_init_neon(h, codec_id, bit_depth, chroma_format_idc);
>> }
>> +
>
>Stray newline added
>
>> diff --git a/libavcodec/aarch64/h264pred_neon.S 
>> b/libavcodec/aarch64/h264pred_neon.S
>> index 213b40b3e7..5ce50323f8 100644
>> --- a/libavcodec/aarch64/h264pred_neon.S
>> +++ b/libavcodec/aarch64/h264pred_neon.S
>> @@ -359,3 +359,120 @@ function ff_pred8x8_0l0_dc_neon, export=1
>> dup v1.8b,  v1.b[0]
>> b   .L_pred8x8_dc_end
>> endfunc
>> +
>> +.macro ldcol.16  rd,  rs,  rt,  n=4,  hi=0
>> +.if \n >= 4 || \hi == 0
>> +ld1 {\rd\().h}[0],  [\rs], \rt
>> +ld1 {\rd\().h}[1],  [\rs], \rt
>> +.endif
>> +.if \n >= 4 || \hi == 1
>> +ld1 {\rd\().h}[2],  [\rs], \rt
>> +ld1 {\rd\().h}[3],  [\rs], \rt
>> +.endif
>> +.if \n == 8
>> +ld1 {\rd\().h}[4],  [\rs], \rt
>> +ld1 {\rd\().h}[5],  [\rs], \rt
>> +ld1 {\rd\().h}[6],  [\rs], \rt
>> +ld1 {\rd\().h}[7],  [\rs], \rt
>> +.endif
>> +.endm
>> +
>> +// slower than C
>> +/*
>> +function ff_pred16x16_128_dc_neon_10, export=1
>> +moviv0.8h, #2, lsl #8 // 512, 1 << (bit_depth - 1)
>> +
>> +b   .L_pred16x16_dc_10_end
>> +endfunc
>> +*/
>> +
>> +function ff_pred16x16_top_dc_neon_10, export=1
>> +sub x2,  x0,  x1
>> +
>> +ld1 {v0.8h, v1.8h},  [x2]
>> +
>> +addvh0,  v0.8h
>> +addvh1,  v1.8h
>> +

>> +add v0.4h, v0.4h, v1.4h
ld1+addv+addv+add  ==>  latency 5+5+5+2=17, throughput 1+1+1+1/2=3.5


Dynamic range analyze: sum 16 of 10-bits is up to 14-bits


New instructions may:
ld1+add+addp+addp+addp ==> latency 5+2+2+2+2=13, throughput 1+1/2+1/2+1/2+1/2=3
or
ld1+add+addv ==> latency 5+2+5=12, throughput 1+1/2+1=2.5


btw: we may replace LD1 by LDP to get more bandwidth.


>> +
>> +rshrn   v0.4h,  v0.4s,  #4
>> +dup v0.8h, v0.h[0]
>> +b   .L_pred16x16_dc_10_end
>> +endfunc
>> +
>> +// slower than C
>> +/*
>> +function ff_pred16x16_left_dc_neon_10, export=1
>> +sub x2,  x0,  #2 // access to the "left" column
>> +ldcol.16v0,  x2,  x1,  8
>> +ldcol.16v1,  x2,  x1,  8 // load "left" column
>> +
>> +   

[FFmpeg-devel] Fwd: [PATCH] avfilter/vf_convolution: add 16-column operation for filter_column() to prepare for x86 SIMD.

2019-12-01 Thread chen


> 下面是被转发的邮件:
> 
> 发件人: chen 
> 主题: [FFmpeg-devel] [PATCH] avfilter/vf_convolution: add 16-column operation 
> for filter_column() to prepare for x86 SIMD.
> 日期: 2019年12月02日 GMT+8 11:36:50
> 收件人: xuju...@sjtu.edu.cn
> 
> In this case, modify in filter_slice(…) is unnecessary because your generic 
> version of filter_column16(…).
> I suggest make a 16 aligned path in filter_column16(…) directly.
> For example
> 
> for(lengh / 16)
> {
> …
> }
> 
> for(length % 16)
> {
> …
> }
> 

___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".

Re: [FFmpeg-devel] [PATCH] avfilter/vf_convolution: add 16-column operation for filter_column() to prepare for x86 SIMD.

2019-12-01 Thread chen
I have a little suggest on filter_column16(..) [the function]


Firstly, the function is confused with filter16_column(..)


Secondly, the function's algoritym based on row direction, it means reduced 
address calculate operators and less cache performance, cost of them may more 
than calculate cost.


For more clear, I give my toy in here, I verify my patch with cmdline in below


 ./ffmpeg -s 1280*720 -pix_fmt yuv420p -i ~/git/sister_720x1280.yuv -vf 
convolution="1 2 3 4 5 6 7 8 9:1 2 3 4 5 6 7 8 9:1 2 3 4 5 6 7 8 9:1 2 3 4 5 6 
7 8 9:1/45:1/45:1/45:1/45:1:2:3:4:column:column:column:column" -an -vframes 
2000 -benchmark -f null /dev/null


The result:
Origin version:   utime=7.359s stime=0.138s rtime=1.664s
Song version:utime=5.320s stime=0.133s rtime=1.250s
My version:   utime=2.930s stime=0.122s rtime=0.794s


ps: since the function processing up to 16-pixels each time, if we split path 
into 16 and non-16, I have got 1.934s in here.




My patch based on today head, I have also corrected Song's merge conflict.


 Patch Start 
diff --git a/libavfilter/vf_convolution.c b/libavfilter/vf_convolution.c
index 5909fea..708732a 100644
--- a/libavfilter/vf_convolution.c
+++ b/libavfilter/vf_convolution.c
@@ -521,6 +521,61 @@ static int filter_slice(AVFilterContext *ctx, void *arg, 
int jobnr, int nb_jobs)
 continue;
 }
 
+if (mode == MATRIX_COLUMN && s->filter[plane] != filter_column){
+for (y = slice_start; y < slice_end - 16; y+=16) {
+const int xoff = (y - slice_start) * bpc;
+const int yoff = radius * stride;
+for (x = 0; x < radius; x++) {
+const int xoff = (y - slice_start) * bpc;
+const int yoff = x * stride;
+
+s->setup[plane](radius, c, src, stride, x, width, y, 
height, bpc);
+s->filter[plane](dst + yoff + xoff, 1, rdiv,
+bias, matrix, c, 16, radius,
+dstride, stride);
+}
+s->setup[plane](radius, c, src, stride, radius, width, y, 
height, bpc);
+s->filter[plane](dst + yoff + xoff, sizew - 2 * radius,
+rdiv, bias, matrix, c, 16, radius,
+dstride, stride);
+for (x = sizew - radius; x < sizew; x++) {
+const int xoff = (y - slice_start) * bpc;
+const int yoff = x * stride;
+
+s->setup[plane](radius, c, src, stride, x, width, y, 
height, bpc);
+s->filter[plane](dst + yoff + xoff, 1, rdiv,
+bias, matrix, c, 16, radius,
+dstride, stride);
+}
+}
+if (y < slice_end){
+const int xoff = (y - slice_start) * bpc;
+const int yoff = radius * stride;
+for (x = 0; x < radius; x++) {
+const int xoff = (y - slice_start) * bpc;
+const int yoff = x * stride;
+
+s->setup[plane](radius, c, src, stride, x, width, y, 
height, bpc);
+s->filter[plane](dst + yoff + xoff, 1, rdiv,
+bias, matrix, c, slice_end - y, radius,
+dstride, stride);
+}
+s->setup[plane](radius, c, src, stride, radius, width, y, 
height, bpc);
+s->filter[plane](dst + yoff + xoff, sizew - 2 * radius,
+rdiv, bias, matrix, c, slice_end - y, radius,
+dstride, stride);
+for (x = sizew - radius; x < sizew; x++) {
+const int xoff = (y - slice_start) * bpc;
+const int yoff = x * stride;
+
+s->setup[plane](radius, c, src, stride, x, width, y, 
height, bpc);
+s->filter[plane](dst + yoff + xoff, 1, rdiv,
+bias, matrix, c, slice_end - y, radius,
+dstride, stride);
+}
+}
+}
+else {
 for (y = slice_start; y < slice_end; y++) {
 const int xoff = mode == MATRIX_COLUMN ? (y - slice_start) * bpc : 
radius * bpc;
 const int yoff = mode == MATRIX_COLUMN ? radius * stride : 0;
@@ -551,6 +606,7 @@ static int filter_slice(AVFilterContext *ctx, void *arg, 
int jobnr, int nb_jobs)
 dst += dstride;
 }
 }
+}
 
 return 0;
 }
diff --git a/libavfilter/x86/vf_convolution_init.c 
b/libavfilter/x86/vf_convolution_init.c
index 5143240..fcc9ae8 100644
--- a/libavfilter/x86/vf_convolution_init.c
+++ b/libavfilter/x86/vf_convolution_init.c
@@ -29,6 +29,56 @@ void ff_filter_3x3_sse4(uint8_t *dst, int width,
 

Re: [FFmpeg-devel] [PATCH] avfilter/vf_convolution: add 16-column operation for filter_column() to prepare for x86 SIMD.

2019-12-01 Thread chen
I have a little suggest on filter_column16(..) [the function]


Firstly, the function is confused with filter16_column(..)


Secondly, the function's algoritym based on row direction, it means reduced 
address calculate operators and less cache performance, cost of them may more 
than calculate cost.


For more clear, I give my toy in here, I verify my patch with cmdline in below


 ./ffmpeg -s 1280*720 -pix_fmt yuv420p -i ~/git/sister_720x1280.yuv -vf 
convolution="1 2 3 4 5 6 7 8 9:1 2 3 4 5 6 7 8 9:1 2 3 4 5 6 7 8 9:1 2 3 4 5 6 
7 8 9:1/45:1/45:1/45:1/45:1:2:3:4:column:column:column:column" -an -vframes 
2000 -benchmark -f null /dev/null


The result:
Origin version:   utime=7.359s stime=0.138s rtime=1.664s
Song version:utime=5.320s stime=0.133s rtime=1.250s
My version:   utime=2.930s stime=0.122s rtime=0.794s




My patch based on today head, I have also corrected Song's merge conflict.


 Patch Start 
diff --git a/libavfilter/vf_convolution.c b/libavfilter/vf_convolution.c
index 5909fea..708732a 100644
--- a/libavfilter/vf_convolution.c
+++ b/libavfilter/vf_convolution.c
@@ -521,6 +521,61 @@ static int filter_slice(AVFilterContext *ctx, void *arg, 
int jobnr, int nb_jobs)
 continue;
 }
 
+if (mode == MATRIX_COLUMN && s->filter[plane] != filter_column){
+for (y = slice_start; y < slice_end - 16; y+=16) {
+const int xoff = (y - slice_start) * bpc;
+const int yoff = radius * stride;
+for (x = 0; x < radius; x++) {
+const int xoff = (y - slice_start) * bpc;
+const int yoff = x * stride;
+
+s->setup[plane](radius, c, src, stride, x, width, y, 
height, bpc);
+s->filter[plane](dst + yoff + xoff, 1, rdiv,
+bias, matrix, c, 16, radius,
+dstride, stride);
+}
+s->setup[plane](radius, c, src, stride, radius, width, y, 
height, bpc);
+s->filter[plane](dst + yoff + xoff, sizew - 2 * radius,
+rdiv, bias, matrix, c, 16, radius,
+dstride, stride);
+for (x = sizew - radius; x < sizew; x++) {
+const int xoff = (y - slice_start) * bpc;
+const int yoff = x * stride;
+
+s->setup[plane](radius, c, src, stride, x, width, y, 
height, bpc);
+s->filter[plane](dst + yoff + xoff, 1, rdiv,
+bias, matrix, c, 16, radius,
+dstride, stride);
+}
+}
+if (y < slice_end){
+const int xoff = (y - slice_start) * bpc;
+const int yoff = radius * stride;
+for (x = 0; x < radius; x++) {
+const int xoff = (y - slice_start) * bpc;
+const int yoff = x * stride;
+
+s->setup[plane](radius, c, src, stride, x, width, y, 
height, bpc);
+s->filter[plane](dst + yoff + xoff, 1, rdiv,
+bias, matrix, c, slice_end - y, radius,
+dstride, stride);
+}
+s->setup[plane](radius, c, src, stride, radius, width, y, 
height, bpc);
+s->filter[plane](dst + yoff + xoff, sizew - 2 * radius,
+rdiv, bias, matrix, c, slice_end - y, radius,
+dstride, stride);
+for (x = sizew - radius; x < sizew; x++) {
+const int xoff = (y - slice_start) * bpc;
+const int yoff = x * stride;
+
+s->setup[plane](radius, c, src, stride, x, width, y, 
height, bpc);
+s->filter[plane](dst + yoff + xoff, 1, rdiv,
+bias, matrix, c, slice_end - y, radius,
+dstride, stride);
+}
+}
+}
+else {
 for (y = slice_start; y < slice_end; y++) {
 const int xoff = mode == MATRIX_COLUMN ? (y - slice_start) * bpc : 
radius * bpc;
 const int yoff = mode == MATRIX_COLUMN ? radius * stride : 0;
@@ -551,6 +606,7 @@ static int filter_slice(AVFilterContext *ctx, void *arg, 
int jobnr, int nb_jobs)
 dst += dstride;
 }
 }
+}
 
 return 0;
 }
diff --git a/libavfilter/x86/vf_convolution_init.c 
b/libavfilter/x86/vf_convolution_init.c
index 5143240..fcc9ae8 100644
--- a/libavfilter/x86/vf_convolution_init.c
+++ b/libavfilter/x86/vf_convolution_init.c
@@ -29,6 +29,56 @@ void ff_filter_3x3_sse4(uint8_t *dst, int width,
 const uint8_t *c[], int peak, int radius,
 int dstride, int stride);
 
+static void filter_colum

Re: [FFmpeg-devel] [PATCH] avfilter/vf_convolution: add 16-column operation for filter_column() to prepare for x86 SIMD.

2019-12-02 Thread chen
This is toy only, it depends on compiler
On my PC, it helpful my old version compiler generate movaps other than movups.


At 2019-12-02 17:21:58, "Carl Eugen Hoyos"  wrote:
>Am Mo., 2. Dez. 2019 um 08:33 Uhr schrieb chen :
>
>> +#define __assume(cond) do { if (!(cond)) __builtin_unreachable(); } 
>> while (0)
>
>We currently don't do that.
>
>If you have a testcase where it makes a big difference,
>adding it could be discussed but has to be checked in
>configure and added to a libavutil header.
>
>Carl Eugen
>___
>ffmpeg-devel mailing list
>ffmpeg-devel@ffmpeg.org
>https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
>
>To unsubscribe, visit link above, or email
>ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".
___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".

Re: [FFmpeg-devel] [PATCH 3/3] avfilter/vf_convolution: add X86 SIMD for filter_column()

2019-12-03 Thread chen
comments inline in code


At 2019-12-03 15:52:07, xuju...@sjtu.edu.cn wrote:
>From: Xu Jun 
>
>+; void filter_column(uint8_t *dst, int height,
>+; float rdiv, float bias, const int *const matrix,
>+; const uint8_t *c[], int length, int radius,
>+; int dstride, int stride);
>+
>+%if ARCH_X86_64
>+INIT_XMM sse4
>+%if UNIX64
>+cglobal filter_column, 8, 15, 7, dst, height, matrix, ptr, width, rad, 
>dstride, stride, i, ci, dst_off, off16, c_off, sum, r
>+%else
>+cglobal filter_column, 8, 15, 7, dst, height, rdiv, bias, matrix, ptr, width, 
>rad, dstride, stride, i, ci, dst_off, off16, c_off, sum, r

>+%endif
no idea, these are difficult to read and understand




>+
>+%if WIN64
>+SWAP m0, m2
>+SWAP m1, m3
>+mov r2q, matrixmp
>+mov r3q, ptrmp
>+mov r4q, widthmp
>+mov r5q, radmp
>+mov r6q, dstridemp
>+mov r7q, stridemp
>+DEFINE_ARGS dst, height, matrix, ptr, width, rad, dstride, stride, i, ci, 
>dst_off, off16, c_off, sum, r
>+%endif
>+
>+movsxdifnidn widthq, widthd
>+movsxdifnidn radq, radd
>+movsxdifnidn dstrideq, dstrided
>+movsxdifnidn strideq, strided
>+sal radq, 1

>+add radq, 1 ;2*radius+1
I don't know how about compare to "LEA x,[y*2+1]"
AndI want not discuss in between SAL and SHL


>+movsxdifnidn heightq, heightd
>+VBROADCASTSS m0, m0
>+VBROADCASTSS m1, m1
>+pxor m6, m6
>+movss m5, [half]
>+VBROADCASTSS m5, m5
>+
>+xor dst_offq, dst_offq
>+xor c_offq, c_offq
>+
>+.loopy:
>+xor off16q, off16q
>+cmp widthq, mmsize/4
>+jl .loopr
>+
>+mov rq, widthq
>+and rq, mmsize/4-1
>+sub widthq, rq
>+

>+.loop16: ;parallel process 16 elements in a row
Processing 4 column per loop, are you means, we want to save lots of unused 
register?
We claim X64, so we have 16 of XMMs


>+pxor m4, m4
>+xor iq, iq
>+.loopi:

>+movss m2, [matrixq + 4*iq]
no idea that you working on Float data path, we are lucky, Intel CPU sounds not 
penalty in here.


>+VBROADCASTSS m2, m2
>+mov ciq, [ptrq + iq * gprsize]
>+movss m3, [ciq + c_offq] ;c[i][y*stride + off16]
>+punpcklbw m3, m6

>+punpcklwd m3, m6
Since you claim SSE4, the instruction PMOVZXBD available, moreover, SSE4 
register can be full fill 16 of uint8, but load 4 of them only.


>+pmulld m2, m3
>+paddd m4, m2
>+
>+add iq, 1

>+cmp iq, radq
When you initial iq to radq and decrement per loop, you can reduce one 
instruction
I know iq is work as index in the loop, but we can found some trick over there.
>+jl .loopi
>+
>+cvtdq2ps m4, m4
>+mulps m4, m0 ; sum *= rdiv
>+addps m4, m1 ; sum += bias

>+addps m4, m5 ; sum += 0.5
I don't know how about precision mismatch if we pre-compute (bias+0.5)


>+cvttps2dq m4, m4
>+packssdw m4, m4
>+packuswb m4, m4
>+movss [dstq + dst_offq], m4
>+add c_offq, mmsize/4
>+add dst_offq, mmsize/4
>+
>+add off16q, mmsize/4
>+cmp off16q, widthq
>+jl .loop16
>+
>+add widthq, rq
>+cmp off16q, widthq
>+jge .paraend
>+

>+.loopr:
no idea about this loop, if we can read beyond, we can reuse above SIMD code


>+xor sumd, sumd
>+xor iq, iq
>+.loopr_i:
>+mov ciq, [ptrq + iq * gprsize]
>+movzx rd, byte [ciq + c_offq]
>+imul rd, [matrixq + 4*iq]
>+add sumd, rd
>+
>+add iq, 1
>+cmp iq, radq
>+jl .loopr_i
>+
>+pxor m4, m4
>+cvtsi2ss m4, sumd
>+mulss m4, m0 ; sum *= rdiv
>+addss m4, m1 ; sum += bias
>+addss m4, m5 ; sum += 0.5
>+cvttps2dq m4, m4
>+packssdw m4, m4
>+packuswb m4, m4
>+movd sumd, m4
>+mov [dstq + dst_offq], sumb
>+add c_offq, 1
>+add dst_offq, 1
>+add off16q, 1
>+cmp off16q, widthq
>+jl .loopr
>+
>+.paraend:
>+sub c_offq, widthq
>+sub dst_offq, widthq
>+add c_offq, strideq
>+add dst_offq, dstrideq
>+
>+sub heightq, 1
>+cmp heightq, 0
>+jg .loopy
>+
>+.end:
>+RET

___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".

Re: [FFmpeg-devel] [PATCH 3/3] avfilter/vf_convolution: add X86 SIMD for filter_column()

2019-12-03 Thread chen


At 2019-12-04 08:59:08, "Song, Ruiling"  wrote:
>> -Original Message-
>> From: ffmpeg-devel  On Behalf Of
>> chen
>> Sent: Tuesday, December 3, 2019 4:59 PM
>> To: FFmpeg development discussions and patches > de...@ffmpeg.org>
>> Subject: Re: [FFmpeg-devel] [PATCH 3/3] avfilter/vf_convolution: add X86
>> SIMD for filter_column()
>> 
>> comments inline in code
>> 
>> 
>> At 2019-12-03 15:52:07, xuju...@sjtu.edu.cn wrote:
>> >From: Xu Jun 
>[...]
>> >+
>> >+cvtdq2ps m4, m4
>> >+mulps m4, m0 ; sum *= rdiv
>> >+addps m4, m1 ; sum += bias
>> 
>> >+addps m4, m5 ; sum += 0.5
>> I don't know how about precision mismatch if we pre-compute (bias+0.5)

>I think it is hard to prove it is safe to do pre-compute.
Agree, I also worried precision issue since float operator is execute order 
dependent.
How about ROUNDPS?


>
>> 
>> 
>> >+cvttps2dq m4, m4
>> >+packssdw m4, m4
>> >+packuswb m4, m4
>> >+movss [dstq + dst_offq], m4
>> >+add c_offq, mmsize/4
>> >+add dst_offq, mmsize/4
>> >+
>> >+add off16q, mmsize/4
>> >+cmp off16q, widthq
>> >+jl .loop16
>> >+
>> >+add widthq, rq
>> >+cmp off16q, widthq
>> >+jge .paraend
>> >+
>> 
>> >+.loopr:
>> no idea about this loop, if we can read beyond, we can reuse above SIMD
>> code
>Reuse above SIMD code may write to the memory that does not belong to this 
>slice-thread.

>IMO, the code to handle remainder columns is still necessary.


Depends on algorithm & size,
For example width=23
Process #0 [0:15]
Process #1 [7:22]
Both of them is multiple of 16

___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".

Re: [FFmpeg-devel] [PATCH 3/3] avfilter/vf_convolution: add X86 SIMD for filter_column()

2019-12-04 Thread chen


At 2019-12-04 16:51:52, "Paul B Mahol"  wrote:
>On 12/4/19, Song, Ruiling  wrote:
>>> -Original Message-
>>> From: ffmpeg-devel  On Behalf Of
>>> chen

>>> >> At 2019-12-03 15:52:07, xuju...@sjtu.edu.cn wrote:
>>> >> >From: Xu Jun 
>>> >[...]
>>> >> >+
>>> >> >+cvtdq2ps m4, m4
>>> >> >+mulps m4, m0 ; sum *= rdiv
>>> >> >+addps m4, m1 ; sum += bias
>>> >>
>>> >> >+addps m4, m5 ; sum += 0.5
>>> >> I don't know how about precision mismatch if we pre-compute (bias+0.5)
>>>
>>> >I think it is hard to prove it is safe to do pre-compute.
>>> Agree, I also worried precision issue since float operator is execute
>>> order
>>> dependent.
>>> How about ROUNDPS?

>> Seems no exactly match.
Funny, I guess it is other issue, such as mistake on instruction's imm field.


>>> >> >+cvttps2dq m4, m4
>>> >> >+packssdw m4, m4
>>> >> >+packuswb m4, m4
>>> >> >+movss [dstq + dst_offq], m4
>>> >> >+add c_offq, mmsize/4
>>> >> >+add dst_offq, mmsize/4
>>> >> >+
>>> >> >+add off16q, mmsize/4
>>> >> >+cmp off16q, widthq
>>> >> >+jl .loop16
>>> >> >+
>>> >> >+add widthq, rq
>>> >> >+cmp off16q, widthq
>>> >> >+jge .paraend
>>> >> >+
>>> >>
>>> >> >+.loopr:
>>> >> no idea about this loop, if we can read beyond, we can reuse above
>>> >> SIMD
>>> >> code
>>> >Reuse above SIMD code may write to the memory that does not belong to
>>> this slice-thread.
>>>
>>> >IMO, the code to handle remainder columns is still necessary.
>>>
>>>
>>> Depends on algorithm & size,
>>> For example width=23
>>> Process #0 [0:15]
>>> Process #1 [7:22]
>>> Both of them is multiple of 16
>> Sounds interesting. But FFmpeg does not do like this now.
>> One question is will this get a penalty for writing to same address of
>> memory (both are writing to 7-15) from different threads?
>
>Yes, and even bad results may happen.

>
This is my problem, I don't speak clean, the "Process #x" is one step of loops,
I guess the function must be atomic, we can place any threading that work on 
same address area.

___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".

Re: [FFmpeg-devel] [PATCH v2 3/3] avfilter/vf_convolution: Add X86 SIMD optimizations for filter_column()

2019-12-22 Thread chen
comments inlined
At 2019-12-22 16:37:03, xuju...@sjtu.edu.cn wrote:
>From: Xu Jun 
>
>Performance improves about 10% compared to v1.
>
>Tested using this command:
>./ffmpeg_g -s 1280*720 -pix_fmt yuv420p -i test.yuv -vf convolution="1 2 3 4 5 
>6 7 8 9:1 2 3 4 5 6 7 8 9:1 2 3 4 5 6 7 8 9:1 2 3 4 5 6 7 8 
>9:1/45:1/45:1/45:1/45:1:2:3:4:column:column:column:column" -an -vframes 5000 
>-f null /dev/null -benchmark
>
>after patch:
>frame= 4317 fps=600 q=-0.0 Lsize=N/A time=00:02:52.68 bitrate=N/A speed=  24x
>video:2260kB audio:0kB subtitle:0kB other streams:0kB global headers:0kB 
>muxing overhead: unknown
>bench: utime=21.540s stime=2.091s rtime=7.197s
>
>before patch:
>frame= 4317 fps=263 q=-0.0 Lsize=N/A time=00:02:52.68 bitrate=N/A speed=10.5x
>video:2260kB audio:0kB subtitle:0kB other streams:0kB global headers:0kB 
>muxing overhead: unknown
>bench: utime=74.377s stime=1.880s rtime=16.420s
>
>Signed-off-by: Xu Jun 
>---
> libavfilter/x86/vf_convolution.asm| 202 ++
> libavfilter/x86/vf_convolution_init.c |   9 ++
> 2 files changed, 211 insertions(+)
>
>diff --git a/libavfilter/x86/vf_convolution.asm 
>b/libavfilter/x86/vf_convolution.asm
>index 2a09374b00..4c700656d6 100755
>--- a/libavfilter/x86/vf_convolution.asm
>+++ b/libavfilter/x86/vf_convolution.asm
>@@ -22,6 +22,8 @@
> 
> SECTION_RODATA
> half:   dd 0.5

>+shuf_init:   ddq 0x80808003808080028080800180808000
TBD
ps: constant define as Byte (db) or Word (dw) have more readable, in this case, 
you use it with psuhfb, so Byte.


>+shuf_step: ddq 0x0004000400040004
> 
> SECTION .text
> 
>@@ -285,3 +287,203 @@ sub widthq, rq
> .end:
> RET
> %endif
>+
>+; void filter_column(uint8_t *dst, int height,
>+; float rdiv, float bias, const int *const matrix,
>+; const uint8_t *c[], int length, int radius,
>+; int dstride, int stride);
>+
>+%macro COMPUTE_4COL 1

>+pshufb m7, m6, m4; get 4 uint8s from the 16 uint8s
Unnecessary, see below comment


>+pmulld m7, m5

>+paddd m1%1, m7
not error, generally, this sum operator made new dependency link, it may stall 
pipeline, I suggest sum 4 of register in parallelism.
In this case, I am not sure dynamic range of Matrix, so I am not sure it is 
good or overflow if sum element of (2 * radius + 1) times.


>+%endmacro
>+
>+%macro CVT_PACK_COL 1
>+cvtdq2ps m1%1, m1%1
>+mulps m1%1, m0 ; sum *= rdiv
>+addps m1%1, m1 ; sum += bias
>+addps m1%1, m3 ; sum += 0.5
>+cvttps2dq m1%1, m1%1
>+packssdw m1%1, m1%1
>+packuswb m1%1, m1%1
>+%endmacro
>+
>+%if ARCH_X86_64
>+INIT_XMM sse4
>+%if UNIX64
>+cglobal filter_column, 8, 14, 14, dst, height, matrix, ptr, width, rad, 
>dstride, stride, \
>+i, ci, ystride, sum, r, off16
>+%else
>+cglobal filter_column, 8, 14, 14, dst, height, rdiv, bias, matrix, ptr, 
>width, rad, dstride, stride, \
>+i, ci, ystride, sum, r, off16
>+%endif
>+
>+%if WIN64
>+SWAP m0, m2
>+SWAP m1, m3
>+mov r2q, matrixmp
>+mov r3q, ptrmp
>+mov r4q, widthmp
>+mov r5q, radmp
>+mov r6q, dstridemp
>+mov r7q, stridemp
>+DEFINE_ARGS dst, height, matrix, ptr, width, rad, dstride, stride, \
>+i, ci, ystride, sum, r, off16
>+%endif
>+
>+movsxdifnidn widthq, widthd
>+movsxdifnidn radq, radd
>+lea radq, [radq * 2 + 1]
>+movsxdifnidn dstrideq, dstrided
>+movsxdifnidn strideq, strided
>+movsxdifnidn heightq, heightd
>+
>+VBROADCASTSS m0, m0; rdiv
>+VBROADCASTSS m1, m1; bias
>+pxor m2, m2; zero
>+movss m3, [half]
>+VBROADCASTSS m3, m3; 0.5

>+movdqu m8, [shuf_init]  ; shuffle initialization
TBD


>+movdqu m9, [shuf_step]; shuffle step
>+
>+xor ystrideq, ystrideq; y*stride
>+
>+cmp widthq, mmsize;if width<16 run loopr, width=16 run 16 parallel
>+jl .less16
>+
>+.equal16:
>+pxor m10, m10
>+pxor m11, m11
>+pxor m12, m12
>+pxor m13, m13

>+; m10-13 hold sums
not error, however, use m0-m7 can be save 1 byte instruction prefix, in the 
inner loop, it made a little performance improvement.


>+
>+lea iq, [radq - 1]
>+.loopi:
>+movd m5, [matrixq + 4*iq]; matrix[i]

>+VBROADCASTSS m5, m5
since you claim SSE4, PSHUFD maybe better, however, it is not problem if you 
want to upgrade to AVX and above


>+mov ciq, [ptrq + iq * gprsize]

>+movdqu m6, [ciq + ystrideq]; c[i][y*stride] 16 uint8s
SSE4 provided MOVZXBD, it make you reduce above PSHUFB and series constant load


>+
>+;m4 controls shuffle
>+movdqa m4, m8
>+COMPUTE_4COL 0; process 0-3 cols, sum in m10
>+paddd m4, m9
>+COMPUTE_4COL 1; process 4-7 cols, sum in m11
>+paddd m4, m9
>+COMPUTE_4COL 2; process 8-11 cols, sum in m12
>+paddd m4, m9
>+COMPUTE_4COL 3; process 12-15 cols, sum in m13
>+
>+sub iq, 1
>+jns .loopi
>+
>+CVT_PACK_COL 0; process 0-3 cols, result in m10's l

Re: [FFmpeg-devel] [PATCH 2/2] lavc/aarch64: add hevc epel/qpel assembly

2021-04-28 Thread chen
inline comment with prefix [MC]

At 2021-04-29 03:50:26, "Josh Dekker"  wrote:
>From: Rafal Dabrowa 
>
>Benchmarked on Apple M1:
>
>put_hevc_epel_bi_h4_8_c: 69.9
>put_hevc_epel_bi_h4_8_neon: 15.4
>put_hevc_epel_bi_h6_8_c: 137.1
>put_hevc_epel_bi_h6_8_neon: 31.9
>put_hevc_epel_bi_h8_8_c: 124.6
>put_hevc_epel_bi_h8_8_neon: 40.9
>put_hevc_epel_bi_h12_8_c: 331.9
>put_hevc_epel_bi_h12_8_neon: 72.4
>put_hevc_epel_bi_h16_8_c: 383.4
>put_hevc_epel_bi_h16_8_neon: 124.9
>put_hevc_epel_bi_h24_8_c: 771.6
>put_hevc_epel_bi_h24_8_neon: 209.6
>put_hevc_epel_bi_h32_8_c: 1324.4
>put_hevc_epel_bi_h32_8_neon: 389.4
>put_hevc_epel_bi_h48_8_c: 2869.6
>put_hevc_epel_bi_h48_8_neon: 730.1
>put_hevc_epel_bi_h64_8_c: 4992.6
>put_hevc_epel_bi_h64_8_neon: 1490.4
>put_hevc_epel_bi_hv4_8_c: 163.4
>put_hevc_epel_bi_hv4_8_neon: 38.4
>put_hevc_epel_bi_hv6_8_c: 292.4
>put_hevc_epel_bi_hv6_8_neon: 66.4
>put_hevc_epel_bi_hv8_8_c: 375.6
>put_hevc_epel_bi_hv8_8_neon: 62.4
>put_hevc_epel_bi_hv12_8_c: 831.6
>put_hevc_epel_bi_hv12_8_neon: 134.9
>put_hevc_epel_bi_hv16_8_c: 1257.9
>put_hevc_epel_bi_hv16_8_neon: 214.1
>put_hevc_epel_bi_hv24_8_c: 2666.6
>put_hevc_epel_bi_hv24_8_neon: 391.1
>put_hevc_epel_bi_hv32_8_c: 4722.4
>put_hevc_epel_bi_hv32_8_neon: 734.1
>put_hevc_epel_bi_hv48_8_c: 10100.4
>put_hevc_epel_bi_hv48_8_neon: 1570.4
>put_hevc_epel_bi_hv64_8_c: 17613.4
>put_hevc_epel_bi_hv64_8_neon: 2810.6
>put_hevc_epel_bi_v4_8_c: 77.4
>put_hevc_epel_bi_v4_8_neon: 18.6
>put_hevc_epel_bi_v6_8_c: 142.1
>put_hevc_epel_bi_v6_8_neon: 27.1
>put_hevc_epel_bi_v8_8_c: 192.9
>put_hevc_epel_bi_v8_8_neon: 9.1
>put_hevc_epel_bi_v12_8_c: 415.6
>put_hevc_epel_bi_v12_8_neon: 55.6
>put_hevc_epel_bi_v16_8_c: 487.6
>put_hevc_epel_bi_v16_8_neon: 61.9
>put_hevc_epel_bi_v24_8_c: 957.4
>put_hevc_epel_bi_v24_8_neon: 131.1
>put_hevc_epel_bi_v32_8_c: 1540.4
>put_hevc_epel_bi_v32_8_neon: 210.4
>put_hevc_epel_bi_v48_8_c: 3242.9
>put_hevc_epel_bi_v48_8_neon: 465.6
>put_hevc_epel_bi_v64_8_c: 5441.1
>put_hevc_epel_bi_v64_8_neon: 818.1
>put_hevc_epel_h4_8_c: 41.6
>put_hevc_epel_h4_8_neon: 8.4
>put_hevc_epel_h6_8_c: 110.1
>put_hevc_epel_h6_8_neon: 24.4
>put_hevc_epel_h8_8_c: 41.6
>put_hevc_epel_h8_8_neon: 17.6
>put_hevc_epel_h12_8_c: 183.1
>put_hevc_epel_h12_8_neon: 58.1
>put_hevc_epel_h16_8_c: 146.6
>put_hevc_epel_h16_8_neon: 83.4
>put_hevc_epel_h24_8_c: 240.4
>put_hevc_epel_h24_8_neon: 157.1
>put_hevc_epel_h32_8_c: 431.1
>put_hevc_epel_h32_8_neon: 292.1
>put_hevc_epel_h48_8_c: 858.6
>put_hevc_epel_h48_8_neon: 557.4
>put_hevc_epel_h64_8_c: 1536.6
>put_hevc_epel_h64_8_neon: 1116.6
>put_hevc_epel_hv4_8_c: 152.6
>put_hevc_epel_hv4_8_neon: 34.9
>put_hevc_epel_hv6_8_c: 269.6
>put_hevc_epel_hv6_8_neon: 61.6
>put_hevc_epel_hv8_8_c: 307.4
>put_hevc_epel_hv8_8_neon: 76.9
>put_hevc_epel_hv12_8_c: 702.6
>put_hevc_epel_hv12_8_neon: 113.1
>put_hevc_epel_hv16_8_c: 1081.4
>put_hevc_epel_hv16_8_neon: 190.6
>put_hevc_epel_hv24_8_c: 2276.1
>put_hevc_epel_hv24_8_neon: 345.1
>put_hevc_epel_hv32_8_c: 4068.6
>put_hevc_epel_hv32_8_neon: 780.4
>put_hevc_epel_hv48_8_c: 8754.1
>put_hevc_epel_hv48_8_neon: 1394.4
>put_hevc_epel_hv64_8_c: 15402.1
>put_hevc_epel_hv64_8_neon: 2616.6
>put_hevc_epel_uni_hv4_8_c: 142.1
>put_hevc_epel_uni_hv4_8_neon: 46.6
>put_hevc_epel_uni_hv6_8_c: 298.4
>put_hevc_epel_uni_hv6_8_neon: 72.4
>put_hevc_epel_uni_hv8_8_c: 352.9
>put_hevc_epel_uni_hv8_8_neon: 75.1
>put_hevc_epel_uni_hv12_8_c: 776.6
>put_hevc_epel_uni_hv12_8_neon: 125.9
>put_hevc_epel_uni_hv16_8_c: 1216.1
>put_hevc_epel_uni_hv16_8_neon: 199.1
>put_hevc_epel_uni_hv24_8_c: 2577.9
>put_hevc_epel_uni_hv24_8_neon: 386.6
>put_hevc_epel_uni_hv32_8_c: 4554.9
>put_hevc_epel_uni_hv32_8_neon: 710.9
>put_hevc_epel_uni_hv48_8_c: 9869.1
>put_hevc_epel_uni_hv48_8_neon: 1499.4
>put_hevc_epel_uni_hv64_8_c: 17307.1
>put_hevc_epel_uni_hv64_8_neon: 2750.6
>put_hevc_epel_uni_v4_8_c: 59.9
>put_hevc_epel_uni_v4_8_neon: 21.9
>put_hevc_epel_uni_v6_8_c: 136.1
>put_hevc_epel_uni_v6_8_neon: 19.6
>put_hevc_epel_uni_v8_8_c: 222.4
>put_hevc_epel_uni_v8_8_neon: 17.1
>put_hevc_epel_uni_v12_8_c: 481.6
>put_hevc_epel_uni_v12_8_neon: 42.4
>put_hevc_epel_uni_v16_8_c: 424.4
>put_hevc_epel_uni_v16_8_neon: 63.4
>put_hevc_epel_uni_v24_8_c: 1184.1
>put_hevc_epel_uni_v24_8_neon: 109.9
>put_hevc_epel_uni_v32_8_c: 1401.1
>put_hevc_epel_uni_v32_8_neon: 182.9
>put_hevc_epel_uni_v48_8_c: 2933.9
>put_hevc_epel_uni_v48_8_neon: 388.9
>put_hevc_epel_uni_v64_8_c: 5044.9
>put_hevc_epel_uni_v64_8_neon: 701.1
>put_hevc_epel_v4_8_c: 31.9
>put_hevc_epel_v4_8_neon: 13.4
>put_hevc_epel_v6_8_c: 95.1
>put_hevc_epel_v6_8_neon: 16.4
>put_hevc_epel_v8_8_c: 98.9
>put_hevc_epel_v8_8_neon: 26.1
>put_hevc_epel_v12_8_c: 283.9
>put_hevc_epel_v12_8_neon: 36.9
>put_hevc_epel_v16_8_c: 229.6
>put_hevc_epel_v16_8_neon: 41.9
>put_hevc_epel_v24_8_c: 376.4
>put_hevc_epel_v24_8_neon: 90.4
>put_hevc_epel_v32_8_c: 577.4
>put_hevc_epel_v32_8_neon: 188.4
>put_hevc_epel_v48_8_c: 1058.4
>put_hevc_epel_v48_8_neon: 350.6
>put_hevc_epel_v64_8_c: 1647.4
>put_hevc_epel_v64_8_neon: 647.9
>put_hevc_pel_bi_pixels4

Re: [FFmpeg-devel] [PATCH 6/6] avcodec: add vvdec H.266/VVC decoder

2020-12-21 Thread chen
A little update
The sequence passed only 163 because they are not update their CMakeLists.txt.
I had been updated CMakeLists.txt and these patches in my github tree as well, 
I can found 81.93% (195/238) passed, 


Moreover, there have two of Field video clips, the decoder works, but output as 
separate of Top/Bottom Fields,
so the hash mismatch to VTM-11.0.


Overall, we can expect totally 82.77% (195+2/238) matched.


At 2020-12-22 01:40:43, "Paul B Mahol"  wrote:
>On Mon, Dec 21, 2020 at 7:08 AM Nuo Mi  wrote:
>
>> you can download test clips here:
>>
>> https://www.itu.int/wftp3/av-arch/jvet-site/bitstream_exchange/VVC/under_test/VTM-11.0/
>>
>> 68.48% (163/238) clips are md5 matched with VTM 11:
>>

___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".

Re: [FFmpeg-devel] [PATCH 6/6] avcodec: add vvdec H.266/VVC decoder

2020-12-23 Thread chen



At 2020-12-23 23:38:18, "Nuo Mi"  wrote:
>On Wed, Dec 23, 2020 at 10:00 PM Lynne  wrote:
>
>> Dec 23, 2020, 14:07 by nuomi2...@gmail.com:
>>
>> > Hi Lynne & James,
>> > Do not worry about the dav1d things that happened on vvcdec. It just a
>> > reference code like libaom.
>> >
>>
>> libaom does encoding and decoding. And most people only use
>> it for encoding, as its not a very fast decoder. As-is, this patch
>> only does decoding.
>>
>how about we replace vvdec with vtm. I guess it's not faster than vvdec,
>but it has an encoder.

>
there have project vvenc for encoder, it is a subset of vtm
___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".

Re: [FFmpeg-devel] Buying and hosting a HiFive RISC-V system

2021-01-10 Thread chen
In my evaluate, the RISC-V code density is 60% compare to ARM, with 
C-Extension, it raise to 80%
it may be a big problem play large ffmpeg on really products, but we have more 
space to improve ffmpeg on it.


At 2021-01-11 04:21:07, "Kieran Kunhya"  wrote:
>Hello,
>
>Lynne has suggested on IRC that we purchase one or more of these:
>https://www.sifive.com/boards/hifive-unmatched
>
>I think this is an interesting idea as RISC-V is an important platform for
>the future (like M1).
>I'll likely have to buy from Mouser (as I'm not sure SPI will accept
>CrowdSupply) and there is a long lead-time for it:
>https://www.mouser.co.uk/ProductDetail/SiFive/HF105-000?qs=zW32dvEIR3vHEV%2FPYYkdMA==
>
>Also, I'll have to claim for a case and M.2 SSD.
>
>I am happy to host this like with the Apple M1.
>
>Regards,
>Kieran Kunhya
>___
>ffmpeg-devel mailing list
>ffmpeg-devel@ffmpeg.org
>https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
>
>To unsubscribe, visit link above, or email
>ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".
___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".

[FFmpeg-devel] Patch for libx265 memory leak

2024-11-11 Thread chen
From 4067c58be8e719a55d89e68aaa9d3db19b88b32f Mon Sep 17 00:00:00 2001

From: Chen 

Date: Fri, 8 Nov 2024 22:21:19 -0800

Subject: [PATCH] Fix memory leak in the libx265




---

 libavcodec/libx265.c | 4 +++-

 1 file changed, 3 insertions(+), 1 deletion(-)




diff --git a/libavcodec/libx265.c b/libavcodec/libx265.c

index 63cc497..60e84d1 100644

--- a/libavcodec/libx265.c

+++ b/libavcodec/libx265.c

@@ -143,8 +143,10 @@ static av_cold int libx265_encode_close(AVCodecContext 
*avctx)

 rd_release(ctx, i);

 av_freep(&ctx->rd);

 

-if (ctx->encoder)

+if (ctx->encoder) {

+ctx->api->cleanup();

 ctx->api->encoder_close(ctx->encoder);

+}

 

 ff_dovi_ctx_unref(&ctx->dovi);

 

-- 

2.35.1.windows.2




0001-Fix-memory-leak-in-the-libx265.patch
Description: Binary data
___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".


Re: [FFmpeg-devel] [PATCH v3 1/2][GSoC 2024] libavcodec/x86/vvc: Add AVX2 DMVR SAD functions for VVC

2024-05-18 Thread Stone Chen
On Sat, May 18, 2024 at 9:04 AM Ronald S. Bultje  wrote:

> Hi,
>
> On Tue, May 14, 2024 at 4:40 PM Stone Chen 
> wrote:
>
>> Implements AVX2 DMVR (decoder-side motion vector refinement) SAD
>> functions. DMVR SAD is only calculated if w >= 8, h >= 8, and w * h > 128.
>> To reduce complexity, SAD is only calculated on even rows. This is
>> calculated for all video bitdepths, but the values passed to the function
>> are always 16bit (even if the original video bitdepth is 8). The AVX2
>> implementation uses min/max/sub.
>>
>> Benchmarks ( AMD 7940HS )
>> Before:
>> BQTerrace_1920x1080_60_10_420_22_RA.vvc | 80.7 |
>> Chimera_8bit_1080P_1000_frames.vvc | 158.0 |
>> NovosobornayaSquare_1920x1080.bin | 159.7 |
>> RitualDance_1920x1080_60_10_420_37_RA.266 | 146.3 |
>>
>> After:
>> BQTerrace_1920x1080_60_10_420_22_RA.vvc | 82.7 |
>> Chimera_8bit_1080P_1000_frames.vvc | 167.0 |
>> NovosobornayaSquare_1920x1080.bin | 166.3 |
>> RitualDance_1920x1080_60_10_420_37_RA.266 | 154.0 |
>>
>
> I assume these are FPS benchmarks? Can you provide checkasm --bench output
> for these functions also?
>
> Ronald
>

Hi Ronald,

Correct those are FPS benchmarks. There's a separate patch that has the
checkasm --bench (Add check_vvc_sad to vvc_mc.c), in the commit message.
I've c&p the benchmark snippit below:


> vvc_sad_8x8_c: 63.0
> vvc_sad_8x8_avx2: 3.0
> vvc_sad_16x16_c: 263.0
> vvc_sad_16x16_avx2: 23.0
> vvc_sad_32x32_c: 1003.0
> vvc_sad_32x32_avx2: 83.0
> vvc_sad_64x64_c: 3923.0
> vvc_sad_64x64_avx2: 373.0
> vvc_sad_128x128_c: 17533.0
> vvc_sad_128x128_avx2: 1683.0


Also your blogpost was very helpful for getting started with asm!

Cheers,
Stone
___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".


Re: [FFmpeg-devel] [PATCH v3 1/2][GSoC 2024] libavcodec/x86/vvc: Add AVX2 DMVR SAD functions for VVC

2024-05-19 Thread Stone Chen
On Sat, May 18, 2024 at 11:33 AM Ronald S. Bultje 
wrote:

> Hi,
>
> On Tue, May 14, 2024 at 4:40 PM Stone Chen 
> wrote:
>
>> +vvc_sad_8:
>> +.loop_height:
>> +movu  xm0, [src1q]
>> +movu  xm1, [src2q]
>> +MIN_MAX_SAD   xm2, xm0, xm1
>> +vpmovzxwd  m1, xm1
>> +vpaddd m3, m1
>>
> [..]
>
>> +vvc_sad_16_128:
>> +.loop_height:
>>
> [..]
>
>> +.loop_width:
>> +movu  xm0, [src1q]
>> +movu  xm1, [src2q]
>> +MIN_MAX_SAD   xm2, xm0, xm1
>> +vpmovzxwd  m1, xm1
>> +vpaddd m3, m1
>>
>
Hi Ronald,


> Wouldn't it be more efficient if the main loops did a full register worth
> at a time?
>
> vpbroadcastd m4, [pw_1]
> loop:
> movu m0, [src1q]
> movu m1, [src2q]
> MIN_MAX_SAD m2, m0, m1
> pmaddwd m1, m4
> paddd m3, m1
>
> (And then for w8, load 2 rows per iteration using movu xmN, [row0] and
> vinserti128 mN, [row1], 1.)
>
> Ronald
>

Thank you, I didn't know about the pmaddwd instruction, using it is
definitely more efficient!

Stone
___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".


[FFmpeg-devel] [PATCH v4 1/2][GSoC 2024] libavcodec/x86/vvc: Add AVX2 DMVR SAD functions for VVC

2024-05-19 Thread Stone Chen
Implements AVX2 DMVR (decoder-side motion vector refinement) SAD functions. 
DMVR SAD is only calculated if w >= 8, h >= 8, and w * h > 128. To reduce 
complexity, SAD is only calculated on even rows. This is calculated for all 
video bitdepths, but the values passed to the function are always 16bit (even 
if the original video bitdepth is 8). The AVX2 implementation uses min/max/sub.

Benchmarks ( AMD 7940HS )
Before:
BQTerrace_1920x1080_60_10_420_22_RA.vvc | 106.0 |
Chimera_8bit_1080P_1000_frames.vvc | 204.3 |
NovosobornayaSquare_1920x1080.bin | 197.3 |
RitualDance_1920x1080_60_10_420_37_RA.266 | 174.0 |

After:
BQTerrace_1920x1080_60_10_420_22_RA.vvc | 109.3 |
Chimera_8bit_1080P_1000_frames.vvc | 216.0 |
NovosobornayaSquare_1920x1080.bin | 204.0|
RitualDance_1920x1080_60_10_420_37_RA.266 | 181.7 |
---
 libavcodec/x86/vvc/Makefile  |   3 +-
 libavcodec/x86/vvc/vvc_sad.asm   | 138 +++
 libavcodec/x86/vvc/vvcdsp_init.c |   6 ++
 3 files changed, 146 insertions(+), 1 deletion(-)
 create mode 100644 libavcodec/x86/vvc/vvc_sad.asm

diff --git a/libavcodec/x86/vvc/Makefile b/libavcodec/x86/vvc/Makefile
index d6a66f860a..7b2438ce17 100644
--- a/libavcodec/x86/vvc/Makefile
+++ b/libavcodec/x86/vvc/Makefile
@@ -5,4 +5,5 @@ OBJS-$(CONFIG_VVC_DECODER) += x86/vvc/vvcdsp_init.o 
\
   x86/h26x/h2656dsp.o
 X86ASM-OBJS-$(CONFIG_VVC_DECODER)  += x86/vvc/vvc_alf.o  \
   x86/vvc/vvc_mc.o   \
-  x86/h26x/h2656_inter.o
+  x86/vvc/vvc_sad.o  \
+  x86/h26x/h2656_inter.o 
diff --git a/libavcodec/x86/vvc/vvc_sad.asm b/libavcodec/x86/vvc/vvc_sad.asm
new file mode 100644
index 00..58a24635d2
--- /dev/null
+++ b/libavcodec/x86/vvc/vvc_sad.asm
@@ -0,0 +1,138 @@
+; /*
+; * Provide SIMD DMVR SAD functions for VVC decoding
+; *
+; * Copyright (c) 2024 Stone Chen
+; *
+; * This file is part of FFmpeg.
+; *
+; * FFmpeg is free software; you can redistribute it and/or
+; * modify it under the terms of the GNU Lesser General Public
+; * License as published by the Free Software Foundation; either
+; * version 2.1 of the License, or (at your option) any later version.
+; *
+; * FFmpeg is distributed in the hope that it will be useful,
+; * but WITHOUT ANY WARRANTY; without even the implied warranty of
+; * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+; * Lesser General Public License for more details.
+; *
+; * You should have received a copy of the GNU Lesser General Public
+; * License along with FFmpeg; if not, write to the Free Software
+; * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 
USA
+; */
+
+%include "libavutil/x86/x86util.asm"
+%define MAX_PB_SIZE 128
+%define ROWS 2
+
+SECTION_RODATA
+
+pw_1: dw 1
+
+; DMVR SAD is only calculated on even rows to reduce complexity
+SECTION .text
+
+%macro MIN_MAX_SAD 3 ; 
+pminuw   %3, %2, %1
+pmaxuw   %1, %2, %1
+psubusw  %1, %1, %3
+%endmacro
+
+%macro HORIZ_ADD 3  ; xm0, xm1, m1
+vextracti128  %1, %3, q0001  ;32  1  0
+paddd%1, %2 ; xm0 (7 + 3) (6 + 2) (5 + 1)   (4 + 0)
+pshufd   %2, %1, q0032  ; xm1-  - (7 + 3)   (6 + 2)
+paddd%1, %1, %2 ; xm0_  _ (5 1 7 3) (4 0 6 2)
+pshufd   %2, %1, q0001  ; xm1_  _ (5 1 7 3) (5 1 7 3)
+paddd%1, %1, %2 ;   (01234567)
+%endmacro
+
+%macro INIT_OFFSET 6 ; src1, src2, dxq, dyq, off1, off2
+sub %3, 2
+sub %4, 2
+
+mov %5, 2
+mov %6, 2
+
+add %5, %4   
+sub %6, %4
+
+imul%5, 128
+imul%6, 128
+
+add %5, 2
+add %6, 2
+
+add %5, %3
+sub %6, %3
+
+lea %1, [%1 + %5 * 2]
+lea %2, [%2 + %6 * 2]
+%endmacro
+
+%if ARCH_X86_64
+%if HAVE_AVX2_EXTERNAL
+
+INIT_YMM avx2
+
+cglobal vvc_sad, 6, 11, 5, src1, src2, dx, dy, block_w, block_h, off1, off2, 
row_idx, dx2, dy2
+movsxd   dx2q, dxd
+movsxd   dy2q, dyd
+INIT_OFFSET src1q, src2q, dx2q, dy2q, off1q, off2q
+pxor   m3, m3
+vpbroadcastw   m4, [pw_1]
+
+cmp  block_wd, 16
+jgevvc_sad_16_128
+
+vvc_sad_8:
+.loop_height:
+movu  xm0, [src1q]
+vinserti128m0, [src1q + MAX_PB_SIZE * ROWS * 2], 1
+movu  xm1, [src2q]
+vinserti128m1, [src2q + MAX_PB_SIZE * ROWS * 2], 1
+
+MIN_MAX_SADm1, m0, m2
+pmaddwdm1, m4
+paddd  m3, m1
+
+a

[FFmpeg-devel] [PATCH v4 2/2][GSoC 2024] tests/checkasm: Add check_vvc_sad to vvc_mc.c

2024-05-19 Thread Stone Chen
Adds checkasm for DMVR SAD AVX2 implementation.

Benchmarks ( AMD 7940HS )
vvc_sad_8x8_c: 70.0
vvc_sad_8x8_avx2: 10.0
vvc_sad_16x16_c: 280.0
vvc_sad_16x16_avx2: 20.0
vvc_sad_32x32_c: 1020.0
vvc_sad_32x32_avx2: 70.0
vvc_sad_64x64_c: 3560.0
vvc_sad_64x64_avx2: 270.0
vvc_sad_128x128_c: 13760.0
vvc_sad_128x128_avx2: 1070.0
---
 tests/checkasm/vvc_mc.c | 38 ++
 1 file changed, 38 insertions(+)

diff --git a/tests/checkasm/vvc_mc.c b/tests/checkasm/vvc_mc.c
index 97f57cb401..e251400bfc 100644
--- a/tests/checkasm/vvc_mc.c
+++ b/tests/checkasm/vvc_mc.c
@@ -322,8 +322,46 @@ static void check_avg(void)
 report("avg");
 }
 
+static void check_vvc_sad(void)
+{
+const int bit_depth = 10;
+VVCDSPContext c;
+LOCAL_ALIGNED_32(uint16_t, src0, [MAX_CTU_SIZE * MAX_CTU_SIZE * 4]);
+LOCAL_ALIGNED_32(uint16_t, src1, [MAX_CTU_SIZE * MAX_CTU_SIZE * 4]);
+declare_func(int, const int16_t *src0, const int16_t *src1, int dx, int 
dy, int block_w, int block_h);
+
+ff_vvc_dsp_init(&c, bit_depth);
+memset(src0, 0, MAX_CTU_SIZE * MAX_CTU_SIZE * 4);
+memset(src1, 0, MAX_CTU_SIZE * MAX_CTU_SIZE * 4);
+
+randomize_pixels(src0, src1, MAX_CTU_SIZE * MAX_CTU_SIZE * 2);
+ for (int h = 8; h <= MAX_CTU_SIZE; h *= 2) {
+for (int w = 8; w <= MAX_CTU_SIZE; w *= 2) {
+for(int offy = 0; offy <= 4; offy++) {
+for(int offx = 0; offx <= 4; offx++) {
+if(check_func(c.inter.sad, "vvc_sad_%dx%d", w, h)) {
+int result0;
+int result1;
+
+result0 =  call_ref(src0 + PIXEL_STRIDE * 2 + 2, src1 
+ PIXEL_STRIDE * 2 + 2, offx, offy, w, h);
+result1 =  call_new(src0 + PIXEL_STRIDE * 2 + 2, src1 
+ PIXEL_STRIDE * 2 + 2, offx, offy, w, h);
+
+if (result1 != result0)
+fail();
+if(w == h && offx == 0 && offy == 0)
+bench_new(src0 + PIXEL_STRIDE * 2 + 2, src1 + 
PIXEL_STRIDE * 2 + 2, offx, offy, w, h);
+}
+}
+}
+}
+ }
+
+report("check_vvc_sad");
+}
+
 void checkasm_check_vvc_mc(void)
 {
+check_vvc_sad();
 check_put_vvc_luma();
 check_put_vvc_luma_uni();
 check_put_vvc_chroma();
-- 
2.45.0

___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".


[FFmpeg-devel] [PATCH v4 1/2][GSoC 2024] libavcodec/x86/vvc: Add AVX2 DMVR SAD functions for VVC

2024-05-19 Thread Stone Chen
Implements AVX2 DMVR (decoder-side motion vector refinement) SAD functions. 
DMVR SAD is only calculated if w >= 8, h >= 8, and w * h > 128. To reduce 
complexity, SAD is only calculated on even rows. This is calculated for all 
video bitdepths, but the values passed to the function are always 16bit (even 
if the original video bitdepth is 8). The AVX2 implementation uses min/max/sub.

Benchmarks ( AMD 7940HS )
Before:
BQTerrace_1920x1080_60_10_420_22_RA.vvc | 106.0 |
Chimera_8bit_1080P_1000_frames.vvc | 204.3 |
NovosobornayaSquare_1920x1080.bin | 197.3 |
RitualDance_1920x1080_60_10_420_37_RA.266 | 174.0 |

After:
BQTerrace_1920x1080_60_10_420_22_RA.vvc | 109.3 |
Chimera_8bit_1080P_1000_frames.vvc | 216.0 |
NovosobornayaSquare_1920x1080.bin | 204.0|
RitualDance_1920x1080_60_10_420_37_RA.266 | 181.7 |
---
 libavcodec/x86/vvc/Makefile  |   3 +-
 libavcodec/x86/vvc/vvc_sad.asm   | 138 +++
 libavcodec/x86/vvc/vvcdsp_init.c |   6 ++
 3 files changed, 146 insertions(+), 1 deletion(-)
 create mode 100644 libavcodec/x86/vvc/vvc_sad.asm

diff --git a/libavcodec/x86/vvc/Makefile b/libavcodec/x86/vvc/Makefile
index d6a66f860a..7b2438ce17 100644
--- a/libavcodec/x86/vvc/Makefile
+++ b/libavcodec/x86/vvc/Makefile
@@ -5,4 +5,5 @@ OBJS-$(CONFIG_VVC_DECODER) += x86/vvc/vvcdsp_init.o 
\
   x86/h26x/h2656dsp.o
 X86ASM-OBJS-$(CONFIG_VVC_DECODER)  += x86/vvc/vvc_alf.o  \
   x86/vvc/vvc_mc.o   \
-  x86/h26x/h2656_inter.o
+  x86/vvc/vvc_sad.o  \
+  x86/h26x/h2656_inter.o 
diff --git a/libavcodec/x86/vvc/vvc_sad.asm b/libavcodec/x86/vvc/vvc_sad.asm
new file mode 100644
index 00..58a24635d2
--- /dev/null
+++ b/libavcodec/x86/vvc/vvc_sad.asm
@@ -0,0 +1,138 @@
+; /*
+; * Provide SIMD DMVR SAD functions for VVC decoding
+; *
+; * Copyright (c) 2024 Stone Chen
+; *
+; * This file is part of FFmpeg.
+; *
+; * FFmpeg is free software; you can redistribute it and/or
+; * modify it under the terms of the GNU Lesser General Public
+; * License as published by the Free Software Foundation; either
+; * version 2.1 of the License, or (at your option) any later version.
+; *
+; * FFmpeg is distributed in the hope that it will be useful,
+; * but WITHOUT ANY WARRANTY; without even the implied warranty of
+; * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+; * Lesser General Public License for more details.
+; *
+; * You should have received a copy of the GNU Lesser General Public
+; * License along with FFmpeg; if not, write to the Free Software
+; * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 
USA
+; */
+
+%include "libavutil/x86/x86util.asm"
+%define MAX_PB_SIZE 128
+%define ROWS 2
+
+SECTION_RODATA
+
+pw_1: dw 1
+
+; DMVR SAD is only calculated on even rows to reduce complexity
+SECTION .text
+
+%macro MIN_MAX_SAD 3 ; 
+pminuw   %3, %2, %1
+pmaxuw   %1, %2, %1
+psubusw  %1, %1, %3
+%endmacro
+
+%macro HORIZ_ADD 3  ; xm0, xm1, m1
+vextracti128  %1, %3, q0001  ;32  1  0
+paddd%1, %2 ; xm0 (7 + 3) (6 + 2) (5 + 1)   (4 + 0)
+pshufd   %2, %1, q0032  ; xm1-  - (7 + 3)   (6 + 2)
+paddd%1, %1, %2 ; xm0_  _ (5 1 7 3) (4 0 6 2)
+pshufd   %2, %1, q0001  ; xm1_  _ (5 1 7 3) (5 1 7 3)
+paddd%1, %1, %2 ;   (01234567)
+%endmacro
+
+%macro INIT_OFFSET 6 ; src1, src2, dxq, dyq, off1, off2
+sub %3, 2
+sub %4, 2
+
+mov %5, 2
+mov %6, 2
+
+add %5, %4   
+sub %6, %4
+
+imul%5, 128
+imul%6, 128
+
+add %5, 2
+add %6, 2
+
+add %5, %3
+sub %6, %3
+
+lea %1, [%1 + %5 * 2]
+lea %2, [%2 + %6 * 2]
+%endmacro
+
+%if ARCH_X86_64
+%if HAVE_AVX2_EXTERNAL
+
+INIT_YMM avx2
+
+cglobal vvc_sad, 6, 11, 5, src1, src2, dx, dy, block_w, block_h, off1, off2, 
row_idx, dx2, dy2
+movsxd   dx2q, dxd
+movsxd   dy2q, dyd
+INIT_OFFSET src1q, src2q, dx2q, dy2q, off1q, off2q
+pxor   m3, m3
+vpbroadcastw   m4, [pw_1]
+
+cmp  block_wd, 16
+jgevvc_sad_16_128
+
+vvc_sad_8:
+.loop_height:
+movu  xm0, [src1q]
+vinserti128m0, [src1q + MAX_PB_SIZE * ROWS * 2], 1
+movu  xm1, [src2q]
+vinserti128m1, [src2q + MAX_PB_SIZE * ROWS * 2], 1
+
+MIN_MAX_SADm1, m0, m2
+pmaddwdm1, m4
+paddd  m3, m1
+
+a

[FFmpeg-devel] [PATCH v4 2/2][GSoC 2024] tests/checkasm: Add check_vvc_sad to vvc_mc.c

2024-05-19 Thread Stone Chen
Adds checkasm for DMVR SAD AVX2 implementation.

Benchmarks ( AMD 7940HS )
vvc_sad_8x8_c: 70.0
vvc_sad_8x8_avx2: 10.0
vvc_sad_16x16_c: 280.0
vvc_sad_16x16_avx2: 20.0
vvc_sad_32x32_c: 1020.0
vvc_sad_32x32_avx2: 70.0
vvc_sad_64x64_c: 3560.0
vvc_sad_64x64_avx2: 270.0
vvc_sad_128x128_c: 13760.0
vvc_sad_128x128_avx2: 1070.0
---
 tests/checkasm/vvc_mc.c | 38 ++
 1 file changed, 38 insertions(+)

diff --git a/tests/checkasm/vvc_mc.c b/tests/checkasm/vvc_mc.c
index 97f57cb401..e251400bfc 100644
--- a/tests/checkasm/vvc_mc.c
+++ b/tests/checkasm/vvc_mc.c
@@ -322,8 +322,46 @@ static void check_avg(void)
 report("avg");
 }
 
+static void check_vvc_sad(void)
+{
+const int bit_depth = 10;
+VVCDSPContext c;
+LOCAL_ALIGNED_32(uint16_t, src0, [MAX_CTU_SIZE * MAX_CTU_SIZE * 4]);
+LOCAL_ALIGNED_32(uint16_t, src1, [MAX_CTU_SIZE * MAX_CTU_SIZE * 4]);
+declare_func(int, const int16_t *src0, const int16_t *src1, int dx, int 
dy, int block_w, int block_h);
+
+ff_vvc_dsp_init(&c, bit_depth);
+memset(src0, 0, MAX_CTU_SIZE * MAX_CTU_SIZE * 4);
+memset(src1, 0, MAX_CTU_SIZE * MAX_CTU_SIZE * 4);
+
+randomize_pixels(src0, src1, MAX_CTU_SIZE * MAX_CTU_SIZE * 2);
+ for (int h = 8; h <= MAX_CTU_SIZE; h *= 2) {
+for (int w = 8; w <= MAX_CTU_SIZE; w *= 2) {
+for(int offy = 0; offy <= 4; offy++) {
+for(int offx = 0; offx <= 4; offx++) {
+if(check_func(c.inter.sad, "vvc_sad_%dx%d", w, h)) {
+int result0;
+int result1;
+
+result0 =  call_ref(src0 + PIXEL_STRIDE * 2 + 2, src1 
+ PIXEL_STRIDE * 2 + 2, offx, offy, w, h);
+result1 =  call_new(src0 + PIXEL_STRIDE * 2 + 2, src1 
+ PIXEL_STRIDE * 2 + 2, offx, offy, w, h);
+
+if (result1 != result0)
+fail();
+if(w == h && offx == 0 && offy == 0)
+bench_new(src0 + PIXEL_STRIDE * 2 + 2, src1 + 
PIXEL_STRIDE * 2 + 2, offx, offy, w, h);
+}
+}
+}
+}
+ }
+
+report("check_vvc_sad");
+}
+
 void checkasm_check_vvc_mc(void)
 {
+check_vvc_sad();
 check_put_vvc_luma();
 check_put_vvc_luma_uni();
 check_put_vvc_chroma();
-- 
2.45.0

___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".


[FFmpeg-devel] [PATCH v5 1/2][GSoC 2024] libavcodec/x86/vvc: Add AVX2 DMVR SAD functions for VVC

2024-05-21 Thread Stone Chen
Implements AVX2 DMVR (decoder-side motion vector refinement) SAD functions. 
DMVR SAD is only calculated if w >= 8, h >= 8, and w * h > 128. To reduce 
complexity, SAD is only calculated on even rows. This is calculated for all 
video bitdepths, but the values passed to the function are always 16bit (even 
if the original video bitdepth is 8). The AVX2 implementation uses min/max/sub.

Additionally this changes parameters dx and dy from int to intptr_t. This 
allows dx & dy to be used as pointer offsets without needing to use movsxd.

Benchmarks ( AMD 7940HS )
Before:
BQTerrace_1920x1080_60_10_420_22_RA.vvc | 106.0 |
Chimera_8bit_1080P_1000_frames.vvc | 204.3 |
NovosobornayaSquare_1920x1080.bin | 197.3 |
RitualDance_1920x1080_60_10_420_37_RA.266 | 174.0 |

After:
BQTerrace_1920x1080_60_10_420_22_RA.vvc | 109.3 |
Chimera_8bit_1080P_1000_frames.vvc | 216.0 |
NovosobornayaSquare_1920x1080.bin | 204.0|
RitualDance_1920x1080_60_10_420_37_RA.266 | 181.7 |
---
 libavcodec/vvc/dsp.c |   2 +-
 libavcodec/vvc/dsp.h |   2 +-
 libavcodec/x86/vvc/Makefile  |   3 +-
 libavcodec/x86/vvc/vvc_sad.asm   | 130 +++
 libavcodec/x86/vvc/vvcdsp_init.c |   6 ++
 5 files changed, 140 insertions(+), 3 deletions(-)
 create mode 100644 libavcodec/x86/vvc/vvc_sad.asm

diff --git a/libavcodec/vvc/dsp.c b/libavcodec/vvc/dsp.c
index 41e830a98a..aded1a2f9f 100644
--- a/libavcodec/vvc/dsp.c
+++ b/libavcodec/vvc/dsp.c
@@ -46,7 +46,7 @@ static void av_always_inline pad_int16(int16_t *_dst, const 
ptrdiff_t dst_stride
 memcpy(_dst, _dst - dst_stride, padded_width * sizeof(int16_t));
 }
 
-static int vvc_sad(const int16_t *src0, const int16_t *src1, int dx, int dy,
+static int vvc_sad(const int16_t *src0, const int16_t *src1, intptr_t dx, 
intptr_t dy,
 const int block_w, const int block_h)
 {
 int sad = 0;
diff --git a/libavcodec/vvc/dsp.h b/libavcodec/vvc/dsp.h
index 9810ac314c..213337358b 100644
--- a/libavcodec/vvc/dsp.h
+++ b/libavcodec/vvc/dsp.h
@@ -86,7 +86,7 @@ typedef struct VVCInterDSPContext {
 
 void (*apply_bdof)(uint8_t *dst, ptrdiff_t dst_stride, int16_t *src0, 
int16_t *src1, int block_w, int block_h);
 
-int (*sad)(const int16_t *src0, const int16_t *src1, int dx, int dy, int 
block_w, int block_h);
+int (*sad)(const int16_t *src0, const int16_t *src1, intptr_t dx, intptr_t 
dy, int block_w, int block_h);
 void (*dmvr[2][2])(int16_t *dst, const uint8_t *src, ptrdiff_t src_stride, 
int height,
 intptr_t mx, intptr_t my, int width);
 } VVCInterDSPContext;
diff --git a/libavcodec/x86/vvc/Makefile b/libavcodec/x86/vvc/Makefile
index d6a66f860a..7b2438ce17 100644
--- a/libavcodec/x86/vvc/Makefile
+++ b/libavcodec/x86/vvc/Makefile
@@ -5,4 +5,5 @@ OBJS-$(CONFIG_VVC_DECODER) += x86/vvc/vvcdsp_init.o 
\
   x86/h26x/h2656dsp.o
 X86ASM-OBJS-$(CONFIG_VVC_DECODER)  += x86/vvc/vvc_alf.o  \
   x86/vvc/vvc_mc.o   \
-  x86/h26x/h2656_inter.o
+  x86/vvc/vvc_sad.o  \
+  x86/h26x/h2656_inter.o 
diff --git a/libavcodec/x86/vvc/vvc_sad.asm b/libavcodec/x86/vvc/vvc_sad.asm
new file mode 100644
index 00..9766446b11
--- /dev/null
+++ b/libavcodec/x86/vvc/vvc_sad.asm
@@ -0,0 +1,130 @@
+; /*
+; * Provide SIMD DMVR SAD functions for VVC decoding
+; *
+; * Copyright (c) 2024 Stone Chen
+; *
+; * This file is part of FFmpeg.
+; *
+; * FFmpeg is free software; you can redistribute it and/or
+; * modify it under the terms of the GNU Lesser General Public
+; * License as published by the Free Software Foundation; either
+; * version 2.1 of the License, or (at your option) any later version.
+; *
+; * FFmpeg is distributed in the hope that it will be useful,
+; * but WITHOUT ANY WARRANTY; without even the implied warranty of
+; * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+; * Lesser General Public License for more details.
+; *
+; * You should have received a copy of the GNU Lesser General Public
+; * License along with FFmpeg; if not, write to the Free Software
+; * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 
USA
+; */
+
+%include "libavutil/x86/x86util.asm"
+%define MAX_PB_SIZE 128
+%define ROWS 2
+
+SECTION_RODATA
+
+pw_1: times 2 dw 1
+
+; DMVR SAD is only calculated on even rows to reduce complexity
+SECTION .text
+
+%macro MIN_MAX_SAD 3 ; 
+pminuw   %3, %2, %1
+pmaxuw   %1, %2, %1
+psubusw  %1, %1, %3
+%endmacro
+
+%macro HORIZ_ADD 3  ; xm0, xm1, m1
+vextracti128 %1, %3, q0001  ;32  1  0
+paddd%1, %2 ; xm0 (7 + 3) (6 + 2) (5 + 1)   (4 + 0)
+pshufd   %2, %1, q0032  ; xm1-  - (7 + 3)   (6 + 2)
+paddd%1, %1, %2 ; xm0

[FFmpeg-devel] [PATCH v5 2/2][GSoC 2024] tests/checkasm: Add check_vvc_sad to vvc_mc.c

2024-05-21 Thread Stone Chen
Adds checkasm for DMVR SAD AVX2 implementation.

Benchmarks ( AMD 7940HS )
vvc_sad_8x8_c: 50.3
vvc_sad_8x8_avx2: 0.3
vvc_sad_16x16_c: 250.3
vvc_sad_16x16_avx2: 10.3
vvc_sad_32x32_c: 1020.3
vvc_sad_32x32_avx2: 60.3
vvc_sad_64x64_c: 3850.3
vvc_sad_64x64_avx2: 220.3
vvc_sad_128x128_c: 14100.3
vvc_sad_128x128_avx2: 840.3
---
 tests/checkasm/vvc_mc.c | 38 ++
 1 file changed, 38 insertions(+)

diff --git a/tests/checkasm/vvc_mc.c b/tests/checkasm/vvc_mc.c
index 97f57cb401..f2d7a6d561 100644
--- a/tests/checkasm/vvc_mc.c
+++ b/tests/checkasm/vvc_mc.c
@@ -322,8 +322,46 @@ static void check_avg(void)
 report("avg");
 }
 
+static void check_vvc_sad(void)
+{
+const int bit_depth = 10;
+VVCDSPContext c;
+LOCAL_ALIGNED_32(uint16_t, src0, [MAX_CTU_SIZE * MAX_CTU_SIZE * 4]);
+LOCAL_ALIGNED_32(uint16_t, src1, [MAX_CTU_SIZE * MAX_CTU_SIZE * 4]);
+declare_func(int, const int16_t *src0, const int16_t *src1, intptr_t dx, 
intptr_t dy, int block_w, int block_h);
+
+ff_vvc_dsp_init(&c, bit_depth);
+memset(src0, 0, MAX_CTU_SIZE * MAX_CTU_SIZE * 4);
+memset(src1, 0, MAX_CTU_SIZE * MAX_CTU_SIZE * 4);
+
+randomize_pixels(src0, src1, MAX_CTU_SIZE * MAX_CTU_SIZE * 2);
+ for (int h = 8; h <= MAX_CTU_SIZE; h *= 2) {
+for (int w = 8; w <= MAX_CTU_SIZE; w *= 2) {
+for(int offy = 0; offy <= 4; offy++) {
+for(int offx = 0; offx <= 4; offx++) {
+if(check_func(c.inter.sad, "vvc_sad_%dx%d", w, h)) {
+int result0;
+int result1;
+
+result0 =  call_ref(src0 + PIXEL_STRIDE * 2 + 2, src1 
+ PIXEL_STRIDE * 2 + 2, offx, offy, w, h);
+result1 =  call_new(src0 + PIXEL_STRIDE * 2 + 2, src1 
+ PIXEL_STRIDE * 2 + 2, offx, offy, w, h);
+
+if (result1 != result0)
+fail();
+if(w == h && offx == 0 && offy == 0)
+bench_new(src0 + PIXEL_STRIDE * 2 + 2, src1 + 
PIXEL_STRIDE * 2 + 2, offx, offy, w, h);
+}
+}
+}
+}
+ }
+
+report("check_vvc_sad");
+}
+
 void checkasm_check_vvc_mc(void)
 {
+check_vvc_sad();
 check_put_vvc_luma();
 check_put_vvc_luma_uni();
 check_put_vvc_chroma();
-- 
2.45.0

___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".


Re: [FFmpeg-devel] [PATCH v4 1/2][GSoC 2024] libavcodec/x86/vvc: Add AVX2 DMVR SAD functions for VVC

2024-05-21 Thread Stone Chen
On Mon, May 20, 2024 at 7:23 AM Ronald S. Bultje  wrote:

> Hi,
>
> This is mostly good, the following is tiny nitpicks.
>
> On Sun, May 19, 2024 at 8:46 PM Stone Chen 
> wrote:
>
>> +%macro INIT_OFFSET 6 ; src1, src2, dxq, dyq, off1, off2
>>
>
> The macro is only used once, so you could inline it in the calling
> function.
>
>>
>> +imul%5, 128
>> +imul%6, 128
>>
>
> I believe shl is typically preferred over imul for powers of two.
>
>
>> +add %5, 2
>> +add %6, 2
>>
>
> And these can be integrated as a constant offset in the lea below (lea %1,
> [%1 + %5 * 2 + 2 * 2], same for %2).
>
>
>> +add %5, %3
>> +sub %6, %3
>> +
>> +lea %1, [%1 + %5 * 2]
>> +lea %2, [%2 + %6 * 2]
>
> [..]
>
>> +cglobal vvc_sad, 6, 11, 5, src1, src2, dx, dy, block_w, block_h, off1,
>> off2, row_idx, dx2, dy2
>> +movsxd   dx2q, dxd
>> +movsxd   dy2q, dyd
>>
>
> If you change the argument type from int to intptr_t, this is not
> necessary anymore.
>
>
>> +vvc_sad_16_128:
>> +.loop_height:
>> +mov off1q, src1q
>> +mov off2q, src2q
>> +mov  row_idxd, block_wd
>> +sar  row_idxd, 4
>>
>
> You could right-shift block_wd by 4 outside the loop (before .loop_height).
>
> Ronald
>

On Mon, May 20, 2024 at 11:53 AM Ronald S. Bultje 
wrote:

> Hi,
>
> one more, I forgot.
>
> On Sun, May 19, 2024 at 8:46 PM Stone Chen 
> wrote:
>
>> +pw_1: dw 1
>>
> [..]
>
>> +vpbroadcastw   m4, [pw_1]
>>
>
> We typically suggest to use vpbroadcastd, not w (and then pw_1: times 2 dw
> 1). agner shows that on e.g. Haswell, the former (d) is 1 uops with 5
> cycles latency, whereas the latter (w) is 3 uops with 7 cycles latency, or
> more generally d is faster then w.
>
> Ronald
>

Hi Ronald,

I've sent a v5 incorporating all the above, thank you for the feedback!

-Stone
___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".


Re: [FFmpeg-devel] [PATCH v5 1/2][GSoC 2024] libavcodec/x86/vvc: Add AVX2 DMVR SAD functions for VVC

2024-05-23 Thread Stone Chen
On Thu, May 23, 2024 at 9:18 AM Nuo Mi  wrote:

> On Thu, May 23, 2024 at 7:38 AM James Almer  wrote:
>
> > On 5/21/2024 10:01 PM, Ronald S. Bultje wrote:
> > > Hi,
> > >
> > > On Tue, May 21, 2024 at 8:01 PM Stone Chen 
> > wrote:
> > >
> > >> Implements AVX2 DMVR (decoder-side motion vector refinement) SAD
> > >> functions. DMVR SAD is only calculated if w >= 8, h >= 8, and w * h >
> > 128.
> > >> To reduce complexity, SAD is only calculated on even rows. This is
> > >> calculated for all video bitdepths, but the values passed to the
> > function
> > >> are always 16bit (even if the original video bitdepth is 8). The AVX2
> > >> implementation uses min/max/sub.
> > >>
> > >> Additionally this changes parameters dx and dy from int to intptr_t.
> > This
> > >> allows dx & dy to be used as pointer offsets without needing to use
> > movsxd.
> > >>
> > >> Benchmarks ( AMD 7940HS )
> > >> Before:
> > >> BQTerrace_1920x1080_60_10_420_22_RA.vvc | 106.0 |
> > >> Chimera_8bit_1080P_1000_frames.vvc | 204.3 |
> > >> NovosobornayaSquare_1920x1080.bin | 197.3 |
> > >> RitualDance_1920x1080_60_10_420_37_RA.266 | 174.0 |
> > >>
> > >> After:
> > >> BQTerrace_1920x1080_60_10_420_22_RA.vvc | 109.3 |
> > >> Chimera_8bit_1080P_1000_frames.vvc | 216.0 |
> > >> NovosobornayaSquare_1920x1080.bin | 204.0|
> > >> RitualDance_1920x1080_60_10_420_37_RA.266 | 181.7 |
> > >> ---
> > >>   libavcodec/vvc/dsp.c |   2 +-
> > >>   libavcodec/vvc/dsp.h |   2 +-
> > >>   libavcodec/x86/vvc/Makefile  |   3 +-
> > >>   libavcodec/x86/vvc/vvc_sad.asm   | 130
> +++
> > >>   libavcodec/x86/vvc/vvcdsp_init.c |   6 ++
> > >>   5 files changed, 140 insertions(+), 3 deletions(-)
> > >>   create mode 100644 libavcodec/x86/vvc/vvc_sad.asm
> > >>
> > >
> > > LGTM.
> > >
> > > Ronald
> >
> > Implemented my changes and applied.
> >
>

Hi all,


> Thank you, Ronald, Andreas, and James.
>
> Hi Stone,
> Congratulations on surviving your first crossfire!
>


Yes thank you Ronald, Andreas and James for the feedback! Also Nuo Mi and
Jian Hua as well.

-Stone


>
> ___
> > ffmpeg-devel mailing list
> > ffmpeg-devel@ffmpeg.org
> > https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
> >
> > To unsubscribe, visit link above, or email
> > ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".
> >
> ___
> ffmpeg-devel mailing list
> ffmpeg-devel@ffmpeg.org
> https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
>
> To unsubscribe, visit link above, or email
> ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".
>
___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".


[FFmpeg-devel] [PATCH v1 1/2][GSoC 2024] libavcode/x86/vvc: change label to vvc_sad_16 to reflect block sizes

2024-05-28 Thread Stone Chen
According to the VVC specification (section 8.5.1), the maximum width/height of 
a subblock passed for DMVR SAD is 16. This along with previous constraint 
requiring width * height >= 128 means that  8x16, 16x8, and 16x16 are the only 
allowed sizes. This re-labels vvc_sad_16_128 to vvc_sad_16 to reflect this and 
adds a comment about the block size constraints. There's no functionality 
change.
---
 libavcodec/x86/vvc/vvc_sad.asm | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/libavcodec/x86/vvc/vvc_sad.asm b/libavcodec/x86/vvc/vvc_sad.asm
index b468d89ac2..982951a370 100644
--- a/libavcodec/x86/vvc/vvc_sad.asm
+++ b/libavcodec/x86/vvc/vvc_sad.asm
@@ -29,6 +29,7 @@ SECTION_RODATA
 pw_1: times 2 dw 1
 
 ; DMVR SAD is only calculated on even rows to reduce complexity
+; Additionally the only valid sizes are 8x16, 16x8, and 16x16
 SECTION .text
 
 %macro MIN_MAX_SAD 3
@@ -77,7 +78,7 @@ cglobal vvc_sad, 6, 9, 5, src1, src2, dx, dy, block_w, 
block_h, off1, off2, row_
 vpbroadcastd   m4, [pw_1]
 
 cmp  block_wd, 16
-jgevvc_sad_16_128
+je vvc_sad_16
 
 vvc_sad_8:
 .loop_height:
@@ -100,7 +101,7 @@ cglobal vvc_sad, 6, 9, 5, src1, src2, dx, dy, block_w, 
block_h, off1, off2, row_
 movd  eax, xm0
 RET
 
-vvc_sad_16_128:
+vvc_sad_16:
 sar  block_wd, 4
 .loop_height:
 mov off1q, src1q
-- 
2.45.0

___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".


[FFmpeg-devel] [PATCH v1 2/2][GSoC 2024] tests/checkasm/vvc_mc: for SAD, only test valid subblock sizes

2024-05-28 Thread Stone Chen
According to the VVC specification (section 8.5.1), the maximum width/height of 
a subblock passed for DMVR SAD is 16. This along with previous constraint 
requiring width * height >= 128 means that  8x16, 16x8, and 16x16 are the only 
allowed sizes.

This changes check_vvc_sad() to only test and benchmark those sizes.
---
 tests/checkasm/vvc_mc.c | 12 +++-
 1 file changed, 7 insertions(+), 5 deletions(-)

diff --git a/tests/checkasm/vvc_mc.c b/tests/checkasm/vvc_mc.c
index 1e889e2cff..09cac82edb 100644
--- a/tests/checkasm/vvc_mc.c
+++ b/tests/checkasm/vvc_mc.c
@@ -337,11 +337,12 @@ static void check_vvc_sad(void)
 memset(src1, 0, MAX_CTU_SIZE * MAX_CTU_SIZE * 4 * sizeof(uint16_t));
 
 randomize_pixels(src0, src1, MAX_CTU_SIZE * MAX_CTU_SIZE * 4);
- for (int h = 8; h <= MAX_CTU_SIZE; h *= 2) {
-for (int w = 8; w <= MAX_CTU_SIZE; w *= 2) {
+for (int h = 8; h <= 16; h *= 2) {
+for (int w = 8; w <= 16; w *= 2) {
 for(int offy = 0; offy <= 4; offy++) {
 for(int offx = 0; offx <= 4; offx++) {
-if(check_func(c.inter.sad, "sad_%dx%d", w, h)) {
+if(w * h >= 128) {
+if(check_func(c.inter.sad, "sad_%dx%d", w, h)) {
 int result0;
 int result1;
 
@@ -350,13 +351,14 @@ static void check_vvc_sad(void)
 
 if (result1 != result0)
 fail();
-if(w == h && offx == 0 && offy == 0)
+if(offx == 0 && offy == 0)
 bench_new(src0 + PIXEL_STRIDE * 2 + 2, src1 + 
PIXEL_STRIDE * 2 + 2, offx, offy, w, h);
+}
 }
 }
 }
 }
- }
+}
 
 report("sad");
 }
-- 
2.45.0

___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".


[FFmpeg-devel] [PATCH 1/3] libavcodec/vaapi_encode: Change the way to call async to increase performance

2021-10-27 Thread Wenbin Chen
Fix: #7706. After commit 5fdcf85bbffe7451c2, vaapi encoder's performance
decrease. The reason is that vaRenderPicture() and vaSyncSurface() are
called at the same time (vaRenderPicture() always followed by a
vaSyncSurface()). When we encode stream with B frames, we need buffer to
reorder frames, so we can send serveral frames to HW at once to increase
performance. Now I changed them to be called in a
asynchronous way, which will make better use of hardware.
1080p transcoding increases about 17% fps on my environment.

Signed-off-by: Wenbin Chen 
---
 libavcodec/vaapi_encode.c | 41 ---
 libavcodec/vaapi_encode.h |  3 +++
 2 files changed, 33 insertions(+), 11 deletions(-)

diff --git a/libavcodec/vaapi_encode.c b/libavcodec/vaapi_encode.c
index ec054ae701..5927849233 100644
--- a/libavcodec/vaapi_encode.c
+++ b/libavcodec/vaapi_encode.c
@@ -951,8 +951,10 @@ static int vaapi_encode_pick_next(AVCodecContext *avctx,
 if (!pic && ctx->end_of_stream) {
 --b_counter;
 pic = ctx->pic_end;
-if (pic->encode_issued)
+if (pic->encode_complete)
 return AVERROR_EOF;
+else if (pic->encode_issued)
+return AVERROR(EAGAIN);
 }
 
 if (!pic) {
@@ -1177,20 +1179,31 @@ int ff_vaapi_encode_receive_packet(AVCodecContext 
*avctx, AVPacket *pkt)
 return AVERROR(EAGAIN);
 }
 
-pic = NULL;
-err = vaapi_encode_pick_next(avctx, &pic);
-if (err < 0)
-return err;
-av_assert0(pic);
+while (av_fifo_size(ctx->encode_fifo) <= MAX_PICTURE_REFERENCES * 
sizeof(VAAPIEncodePicture *)) {
+pic = NULL;
+err = vaapi_encode_pick_next(avctx, &pic);
+if (err < 0)
+break;
+av_assert0(pic);
 
-pic->encode_order = ctx->encode_order++;
+pic->encode_order = ctx->encode_order +
+(av_fifo_size(ctx->encode_fifo) / 
sizeof(VAAPIEncodePicture *));
 
-err = vaapi_encode_issue(avctx, pic);
-if (err < 0) {
-av_log(avctx, AV_LOG_ERROR, "Encode failed: %d.\n", err);
-return err;
+err = vaapi_encode_issue(avctx, pic);
+if (err < 0) {
+av_log(avctx, AV_LOG_ERROR, "Encode failed: %d.\n", err);
+return err;
+}
+
+av_fifo_generic_write(ctx->encode_fifo, &pic, sizeof(pic), NULL);
 }
 
+if (!av_fifo_size(ctx->encode_fifo))
+return err;
+
+av_fifo_generic_read(ctx->encode_fifo, &pic, sizeof(pic), NULL);
+ctx->encode_order = pic->encode_order + 1;
+
 err = vaapi_encode_output(avctx, pic, pkt);
 if (err < 0) {
 av_log(avctx, AV_LOG_ERROR, "Output failed: %d.\n", err);
@@ -2520,6 +2533,11 @@ av_cold int ff_vaapi_encode_init(AVCodecContext *avctx)
 }
 }
 
+ctx->encode_fifo = av_fifo_alloc((MAX_PICTURE_REFERENCES + 1) *
+  sizeof(VAAPIEncodePicture *));
+if (!ctx->encode_fifo)
+return AVERROR(ENOMEM);
+
 return 0;
 
 fail:
@@ -2552,6 +2570,7 @@ av_cold int ff_vaapi_encode_close(AVCodecContext *avctx)
 
 av_freep(&ctx->codec_sequence_params);
 av_freep(&ctx->codec_picture_params);
+av_fifo_freep(&ctx->encode_fifo);
 
 av_buffer_unref(&ctx->recon_frames_ref);
 av_buffer_unref(&ctx->input_frames_ref);
diff --git a/libavcodec/vaapi_encode.h b/libavcodec/vaapi_encode.h
index b41604a883..89fe8de466 100644
--- a/libavcodec/vaapi_encode.h
+++ b/libavcodec/vaapi_encode.h
@@ -29,6 +29,7 @@
 
 #include "libavutil/hwcontext.h"
 #include "libavutil/hwcontext_vaapi.h"
+#include "libavutil/fifo.h"
 
 #include "avcodec.h"
 #include "hwconfig.h"
@@ -345,6 +346,8 @@ typedef struct VAAPIEncodeContext {
 int roi_warned;
 
 AVFrame *frame;
+
+AVFifoBuffer *encode_fifo;
 } VAAPIEncodeContext;
 
 enum {
-- 
2.25.1

___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".


[FFmpeg-devel] [PATCH 2/3] libavcodec/vaapi_encode: Add new API adaption to vaapi_encode

2021-10-27 Thread Wenbin Chen
Add vaSyncBuffer to VAAPI encoder. Old version API vaSyncSurface wait
surface to complete. When surface is used for multiple operation, it
wait all operation to finish. vaSyncBuffer only wait one channel to
finish.

Add wait param to vaapi_encode_wait() to prepare for the async_depth
option. "wait=1" means wait until operation ready. "wait=0" means
query operation's status. If ready return 0, if still in progress
return EAGAIN.

Signed-off-by: Wenbin Chen 
---
 libavcodec/vaapi_encode.c | 47 +--
 1 file changed, 40 insertions(+), 7 deletions(-)

diff --git a/libavcodec/vaapi_encode.c b/libavcodec/vaapi_encode.c
index 5927849233..db0ae136a1 100644
--- a/libavcodec/vaapi_encode.c
+++ b/libavcodec/vaapi_encode.c
@@ -134,7 +134,8 @@ static int 
vaapi_encode_make_misc_param_buffer(AVCodecContext *avctx,
 }
 
 static int vaapi_encode_wait(AVCodecContext *avctx,
- VAAPIEncodePicture *pic)
+ VAAPIEncodePicture *pic,
+ uint8_t wait)
 {
 VAAPIEncodeContext *ctx = avctx->priv_data;
 VAStatus vas;
@@ -150,11 +151,43 @@ static int vaapi_encode_wait(AVCodecContext *avctx,
"(input surface %#x).\n", pic->display_order,
pic->encode_order, pic->input_surface);
 
-vas = vaSyncSurface(ctx->hwctx->display, pic->input_surface);
-if (vas != VA_STATUS_SUCCESS) {
-av_log(avctx, AV_LOG_ERROR, "Failed to sync to picture completion: "
-   "%d (%s).\n", vas, vaErrorStr(vas));
+#if VA_CHECK_VERSION(1, 9, 0)
+// Try vaSyncBuffer.
+vas = vaSyncBuffer(ctx->hwctx->display,
+   pic->output_buffer,
+   wait ? VA_TIMEOUT_INFINITE : 0);
+if (vas == VA_STATUS_ERROR_TIMEDOUT) {
+return AVERROR(EAGAIN);
+} else if (vas != VA_STATUS_SUCCESS && vas != 
VA_STATUS_ERROR_UNIMPLEMENTED) {
+av_log(avctx, AV_LOG_ERROR, "Failed to sync to output buffer 
completion: "
+"%d (%s).\n", vas, vaErrorStr(vas));
 return AVERROR(EIO);
+} else if (vas == VA_STATUS_ERROR_UNIMPLEMENTED)
+// If vaSyncBuffer is not implemented, try old version API.
+#endif
+{
+if (!wait) {
+VASurfaceStatus surface_status;
+vas = vaQuerySurfaceStatus(ctx->hwctx->display,
+pic->input_surface,
+&surface_status);
+if (vas == VA_STATUS_SUCCESS &&
+surface_status != VASurfaceReady &&
+surface_status != VASurfaceSkipped) {
+return AVERROR(EAGAIN);
+} else if (vas != VA_STATUS_SUCCESS) {
+av_log(avctx, AV_LOG_ERROR, "Failed to query surface status: "
+"%d (%s).\n", vas, vaErrorStr(vas));
+return AVERROR(EIO);
+}
+} else {
+vas = vaSyncSurface(ctx->hwctx->display, pic->input_surface);
+if (vas != VA_STATUS_SUCCESS) {
+av_log(avctx, AV_LOG_ERROR, "Failed to sync to picture 
completion: "
+"%d (%s).\n", vas, vaErrorStr(vas));
+return AVERROR(EIO);
+}
+}
 }
 
 // Input is definitely finished with now.
@@ -633,7 +666,7 @@ static int vaapi_encode_output(AVCodecContext *avctx,
 uint8_t *ptr;
 int err;
 
-err = vaapi_encode_wait(avctx, pic);
+err = vaapi_encode_wait(avctx, pic, 1);
 if (err < 0)
 return err;
 
@@ -695,7 +728,7 @@ fail:
 static int vaapi_encode_discard(AVCodecContext *avctx,
 VAAPIEncodePicture *pic)
 {
-vaapi_encode_wait(avctx, pic);
+vaapi_encode_wait(avctx, pic, 1);
 
 if (pic->output_buffer_ref) {
 av_log(avctx, AV_LOG_DEBUG, "Discard output for pic "
-- 
2.25.1

___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".


[FFmpeg-devel] [PATCH 3/3] libavcodec/vaapi_encode: Add async_depth to vaapi_encoder to increase performance

2021-10-27 Thread Wenbin Chen
Add async_depth to increase encoder's performance. Reuse encode_fifo as
async buffer. Encoder puts all reordered frame to HW and then check
fifo size. If fifo < async_depth and the top frame is not ready, it will
return AVERROR(EAGAIN) to require more frames.

1080p transcoding (no B frames) with -async_depth=4 can increase 20%
performance on my environment.
The async increases performance but also introduces frame delay.

Signed-off-by: Wenbin Chen 
---
 libavcodec/vaapi_encode.c | 20 +++-
 libavcodec/vaapi_encode.h | 12 ++--
 2 files changed, 25 insertions(+), 7 deletions(-)

diff --git a/libavcodec/vaapi_encode.c b/libavcodec/vaapi_encode.c
index db0ae136a1..616fb7c089 100644
--- a/libavcodec/vaapi_encode.c
+++ b/libavcodec/vaapi_encode.c
@@ -1158,7 +1158,8 @@ static int vaapi_encode_send_frame(AVCodecContext *avctx, 
AVFrame *frame)
 if (ctx->input_order == ctx->decode_delay)
 ctx->dts_pts_diff = pic->pts - ctx->first_pts;
 if (ctx->output_delay > 0)
-ctx->ts_ring[ctx->input_order % (3 * ctx->output_delay)] = 
pic->pts;
+ctx->ts_ring[ctx->input_order %
+(3 * ctx->output_delay + ctx->async_depth)] = pic->pts;
 
 pic->display_order = ctx->input_order;
 ++ctx->input_order;
@@ -1212,7 +1213,8 @@ int ff_vaapi_encode_receive_packet(AVCodecContext *avctx, 
AVPacket *pkt)
 return AVERROR(EAGAIN);
 }
 
-while (av_fifo_size(ctx->encode_fifo) <= MAX_PICTURE_REFERENCES * 
sizeof(VAAPIEncodePicture *)) {
+while (av_fifo_size(ctx->encode_fifo) <
+MAX_ASYNC_DEPTH * sizeof(VAAPIEncodePicture *)) {
 pic = NULL;
 err = vaapi_encode_pick_next(avctx, &pic);
 if (err < 0)
@@ -1234,6 +1236,14 @@ int ff_vaapi_encode_receive_packet(AVCodecContext 
*avctx, AVPacket *pkt)
 if (!av_fifo_size(ctx->encode_fifo))
 return err;
 
+if (av_fifo_size(ctx->encode_fifo) < ctx->async_depth * 
sizeof(VAAPIEncodePicture *) &&
+!ctx->end_of_stream) {
+av_fifo_generic_peek(ctx->encode_fifo, &pic, sizeof(pic), NULL);
+err = vaapi_encode_wait(avctx, pic, 0);
+if (err < 0)
+return err;
+}
+
 av_fifo_generic_read(ctx->encode_fifo, &pic, sizeof(pic), NULL);
 ctx->encode_order = pic->encode_order + 1;
 
@@ -1252,7 +1262,7 @@ int ff_vaapi_encode_receive_packet(AVCodecContext *avctx, 
AVPacket *pkt)
 pkt->dts = ctx->ts_ring[pic->encode_order] - ctx->dts_pts_diff;
 } else {
 pkt->dts = ctx->ts_ring[(pic->encode_order - ctx->decode_delay) %
-(3 * ctx->output_delay)];
+(3 * ctx->output_delay + ctx->async_depth)];
 }
 av_log(avctx, AV_LOG_DEBUG, "Output packet: pts %"PRId64" dts 
%"PRId64".\n",
pkt->pts, pkt->dts);
@@ -2566,8 +2576,8 @@ av_cold int ff_vaapi_encode_init(AVCodecContext *avctx)
 }
 }
 
-ctx->encode_fifo = av_fifo_alloc((MAX_PICTURE_REFERENCES + 1) *
-  sizeof(VAAPIEncodePicture *));
+ctx->encode_fifo = av_fifo_alloc(MAX_ASYNC_DEPTH *
+ sizeof(VAAPIEncodePicture *));
 if (!ctx->encode_fifo)
 return AVERROR(ENOMEM);
 
diff --git a/libavcodec/vaapi_encode.h b/libavcodec/vaapi_encode.h
index 89fe8de466..1bf5d7c337 100644
--- a/libavcodec/vaapi_encode.h
+++ b/libavcodec/vaapi_encode.h
@@ -48,6 +48,7 @@ enum {
 MAX_TILE_ROWS  = 22,
 // A.4.1: table A.6 allows at most 20 tile columns for any level.
 MAX_TILE_COLS  = 20,
+MAX_ASYNC_DEPTH= 64,
 };
 
 extern const AVCodecHWConfigInternal *const ff_vaapi_encode_hw_configs[];
@@ -298,7 +299,8 @@ typedef struct VAAPIEncodeContext {
 // Timestamp handling.
 int64_t first_pts;
 int64_t dts_pts_diff;
-int64_t ts_ring[MAX_REORDER_DELAY * 3];
+int64_t ts_ring[MAX_REORDER_DELAY * 3 +
+MAX_ASYNC_DEPTH];
 
 // Slice structure.
 int slice_block_rows;
@@ -348,6 +350,8 @@ typedef struct VAAPIEncodeContext {
 AVFrame *frame;
 
 AVFifoBuffer *encode_fifo;
+
+int async_depth;
 } VAAPIEncodeContext;
 
 enum {
@@ -458,7 +462,11 @@ int ff_vaapi_encode_close(AVCodecContext *avctx);
 { "b_depth", \
   "Maximum B-frame reference depth", \
   OFFSET(common.desired_b_depth), AV_OPT_TYPE_INT, \
-  { .i64 = 1 }, 1, INT_MAX, FLAGS }
+  { .i64 = 1 }, 1, INT_MAX, FLAGS }, \
+{ "async_depth", "Maximum processing parallelism. " \
+  "Increase this to improve single channel performance", \
+  OFFSET(common.async_depth), AV_OPT_TYPE_INT, \
+  {

Re: [FFmpeg-devel] [PATCH 1/3] libavcodec/vaapi_encode: Change the way to call async to increase performance

2021-10-31 Thread Chen, Wenbin
> Fix: #7706. After commit 5fdcf85bbffe7451c2, vaapi encoder's performance
> decrease. The reason is that vaRenderPicture() and vaSyncSurface() are
> called at the same time (vaRenderPicture() always followed by a
> vaSyncSurface()). When we encode stream with B frames, we need buffer to
> reorder frames, so we can send serveral frames to HW at once to increase
> performance. Now I changed them to be called in a
> asynchronous way, which will make better use of hardware.
> 1080p transcoding increases about 17% fps on my environment.
> 
> Signed-off-by: Wenbin Chen 
> ---
>  libavcodec/vaapi_encode.c | 41 ---
>  libavcodec/vaapi_encode.h |  3 +++
>  2 files changed, 33 insertions(+), 11 deletions(-)
> 
> diff --git a/libavcodec/vaapi_encode.c b/libavcodec/vaapi_encode.c
> index ec054ae701..5927849233 100644
> --- a/libavcodec/vaapi_encode.c
> +++ b/libavcodec/vaapi_encode.c
> @@ -951,8 +951,10 @@ static int vaapi_encode_pick_next(AVCodecContext
> *avctx,
>  if (!pic && ctx->end_of_stream) {
>  --b_counter;
>  pic = ctx->pic_end;
> -if (pic->encode_issued)
> +if (pic->encode_complete)
>  return AVERROR_EOF;
> +else if (pic->encode_issued)
> +return AVERROR(EAGAIN);
>  }
> 
>  if (!pic) {
> @@ -1177,20 +1179,31 @@ int
> ff_vaapi_encode_receive_packet(AVCodecContext *avctx, AVPacket *pkt)
>  return AVERROR(EAGAIN);
>  }
> 
> -pic = NULL;
> -err = vaapi_encode_pick_next(avctx, &pic);
> -if (err < 0)
> -return err;
> -av_assert0(pic);
> +while (av_fifo_size(ctx->encode_fifo) <= MAX_PICTURE_REFERENCES *
> sizeof(VAAPIEncodePicture *)) {
> +pic = NULL;
> +err = vaapi_encode_pick_next(avctx, &pic);
> +if (err < 0)
> +break;
> +av_assert0(pic);
> 
> -pic->encode_order = ctx->encode_order++;
> +pic->encode_order = ctx->encode_order +
> +(av_fifo_size(ctx->encode_fifo) / 
> sizeof(VAAPIEncodePicture
> *));
> 
> -err = vaapi_encode_issue(avctx, pic);
> -if (err < 0) {
> -av_log(avctx, AV_LOG_ERROR, "Encode failed: %d.\n", err);
> -return err;
> +err = vaapi_encode_issue(avctx, pic);
> +if (err < 0) {
> +av_log(avctx, AV_LOG_ERROR, "Encode failed: %d.\n", err);
> +return err;
> +}
> +
> +av_fifo_generic_write(ctx->encode_fifo, &pic, sizeof(pic), NULL);
>  }
> 
> +if (!av_fifo_size(ctx->encode_fifo))
> +return err;
> +
> +av_fifo_generic_read(ctx->encode_fifo, &pic, sizeof(pic), NULL);
> +ctx->encode_order = pic->encode_order + 1;
> +
>  err = vaapi_encode_output(avctx, pic, pkt);
>  if (err < 0) {
>  av_log(avctx, AV_LOG_ERROR, "Output failed: %d.\n", err);
> @@ -2520,6 +2533,11 @@ av_cold int
> ff_vaapi_encode_init(AVCodecContext *avctx)
>  }
>  }
> 
> +ctx->encode_fifo = av_fifo_alloc((MAX_PICTURE_REFERENCES + 1) *
> +  sizeof(VAAPIEncodePicture *));
> +if (!ctx->encode_fifo)
> +return AVERROR(ENOMEM);
> +
>  return 0;
> 
>  fail:
> @@ -2552,6 +2570,7 @@ av_cold int
> ff_vaapi_encode_close(AVCodecContext *avctx)
> 
>  av_freep(&ctx->codec_sequence_params);
>  av_freep(&ctx->codec_picture_params);
> +av_fifo_freep(&ctx->encode_fifo);
> 
>  av_buffer_unref(&ctx->recon_frames_ref);
>  av_buffer_unref(&ctx->input_frames_ref);
> diff --git a/libavcodec/vaapi_encode.h b/libavcodec/vaapi_encode.h
> index b41604a883..89fe8de466 100644
> --- a/libavcodec/vaapi_encode.h
> +++ b/libavcodec/vaapi_encode.h
> @@ -29,6 +29,7 @@
> 
>  #include "libavutil/hwcontext.h"
>  #include "libavutil/hwcontext_vaapi.h"
> +#include "libavutil/fifo.h"
> 
>  #include "avcodec.h"
>  #include "hwconfig.h"
> @@ -345,6 +346,8 @@ typedef struct VAAPIEncodeContext {
>  int roi_warned;
> 
>  AVFrame *frame;
> +
> +AVFifoBuffer *encode_fifo;
>  } VAAPIEncodeContext;
> 
>  enum {
> --
> 2.25.1

ping

___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".


Re: [FFmpeg-devel] [PATCH 2/3] libavcodec/vaapi_encode: Add new API adaption to vaapi_encode

2021-10-31 Thread Chen, Wenbin
> Add vaSyncBuffer to VAAPI encoder. Old version API vaSyncSurface wait
> surface to complete. When surface is used for multiple operation, it
> wait all operation to finish. vaSyncBuffer only wait one channel to
> finish.
> 
> Add wait param to vaapi_encode_wait() to prepare for the async_depth
> option. "wait=1" means wait until operation ready. "wait=0" means
> query operation's status. If ready return 0, if still in progress
> return EAGAIN.
> 
> Signed-off-by: Wenbin Chen 
> ---
>  libavcodec/vaapi_encode.c | 47 +--
>  1 file changed, 40 insertions(+), 7 deletions(-)
> 
> diff --git a/libavcodec/vaapi_encode.c b/libavcodec/vaapi_encode.c
> index 5927849233..db0ae136a1 100644
> --- a/libavcodec/vaapi_encode.c
> +++ b/libavcodec/vaapi_encode.c
> @@ -134,7 +134,8 @@ static int
> vaapi_encode_make_misc_param_buffer(AVCodecContext *avctx,
>  }
> 
>  static int vaapi_encode_wait(AVCodecContext *avctx,
> - VAAPIEncodePicture *pic)
> + VAAPIEncodePicture *pic,
> + uint8_t wait)
>  {
>  VAAPIEncodeContext *ctx = avctx->priv_data;
>  VAStatus vas;
> @@ -150,11 +151,43 @@ static int vaapi_encode_wait(AVCodecContext
> *avctx,
> "(input surface %#x).\n", pic->display_order,
> pic->encode_order, pic->input_surface);
> 
> -vas = vaSyncSurface(ctx->hwctx->display, pic->input_surface);
> -if (vas != VA_STATUS_SUCCESS) {
> -av_log(avctx, AV_LOG_ERROR, "Failed to sync to picture completion: "
> -   "%d (%s).\n", vas, vaErrorStr(vas));
> +#if VA_CHECK_VERSION(1, 9, 0)
> +// Try vaSyncBuffer.
> +vas = vaSyncBuffer(ctx->hwctx->display,
> +   pic->output_buffer,
> +   wait ? VA_TIMEOUT_INFINITE : 0);
> +if (vas == VA_STATUS_ERROR_TIMEDOUT) {
> +return AVERROR(EAGAIN);
> +} else if (vas != VA_STATUS_SUCCESS && vas !=
> VA_STATUS_ERROR_UNIMPLEMENTED) {
> +av_log(avctx, AV_LOG_ERROR, "Failed to sync to output buffer
> completion: "
> +"%d (%s).\n", vas, vaErrorStr(vas));
>  return AVERROR(EIO);
> +} else if (vas == VA_STATUS_ERROR_UNIMPLEMENTED)
> +// If vaSyncBuffer is not implemented, try old version API.
> +#endif
> +{
> +if (!wait) {
> +VASurfaceStatus surface_status;
> +vas = vaQuerySurfaceStatus(ctx->hwctx->display,
> +pic->input_surface,
> +&surface_status);
> +if (vas == VA_STATUS_SUCCESS &&
> +surface_status != VASurfaceReady &&
> +surface_status != VASurfaceSkipped) {
> +return AVERROR(EAGAIN);
> +} else if (vas != VA_STATUS_SUCCESS) {
> +av_log(avctx, AV_LOG_ERROR, "Failed to query surface status: 
> "
> +"%d (%s).\n", vas, vaErrorStr(vas));
> +return AVERROR(EIO);
> +}
> +} else {
> +vas = vaSyncSurface(ctx->hwctx->display, pic->input_surface);
> +if (vas != VA_STATUS_SUCCESS) {
> +av_log(avctx, AV_LOG_ERROR, "Failed to sync to picture
> completion: "
> +"%d (%s).\n", vas, vaErrorStr(vas));
> +return AVERROR(EIO);
> +}
> +}
>  }
> 
>  // Input is definitely finished with now.
> @@ -633,7 +666,7 @@ static int vaapi_encode_output(AVCodecContext
> *avctx,
>  uint8_t *ptr;
>  int err;
> 
> -err = vaapi_encode_wait(avctx, pic);
> +err = vaapi_encode_wait(avctx, pic, 1);
>  if (err < 0)
>  return err;
> 
> @@ -695,7 +728,7 @@ fail:
>  static int vaapi_encode_discard(AVCodecContext *avctx,
>  VAAPIEncodePicture *pic)
>  {
> -vaapi_encode_wait(avctx, pic);
> +vaapi_encode_wait(avctx, pic, 1);
> 
>  if (pic->output_buffer_ref) {
>  av_log(avctx, AV_LOG_DEBUG, "Discard output for pic "
> --
> 2.25.1

ping
___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".


Re: [FFmpeg-devel] [PATCH 3/3] libavcodec/vaapi_encode: Add async_depth to vaapi_encoder to increase performance

2021-10-31 Thread Chen, Wenbin
> Add async_depth to increase encoder's performance. Reuse encode_fifo as
> async buffer. Encoder puts all reordered frame to HW and then check
> fifo size. If fifo < async_depth and the top frame is not ready, it will
> return AVERROR(EAGAIN) to require more frames.
> 
> 1080p transcoding (no B frames) with -async_depth=4 can increase 20%
> performance on my environment.
> The async increases performance but also introduces frame delay.
> 
> Signed-off-by: Wenbin Chen 
> ---
>  libavcodec/vaapi_encode.c | 20 +++-
>  libavcodec/vaapi_encode.h | 12 ++--
>  2 files changed, 25 insertions(+), 7 deletions(-)
> 
> diff --git a/libavcodec/vaapi_encode.c b/libavcodec/vaapi_encode.c
> index db0ae136a1..616fb7c089 100644
> --- a/libavcodec/vaapi_encode.c
> +++ b/libavcodec/vaapi_encode.c
> @@ -1158,7 +1158,8 @@ static int
> vaapi_encode_send_frame(AVCodecContext *avctx, AVFrame *frame)
>  if (ctx->input_order == ctx->decode_delay)
>  ctx->dts_pts_diff = pic->pts - ctx->first_pts;
>  if (ctx->output_delay > 0)
> -ctx->ts_ring[ctx->input_order % (3 * ctx->output_delay)] = 
> pic->pts;
> +ctx->ts_ring[ctx->input_order %
> +(3 * ctx->output_delay + ctx->async_depth)] = 
> pic->pts;
> 
>  pic->display_order = ctx->input_order;
>  ++ctx->input_order;
> @@ -1212,7 +1213,8 @@ int
> ff_vaapi_encode_receive_packet(AVCodecContext *avctx, AVPacket *pkt)
>  return AVERROR(EAGAIN);
>  }
> 
> -while (av_fifo_size(ctx->encode_fifo) <= MAX_PICTURE_REFERENCES *
> sizeof(VAAPIEncodePicture *)) {
> +while (av_fifo_size(ctx->encode_fifo) <
> +MAX_ASYNC_DEPTH * sizeof(VAAPIEncodePicture *)) {
>  pic = NULL;
>  err = vaapi_encode_pick_next(avctx, &pic);
>  if (err < 0)
> @@ -1234,6 +1236,14 @@ int
> ff_vaapi_encode_receive_packet(AVCodecContext *avctx, AVPacket *pkt)
>  if (!av_fifo_size(ctx->encode_fifo))
>  return err;
> 
> +if (av_fifo_size(ctx->encode_fifo) < ctx->async_depth *
> sizeof(VAAPIEncodePicture *) &&
> +!ctx->end_of_stream) {
> +av_fifo_generic_peek(ctx->encode_fifo, &pic, sizeof(pic), NULL);
> +err = vaapi_encode_wait(avctx, pic, 0);
> +if (err < 0)
> +return err;
> +}
> +
>  av_fifo_generic_read(ctx->encode_fifo, &pic, sizeof(pic), NULL);
>  ctx->encode_order = pic->encode_order + 1;
> 
> @@ -1252,7 +1262,7 @@ int
> ff_vaapi_encode_receive_packet(AVCodecContext *avctx, AVPacket *pkt)
>  pkt->dts = ctx->ts_ring[pic->encode_order] - ctx->dts_pts_diff;
>  } else {
>  pkt->dts = ctx->ts_ring[(pic->encode_order - ctx->decode_delay) %
> -(3 * ctx->output_delay)];
> +(3 * ctx->output_delay + ctx->async_depth)];
>  }
>  av_log(avctx, AV_LOG_DEBUG, "Output packet: pts %"PRId64"
> dts %"PRId64".\n",
> pkt->pts, pkt->dts);
> @@ -2566,8 +2576,8 @@ av_cold int ff_vaapi_encode_init(AVCodecContext
> *avctx)
>  }
>  }
> 
> -ctx->encode_fifo = av_fifo_alloc((MAX_PICTURE_REFERENCES + 1) *
> -  sizeof(VAAPIEncodePicture *));
> +ctx->encode_fifo = av_fifo_alloc(MAX_ASYNC_DEPTH *
> + sizeof(VAAPIEncodePicture *));
>  if (!ctx->encode_fifo)
>  return AVERROR(ENOMEM);
> 
> diff --git a/libavcodec/vaapi_encode.h b/libavcodec/vaapi_encode.h
> index 89fe8de466..1bf5d7c337 100644
> --- a/libavcodec/vaapi_encode.h
> +++ b/libavcodec/vaapi_encode.h
> @@ -48,6 +48,7 @@ enum {
>  MAX_TILE_ROWS  = 22,
>  // A.4.1: table A.6 allows at most 20 tile columns for any level.
>  MAX_TILE_COLS  = 20,
> +MAX_ASYNC_DEPTH= 64,
>  };
> 
>  extern const AVCodecHWConfigInternal *const
> ff_vaapi_encode_hw_configs[];
> @@ -298,7 +299,8 @@ typedef struct VAAPIEncodeContext {
>  // Timestamp handling.
>  int64_t first_pts;
>  int64_t dts_pts_diff;
> -int64_t ts_ring[MAX_REORDER_DELAY * 3];
> +int64_t ts_ring[MAX_REORDER_DELAY * 3 +
> +MAX_ASYNC_DEPTH];
> 
>  // Slice structure.
>  int slice_block_rows;
> @@ -348,6 +350,8 @@ typedef struct VAAPIEncodeContext {
>  AVFrame *frame;
> 
>  AVFifoBuffer *encode_fifo;
> +
> +int a

[FFmpeg-devel] [PATCH 1/4] libavutil/hwcontext_d3d11va: Add nb_surfaces to AVD3D11VAFramesContext

2021-11-03 Thread Wenbin Chen
Adding nb_surfaces in AVD3D11VAFramesContext in the end of the structure
to support flexible size of this arrays and align to
AVDXVA2FramesContext and AVVAAPIFramesContext.

Signed-off-by Wenbin Chen 
---
 libavutil/hwcontext_d3d11va.c | 3 +--
 libavutil/hwcontext_d3d11va.h | 2 ++
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/libavutil/hwcontext_d3d11va.c b/libavutil/hwcontext_d3d11va.c
index 8ab96bad25..086e7b9daa 100644
--- a/libavutil/hwcontext_d3d11va.c
+++ b/libavutil/hwcontext_d3d11va.c
@@ -72,7 +72,6 @@ static av_cold void load_functions(void)
 }
 
 typedef struct D3D11VAFramesContext {
-int nb_surfaces;
 int nb_surfaces_used;
 
 DXGI_FORMAT format;
@@ -287,7 +286,7 @@ static int d3d11va_frames_init(AVHWFramesContext *ctx)
 hwctx->texture_infos = av_calloc(ctx->initial_pool_size, 
sizeof(*hwctx->texture_infos));
 if (!hwctx->texture_infos)
 return AVERROR(ENOMEM);
-s->nb_surfaces = ctx->initial_pool_size;
+hwctx->nb_surfaces = ctx->initial_pool_size;
 
 ctx->internal->pool_internal = 
av_buffer_pool_init2(sizeof(AVD3D11FrameDescriptor),
 ctx, 
d3d11va_pool_alloc, NULL);
diff --git a/libavutil/hwcontext_d3d11va.h b/libavutil/hwcontext_d3d11va.h
index 77d2d72f1b..b0df470190 100644
--- a/libavutil/hwcontext_d3d11va.h
+++ b/libavutil/hwcontext_d3d11va.h
@@ -173,6 +173,8 @@ typedef struct AVD3D11VAFramesContext {
  * This field is ignored/invalid if a user-allocated texture is provided.
 */
 AVD3D11FrameDescriptor *texture_infos;
+
+int nb_surfaces;
 } AVD3D11VAFramesContext;
 
 #endif /* AVUTIL_HWCONTEXT_D3D11VA_H */
-- 
2.25.1

___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".


[FFmpeg-devel] [PATCH 2/4] libavutil/hwcontext_qsv: fix a bug when malloc handle_pairs_internal

2021-11-03 Thread Wenbin Chen
This commandline cause core dumped:
ffmpeg -hwaccel vaapi -hwaccel_device /dev/dri/renderD128 \
-hwaccel_output_format vaapi -i input.264 \
-vf "hwmap=derive_device=qsv,format=qsv" \
-c:v h264_qsv output.264

reason: We use nb_surfaces to assign surface to handle_pairs_internal
but handle_pairs_internal is alloced with the size of init_pool_size.
This lead to access to illegal address.

Now change it to use nb_surfaces to allocate handle_pairs_internal and the
core dumped error is unseen. Also change D3D11VA to use nb_surfaces
to align to VAAPI and DXVA2.

Signed-off-by: Wenbin Chen 
---
 libavutil/hwcontext_qsv.c | 13 ++---
 1 file changed, 6 insertions(+), 7 deletions(-)

diff --git a/libavutil/hwcontext_qsv.c b/libavutil/hwcontext_qsv.c
index c18747f7eb..5a285fd25b 100644
--- a/libavutil/hwcontext_qsv.c
+++ b/libavutil/hwcontext_qsv.c
@@ -1123,8 +1123,7 @@ static int qsv_frames_derive_to(AVHWFramesContext 
*dst_ctx,
 case AV_HWDEVICE_TYPE_VAAPI:
 {
 AVVAAPIFramesContext *src_hwctx = src_ctx->hwctx;
-s->handle_pairs_internal = av_calloc(src_ctx->initial_pool_size,
- 
sizeof(*s->handle_pairs_internal));
+s->handle_pairs_internal = av_calloc(src_hwctx->nb_surfaces, 
sizeof(*s->handle_pairs_internal));
 if (!s->handle_pairs_internal)
 return AVERROR(ENOMEM);
 s->surfaces_internal = av_calloc(src_hwctx->nb_surfaces,
@@ -1146,15 +1145,15 @@ static int qsv_frames_derive_to(AVHWFramesContext 
*dst_ctx,
 case AV_HWDEVICE_TYPE_D3D11VA:
 {
 AVD3D11VAFramesContext *src_hwctx = src_ctx->hwctx;
-s->handle_pairs_internal = av_calloc(src_ctx->initial_pool_size,
+s->handle_pairs_internal = av_calloc(src_ctx->nb_surfaces,
  
sizeof(*s->handle_pairs_internal));
 if (!s->handle_pairs_internal)
 return AVERROR(ENOMEM);
-s->surfaces_internal = av_calloc(src_ctx->initial_pool_size,
+s->surfaces_internal = av_calloc(src_ctx->nb_surfaces,
  sizeof(*s->surfaces_internal));
 if (!s->surfaces_internal)
 return AVERROR(ENOMEM);
-for (i = 0; i < src_ctx->initial_pool_size; i++) {
+for (i = 0; i < src_ctx->nb_surfaces; i++) {
 qsv_init_surface(dst_ctx, &s->surfaces_internal[i]);
 s->handle_pairs_internal[i].first = 
(mfxMemId)src_hwctx->texture_infos[i].texture;
 if (src_hwctx->BindFlags & D3D11_BIND_RENDER_TARGET) {
@@ -1164,7 +1163,7 @@ static int qsv_frames_derive_to(AVHWFramesContext 
*dst_ctx,
 }
 s->surfaces_internal[i].Data.MemId = 
(mfxMemId)&s->handle_pairs_internal[i];
 }
-dst_hwctx->nb_surfaces = src_ctx->initial_pool_size;
+dst_hwctx->nb_surfaces = src_ctx->nb_surfaces;
 if (src_hwctx->BindFlags & D3D11_BIND_RENDER_TARGET) {
 dst_hwctx->frame_type |= 
MFX_MEMTYPE_VIDEO_MEMORY_PROCESSOR_TARGET;
 } else {
@@ -1177,7 +1176,7 @@ static int qsv_frames_derive_to(AVHWFramesContext 
*dst_ctx,
 case AV_HWDEVICE_TYPE_DXVA2:
 {
 AVDXVA2FramesContext *src_hwctx = src_ctx->hwctx;
-s->handle_pairs_internal = av_calloc(src_ctx->initial_pool_size,
+s->handle_pairs_internal = av_calloc(src_ctx->nb_surfaces,
  
sizeof(*s->handle_pairs_internal));
 if (!s->handle_pairs_internal)
 return AVERROR(ENOMEM);
-- 
2.25.1

___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".


[FFmpeg-devel] [PATCH 3/4] libavutil/hwcontext_qsv: fix a bug for mapping vaapi frame to qsv

2021-11-03 Thread Wenbin Chen
From: nyanmisaka 

The data stored in data[3] in VAAPI AVFrame is VASurfaceID while
the data stored in pair->first is the pointer of VASurfaceID, so
we need to do cast to make following commandline works:

ffmpeg -hwaccel vaapi -hwaccel_device /dev/dri/renderD128 \
-hwaccel_output_format vaapi -i input.264 \
-vf "hwmap=derive_device=qsv,format=qsv" -c:v h264_qsv output.264

Signed-off-by: nyanmisaka 
Signed-off-by: Wenbin Chen 
---
 libavutil/hwcontext_qsv.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/libavutil/hwcontext_qsv.c b/libavutil/hwcontext_qsv.c
index 5a285fd25b..8075c27862 100644
--- a/libavutil/hwcontext_qsv.c
+++ b/libavutil/hwcontext_qsv.c
@@ -1219,7 +1219,7 @@ static int qsv_map_to(AVHWFramesContext *dst_ctx,
 case AV_PIX_FMT_VAAPI:
 {
 mfxHDLPair *pair = (mfxHDLPair*)hwctx->surfaces[i].Data.MemId;
-if (pair->first == src->data[3]) {
+if (*(VASurfaceID*)pair->first == (VASurfaceID)src->data[3]) {
 index = i;
 break;
 }
-- 
2.25.1

___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".


[FFmpeg-devel] [PATCH 4/4] libavutil/hwcontext_opencl: fix a bug for mapping qsv frame to opencl

2021-11-03 Thread Wenbin Chen
From: nyanmisaka 

mfxHDLPair was added to qsv, so modify qsv->opencl map function as well.
Now the following commandline works:

ffmpeg -v verbose -init_hw_device vaapi=va:/dev/dri/renderD128 \
-init_hw_device qsv=qs@va -init_hw_device opencl=ocl@va -filter_hw_device ocl \
-hwaccel qsv -hwaccel_output_format qsv -hwaccel_device qs -c:v h264_qsv \
-i input.264 -vf "hwmap=derive_device=opencl,format=opencl,avgblur_opencl, \
hwmap=derive_device=qsv:reverse=1:extra_hw_frames=32,format=qsv" \
-c:v h264_qsv output.264

Signed-off-by: nyanmisaka 
Signed-off-by: Wenbin Chen 
---
 libavutil/hwcontext_opencl.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/libavutil/hwcontext_opencl.c b/libavutil/hwcontext_opencl.c
index 26a3a24593..4b6e74ff6f 100644
--- a/libavutil/hwcontext_opencl.c
+++ b/libavutil/hwcontext_opencl.c
@@ -2249,7 +2249,8 @@ static int opencl_map_from_qsv(AVHWFramesContext *dst_fc, 
AVFrame *dst,
 #if CONFIG_LIBMFX
 if (src->format == AV_PIX_FMT_QSV) {
 mfxFrameSurface1 *mfx_surface = (mfxFrameSurface1*)src->data[3];
-va_surface = *(VASurfaceID*)mfx_surface->Data.MemId;
+mfxHDLPair *pair = (mfxHDLPair*)mfx_surface->Data.MemId;
+va_surface = *(VASurfaceID*)pair->first;
 } else
 #endif
 if (src->format == AV_PIX_FMT_VAAPI) {
-- 
2.25.1

___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".


Re: [FFmpeg-devel] [PATCH v3] libavfilter: add a gblur_vulkan filter

2021-11-07 Thread Chen, Wenbin
ptorImageInfo tmp_images[3];
> > +VkDescriptorImageInfo output_images[3];
> > +VkDescriptorBufferInfo params_desc_hor;
> > +VkDescriptorBufferInfo params_desc_ver;
> > +
> > +int initialized;
> > +int size;
> > +int planes;
> > +int kernel_size;
> > +float sigma;
> > +float sigmaV;
> > +AVFrame *tmpframe;
> > +} GBlurVulkanContext;
> > +
> > +static const char gblur_horizontal[] = {
> > +C(0, void gblur(const ivec2 pos, const int index)  
> > )
> > +C(0, { 
> > )
> > +C(1, vec4 sum = texture(input_image[index], pos) *
> kernel[0];  )
> > +C(0,   
> > )
> > +C(1, for(int i = 1; i < kernel.length(); i++) {
> > )
> > +C(2, sum += texture(input_image[index], pos + vec2(i, 0.0)) *
> kernel[i];   )
> > +C(2, sum += texture(input_image[index], pos - vec2(i, 0.0)) *
> kernel[i];   )
> > +C(1, } 
> > )
> > +C(0,   
> > )
> > +C(1, imageStore(output_image[index], pos, sum);
> > )
> > +C(0, } 
> > )
> > +};
> > +
> > +static const char gblur_vertical[] = {
> > +C(0, void gblur(const ivec2 pos, const int index)  
> > )
> > +C(0, { 
> > )
> > +C(1, vec4 sum = texture(input_image[index], pos) *
> kernel[0];  )
> > +C(0,   
> > )
> > +C(1, for(int i = 1; i < kernel.length(); i++) {
> > )
> > +C(2, sum += texture(input_image[index], pos + vec2(0.0, i)) *
> kernel[i];   )
> > +C(2, sum += texture(input_image[index], pos - vec2(0.0, i)) *
> kernel[i];   )
> > +C(1, } 
> > )
> > +C(0,   
> > )
> > +C(1, imageStore(output_image[index], pos, sum);
> > )
> > +C(0, } 
> > )
> > +};
> >
> 
> The reason why avgblur_vulkan is split into horizontal and vertical
> was because you can change the blur radius in either direction.
> This is always going to be square, so to speed it up significantly,
> you can just do it all at once in both directions.
> 
> By the way, I've written a replacement for the synchronization
> system that uses timeline semaphores. The repo is at
> https://github.com/cyanreg/FFmpeg/tree/vulkan
> I'll be pushing it in a few days once someone reviews it.
> You can adapt your patches to that.
> 
> I've CC'd Wenbin Chen here too. I think your single_memory
> flag patch is reasonable, could you rebase it onto the branch and
> resubmit?

Ok, I will resubmit it.
___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".


[FFmpeg-devel] [PATCH 1/7] hwcontext_vaapi: Use PRIME_2 memory type for modifiers.

2021-11-09 Thread Wenbin Chen
From: Bas Nieuwenhuizen 

This way we can pass explicit modifiers in. Sometimes the
modifier matters for the number of memory planes that
libva accepts, in particular when dealing with
driver-compressed textures. Furthermore the driver might
not actually be able to determine the implicit modifier
if all the buffer-passing has used explicit modifier.
All these issues should be resolved by passing in the
modifier, and for that we switch to using the PRIME_2
memory type.

Tested with experimental radeonsi patches for modifiers
and kmsgrab. Also tested with radeonsi without the
patches to double-check it works without PRIME_2 support.

v2:
  Cache PRIME_2 support to avoid doing two calls every time on
  libva drivers that do not support it.

v3:
  Remove prime2_vas usage.

Signed-off-by: Bas Nieuwenhuizen 
---
 libavutil/hwcontext_vaapi.c | 158 ++--
 1 file changed, 114 insertions(+), 44 deletions(-)

diff --git a/libavutil/hwcontext_vaapi.c b/libavutil/hwcontext_vaapi.c
index 83e542876d..75acc851d6 100644
--- a/libavutil/hwcontext_vaapi.c
+++ b/libavutil/hwcontext_vaapi.c
@@ -79,6 +79,9 @@ typedef struct VAAPIFramesContext {
 unsigned int rt_format;
 // Whether vaDeriveImage works.
 int derive_works;
+// Caches whether VA_SURFACE_ATTRIB_MEM_TYPE_DRM_PRIME_2 is unsupported for
+// surface imports.
+int prime_2_import_unsupported;
 } VAAPIFramesContext;
 
 typedef struct VAAPIMapping {
@@ -1022,32 +1025,17 @@ static void vaapi_unmap_from_drm(AVHWFramesContext 
*dst_fc,
 static int vaapi_map_from_drm(AVHWFramesContext *src_fc, AVFrame *dst,
   const AVFrame *src, int flags)
 {
+VAAPIFramesContext *src_vafc = src_fc->internal->priv;
 AVHWFramesContext  *dst_fc =
 (AVHWFramesContext*)dst->hw_frames_ctx->data;
 AVVAAPIDeviceContext  *dst_dev = dst_fc->device_ctx->hwctx;
 const AVDRMFrameDescriptor *desc;
 const VAAPIFormatDescriptor *format_desc;
 VASurfaceID surface_id;
-VAStatus vas;
+VAStatus vas = VA_STATUS_SUCCESS;
+int use_prime2;
 uint32_t va_fourcc;
-int err, i, j, k;
-
-unsigned long buffer_handle;
-VASurfaceAttribExternalBuffers buffer_desc;
-VASurfaceAttrib attrs[2] = {
-{
-.type  = VASurfaceAttribMemoryType,
-.flags = VA_SURFACE_ATTRIB_SETTABLE,
-.value.type= VAGenericValueTypeInteger,
-.value.value.i = VA_SURFACE_ATTRIB_MEM_TYPE_DRM_PRIME,
-},
-{
-.type  = VASurfaceAttribExternalBufferDescriptor,
-.flags = VA_SURFACE_ATTRIB_SETTABLE,
-.value.type= VAGenericValueTypePointer,
-.value.value.p = &buffer_desc,
-}
-};
+int err, i, j;
 
 desc = (AVDRMFrameDescriptor*)src->data[0];
 
@@ -1083,35 +1071,117 @@ static int vaapi_map_from_drm(AVHWFramesContext 
*src_fc, AVFrame *dst,
 format_desc = vaapi_format_from_fourcc(va_fourcc);
 av_assert0(format_desc);
 
-buffer_handle = desc->objects[0].fd;
-buffer_desc.pixel_format = va_fourcc;
-buffer_desc.width= src_fc->width;
-buffer_desc.height   = src_fc->height;
-buffer_desc.data_size= desc->objects[0].size;
-buffer_desc.buffers  = &buffer_handle;
-buffer_desc.num_buffers  = 1;
-buffer_desc.flags= 0;
-
-k = 0;
-for (i = 0; i < desc->nb_layers; i++) {
-for (j = 0; j < desc->layers[i].nb_planes; j++) {
-buffer_desc.pitches[k] = desc->layers[i].planes[j].pitch;
-buffer_desc.offsets[k] = desc->layers[i].planes[j].offset;
-++k;
+use_prime2 = !src_vafc->prime_2_import_unsupported &&
+ desc->objects[0].format_modifier != DRM_FORMAT_MOD_INVALID;
+if (use_prime2) {
+VADRMPRIMESurfaceDescriptor prime_desc;
+VASurfaceAttrib prime_attrs[2] = {
+{
+.type  = VASurfaceAttribMemoryType,
+.flags = VA_SURFACE_ATTRIB_SETTABLE,
+.value.type= VAGenericValueTypeInteger,
+.value.value.i = VA_SURFACE_ATTRIB_MEM_TYPE_DRM_PRIME_2,
+},
+{
+.type  = VASurfaceAttribExternalBufferDescriptor,
+.flags = VA_SURFACE_ATTRIB_SETTABLE,
+.value.type= VAGenericValueTypePointer,
+.value.value.p = &prime_desc,
+}
+};
+prime_desc.fourcc = va_fourcc;
+prime_desc.width = src_fc->width;
+prime_desc.height = src_fc->height;
+prime_desc.num_objects = desc->nb_objects;
+for (i = 0; i < desc->nb_objects; ++i) {
+prime_desc.objects[i].fd = desc->objects[i].fd;
+prime_desc.objects[i].size = desc->objects[i].size;
+prime_desc.objects[i].drm_format_modifier =
+desc->objects[i].format_modifier;
 }
-}
-buffer_desc.num_planes = k;
 
-if (format_desc->chroma_planes

[FFmpeg-devel] [PATCH 3/7] libavutil/hwcontext_vulkan: Add one_memory flag to make vulkan compatible with vaapi device.

2021-11-09 Thread Wenbin Chen
Vaapi can import external surface, but all the planes of the external
frames should be in the same drm object. A new flag is introduced and
vulkan can choose to allocate planes in one memory according this flag.
This flag will be enabled when the vulkan device is derived from vaapi
device, so that this change will not affect current vulkan behaviour.

Signed-off-by: Wenbin Chen 
---
 libavutil/hwcontext_vulkan.c | 12 +++-
 1 file changed, 11 insertions(+), 1 deletion(-)

diff --git a/libavutil/hwcontext_vulkan.c b/libavutil/hwcontext_vulkan.c
index 6041580117..ccf3e58f49 100644
--- a/libavutil/hwcontext_vulkan.c
+++ b/libavutil/hwcontext_vulkan.c
@@ -100,6 +100,9 @@ typedef struct VulkanDevicePriv {
 /* Settings */
 int use_linear_images;
 
+/* map all planes to one memory */
+int use_one_memory;
+
 /* Nvidia */
 int dev_is_nvidia;
 } VulkanDevicePriv;
@@ -1245,6 +1248,11 @@ static int 
vulkan_device_create_internal(AVHWDeviceContext *ctx,
 if (opt_d)
 p->use_linear_images = strtol(opt_d->value, NULL, 10);
 
+opt_d = av_dict_get(opts, "one_memory", NULL, 0);
+if (opt_d)
+p->use_one_memory = strtol(opt_d->value, NULL, 10);
+
+
 hwctx->enabled_dev_extensions = dev_info.ppEnabledExtensionNames;
 hwctx->nb_enabled_dev_extensions = dev_info.enabledExtensionCount;
 
@@ -1365,8 +1373,10 @@ static int vulkan_device_derive(AVHWDeviceContext *ctx,
 return AVERROR_EXTERNAL;
 }
 
-if (strstr(vendor, "Intel"))
+if (strstr(vendor, "Intel")) {
+av_dict_set_int(&opts, "one_memory", 1, 0);
 dev_select.vendor_id = 0x8086;
+}
 if (strstr(vendor, "AMD"))
 dev_select.vendor_id = 0x1002;
 
-- 
2.25.1

___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".


[FFmpeg-devel] [PATCH 2/7] libavutil/hwcontext_vaapi: Add a new nv12 format map to support vulkan frame

2021-11-09 Thread Wenbin Chen
Vulkan will map nv12 to R8 and GR88, so add this map to vaapi to support
vulkan frame.

Signed-off-by: Wenbin Chen 
---
 libavutil/hwcontext_vaapi.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/libavutil/hwcontext_vaapi.c b/libavutil/hwcontext_vaapi.c
index 75acc851d6..994b744e4d 100644
--- a/libavutil/hwcontext_vaapi.c
+++ b/libavutil/hwcontext_vaapi.c
@@ -992,6 +992,7 @@ static const struct {
 } vaapi_drm_format_map[] = {
 #ifdef DRM_FORMAT_R8
 DRM_MAP(NV12, 2, DRM_FORMAT_R8,  DRM_FORMAT_RG88),
+DRM_MAP(NV12, 2, DRM_FORMAT_R8,  DRM_FORMAT_GR88),
 #endif
 DRM_MAP(NV12, 1, DRM_FORMAT_NV12),
 #if defined(VA_FOURCC_P010) && defined(DRM_FORMAT_R16)
-- 
2.25.1

___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".


[FFmpeg-devel] [PATCH 6/7] libavutil/hwcontext_vulkan: fix a sem_wait bug when export drm

2021-11-09 Thread Wenbin Chen
sem_sig_val is wrongly assigned to pWaitSemaphoreValues when export drm. Now fix
it.

Signed-off-by: Wenbin Chen 
---
 libavutil/hwcontext_vulkan.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/libavutil/hwcontext_vulkan.c b/libavutil/hwcontext_vulkan.c
index b857d1a9ed..29ade94b7f 100644
--- a/libavutil/hwcontext_vulkan.c
+++ b/libavutil/hwcontext_vulkan.c
@@ -1718,7 +1718,7 @@ static int prepare_frame(AVHWFramesContext *hwfc, 
VulkanExecCtx *ectx,
 const int planes = av_pix_fmt_count_planes(hwfc->sw_format);
 VulkanDevicePriv *p = hwfc->device_ctx->internal->priv;
 FFVulkanFunctions *vk = &p->vkfn;
-uint64_t sem_sig_val[AV_NUM_DATA_POINTERS];
+uint64_t sem_sig_val[AV_NUM_DATA_POINTERS], 
sem_wait_val[AV_NUM_DATA_POINTERS];
 
 VkImageMemoryBarrier img_bar[AV_NUM_DATA_POINTERS] = { 0 };
 
@@ -1738,6 +1738,7 @@ static int prepare_frame(AVHWFramesContext *hwfc, 
VulkanExecCtx *ectx,
 VkPipelineStageFlagBits wait_st[AV_NUM_DATA_POINTERS];
 for (int i = 0; i < planes; i++) {
 wait_st[i] = VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT;
+sem_wait_val[i] = frame->sem_value[i];
 sem_sig_val[i] = frame->sem_value[i] + 1;
 }
 
@@ -1756,7 +1757,7 @@ static int prepare_frame(AVHWFramesContext *hwfc, 
VulkanExecCtx *ectx,
 new_layout = VK_IMAGE_LAYOUT_GENERAL;
 new_access = VK_ACCESS_MEMORY_READ_BIT | VK_ACCESS_MEMORY_WRITE_BIT;
 dst_qf = VK_QUEUE_FAMILY_EXTERNAL_KHR;
-s_timeline_sem_info.pWaitSemaphoreValues = sem_sig_val;
+s_timeline_sem_info.pWaitSemaphoreValues = sem_wait_val;
 s_timeline_sem_info.waitSemaphoreValueCount = planes;
 s_info.pWaitSemaphores = frame->sem;
 s_info.pWaitDstStageMask = wait_st;
-- 
2.25.1

___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".


[FFmpeg-devel] [PATCH 4/7] libavutil/hwcontext_vulkan: Allocate vkFrame in one memory

2021-11-09 Thread Wenbin Chen
The vaapi can import external frame, but the planes of the external
frames should be in the same drm object. I add a new function to
allocate vkFrame in one memory and vulkan device will choose a way
to allocate memory according to one_memory flag.
A new variable is added to AVVKFrame to store the offset of each plane.

Signed-off-by: Wenbin Chen 
---
 libavutil/hwcontext_vulkan.c | 46 +++-
 libavutil/hwcontext_vulkan.h |  1 +
 2 files changed, 46 insertions(+), 1 deletion(-)

diff --git a/libavutil/hwcontext_vulkan.c b/libavutil/hwcontext_vulkan.c
index ccf3e58f49..f7878ed9c3 100644
--- a/libavutil/hwcontext_vulkan.c
+++ b/libavutil/hwcontext_vulkan.c
@@ -1600,6 +1600,9 @@ static int alloc_bind_mem(AVHWFramesContext *hwfc, 
AVVkFrame *f,
 FFVulkanFunctions *vk = &p->vkfn;
 const int planes = av_pix_fmt_count_planes(hwfc->sw_format);
 VkBindImageMemoryInfo bind_info[AV_NUM_DATA_POINTERS] = { { 0 } };
+VkMemoryRequirements memory_requirements = { 0 };
+int mem_size = 0;
+int mem_size_list[AV_NUM_DATA_POINTERS] = { 0 };
 
 AVVulkanDeviceContext *hwctx = ctx->hwctx;
 
@@ -1627,6 +1630,23 @@ static int alloc_bind_mem(AVHWFramesContext *hwfc, 
AVVkFrame *f,
 req.memoryRequirements.size = FFALIGN(req.memoryRequirements.size,
   
p->props.properties.limits.minMemoryMapAlignment);
 
+if (p->use_one_memory) {
+if (ded_req.prefersDedicatedAllocation | 
ded_req.requiresDedicatedAllocation) {
+av_log(hwfc, AV_LOG_ERROR, "Cannot use dedicated allocation 
for intel vaapi\n");
+return AVERROR(EINVAL);
+}
+if (memory_requirements.size == 0) {
+memory_requirements = req.memoryRequirements;
+} else if (memory_requirements.memoryTypeBits != 
req.memoryRequirements.memoryTypeBits) {
+av_log(hwfc, AV_LOG_ERROR, "the param for each planes are not 
the same\n");
+return AVERROR(EINVAL);
+}
+
+mem_size_list[i] = req.memoryRequirements.size;
+mem_size += mem_size_list[i];
+continue;
+}
+
 /* In case the implementation prefers/requires dedicated allocation */
 use_ded_mem = ded_req.prefersDedicatedAllocation |
   ded_req.requiresDedicatedAllocation;
@@ -1648,6 +1668,29 @@ static int alloc_bind_mem(AVHWFramesContext *hwfc, 
AVVkFrame *f,
 bind_info[i].memory = f->mem[i];
 }
 
+if (p->use_one_memory) {
+memory_requirements.size = mem_size;
+
+/* Allocate memory */
+if ((err = alloc_mem(ctx, &memory_requirements,
+f->tiling == VK_IMAGE_TILING_LINEAR ?
+VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT :
+VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT,
+(void *)(((uint8_t *)alloc_pnext)),
+&f->flags, &f->mem[0])))
+return err;
+
+f->size[0] = memory_requirements.size;
+
+for (int i = 0; i < planes; i++) {
+bind_info[i].sType  = VK_STRUCTURE_TYPE_BIND_IMAGE_MEMORY_INFO;
+bind_info[i].image  = f->img[i];
+bind_info[i].memory = f->mem[0];
+bind_info[i].memoryOffset = i == 0 ? 0 : mem_size_list[i-1];
+f->offset[i] = bind_info[i].memoryOffset;
+}
+}
+
 /* Bind the allocated memory to the images */
 ret = vk->BindImageMemory2(hwctx->act_dev, planes, bind_info);
 if (ret != VK_SUCCESS) {
@@ -2924,7 +2967,8 @@ static int vulkan_map_to_drm(AVHWFramesContext *hwfc, 
AVFrame *dst,
 continue;
 
 vk->GetImageSubresourceLayout(hwctx->act_dev, f->img[i], &sub, 
&layout);
-drm_desc->layers[i].planes[0].offset   = layout.offset;
+drm_desc->layers[i].planes[0].offset   = p->use_one_memory ?
+f->offset[i] : 
layout.offset;
 drm_desc->layers[i].planes[0].pitch= layout.rowPitch;
 }
 
diff --git a/libavutil/hwcontext_vulkan.h b/libavutil/hwcontext_vulkan.h
index 9264f70dbf..efb602ef27 100644
--- a/libavutil/hwcontext_vulkan.h
+++ b/libavutil/hwcontext_vulkan.h
@@ -189,6 +189,7 @@ typedef struct AVVkFrame {
  */
 VkDeviceMemory mem[AV_NUM_DATA_POINTERS];
 size_t size[AV_NUM_DATA_POINTERS];
+size_t offset[AV_NUM_DATA_POINTERS];
 
 /**
  * OR'd flags for all memory allocated
-- 
2.25.1

___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".


[FFmpeg-devel] [PATCH 5/7] libavutil/hwcontext_vulkan: Add hwupload and hwdownload support when using one_memory flag.

2021-11-09 Thread Wenbin Chen
Add hwupload and hwdownload support to vulkan when frames are allocated
in one memory

Signed-off-by: Wenbin Chen 
---
 libavutil/hwcontext_vulkan.c | 10 --
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/libavutil/hwcontext_vulkan.c b/libavutil/hwcontext_vulkan.c
index f7878ed9c3..b857d1a9ed 100644
--- a/libavutil/hwcontext_vulkan.c
+++ b/libavutil/hwcontext_vulkan.c
@@ -2138,7 +2138,7 @@ static int vulkan_map_frame_to_mem(AVHWFramesContext 
*hwfc, AVFrame *dst,
const AVFrame *src, int flags)
 {
 VkResult ret;
-int err, mapped_mem_count = 0;
+int err, mapped_mem_count = 0, loop = 0;
 AVVkFrame *f = (AVVkFrame *)src->data[0];
 AVVulkanDeviceContext *hwctx = hwfc->device_ctx->hwctx;
 const int planes = av_pix_fmt_count_planes(hwfc->sw_format);
@@ -2167,7 +2167,8 @@ static int vulkan_map_frame_to_mem(AVHWFramesContext 
*hwfc, AVFrame *dst,
 dst->width  = src->width;
 dst->height = src->height;
 
-for (int i = 0; i < planes; i++) {
+loop = p->use_one_memory ? 1 : planes;
+for (int i = 0; i < loop; i++) {
 ret = vk->MapMemory(hwctx->act_dev, f->mem[i], 0,
 VK_WHOLE_SIZE, 0, (void **)&dst->data[i]);
 if (ret != VK_SUCCESS) {
@@ -2178,6 +2179,11 @@ static int vulkan_map_frame_to_mem(AVHWFramesContext 
*hwfc, AVFrame *dst,
 }
 mapped_mem_count++;
 }
+if (p->use_one_memory) {
+for (int i = 0; i < planes; i++) {
+dst->data[i] = dst->data[0] + f->offset[i];
+}
+}
 
 /* Check if the memory contents matter */
 if (((flags & AV_HWFRAME_MAP_READ) || !(flags & AV_HWFRAME_MAP_OVERWRITE)) 
&&
-- 
2.25.1

___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".


[FFmpeg-devel] [PATCH 7/7] libavutil/hwcontext_vulkan: specify the modifier to create VKImage

2021-11-09 Thread Wenbin Chen
When vulkan image exports to drm, the tilling need to be
VK_IMAGE_TILING_DRM_FORMAT_MODIFIER_EXT. Now add code to create vulkan
image using this format.

Now the following command line works:

ffmpeg -hwaccel vaapi -hwaccel_device /dev/dri/renderD128 
-hwaccel_output_format \
vaapi -i input_1080p.264 -vf "hwmap=derive_device=vulkan,format=vulkan, \
scale_vulkan=1920:1080,hwmap=derive_device=vaapi,format=vaapi" -c:v h264_vaapi 
output.264

Signed-off-by: Wenbin Chen 
---
 libavutil/hwcontext_vulkan.c | 76 +---
 libavutil/hwcontext_vulkan.h |  5 +++
 2 files changed, 75 insertions(+), 6 deletions(-)

diff --git a/libavutil/hwcontext_vulkan.c b/libavutil/hwcontext_vulkan.c
index 29ade94b7f..e252c2177e 100644
--- a/libavutil/hwcontext_vulkan.c
+++ b/libavutil/hwcontext_vulkan.c
@@ -1919,6 +1919,7 @@ static void try_export_flags(AVHWFramesContext *hwfc,
 AVVulkanDeviceContext *dev_hwctx = hwfc->device_ctx->hwctx;
 VulkanDevicePriv *p = hwfc->device_ctx->internal->priv;
 FFVulkanFunctions *vk = &p->vkfn;
+const int has_modifiers = hwctx->tiling == 
VK_IMAGE_TILING_DRM_FORMAT_MODIFIER_EXT;
 VkExternalImageFormatProperties eprops = {
 .sType = VK_STRUCTURE_TYPE_EXTERNAL_IMAGE_FORMAT_PROPERTIES_KHR,
 };
@@ -1926,9 +1927,18 @@ static void try_export_flags(AVHWFramesContext *hwfc,
 .sType = VK_STRUCTURE_TYPE_IMAGE_FORMAT_PROPERTIES_2,
 .pNext = &eprops,
 };
+VkPhysicalDeviceImageDrmFormatModifierInfoEXT phy_dev_mod_info = {
+.sType = 
VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_IMAGE_DRM_FORMAT_MODIFIER_INFO_EXT,
+.pNext = NULL,
+.pQueueFamilyIndices   = p->qfs,
+.queueFamilyIndexCount = p->num_qfs,
+.sharingMode   = p->num_qfs > 1 ? VK_SHARING_MODE_CONCURRENT :
+  VK_SHARING_MODE_EXCLUSIVE,
+};
 VkPhysicalDeviceExternalImageFormatInfo enext = {
 .sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_EXTERNAL_IMAGE_FORMAT_INFO,
 .handleType = exp,
+.pNext = has_modifiers ? &phy_dev_mod_info : NULL,
 };
 VkPhysicalDeviceImageFormatInfo2 pinfo = {
 .sType  = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_IMAGE_FORMAT_INFO_2,
@@ -1940,11 +1950,15 @@ static void try_export_flags(AVHWFramesContext *hwfc,
 .flags  = VK_IMAGE_CREATE_ALIAS_BIT,
 };
 
-ret = vk->GetPhysicalDeviceImageFormatProperties2(dev_hwctx->phys_dev,
-  &pinfo, &props);
-if (ret == VK_SUCCESS) {
-*iexp |= exp;
-*comp_handle_types |= 
eprops.externalMemoryProperties.compatibleHandleTypes;
+for (int i = 0; i < (has_modifiers ? hwctx->modifier_count : 1); i++) {
+if (has_modifiers && hwctx->modifier_count)
+phy_dev_mod_info.drmFormatModifier = hwctx->modifiers[i];
+ret = vk->GetPhysicalDeviceImageFormatProperties2(dev_hwctx->phys_dev,
+&pinfo, &props);
+if (ret == VK_SUCCESS) {
+*iexp |= exp;
+*comp_handle_types |= 
eprops.externalMemoryProperties.compatibleHandleTypes;
+}
 }
 }
 
@@ -2007,6 +2021,7 @@ fail:
 static void vulkan_frames_uninit(AVHWFramesContext *hwfc)
 {
 VulkanFramesPriv *fp = hwfc->internal->priv;
+AVVulkanFramesContext *hwctx = hwfc->hwctx;
 
 free_exec_ctx(hwfc, &fp->conv_ctx);
 free_exec_ctx(hwfc, &fp->upload_ctx);
@@ -2021,11 +2036,60 @@ static int vulkan_frames_init(AVHWFramesContext *hwfc)
 VulkanFramesPriv *fp = hwfc->internal->priv;
 AVVulkanDeviceContext *dev_hwctx = hwfc->device_ctx->hwctx;
 VulkanDevicePriv *p = hwfc->device_ctx->internal->priv;
+const int has_modifiers = !!(p->extensions & FF_VK_EXT_DRM_MODIFIER_FLAGS);
 
 /* Default pool flags */
-hwctx->tiling = hwctx->tiling ? hwctx->tiling : p->use_linear_images ?
+hwctx->tiling = hwctx->tiling ? hwctx->tiling : has_modifiers ?
+VK_IMAGE_TILING_DRM_FORMAT_MODIFIER_EXT : 
p->use_linear_images ?
 VK_IMAGE_TILING_LINEAR : VK_IMAGE_TILING_OPTIMAL;
 
+/* get the supported modifier */
+if (has_modifiers) {
+const VkFormat *fmt = av_vkfmt_from_pixfmt(hwfc->sw_format);
+FFVulkanFunctions *vk = &p->vkfn;
+VkDrmFormatModifierPropertiesEXT mod_props[MAX_VULKAN_MODIFIERS];
+
+VkDrmFormatModifierPropertiesListEXT mod_props_list = {
+.sType = VK_STRUCTURE_TYPE_DRM_FORMAT_MODIFIER_PROPERTIES_LIST_EXT,
+.pNext = NULL,
+.drmFormatModifierCount = 0,
+.pDrmFormatModifierProperties = NULL,
+};
+VkFormatProperties2 prop = {
+.sType = VK_STRUCTURE_TYPE_FORMAT_PROPERTIES_2,
+.pNext = &mod

Re: [FFmpeg-devel] [PATCH 1/4] libavutil/hwcontext_d3d11va: Add nb_surfaces to AVD3D11VAFramesContext

2021-11-09 Thread Chen, Wenbin
> Adding nb_surfaces in AVD3D11VAFramesContext in the end of the structure
> to support flexible size of this arrays and align to
> AVDXVA2FramesContext and AVVAAPIFramesContext.
> 
> Signed-off-by Wenbin Chen 
> ---
>  libavutil/hwcontext_d3d11va.c | 3 +--
>  libavutil/hwcontext_d3d11va.h | 2 ++
>  2 files changed, 3 insertions(+), 2 deletions(-)
> 
> diff --git a/libavutil/hwcontext_d3d11va.c b/libavutil/hwcontext_d3d11va.c
> index 8ab96bad25..086e7b9daa 100644
> --- a/libavutil/hwcontext_d3d11va.c
> +++ b/libavutil/hwcontext_d3d11va.c
> @@ -72,7 +72,6 @@ static av_cold void load_functions(void)
>  }
> 
>  typedef struct D3D11VAFramesContext {
> -int nb_surfaces;
>  int nb_surfaces_used;
> 
>  DXGI_FORMAT format;
> @@ -287,7 +286,7 @@ static int d3d11va_frames_init(AVHWFramesContext
> *ctx)
>  hwctx->texture_infos = av_calloc(ctx->initial_pool_size, sizeof(*hwctx-
> >texture_infos));
>  if (!hwctx->texture_infos)
>  return AVERROR(ENOMEM);
> -s->nb_surfaces = ctx->initial_pool_size;
> +hwctx->nb_surfaces = ctx->initial_pool_size;
> 
>  ctx->internal->pool_internal =
> av_buffer_pool_init2(sizeof(AVD3D11FrameDescriptor),
>  ctx, 
> d3d11va_pool_alloc, NULL);
> diff --git a/libavutil/hwcontext_d3d11va.h b/libavutil/hwcontext_d3d11va.h
> index 77d2d72f1b..b0df470190 100644
> --- a/libavutil/hwcontext_d3d11va.h
> +++ b/libavutil/hwcontext_d3d11va.h
> @@ -173,6 +173,8 @@ typedef struct AVD3D11VAFramesContext {
>   * This field is ignored/invalid if a user-allocated texture is provided.
>  */
>  AVD3D11FrameDescriptor *texture_infos;
> +
> +int nb_surfaces;
>  } AVD3D11VAFramesContext;
> 
>  #endif /* AVUTIL_HWCONTEXT_D3D11VA_H */
> --
> 2.25.1

ping
___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".


Re: [FFmpeg-devel] [PATCH 2/4] libavutil/hwcontext_qsv: fix a bug when malloc handle_pairs_internal

2021-11-09 Thread Chen, Wenbin
> This commandline cause core dumped:
> ffmpeg -hwaccel vaapi -hwaccel_device /dev/dri/renderD128 \
> -hwaccel_output_format vaapi -i input.264 \
> -vf "hwmap=derive_device=qsv,format=qsv" \
> -c:v h264_qsv output.264
> 
> reason: We use nb_surfaces to assign surface to handle_pairs_internal
> but handle_pairs_internal is alloced with the size of init_pool_size.
> This lead to access to illegal address.
> 
> Now change it to use nb_surfaces to allocate handle_pairs_internal and the
> core dumped error is unseen. Also change D3D11VA to use nb_surfaces
> to align to VAAPI and DXVA2.
> 
> Signed-off-by: Wenbin Chen 
> ---
>  libavutil/hwcontext_qsv.c | 13 ++---
>  1 file changed, 6 insertions(+), 7 deletions(-)
> 
> diff --git a/libavutil/hwcontext_qsv.c b/libavutil/hwcontext_qsv.c
> index c18747f7eb..5a285fd25b 100644
> --- a/libavutil/hwcontext_qsv.c
> +++ b/libavutil/hwcontext_qsv.c
> @@ -1123,8 +1123,7 @@ static int
> qsv_frames_derive_to(AVHWFramesContext *dst_ctx,
>  case AV_HWDEVICE_TYPE_VAAPI:
>  {
>  AVVAAPIFramesContext *src_hwctx = src_ctx->hwctx;
> -s->handle_pairs_internal = av_calloc(src_ctx->initial_pool_size,
> - 
> sizeof(*s->handle_pairs_internal));
> +s->handle_pairs_internal = av_calloc(src_hwctx->nb_surfaces,
> sizeof(*s->handle_pairs_internal));
>  if (!s->handle_pairs_internal)
>  return AVERROR(ENOMEM);
>  s->surfaces_internal = av_calloc(src_hwctx->nb_surfaces,
> @@ -1146,15 +1145,15 @@ static int
> qsv_frames_derive_to(AVHWFramesContext *dst_ctx,
>  case AV_HWDEVICE_TYPE_D3D11VA:
>  {
>  AVD3D11VAFramesContext *src_hwctx = src_ctx->hwctx;
> -s->handle_pairs_internal = av_calloc(src_ctx->initial_pool_size,
> +s->handle_pairs_internal = av_calloc(src_ctx->nb_surfaces,
>   
> sizeof(*s->handle_pairs_internal));
>  if (!s->handle_pairs_internal)
>  return AVERROR(ENOMEM);
> -s->surfaces_internal = av_calloc(src_ctx->initial_pool_size,
> +s->surfaces_internal = av_calloc(src_ctx->nb_surfaces,
>   sizeof(*s->surfaces_internal));
>  if (!s->surfaces_internal)
>  return AVERROR(ENOMEM);
> -for (i = 0; i < src_ctx->initial_pool_size; i++) {
> +for (i = 0; i < src_ctx->nb_surfaces; i++) {
>  qsv_init_surface(dst_ctx, &s->surfaces_internal[i]);
>  s->handle_pairs_internal[i].first = (mfxMemId)src_hwctx-
> >texture_infos[i].texture;
>  if (src_hwctx->BindFlags & D3D11_BIND_RENDER_TARGET) {
> @@ -1164,7 +1163,7 @@ static int
> qsv_frames_derive_to(AVHWFramesContext *dst_ctx,
>  }
>  s->surfaces_internal[i].Data.MemId = (mfxMemId)&s-
> >handle_pairs_internal[i];
>  }
> -dst_hwctx->nb_surfaces = src_ctx->initial_pool_size;
> +dst_hwctx->nb_surfaces = src_ctx->nb_surfaces;
>  if (src_hwctx->BindFlags & D3D11_BIND_RENDER_TARGET) {
>  dst_hwctx->frame_type |=
> MFX_MEMTYPE_VIDEO_MEMORY_PROCESSOR_TARGET;
>  } else {
> @@ -1177,7 +1176,7 @@ static int
> qsv_frames_derive_to(AVHWFramesContext *dst_ctx,
>  case AV_HWDEVICE_TYPE_DXVA2:
>  {
>  AVDXVA2FramesContext *src_hwctx = src_ctx->hwctx;
> -s->handle_pairs_internal = av_calloc(src_ctx->initial_pool_size,
> +s->handle_pairs_internal = av_calloc(src_ctx->nb_surfaces,
>   
> sizeof(*s->handle_pairs_internal));
>  if (!s->handle_pairs_internal)
>  return AVERROR(ENOMEM);
> --
> 2.25.1

ping
___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".


Re: [FFmpeg-devel] [PATCH 3/4] libavutil/hwcontext_qsv: fix a bug for mapping vaapi frame to qsv

2021-11-09 Thread Chen, Wenbin
> From: nyanmisaka 
> 
> The data stored in data[3] in VAAPI AVFrame is VASurfaceID while
> the data stored in pair->first is the pointer of VASurfaceID, so
> we need to do cast to make following commandline works:
> 
> ffmpeg -hwaccel vaapi -hwaccel_device /dev/dri/renderD128 \
> -hwaccel_output_format vaapi -i input.264 \
> -vf "hwmap=derive_device=qsv,format=qsv" -c:v h264_qsv output.264
> 
> Signed-off-by: nyanmisaka 
> Signed-off-by: Wenbin Chen 
> ---
>  libavutil/hwcontext_qsv.c | 2 +-
>  1 file changed, 1 insertion(+), 1 deletion(-)
> 
> diff --git a/libavutil/hwcontext_qsv.c b/libavutil/hwcontext_qsv.c
> index 5a285fd25b..8075c27862 100644
> --- a/libavutil/hwcontext_qsv.c
> +++ b/libavutil/hwcontext_qsv.c
> @@ -1219,7 +1219,7 @@ static int qsv_map_to(AVHWFramesContext
> *dst_ctx,
>  case AV_PIX_FMT_VAAPI:
>  {
>  mfxHDLPair *pair = (mfxHDLPair*)hwctx->surfaces[i].Data.MemId;
> -if (pair->first == src->data[3]) {
> +if (*(VASurfaceID*)pair->first == (VASurfaceID)src->data[3]) {
>  index = i;
>  break;
>  }
> --
> 2.25.1

ping
___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".


Re: [FFmpeg-devel] [PATCH 4/4] libavutil/hwcontext_opencl: fix a bug for mapping qsv frame to opencl

2021-11-09 Thread Chen, Wenbin
> From: nyanmisaka 
> 
> mfxHDLPair was added to qsv, so modify qsv->opencl map function as well.
> Now the following commandline works:
> 
> ffmpeg -v verbose -init_hw_device vaapi=va:/dev/dri/renderD128 \
> -init_hw_device qsv=qs@va -init_hw_device opencl=ocl@va -
> filter_hw_device ocl \
> -hwaccel qsv -hwaccel_output_format qsv -hwaccel_device qs -c:v h264_qsv
> \
> -i input.264 -vf
> "hwmap=derive_device=opencl,format=opencl,avgblur_opencl, \
> hwmap=derive_device=qsv:reverse=1:extra_hw_frames=32,format=qsv" \
> -c:v h264_qsv output.264
> 
> Signed-off-by: nyanmisaka 
> Signed-off-by: Wenbin Chen 
> ---
>  libavutil/hwcontext_opencl.c | 3 ++-
>  1 file changed, 2 insertions(+), 1 deletion(-)
> 
> diff --git a/libavutil/hwcontext_opencl.c b/libavutil/hwcontext_opencl.c
> index 26a3a24593..4b6e74ff6f 100644
> --- a/libavutil/hwcontext_opencl.c
> +++ b/libavutil/hwcontext_opencl.c
> @@ -2249,7 +2249,8 @@ static int
> opencl_map_from_qsv(AVHWFramesContext *dst_fc, AVFrame *dst,
>  #if CONFIG_LIBMFX
>  if (src->format == AV_PIX_FMT_QSV) {
>  mfxFrameSurface1 *mfx_surface = (mfxFrameSurface1*)src->data[3];
> -va_surface = *(VASurfaceID*)mfx_surface->Data.MemId;
> +mfxHDLPair *pair = (mfxHDLPair*)mfx_surface->Data.MemId;
> +va_surface = *(VASurfaceID*)pair->first;
>  } else
>  #endif
>  if (src->format == AV_PIX_FMT_VAAPI) {
> --
> 2.25.1

ping
___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".


Re: [FFmpeg-devel] [PATCH 6/7] libavutil/hwcontext_vulkan: fix a sem_wait bug when export drm

2021-11-10 Thread Chen, Wenbin
> 9 Nov 2021, 10:18 by wenbin.c...@intel.com:
> 
> > sem_sig_val is wrongly assigned to pWaitSemaphoreValues when export
> drm. Now fix
> > it.
> >
> > Signed-off-by: Wenbin Chen <> wenbin.c...@intel.com> >
> >
> 
> Thanks for spotting this, I fixed that in my patchset and updated branch.
> frame->sem_value is safe to use for waiting, as it's only updated after
> the command buffer is successfully submitted.
> 
Ok, Got it.
 
Thanks
Wenbin
> ___
> ffmpeg-devel mailing list
> ffmpeg-devel@ffmpeg.org
> https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
> 
> To unsubscribe, visit link above, or email
> ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".
___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".


Re: [FFmpeg-devel] [PATCH 2/4] libavutil/hwcontext_qsv: fix a bug when malloc handle_pairs_internal

2021-11-10 Thread Chen, Wenbin
> > -Original Message-
> > From: ffmpeg-devel  On Behalf Of
> > Chen, Wenbin
> > Sent: Wednesday, November 10, 2021 4:03 AM
> > To: ffmpeg-devel@ffmpeg.org
> > Subject: Re: [FFmpeg-devel] [PATCH 2/4] libavutil/hwcontext_qsv: fix
> > a bug when malloc handle_pairs_internal
> >
> > > This commandline cause core dumped:
> > > ffmpeg -hwaccel vaapi -hwaccel_device /dev/dri/renderD128 \
> > > -hwaccel_output_format vaapi -i input.264 \
> > > -vf "hwmap=derive_device=qsv,format=qsv" \
> > > -c:v h264_qsv output.264
> > >
> > > reason: We use nb_surfaces to assign surface to
> > handle_pairs_internal
> > > but handle_pairs_internal is alloced with the size of
> > init_pool_size.
> > > This lead to access to illegal address.
> > >
> > > Now change it to use nb_surfaces to allocate handle_pairs_internal
> > and the
> 
> I'm not sure about whether this is right.
> 
> When we look at the top of the qsv_frames_derive_to function that you
> are changing, there is this:
> 
> 
> if (src_ctx->initial_pool_size == 0) {
> av_log(dst_ctx, AV_LOG_ERROR, "Only fixed-size pools can be "
> "mapped to QSV frames.\n");
> return AVERROR(EINVAL);
> }
> 
> It's because QSV doesn't support dynamic pool sizes.
> 
> When we look at the vaapi_pool_alloc function in hwcontext_vaapi.c, we
> can see that:
> 
>   when  initial_pool_size is > 0, the pool cannot grow beyond this value,
>   so nb_surfaces cannot be > initial_pool_size
> 
> So I'm wondering what could have caused the segfault? Which values did
> you have there for nb_surfaces and initial_pool_size?
> 
> 
> > > core dumped error is unseen. Also change D3D11VA to use nb_surfaces
> > > to align to VAAPI and DXVA2.
> 
> Those changes are unrelated to fixing the issue with VAAPI.
> (besides that I don't think these are needed at all)
> 
> Kind regards,
> softworkz

You are right. The real cause is that vaapi_decode_make_config() is called 
twice.
The init_pool_size is changed on the second call. I will resubmit patch to fix 
this

Thanks
Wenbin
> ___
> ffmpeg-devel mailing list
> ffmpeg-devel@ffmpeg.org
> https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
> 
> To unsubscribe, visit link above, or email
> ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".
___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".


Re: [FFmpeg-devel] [PATCH 1/4] libavutil/hwcontext_d3d11va: Add nb_surfaces to AVD3D11VAFramesContext

2021-11-10 Thread Chen, Wenbin
> > -Original Message-
> > From: ffmpeg-devel  On Behalf Of
> > Chen, Wenbin
> > Sent: Wednesday, November 10, 2021 4:03 AM
> > To: ffmpeg-devel@ffmpeg.org
> > Subject: Re: [FFmpeg-devel] [PATCH 1/4] libavutil/hwcontext_d3d11va:
> > Add nb_surfaces to AVD3D11VAFramesContext
> >
> > > Adding nb_surfaces in AVD3D11VAFramesContext in the end of the
> > structure
> > > to support flexible size of this arrays and align to
> > > AVDXVA2FramesContext and AVVAAPIFramesContext.
> 
> There is no flexibility in pool size for D3D11 frames contexts. The
> surface count is always identical to initial_pool_size. There's no
> point in exposing it. nb_surfaces could even be removed here.
> 
> Also, this change doesn't align with AVVAAPIFramesContext.
> In fact not even AVDXVA2FramesContext aligns with AVVAAPIFramesContext.
> Both have an nb_surfaces field, but they have different semantics.
> 
> The corresponding field to AVVAAPIFramesContext->nb_surfaces is
> DXVA2FramesContext->nb_surfaces_used, not AVDXVA2FramesContext-
> >nb_surfaces.
> 
> Kind regards,
> softworkz
> 
> 
Yes, the nb_surface for d3d11 seems useless, I can use init_pool_size instead.
The nb_surface in dxva works as the same as init_pool_size, and I can use 
init_pool_size instead as well. The original code is good I don't need to 
change it.
Since I found the real cause for segmentation fault, I will resubmit the patches

Thanks
Wenbin
> 
> 
> 
> ___
> ffmpeg-devel mailing list
> ffmpeg-devel@ffmpeg.org
> https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
> 
> To unsubscribe, visit link above, or email
> ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".
___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".


Re: [FFmpeg-devel] [PATCH 4/7] libavutil/hwcontext_vulkan: Allocate vkFrame in one memory

2021-11-14 Thread Chen, Wenbin
> 9 Nov 2021, 10:18 by wenbin.c...@intel.com:
> 
> > The vaapi can import external frame, but the planes of the external
> > frames should be in the same drm object. I add a new function to
> > allocate vkFrame in one memory and vulkan device will choose a way
> > to allocate memory according to one_memory flag.
> > A new variable is added to AVVKFrame to store the offset of each plane.
> >
> > Signed-off-by: Wenbin Chen 
> > ---
> >  libavutil/hwcontext_vulkan.c | 46
> +++-
> >  libavutil/hwcontext_vulkan.h |  1 +
> >  2 files changed, 46 insertions(+), 1 deletion(-)
> >
> > diff --git a/libavutil/hwcontext_vulkan.c b/libavutil/hwcontext_vulkan.c
> > index ccf3e58f49..f7878ed9c3 100644
> > --- a/libavutil/hwcontext_vulkan.c
> > +++ b/libavutil/hwcontext_vulkan.c
> > @@ -1600,6 +1600,9 @@ static int alloc_bind_mem(AVHWFramesContext
> *hwfc, AVVkFrame *f,
> >  FFVulkanFunctions *vk = &p->vkfn;
> >  const int planes = av_pix_fmt_count_planes(hwfc->sw_format);
> >  VkBindImageMemoryInfo bind_info[AV_NUM_DATA_POINTERS] = { { 0 } };
> > +VkMemoryRequirements memory_requirements = { 0 };
> > +int mem_size = 0;
> > +int mem_size_list[AV_NUM_DATA_POINTERS] = { 0 };
> >
> >  AVVulkanDeviceContext *hwctx = ctx->hwctx;
> >
> > @@ -1627,6 +1630,23 @@ static int
> alloc_bind_mem(AVHWFramesContext *hwfc, AVVkFrame *f,
> >  req.memoryRequirements.size = FFALIGN(req.memoryRequirements.size,
> >  p->props.properties.limits.minMemoryMapAlignment);
> >
> > +if (p->use_one_memory) {
> > +if (ded_req.prefersDedicatedAllocation |
> ded_req.requiresDedicatedAllocation) {
> > +av_log(hwfc, AV_LOG_ERROR, "Cannot use dedicated allocation
> for intel vaapi\n");
> > +return AVERROR(EINVAL);
> > +}
> >
> 
> We don't set the flag unless the driver tells us to, so if the
> driver asks us to use dedicated memory when it can't handle such
> images, shouldn't the driver just not set this flag?

I check the dedicatedAllocation flag because I don't know if vaapi driver 
support importing dedicated memory.
Actually I am not sure if I need to check this flag for vaapi. I can remove it.

> 
> 
> > +if (memory_requirements.size == 0) {
> > +memory_requirements = req.memoryRequirements;
> > +} else if (memory_requirements.memoryTypeBits !=
> req.memoryRequirements.memoryTypeBits) {
> > +av_log(hwfc, AV_LOG_ERROR, "the param for each planes are
> not the same\n");
> > +return AVERROR(EINVAL);
> > +}
> > +
> > +mem_size_list[i] = req.memoryRequirements.size;
> > +mem_size += mem_size_list[i];
> > +continue;
> > +}
> > +
> >  /* In case the implementation prefers/requires dedicated allocation */
> >  use_ded_mem = ded_req.prefersDedicatedAllocation |
> >  ded_req.requiresDedicatedAllocation;
> > @@ -1648,6 +1668,29 @@ static int
> alloc_bind_mem(AVHWFramesContext *hwfc, AVVkFrame *f,
> >  bind_info[i].memory = f->mem[i];
> >  }
> >
> > +if (p->use_one_memory) {
> > +memory_requirements.size = mem_size;
> > +
> > +/* Allocate memory */
> > +if ((err = alloc_mem(ctx, &memory_requirements,
> > +f->tiling == VK_IMAGE_TILING_LINEAR ?
> > +VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT :
> > +VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT,
> > +(void *)(((uint8_t *)alloc_pnext)),
> > +&f->flags, &f->mem[0])))
> > +return err;
> > +
> > +f->size[0] = memory_requirements.size;
> > +
> > +for (int i = 0; i < planes; i++) {
> > +bind_info[i].sType  =
> VK_STRUCTURE_TYPE_BIND_IMAGE_MEMORY_INFO;
> > +bind_info[i].image  = f->img[i];
> > +bind_info[i].memory = f->mem[0];
> > +bind_info[i].memoryOffset = i == 0 ? 0 : mem_size_list[i-1];
> > +f->offset[i] = bind_info[i].memoryOffset;
> > +}
> > +}
> > +
> >  /* Bind the allocated memory to the images */
> >  ret = vk->BindImageMemory2(hwctx->act_dev, planes, bind_info);
> >  if (ret != VK_SUCCESS) {
> > @@ -2924,7 +2967,8 @@ static int
> vulkan_map_to_drm(AVHWFramesContext *hwfc, 

Re: [FFmpeg-devel] [PATCH 7/7] libavutil/hwcontext_vulkan: specify the modifier to create VKImage

2021-11-15 Thread Chen, Wenbin
> 9 Nov 2021, 10:18 by wenbin.c...@intel.com:
> 
> > When vulkan image exports to drm, the tilling need to be
> > VK_IMAGE_TILING_DRM_FORMAT_MODIFIER_EXT. Now add code to
> create vulkan
> > image using this format.
> >
> > Now the following command line works:
> >
> > ffmpeg -hwaccel vaapi -hwaccel_device /dev/dri/renderD128 -
> hwaccel_output_format \
> > vaapi -i input_1080p.264 -vf "hwmap=derive_device=vulkan,format=vulkan,
> \
> > scale_vulkan=1920:1080,hwmap=derive_device=vaapi,format=vaapi" -c:v
> h264_vaapi output.264
> >
> > Signed-off-by: Wenbin Chen 
> > ---
> >  libavutil/hwcontext_vulkan.c | 76 +-
> --
> >  libavutil/hwcontext_vulkan.h |  5 +++
> >  2 files changed, 75 insertions(+), 6 deletions(-)
> >
> > diff --git a/libavutil/hwcontext_vulkan.c b/libavutil/hwcontext_vulkan.c
> > index 29ade94b7f..e252c2177e 100644
> > --- a/libavutil/hwcontext_vulkan.c
> > +++ b/libavutil/hwcontext_vulkan.c
> > @@ -1919,6 +1919,7 @@ static void
> try_export_flags(AVHWFramesContext *hwfc,
> >  AVVulkanDeviceContext *dev_hwctx = hwfc->device_ctx->hwctx;
> >  VulkanDevicePriv *p = hwfc->device_ctx->internal->priv;
> >  FFVulkanFunctions *vk = &p->vkfn;
> > +const int has_modifiers = hwctx->tiling ==
> VK_IMAGE_TILING_DRM_FORMAT_MODIFIER_EXT;
> >  VkExternalImageFormatProperties eprops = {
> >  .sType =
> VK_STRUCTURE_TYPE_EXTERNAL_IMAGE_FORMAT_PROPERTIES_KHR,
> >  };
> > @@ -1926,9 +1927,18 @@ static void
> try_export_flags(AVHWFramesContext *hwfc,
> >  .sType = VK_STRUCTURE_TYPE_IMAGE_FORMAT_PROPERTIES_2,
> >  .pNext = &eprops,
> >  };
> > +VkPhysicalDeviceImageDrmFormatModifierInfoEXT phy_dev_mod_info
> = {
> > +.sType =
> VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_IMAGE_DRM_FORMAT_MODIFIER
> _INFO_EXT,
> > +.pNext = NULL,
> > +.pQueueFamilyIndices   = p->qfs,
> > +.queueFamilyIndexCount = p->num_qfs,
> > +.sharingMode   = p->num_qfs > 1 ?
> VK_SHARING_MODE_CONCURRENT :
> > +  
> > VK_SHARING_MODE_EXCLUSIVE,
> > +};
> >  VkPhysicalDeviceExternalImageFormatInfo enext = {
> >  .sType =
> VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_EXTERNAL_IMAGE_FORMAT_INFO,
> >  .handleType = exp,
> > +.pNext = has_modifiers ? &phy_dev_mod_info : NULL,
> >  };
> >  VkPhysicalDeviceImageFormatInfo2 pinfo = {
> >  .sType  =
> VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_IMAGE_FORMAT_INFO_2,
> > @@ -1940,11 +1950,15 @@ static void
> try_export_flags(AVHWFramesContext *hwfc,
> >  .flags  = VK_IMAGE_CREATE_ALIAS_BIT,
> >  };
> >
> > -ret = vk->GetPhysicalDeviceImageFormatProperties2(dev_hwctx-
> >phys_dev,
> > -  &pinfo, &props);
> > -if (ret == VK_SUCCESS) {
> > -*iexp |= exp;
> > -*comp_handle_types |=
> eprops.externalMemoryProperties.compatibleHandleTypes;
> > +for (int i = 0; i < (has_modifiers ? hwctx->modifier_count : 1); i++) {
> > +if (has_modifiers && hwctx->modifier_count)
> > +phy_dev_mod_info.drmFormatModifier = hwctx->modifiers[i];
> > +ret = vk->GetPhysicalDeviceImageFormatProperties2(dev_hwctx-
> >phys_dev,
> > +&pinfo, &props);
> > +if (ret == VK_SUCCESS) {
> > +*iexp |= exp;
> > +*comp_handle_types |=
> eprops.externalMemoryProperties.compatibleHandleTypes;
> > +}
> >  }
> >  }
> >
> > @@ -2007,6 +2021,7 @@ fail:
> >  static void vulkan_frames_uninit(AVHWFramesContext *hwfc)
> >  {
> >  VulkanFramesPriv *fp = hwfc->internal->priv;
> > +AVVulkanFramesContext *hwctx = hwfc->hwctx;
> >
> >  free_exec_ctx(hwfc, &fp->conv_ctx);
> >  free_exec_ctx(hwfc, &fp->upload_ctx);
> > @@ -2021,11 +2036,60 @@ static int
> vulkan_frames_init(AVHWFramesContext *hwfc)
> >  VulkanFramesPriv *fp = hwfc->internal->priv;
> >  AVVulkanDeviceContext *dev_hwctx = hwfc->device_ctx->hwctx;
> >  VulkanDevicePriv *p = hwfc->device_ctx->internal->priv;
> > +const int has_modifiers = !!(p->extensions &
> FF_VK_EXT_DRM_MODIFIER_FLAGS);
> >
> >  /* Default pool flags */
> > -hwctx->tiling = hwctx->tiling ? hwctx->tiling : p->use_linear_images ?
> > +hwctx->tiling = hwc

[FFmpeg-devel] [PATCH 1/3] libavcodec/vaapi_decode: fix the problem that init_pool_size < nb_surface

2021-11-16 Thread Wenbin Chen
For vaapi if the init_pool_size is not zero, the pool size is fixed.
This means max surfaces is init_pool_size, but when mapping vaapi
frame to qsv frame, the init_pool_size < nb_surface. The cause is that
vaapi_decode_make_config() config the init_pool_size and it is called
twice. The first time is to init frame_context and the second time is to
init codec. On the second time the init_pool_size is changed to original
value so the init_pool_size is lower than the reall size because
pool_size used to initialize frame_context need to plus thread_count and
3 (guarantee 4 base work surfaces). Now add code to make sure
init_pool_size is only set once. Now the following commandline works:

ffmpeg -hwaccel vaapi -hwaccel_device /dev/dri/renderD128 \
-hwaccel_output_format vaapi -i input.264 \
-vf "hwmap=derive_device=qsv,format=qsv" \
-c:v h264_qsv output.264

Signed-off-by: Wenbin Chen 
---
 libavcodec/vaapi_decode.c | 34 ++
 1 file changed, 18 insertions(+), 16 deletions(-)

diff --git a/libavcodec/vaapi_decode.c b/libavcodec/vaapi_decode.c
index 665af370ed..aab8162989 100644
--- a/libavcodec/vaapi_decode.c
+++ b/libavcodec/vaapi_decode.c
@@ -572,22 +572,24 @@ static int vaapi_decode_make_config(AVCodecContext *avctx,
 if (err < 0)
 goto fail;
 
-frames->initial_pool_size = 1;
-// Add per-codec number of surfaces used for storing reference frames.
-switch (avctx->codec_id) {
-case AV_CODEC_ID_H264:
-case AV_CODEC_ID_HEVC:
-case AV_CODEC_ID_AV1:
-frames->initial_pool_size += 16;
-break;
-case AV_CODEC_ID_VP9:
-frames->initial_pool_size += 8;
-break;
-case AV_CODEC_ID_VP8:
-frames->initial_pool_size += 3;
-break;
-default:
-frames->initial_pool_size += 2;
+if (!frames->initial_pool_size) {
+frames->initial_pool_size = 1;
+// Add per-codec number of surfaces used for storing reference 
frames.
+switch (avctx->codec_id) {
+case AV_CODEC_ID_H264:
+case AV_CODEC_ID_HEVC:
+case AV_CODEC_ID_AV1:
+frames->initial_pool_size += 16;
+break;
+case AV_CODEC_ID_VP9:
+frames->initial_pool_size += 8;
+break;
+case AV_CODEC_ID_VP8:
+frames->initial_pool_size += 3;
+break;
+default:
+frames->initial_pool_size += 2;
+}
 }
 }
 
-- 
2.25.1

___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".


[FFmpeg-devel] [PATCH 2/3] libavutil/hwcontext_qsv: fix a bug for mapping vaapi frame to qsv

2021-11-16 Thread Wenbin Chen
From: nyanmisaka 

The data stored in data[3] in VAAPI AVFrame is VASurfaceID while
the data stored in pair->first is the pointer of VASurfaceID, so
we need to do cast to make following commandline works:

ffmpeg -hwaccel vaapi -hwaccel_device /dev/dri/renderD128 \
-hwaccel_output_format vaapi -i input.264 \
-vf "hwmap=derive_device=qsv,format=qsv" -c:v h264_qsv output.264

Signed-off-by: nyanmisaka 
Signed-off-by: Wenbin Chen 
---
 libavutil/hwcontext_qsv.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/libavutil/hwcontext_qsv.c b/libavutil/hwcontext_qsv.c
index c18747f7eb..d83754193a 100644
--- a/libavutil/hwcontext_qsv.c
+++ b/libavutil/hwcontext_qsv.c
@@ -1220,7 +1220,7 @@ static int qsv_map_to(AVHWFramesContext *dst_ctx,
 case AV_PIX_FMT_VAAPI:
 {
 mfxHDLPair *pair = (mfxHDLPair*)hwctx->surfaces[i].Data.MemId;
-if (pair->first == src->data[3]) {
+if (*(VASurfaceID*)pair->first == (VASurfaceID)src->data[3]) {
 index = i;
 break;
 }
-- 
2.25.1

___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".


[FFmpeg-devel] [PATCH 3/3] libavutil/hwcontext_opencl: fix a bug for mapping qsv frame to opencl

2021-11-16 Thread Wenbin Chen
From: nyanmisaka 

mfxHDLPair was added to qsv, so modify qsv->opencl map function as well.
Now the following commandline works:

ffmpeg -v verbose -init_hw_device vaapi=va:/dev/dri/renderD128 \
-init_hw_device qsv=qs@va -init_hw_device opencl=ocl@va -filter_hw_device ocl \
-hwaccel qsv -hwaccel_output_format qsv -hwaccel_device qs -c:v h264_qsv \
-i input.264 -vf "hwmap=derive_device=opencl,format=opencl,avgblur_opencl, \
hwmap=derive_device=qsv:reverse=1:extra_hw_frames=32,format=qsv" \
-c:v h264_qsv output.264

Signed-off-by: nyanmisaka 
Signed-off-by: Wenbin Chen 
---
 libavutil/hwcontext_opencl.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/libavutil/hwcontext_opencl.c b/libavutil/hwcontext_opencl.c
index 26a3a24593..4b6e74ff6f 100644
--- a/libavutil/hwcontext_opencl.c
+++ b/libavutil/hwcontext_opencl.c
@@ -2249,7 +2249,8 @@ static int opencl_map_from_qsv(AVHWFramesContext *dst_fc, 
AVFrame *dst,
 #if CONFIG_LIBMFX
 if (src->format == AV_PIX_FMT_QSV) {
 mfxFrameSurface1 *mfx_surface = (mfxFrameSurface1*)src->data[3];
-va_surface = *(VASurfaceID*)mfx_surface->Data.MemId;
+mfxHDLPair *pair = (mfxHDLPair*)mfx_surface->Data.MemId;
+va_surface = *(VASurfaceID*)pair->first;
 } else
 #endif
 if (src->format == AV_PIX_FMT_VAAPI) {
-- 
2.25.1

___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".


Re: [FFmpeg-devel] [PATCH 4/7] libavutil/hwcontext_vulkan: Allocate vkFrame in one memory

2021-11-21 Thread Chen, Wenbin
> 19 Nov 2021, 19:13 by d...@lynne.ee:
> 
> > 19 Nov 2021, 18:59 by d...@lynne.ee:
> >
> >> 15 Nov 2021, 08:25 by wenbin.c...@intel.com:
> >>
> >>>> 9 Nov 2021, 10:18 by wenbin.c...@intel.com:
> >>>>
> >>>> > The vaapi can import external frame, but the planes of the external
> >>>> > frames should be in the same drm object. I add a new function to
> >>>> > allocate vkFrame in one memory and vulkan device will choose a way
> >>>> > to allocate memory according to one_memory flag.
> >>>> > A new variable is added to AVVKFrame to store the offset of each
> plane.
> >>>> >
> >>>> > Signed-off-by: Wenbin Chen 
> >>>> > ---
> >>>> >  libavutil/hwcontext_vulkan.c | 46
> >>>> +++-
> >>>> >  libavutil/hwcontext_vulkan.h |  1 +
> >>>> >  2 files changed, 46 insertions(+), 1 deletion(-)
> >>>> >
> >>>> > diff --git a/libavutil/hwcontext_vulkan.c
> b/libavutil/hwcontext_vulkan.c
> >>>> > index ccf3e58f49..f7878ed9c3 100644
> >>>> > --- a/libavutil/hwcontext_vulkan.c
> >>>> > +++ b/libavutil/hwcontext_vulkan.c
> >>>> > @@ -1600,6 +1600,9 @@ static int
> alloc_bind_mem(AVHWFramesContext
> >>>> *hwfc, AVVkFrame *f,
> >>>> >  FFVulkanFunctions *vk = &p->vkfn;
> >>>> >  const int planes = av_pix_fmt_count_planes(hwfc->sw_format);
> >>>> >  VkBindImageMemoryInfo bind_info[AV_NUM_DATA_POINTERS] =
> { { 0 } };
> >>>> > +VkMemoryRequirements memory_requirements = { 0 };
> >>>> > +int mem_size = 0;
> >>>> > +int mem_size_list[AV_NUM_DATA_POINTERS] = { 0 };
> >>>> >
> >>>> >  AVVulkanDeviceContext *hwctx = ctx->hwctx;
> >>>> >
> >>>> > @@ -1627,6 +1630,23 @@ static int
> >>>> alloc_bind_mem(AVHWFramesContext *hwfc, AVVkFrame *f,
> >>>> >  req.memoryRequirements.size =
> FFALIGN(req.memoryRequirements.size,
> >>>> >  p->props.properties.limits.minMemoryMapAlignment);
> >>>> >
> >>>> > +if (p->use_one_memory) {
> >>>> > +if (ded_req.prefersDedicatedAllocation |
> >>>> ded_req.requiresDedicatedAllocation) {
> >>>> > +av_log(hwfc, AV_LOG_ERROR, "Cannot use dedicated
> allocation
> >>>> for intel vaapi\n");
> >>>> > +return AVERROR(EINVAL);
> >>>> > +}
> >>>> >
> >>>>
> >>>> We don't set the flag unless the driver tells us to, so if the
> >>>> driver asks us to use dedicated memory when it can't handle such
> >>>> images, shouldn't the driver just not set this flag?
> >>>>
> >>>
> >>> I check the dedicatedAllocation flag because I don't know if vaapi driver
> >>> support importing dedicated memory.
> >>> Actually I am not sure if I need to check this flag for vaapi. I can 
> >>> remove it.
> >>>
> >>>>
> >>>>
> >>>> > +if (memory_requirements.size == 0) {
> >>>> > +memory_requirements = req.memoryRequirements;
> >>>> > +} else if (memory_requirements.memoryTypeBits !=
> >>>> req.memoryRequirements.memoryTypeBits) {
> >>>> > +av_log(hwfc, AV_LOG_ERROR, "the param for each planes
> are
> >>>> not the same\n");
> >>>> > +return AVERROR(EINVAL);
> >>>> > +}
> >>>> > +
> >>>> > +mem_size_list[i] = req.memoryRequirements.size;
> >>>> > +mem_size += mem_size_list[i];
> >>>> > +continue;
> >>>> > +}
> >>>> > +
> >>>> >  /* In case the implementation prefers/requires dedicated allocation
> */
> >>>> >  use_ded_mem = ded_req.prefersDedicatedAllocation |
> >>>> >  ded_req.requiresDedicatedAllocation;
> >>>> > @@ -1648,6 +1668,29 @@ static int
> >>>> alloc_bind_mem(AVHWFramesContext *hwfc, AVVkFrame *f,
> >>>> >  bind_info[i].memory = f

Re: [FFmpeg-devel] [PATCH v3 1/1] avutils/hwcontext: When deriving a hwdevice, search for existing device in both directions

2021-11-21 Thread Chen, Wenbin
> > -Original Message-
> > From: ffmpeg-devel  On Behalf Of
> > Xiang, Haihao
> > Sent: Monday, October 18, 2021 6:48 AM
> > To: ffmpeg-devel@ffmpeg.org
> > Subject: Re: [FFmpeg-devel] [PATCH v3 1/1] avutils/hwcontext: When
> > deriving a hwdevice, search for existing device in both directions
> >
> > On Mon, 2021-10-11 at 04:19 +, Soft Works wrote:
> > > The test /libavutil/tests/hwdevice checks that when deriving a
> > device
> > > from a source device and then deriving back to the type of the
> > source
> > > device, the result is matching the original source device, i.e. the
> > > derivation mechanism doesn't create a new device in this case.
> > >
> > > Previously, this test was usually passed, but only due to two
> > different
> > > kind of flaws:
> > >
> > > 1. The test covers only a single level of derivation (and back)
> > >
> > > It derives device Y from device X and then Y back to the type of X
> > and
> > > checks whether the result matches X.
> > >
> > > What it doesn't check for, are longer chains of derivation like:
> > >
> > > CUDA1 > OpenCL2 > CUDA3 and then back to OpenCL4
> > >
> > > In that case, the second derivation returns the first device (CUDA3
> > ==
> > > CUDA1), but when deriving OpenCL4, hwcontext.c was creating a new
> > > OpenCL4 context instead of returning OpenCL2, because there was no
> > link
> > > from CUDA1 to OpenCL2 (only backwards from OpenCL2 to CUDA1)
> > >
> > > If the test would check for two levels of derivation, it would have
> > > failed.
> > >
> > > This patch fixes those (yet untested) cases by introducing forward
> > > references (derived_device) in addition to the existing back
> > references
> > > (source_device).
> > >
> > > 2. hwcontext_qsv didn't properly set the source_device
> > >
> > > In case of QSV, hwcontext_qsv creates a source context internally
> > > (vaapi, dxva2 or d3d11va) without calling
> > av_hwdevice_ctx_create_derived
> > > and without setting source_device.
> > >
> > > This way, the hwcontext test ran successful, but what practically
> > > happened, was that - for example - deriving vaapi from qsv didn't
> > return
> > > the original underlying vaapi device and a new one was created
> > instead:
> > > Exactly what the test is intended to detect and prevent. It just
> > > couldn't do so, because the original device was hidden (= not set
> > as the
> > > source_device of the QSV device).
> > >
> > > This patch properly makes these setting and fixes all derivation
> > > scenarios.
> > >
> > > (at a later stage, /libavutil/tests/hwdevice should be extended to
> > check
> > > longer derivation chains as well)
> > >
> > > Signed-off-by: softworkz 
> > > ---
> > > v3: avoid double-release as suggested by Haihao
> > >
> > >  libavutil/hwcontext.c  | 38
> > ++
> > >  libavutil/hwcontext.h  |  1 +
> > >  libavutil/hwcontext_internal.h |  6 ++
> > >  libavutil/hwcontext_qsv.c  | 16 ++
> > >  4 files changed, 57 insertions(+), 4 deletions(-)
> > >
> > > diff --git a/libavutil/hwcontext.c b/libavutil/hwcontext.c
> > > index 31c7840dba..1a50635018 100644
> > > --- a/libavutil/hwcontext.c
> > > +++ b/libavutil/hwcontext.c
> > > @@ -122,6 +122,7 @@ static const AVClass hwdevice_ctx_class = {
> > >  static void hwdevice_ctx_free(void *opaque, uint8_t *data)
> > >  {
> > >  AVHWDeviceContext *ctx = (AVHWDeviceContext*)data;
> > > +int i;
> > >
> > >  /* uninit might still want access the hw context and the user
> > >   * free() callback might destroy it, so uninit has to be
> > called first */
> > > @@ -132,6 +133,8 @@ static void hwdevice_ctx_free(void *opaque,
> > uint8_t *data)
> > >  ctx->free(ctx);
> > >
> > >  av_buffer_unref(&ctx->internal->source_device);
> > > +for (i = 0; i < AV_HWDEVICE_TYPE_NB; i++)
> > > +av_buffer_unref(&ctx->internal->derived_devices[i]);
> > >
> > >  av_freep(&ctx->hwctx);
> > >  av_freep(&ctx->internal->priv);
> > > @@ -643,6 +646,26 @@ fail:
> > >  return ret;
> > >  }
> > >
> > > +static AVBufferRef* find_derived_hwdevice_ctx(AVBufferRef
> > *src_ref, enum
> > > AVHWDeviceType type)
> > > +{
> > > +AVBufferRef *tmp_ref;
> > > +AVHWDeviceContext *src_ctx;
> > > +int i;
> > > +
> > > +src_ctx = (AVHWDeviceContext*)src_ref->data;
> > > +if (src_ctx->type == type)
> > > +return src_ref;
> > > +
> > > +for (i = 0; i < AV_HWDEVICE_TYPE_NB; i++)
> > > +if (src_ctx->internal->derived_devices[i]) {
> > > +tmp_ref = find_derived_hwdevice_ctx(src_ctx->internal-
> > > >derived_devices[i], type);
> > > +if (tmp_ref)
> > > +return tmp_ref;
> > > +}
> > > +
> > > +return NULL;
> > > +}
> > > +
> > >  int av_hwdevice_ctx_create_derived_opts(AVBufferRef **dst_ref_ptr,
> > >  enum AVHWDeviceType type,
> > >  AVBufferRef *src_ref,
> > > 

Re: [FFmpeg-devel] [PATCH 1/3] libavcodec/vaapi_decode: fix the problem that init_pool_size < nb_surface

2021-11-22 Thread Chen, Wenbin
> For vaapi if the init_pool_size is not zero, the pool size is fixed.
> This means max surfaces is init_pool_size, but when mapping vaapi
> frame to qsv frame, the init_pool_size < nb_surface. The cause is that
> vaapi_decode_make_config() config the init_pool_size and it is called
> twice. The first time is to init frame_context and the second time is to
> init codec. On the second time the init_pool_size is changed to original
> value so the init_pool_size is lower than the reall size because
> pool_size used to initialize frame_context need to plus thread_count and
> 3 (guarantee 4 base work surfaces). Now add code to make sure
> init_pool_size is only set once. Now the following commandline works:
> 
> ffmpeg -hwaccel vaapi -hwaccel_device /dev/dri/renderD128 \
> -hwaccel_output_format vaapi -i input.264 \
> -vf "hwmap=derive_device=qsv,format=qsv" \
> -c:v h264_qsv output.264
> 
> Signed-off-by: Wenbin Chen 
> ---
>  libavcodec/vaapi_decode.c | 34 ++
>  1 file changed, 18 insertions(+), 16 deletions(-)
> 
> diff --git a/libavcodec/vaapi_decode.c b/libavcodec/vaapi_decode.c
> index 665af370ed..aab8162989 100644
> --- a/libavcodec/vaapi_decode.c
> +++ b/libavcodec/vaapi_decode.c
> @@ -572,22 +572,24 @@ static int
> vaapi_decode_make_config(AVCodecContext *avctx,
>  if (err < 0)
>  goto fail;
> 
> -frames->initial_pool_size = 1;
> -// Add per-codec number of surfaces used for storing reference 
> frames.
> -switch (avctx->codec_id) {
> -case AV_CODEC_ID_H264:
> -case AV_CODEC_ID_HEVC:
> -case AV_CODEC_ID_AV1:
> -frames->initial_pool_size += 16;
> -break;
> -case AV_CODEC_ID_VP9:
> -frames->initial_pool_size += 8;
> -break;
> -case AV_CODEC_ID_VP8:
> -frames->initial_pool_size += 3;
> -break;
> -default:
> -frames->initial_pool_size += 2;
> +if (!frames->initial_pool_size) {
> +frames->initial_pool_size = 1;
> +// Add per-codec number of surfaces used for storing reference
> frames.
> +switch (avctx->codec_id) {
> +case AV_CODEC_ID_H264:
> +case AV_CODEC_ID_HEVC:
> +case AV_CODEC_ID_AV1:
> +frames->initial_pool_size += 16;
> +break;
> +case AV_CODEC_ID_VP9:
> +frames->initial_pool_size += 8;
> +break;
> +case AV_CODEC_ID_VP8:
> +frames->initial_pool_size += 3;
> +break;
> +default:
> +frames->initial_pool_size += 2;
> +}
>  }
>  }
> 
> --
> 2.25.1
> 

ping

> ___
> ffmpeg-devel mailing list
> ffmpeg-devel@ffmpeg.org
> https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
> 
> To unsubscribe, visit link above, or email
> ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".
___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".


Re: [FFmpeg-devel] [PATCH 2/3] libavutil/hwcontext_qsv: fix a bug for mapping vaapi frame to qsv

2021-11-22 Thread Chen, Wenbin
> From: nyanmisaka 
> 
> The data stored in data[3] in VAAPI AVFrame is VASurfaceID while
> the data stored in pair->first is the pointer of VASurfaceID, so
> we need to do cast to make following commandline works:
> 
> ffmpeg -hwaccel vaapi -hwaccel_device /dev/dri/renderD128 \
> -hwaccel_output_format vaapi -i input.264 \
> -vf "hwmap=derive_device=qsv,format=qsv" -c:v h264_qsv output.264
> 
> Signed-off-by: nyanmisaka 
> Signed-off-by: Wenbin Chen 
> ---
>  libavutil/hwcontext_qsv.c | 2 +-
>  1 file changed, 1 insertion(+), 1 deletion(-)
> 
> diff --git a/libavutil/hwcontext_qsv.c b/libavutil/hwcontext_qsv.c
> index c18747f7eb..d83754193a 100644
> --- a/libavutil/hwcontext_qsv.c
> +++ b/libavutil/hwcontext_qsv.c
> @@ -1220,7 +1220,7 @@ static int qsv_map_to(AVHWFramesContext
> *dst_ctx,
>  case AV_PIX_FMT_VAAPI:
>  {
>  mfxHDLPair *pair = (mfxHDLPair*)hwctx->surfaces[i].Data.MemId;
> -if (pair->first == src->data[3]) {
> +if (*(VASurfaceID*)pair->first == (VASurfaceID)src->data[3]) {
>  index = i;
>  break;
>  }
> --
> 2.25.1
> 

ping

> ___
> ffmpeg-devel mailing list
> ffmpeg-devel@ffmpeg.org
> https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
> 
> To unsubscribe, visit link above, or email
> ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".
___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".


Re: [FFmpeg-devel] [PATCH 3/3] libavutil/hwcontext_opencl: fix a bug for mapping qsv frame to opencl

2021-11-22 Thread Chen, Wenbin
> From: nyanmisaka 
> 
> mfxHDLPair was added to qsv, so modify qsv->opencl map function as well.
> Now the following commandline works:
> 
> ffmpeg -v verbose -init_hw_device vaapi=va:/dev/dri/renderD128 \
> -init_hw_device qsv=qs@va -init_hw_device opencl=ocl@va -
> filter_hw_device ocl \
> -hwaccel qsv -hwaccel_output_format qsv -hwaccel_device qs -c:v h264_qsv
> \
> -i input.264 -vf
> "hwmap=derive_device=opencl,format=opencl,avgblur_opencl, \
> hwmap=derive_device=qsv:reverse=1:extra_hw_frames=32,format=qsv" \
> -c:v h264_qsv output.264
> 
> Signed-off-by: nyanmisaka 
> Signed-off-by: Wenbin Chen 
> ---
>  libavutil/hwcontext_opencl.c | 3 ++-
>  1 file changed, 2 insertions(+), 1 deletion(-)
> 
> diff --git a/libavutil/hwcontext_opencl.c b/libavutil/hwcontext_opencl.c
> index 26a3a24593..4b6e74ff6f 100644
> --- a/libavutil/hwcontext_opencl.c
> +++ b/libavutil/hwcontext_opencl.c
> @@ -2249,7 +2249,8 @@ static int
> opencl_map_from_qsv(AVHWFramesContext *dst_fc, AVFrame *dst,
>  #if CONFIG_LIBMFX
>  if (src->format == AV_PIX_FMT_QSV) {
>  mfxFrameSurface1 *mfx_surface = (mfxFrameSurface1*)src->data[3];
> -va_surface = *(VASurfaceID*)mfx_surface->Data.MemId;
> +mfxHDLPair *pair = (mfxHDLPair*)mfx_surface->Data.MemId;
> +va_surface = *(VASurfaceID*)pair->first;
>  } else
>  #endif
>  if (src->format == AV_PIX_FMT_VAAPI) {
> --
> 2.25.1
> 

ping

> ___
> ffmpeg-devel mailing list
> ffmpeg-devel@ffmpeg.org
> https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
> 
> To unsubscribe, visit link above, or email
> ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".
___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".


[FFmpeg-devel] [PATCH V2 2/5] libavutil/hwcontext_vaapi: Add a new nv12 format map to support vulkan frame

2021-11-23 Thread Wenbin Chen
Vulkan will map nv12 to R8 and GR88, so add this map to vaapi to support
vulkan frame.

Signed-off-by: Wenbin Chen 
---
 libavutil/hwcontext_vaapi.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/libavutil/hwcontext_vaapi.c b/libavutil/hwcontext_vaapi.c
index 75acc851d6..994b744e4d 100644
--- a/libavutil/hwcontext_vaapi.c
+++ b/libavutil/hwcontext_vaapi.c
@@ -992,6 +992,7 @@ static const struct {
 } vaapi_drm_format_map[] = {
 #ifdef DRM_FORMAT_R8
 DRM_MAP(NV12, 2, DRM_FORMAT_R8,  DRM_FORMAT_RG88),
+DRM_MAP(NV12, 2, DRM_FORMAT_R8,  DRM_FORMAT_GR88),
 #endif
 DRM_MAP(NV12, 1, DRM_FORMAT_NV12),
 #if defined(VA_FOURCC_P010) && defined(DRM_FORMAT_R16)
-- 
2.25.1

___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".


[FFmpeg-devel] [PATCH V2 3/5] libavutil/hwcontext_vulkan: Allocate vkFrame in one memory

2021-11-23 Thread Wenbin Chen
The vaapi can import external frame, but the planes of the external
frames should be in the same drm object. A new option "contiguous_planes"
is added to device. This flag tells device to allocate places in one
memory. When device is derived from vaapi this flag will be enabled.
A new flag frame_flag is also added to AVVulkanFramesContext. User
can use this flag to force enable or disable this behaviour.
A new variable "offset "is added to AVVKFrame. It describe describe the
offset from the memory currently bound to the VkImage.

Signed-off-by: Wenbin Chen 
---
 libavutil/hwcontext_vulkan.c | 62 ++--
 libavutil/hwcontext_vulkan.h | 22 +
 2 files changed, 82 insertions(+), 2 deletions(-)

diff --git a/libavutil/hwcontext_vulkan.c b/libavutil/hwcontext_vulkan.c
index f1e750cd3e..4100e8b0a2 100644
--- a/libavutil/hwcontext_vulkan.c
+++ b/libavutil/hwcontext_vulkan.c
@@ -103,6 +103,9 @@ typedef struct VulkanDevicePriv {
 /* Settings */
 int use_linear_images;
 
+/* allocate planes in a contiguous memory */
+int contiguous_planes;
+
 /* Nvidia */
 int dev_is_nvidia;
 } VulkanDevicePriv;
@@ -1266,6 +1269,11 @@ static int 
vulkan_device_create_internal(AVHWDeviceContext *ctx,
 if (opt_d)
 p->use_linear_images = strtol(opt_d->value, NULL, 10);
 
+opt_d = av_dict_get(opts, "contiguous_planes", NULL, 0);
+if (opt_d)
+p->contiguous_planes = strtol(opt_d->value, NULL, 10);
+
+
 hwctx->enabled_dev_extensions = dev_info.ppEnabledExtensionNames;
 hwctx->nb_enabled_dev_extensions = dev_info.enabledExtensionCount;
 
@@ -1410,8 +1418,10 @@ static int vulkan_device_derive(AVHWDeviceContext *ctx,
 return AVERROR_EXTERNAL;
 }
 
-if (strstr(vendor, "Intel"))
+if (strstr(vendor, "Intel")) {
+av_dict_set_int(&opts, "contiguous_planes", 1, 0);
 dev_select.vendor_id = 0x8086;
+}
 if (strstr(vendor, "AMD"))
 dev_select.vendor_id = 0x1002;
 
@@ -1634,8 +1644,12 @@ static int alloc_bind_mem(AVHWFramesContext *hwfc, 
AVVkFrame *f,
 AVHWDeviceContext *ctx = hwfc->device_ctx;
 VulkanDevicePriv *p = ctx->internal->priv;
 FFVulkanFunctions *vk = &p->vkfn;
+AVVulkanFramesContext *hwfctx = hwfc->hwctx;
 const int planes = av_pix_fmt_count_planes(hwfc->sw_format);
 VkBindImageMemoryInfo bind_info[AV_NUM_DATA_POINTERS] = { { 0 } };
+VkMemoryRequirements memory_requirements = { 0 };
+int mem_size = 0;
+int mem_size_list[AV_NUM_DATA_POINTERS] = { 0 };
 
 AVVulkanDeviceContext *hwctx = ctx->hwctx;
 
@@ -1663,6 +1677,19 @@ static int alloc_bind_mem(AVHWFramesContext *hwfc, 
AVVkFrame *f,
 req.memoryRequirements.size = FFALIGN(req.memoryRequirements.size,
   
p->props.properties.limits.minMemoryMapAlignment);
 
+if (hwfctx->contiguous_planes == AV_VK_FRAME_FLAG_CONTIGUOUS_MEMORY) {
+if (memory_requirements.size == 0) {
+memory_requirements = req.memoryRequirements;
+} else if (memory_requirements.memoryTypeBits != 
req.memoryRequirements.memoryTypeBits) {
+av_log(hwfc, AV_LOG_ERROR, "the param for each planes are not 
the same\n");
+return AVERROR(EINVAL);
+}
+
+mem_size_list[i] = req.memoryRequirements.size;
+mem_size += mem_size_list[i];
+continue;
+}
+
 /* In case the implementation prefers/requires dedicated allocation */
 use_ded_mem = ded_req.prefersDedicatedAllocation |
   ded_req.requiresDedicatedAllocation;
@@ -1684,6 +1711,29 @@ static int alloc_bind_mem(AVHWFramesContext *hwfc, 
AVVkFrame *f,
 bind_info[i].memory = f->mem[i];
 }
 
+if (hwfctx->contiguous_planes == AV_VK_FRAME_FLAG_CONTIGUOUS_MEMORY) {
+memory_requirements.size = mem_size;
+
+/* Allocate memory */
+if ((err = alloc_mem(ctx, &memory_requirements,
+f->tiling == VK_IMAGE_TILING_LINEAR ?
+VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT :
+VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT,
+(void *)(((uint8_t *)alloc_pnext)),
+&f->flags, &f->mem[0])))
+return err;
+
+f->size[0] = memory_requirements.size;
+
+for (int i = 0; i < planes; i++) {
+bind_info[i].sType  = VK_STRUCTURE_TYPE_BIND_IMAGE_MEMORY_INFO;
+bind_info[i].image  = f->img[i];
+bind_info[i].memory = f->mem[0];
+bind_info[i].memoryOffset = i == 0 ? 0 : mem_size_list[i-1];
+f->offset[i] = bind_info[i].memoryOffset;
+}
+ 

[FFmpeg-devel] [PATCH V2 1/5] hwcontext_vaapi: Use PRIME_2 memory type for modifiers.

2021-11-23 Thread Wenbin Chen
From: Bas Nieuwenhuizen 

This way we can pass explicit modifiers in. Sometimes the
modifier matters for the number of memory planes that
libva accepts, in particular when dealing with
driver-compressed textures. Furthermore the driver might
not actually be able to determine the implicit modifier
if all the buffer-passing has used explicit modifier.
All these issues should be resolved by passing in the
modifier, and for that we switch to using the PRIME_2
memory type.

Tested with experimental radeonsi patches for modifiers
and kmsgrab. Also tested with radeonsi without the
patches to double-check it works without PRIME_2 support.

v2:
  Cache PRIME_2 support to avoid doing two calls every time on
  libva drivers that do not support it.

v3:
  Remove prime2_vas usage.

Signed-off-by: Bas Nieuwenhuizen 
---
 libavutil/hwcontext_vaapi.c | 158 ++--
 1 file changed, 114 insertions(+), 44 deletions(-)

diff --git a/libavutil/hwcontext_vaapi.c b/libavutil/hwcontext_vaapi.c
index 83e542876d..75acc851d6 100644
--- a/libavutil/hwcontext_vaapi.c
+++ b/libavutil/hwcontext_vaapi.c
@@ -79,6 +79,9 @@ typedef struct VAAPIFramesContext {
 unsigned int rt_format;
 // Whether vaDeriveImage works.
 int derive_works;
+// Caches whether VA_SURFACE_ATTRIB_MEM_TYPE_DRM_PRIME_2 is unsupported for
+// surface imports.
+int prime_2_import_unsupported;
 } VAAPIFramesContext;
 
 typedef struct VAAPIMapping {
@@ -1022,32 +1025,17 @@ static void vaapi_unmap_from_drm(AVHWFramesContext 
*dst_fc,
 static int vaapi_map_from_drm(AVHWFramesContext *src_fc, AVFrame *dst,
   const AVFrame *src, int flags)
 {
+VAAPIFramesContext *src_vafc = src_fc->internal->priv;
 AVHWFramesContext  *dst_fc =
 (AVHWFramesContext*)dst->hw_frames_ctx->data;
 AVVAAPIDeviceContext  *dst_dev = dst_fc->device_ctx->hwctx;
 const AVDRMFrameDescriptor *desc;
 const VAAPIFormatDescriptor *format_desc;
 VASurfaceID surface_id;
-VAStatus vas;
+VAStatus vas = VA_STATUS_SUCCESS;
+int use_prime2;
 uint32_t va_fourcc;
-int err, i, j, k;
-
-unsigned long buffer_handle;
-VASurfaceAttribExternalBuffers buffer_desc;
-VASurfaceAttrib attrs[2] = {
-{
-.type  = VASurfaceAttribMemoryType,
-.flags = VA_SURFACE_ATTRIB_SETTABLE,
-.value.type= VAGenericValueTypeInteger,
-.value.value.i = VA_SURFACE_ATTRIB_MEM_TYPE_DRM_PRIME,
-},
-{
-.type  = VASurfaceAttribExternalBufferDescriptor,
-.flags = VA_SURFACE_ATTRIB_SETTABLE,
-.value.type= VAGenericValueTypePointer,
-.value.value.p = &buffer_desc,
-}
-};
+int err, i, j;
 
 desc = (AVDRMFrameDescriptor*)src->data[0];
 
@@ -1083,35 +1071,117 @@ static int vaapi_map_from_drm(AVHWFramesContext 
*src_fc, AVFrame *dst,
 format_desc = vaapi_format_from_fourcc(va_fourcc);
 av_assert0(format_desc);
 
-buffer_handle = desc->objects[0].fd;
-buffer_desc.pixel_format = va_fourcc;
-buffer_desc.width= src_fc->width;
-buffer_desc.height   = src_fc->height;
-buffer_desc.data_size= desc->objects[0].size;
-buffer_desc.buffers  = &buffer_handle;
-buffer_desc.num_buffers  = 1;
-buffer_desc.flags= 0;
-
-k = 0;
-for (i = 0; i < desc->nb_layers; i++) {
-for (j = 0; j < desc->layers[i].nb_planes; j++) {
-buffer_desc.pitches[k] = desc->layers[i].planes[j].pitch;
-buffer_desc.offsets[k] = desc->layers[i].planes[j].offset;
-++k;
+use_prime2 = !src_vafc->prime_2_import_unsupported &&
+ desc->objects[0].format_modifier != DRM_FORMAT_MOD_INVALID;
+if (use_prime2) {
+VADRMPRIMESurfaceDescriptor prime_desc;
+VASurfaceAttrib prime_attrs[2] = {
+{
+.type  = VASurfaceAttribMemoryType,
+.flags = VA_SURFACE_ATTRIB_SETTABLE,
+.value.type= VAGenericValueTypeInteger,
+.value.value.i = VA_SURFACE_ATTRIB_MEM_TYPE_DRM_PRIME_2,
+},
+{
+.type  = VASurfaceAttribExternalBufferDescriptor,
+.flags = VA_SURFACE_ATTRIB_SETTABLE,
+.value.type= VAGenericValueTypePointer,
+.value.value.p = &prime_desc,
+}
+};
+prime_desc.fourcc = va_fourcc;
+prime_desc.width = src_fc->width;
+prime_desc.height = src_fc->height;
+prime_desc.num_objects = desc->nb_objects;
+for (i = 0; i < desc->nb_objects; ++i) {
+prime_desc.objects[i].fd = desc->objects[i].fd;
+prime_desc.objects[i].size = desc->objects[i].size;
+prime_desc.objects[i].drm_format_modifier =
+desc->objects[i].format_modifier;
 }
-}
-buffer_desc.num_planes = k;
 
-if (format_desc->chroma_planes

[FFmpeg-devel] [PATCH V2 4/5] libavutil/hwcontext_vulkan: Add hwupload and hwdownload support when using contiguous_planes flag.

2021-11-23 Thread Wenbin Chen
Add hwupload and hwdownload support to vulkan when frames are allocated
in one memory

Signed-off-by: Wenbin Chen 
---
 libavutil/hwcontext_vulkan.c | 12 ++--
 1 file changed, 10 insertions(+), 2 deletions(-)

diff --git a/libavutil/hwcontext_vulkan.c b/libavutil/hwcontext_vulkan.c
index 4100e8b0a2..6421115385 100644
--- a/libavutil/hwcontext_vulkan.c
+++ b/libavutil/hwcontext_vulkan.c
@@ -2212,9 +2212,10 @@ static int vulkan_map_frame_to_mem(AVHWFramesContext 
*hwfc, AVFrame *dst,
const AVFrame *src, int flags)
 {
 VkResult ret;
-int err, mapped_mem_count = 0;
+int err, mapped_mem_count = 0, loop = 0;
 AVVkFrame *f = (AVVkFrame *)src->data[0];
 AVVulkanDeviceContext *hwctx = hwfc->device_ctx->hwctx;
+AVVulkanFramesContext *hwfctx = hwfc->hwctx;
 const int planes = av_pix_fmt_count_planes(hwfc->sw_format);
 VulkanDevicePriv *p = hwfc->device_ctx->internal->priv;
 FFVulkanFunctions *vk = &p->vkfn;
@@ -2241,7 +2242,9 @@ static int vulkan_map_frame_to_mem(AVHWFramesContext 
*hwfc, AVFrame *dst,
 dst->width  = src->width;
 dst->height = src->height;
 
-for (int i = 0; i < planes; i++) {
+loop = hwfctx->contiguous_planes == AV_VK_FRAME_FLAG_CONTIGUOUS_MEMORY ?
+   1 : planes;
+for (int i = 0; i < loop; i++) {
 ret = vk->MapMemory(hwctx->act_dev, f->mem[i], 0,
 VK_WHOLE_SIZE, 0, (void **)&dst->data[i]);
 if (ret != VK_SUCCESS) {
@@ -2252,6 +2255,11 @@ static int vulkan_map_frame_to_mem(AVHWFramesContext 
*hwfc, AVFrame *dst,
 }
 mapped_mem_count++;
 }
+if (hwfctx->contiguous_planes == AV_VK_FRAME_FLAG_CONTIGUOUS_MEMORY) {
+for (int i = 0; i < planes; i++) {
+dst->data[i] = dst->data[0] + f->offset[i];
+}
+}
 
 /* Check if the memory contents matter */
 if (((flags & AV_HWFRAME_MAP_READ) || !(flags & AV_HWFRAME_MAP_OVERWRITE)) 
&&
-- 
2.25.1

___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".


[FFmpeg-devel] [PATCH V2 5/5] libavutil/hwcontext_vulkan: specify the modifier to create VKImage

2021-11-23 Thread Wenbin Chen
When vulkan image exports to drm, the tilling need to be
VK_IMAGE_TILING_DRM_FORMAT_MODIFIER_EXT. Now add code to create vulkan
image using this format.

Now the following command line works:

ffmpeg -hwaccel vaapi -hwaccel_device /dev/dri/renderD128 
-hwaccel_output_format \
vaapi -i input_1080p.264 -vf "hwmap=derive_device=vulkan,format=vulkan, \
scale_vulkan=1920:1080,hwmap=derive_device=vaapi,format=vaapi" -c:v h264_vaapi 
output.264

Signed-off-by: Wenbin Chen 
---
 libavutil/hwcontext_vulkan.c | 130 +--
 1 file changed, 124 insertions(+), 6 deletions(-)

diff --git a/libavutil/hwcontext_vulkan.c b/libavutil/hwcontext_vulkan.c
index 6421115385..4b951fb202 100644
--- a/libavutil/hwcontext_vulkan.c
+++ b/libavutil/hwcontext_vulkan.c
@@ -230,6 +230,28 @@ const VkFormat *av_vkfmt_from_pixfmt(enum AVPixelFormat p)
 return NULL;
 }
 
+static void *find_in_structure_list(VkBaseOutStructure *stru_list, 
VkStructureType sType) {
+if (!stru_list)
+return NULL;
+
+for(;stru_list;stru_list = stru_list->pNext)
+if (stru_list->sType == sType)
+return stru_list;
+
+return NULL;
+}
+
+static void append_to_structure_list(VkBaseOutStructure **stru_list, 
VkBaseOutStructure *added_stru) {
+VkBaseOutStructure *p;
+if (!*stru_list) {
+*stru_list = added_stru;
+return;
+}
+for(p = *stru_list; p->pNext; p = p->pNext);
+p->pNext = added_stru;
+return;
+}
+
 static int pixfmt_is_supported(AVHWDeviceContext *dev_ctx, enum AVPixelFormat 
p,
int linear)
 {
@@ -1979,6 +2001,10 @@ static void try_export_flags(AVHWFramesContext *hwfc,
 AVVulkanDeviceContext *dev_hwctx = hwfc->device_ctx->hwctx;
 VulkanDevicePriv *p = hwfc->device_ctx->internal->priv;
 FFVulkanFunctions *vk = &p->vkfn;
+const int has_modifiers = hwctx->tiling == 
VK_IMAGE_TILING_DRM_FORMAT_MODIFIER_EXT;
+int loop_count;
+VkImageDrmFormatModifierListCreateInfoEXT *modifier_info = 
find_in_structure_list(hwctx->create_pnext,
+
VK_STRUCTURE_TYPE_IMAGE_DRM_FORMAT_MODIFIER_LIST_CREATE_INFO_EXT);
 VkExternalImageFormatProperties eprops = {
 .sType = VK_STRUCTURE_TYPE_EXTERNAL_IMAGE_FORMAT_PROPERTIES_KHR,
 };
@@ -1986,9 +2012,18 @@ static void try_export_flags(AVHWFramesContext *hwfc,
 .sType = VK_STRUCTURE_TYPE_IMAGE_FORMAT_PROPERTIES_2,
 .pNext = &eprops,
 };
+VkPhysicalDeviceImageDrmFormatModifierInfoEXT phy_dev_mod_info = {
+.sType = 
VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_IMAGE_DRM_FORMAT_MODIFIER_INFO_EXT,
+.pNext = NULL,
+.pQueueFamilyIndices   = p->qfs,
+.queueFamilyIndexCount = p->num_qfs,
+.sharingMode   = p->num_qfs > 1 ? VK_SHARING_MODE_CONCURRENT :
+  VK_SHARING_MODE_EXCLUSIVE,
+};
 VkPhysicalDeviceExternalImageFormatInfo enext = {
 .sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_EXTERNAL_IMAGE_FORMAT_INFO,
 .handleType = exp,
+.pNext = has_modifiers && modifier_info ? &phy_dev_mod_info : NULL,
 };
 VkPhysicalDeviceImageFormatInfo2 pinfo = {
 .sType  = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_IMAGE_FORMAT_INFO_2,
@@ -2000,11 +2035,16 @@ static void try_export_flags(AVHWFramesContext *hwfc,
 .flags  = VK_IMAGE_CREATE_ALIAS_BIT,
 };
 
-ret = vk->GetPhysicalDeviceImageFormatProperties2(dev_hwctx->phys_dev,
-  &pinfo, &props);
-if (ret == VK_SUCCESS) {
-*iexp |= exp;
-*comp_handle_types |= 
eprops.externalMemoryProperties.compatibleHandleTypes;
+loop_count = has_modifiers && modifier_info ? 
modifier_info->drmFormatModifierCount : 1;
+for (int i = 0; i < loop_count; i++) {
+if (has_modifiers && modifier_info)
+phy_dev_mod_info.drmFormatModifier = 
modifier_info->pDrmFormatModifiers[i];
+ret = vk->GetPhysicalDeviceImageFormatProperties2(dev_hwctx->phys_dev,
+&pinfo, &props);
+if (ret == VK_SUCCESS) {
+*iexp |= exp;
+*comp_handle_types |= 
eprops.externalMemoryProperties.compatibleHandleTypes;
+}
 }
 }
 
@@ -2074,6 +2114,20 @@ fail:
 static void vulkan_frames_uninit(AVHWFramesContext *hwfc)
 {
 VulkanFramesPriv *fp = hwfc->internal->priv;
+AVVulkanFramesContext *hwctx = hwfc->hwctx;
+VkBaseOutStructure *structure_p_next,*structure_p = hwctx->create_pnext;
+VkImageDrmFormatModifierListCreateInfoEXT *modifier_info;
+while (structure_p) {
+structure_p_next = structure_p->pNext;
+switch (structure_p->sType) {
+case VK_STRUCTURE_TYPE_IMAGE_DRM_FORMAT_MODIFIER_LIST_CREATE_INF

Re: [FFmpeg-devel] [PATCH V2 1/5] hwcontext_vaapi: Use PRIME_2 memory type for modifiers.

2021-11-24 Thread Chen, Wenbin
> 24 Nov 2021, 06:28 by wenbin.c...@intel.com:
> 
> > From: Bas Nieuwenhuizen 
> >
> > This way we can pass explicit modifiers in. Sometimes the
> > modifier matters for the number of memory planes that
> > libva accepts, in particular when dealing with
> > driver-compressed textures. Furthermore the driver might
> > not actually be able to determine the implicit modifier
> > if all the buffer-passing has used explicit modifier.
> > All these issues should be resolved by passing in the
> > modifier, and for that we switch to using the PRIME_2
> > memory type.
> >
> > Tested with experimental radeonsi patches for modifiers
> > and kmsgrab. Also tested with radeonsi without the
> > patches to double-check it works without PRIME_2 support.
> >
> > v2:
> >  Cache PRIME_2 support to avoid doing two calls every time on
> >  libva drivers that do not support it.
> >
> > v3:
> >  Remove prime2_vas usage.
> >
> 
> I've pinged jkqxz, the maintainer, but to me it looks good,
> and considering how long this has been on the ML, I'll
> apply it alongside the Vulkan patches if he doesn't respond.
> By the way, it mentions the author is Bas Nieuwenhuizen,
> should I change the commit author when I apply?

Please change commit author to Bas Nieuwenhuizen.
Thanks.

> ___
> ffmpeg-devel mailing list
> ffmpeg-devel@ffmpeg.org
> https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
> 
> To unsubscribe, visit link above, or email
> ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".
___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".


Re: [FFmpeg-devel] [PATCH V2 3/5] libavutil/hwcontext_vulkan: Allocate vkFrame in one memory

2021-11-24 Thread Chen, Wenbin
> 24 Nov 2021, 06:28 by wenbin.c...@intel.com:
> 
> > The vaapi can import external frame, but the planes of the external
> > frames should be in the same drm object. A new option
> "contiguous_planes"
> > is added to device. This flag tells device to allocate places in one
> > memory. When device is derived from vaapi this flag will be enabled.
> > A new flag frame_flag is also added to AVVulkanFramesContext. User
> > can use this flag to force enable or disable this behaviour.
> > A new variable "offset "is added to AVVKFrame. It describe describe the
> > offset from the memory currently bound to the VkImage.
> >
> > Signed-off-by: Wenbin Chen 
> > ---
> >  libavutil/hwcontext_vulkan.c | 62
> ++--
> >  libavutil/hwcontext_vulkan.h | 22 +
> >  2 files changed, 82 insertions(+), 2 deletions(-)
> >
> > diff --git a/libavutil/hwcontext_vulkan.c b/libavutil/hwcontext_vulkan.c
> > index f1e750cd3e..4100e8b0a2 100644
> > --- a/libavutil/hwcontext_vulkan.c
> > +++ b/libavutil/hwcontext_vulkan.c
> > @@ -103,6 +103,9 @@ typedef struct VulkanDevicePriv {
> >  /* Settings */
> >  int use_linear_images;
> >
> > +/* allocate planes in a contiguous memory */
> > +int contiguous_planes;
> > +
> >  /* Nvidia */
> >  int dev_is_nvidia;
> >
> 
> Add a new `int dev_is_intel;` field, and set it
> in `vulkan_device_init()`, where `dev_is_nvidia` is set.
> 
> 
> >  } VulkanDevicePriv;
> > @@ -1266,6 +1269,11 @@ static int
> vulkan_device_create_internal(AVHWDeviceContext *ctx,
> >  if (opt_d)
> >  p->use_linear_images = strtol(opt_d->value, NULL, 10);
> >
> > +opt_d = av_dict_get(opts, "contiguous_planes", NULL, 0);
> > +if (opt_d)
> > +p->contiguous_planes = strtol(opt_d->value, NULL, 10);
> >
> 
> Set `p->contiguous_planes` to -1 if not specified.
> 
> 
> >  hwctx->enabled_dev_extensions = dev_info.ppEnabledExtensionNames;
> >  hwctx->nb_enabled_dev_extensions = dev_info.enabledExtensionCount;
> >
> > @@ -1410,8 +1418,10 @@ static int
> vulkan_device_derive(AVHWDeviceContext *ctx,
> >  return AVERROR_EXTERNAL;
> >  }
> >
> > -if (strstr(vendor, "Intel"))
> > +if (strstr(vendor, "Intel")) {
> > +av_dict_set_int(&opts, "contiguous_planes", 1, 0);
> >
> 
> Don't set this here, it's not needed with the changes I mentioned.
> 
> 
> >  dev_select.vendor_id = 0x8086;
> > +}
> >  if (strstr(vendor, "AMD"))
> >  dev_select.vendor_id = 0x1002;
> >
> > @@ -1634,8 +1644,12 @@ static int
> alloc_bind_mem(AVHWFramesContext *hwfc, AVVkFrame *f,
> >  AVHWDeviceContext *ctx = hwfc->device_ctx;
> >  VulkanDevicePriv *p = ctx->internal->priv;
> >  FFVulkanFunctions *vk = &p->vkfn;
> > +AVVulkanFramesContext *hwfctx = hwfc->hwctx;
> >  const int planes = av_pix_fmt_count_planes(hwfc->sw_format);
> >  VkBindImageMemoryInfo bind_info[AV_NUM_DATA_POINTERS] = { { 0 } };
> > +VkMemoryRequirements memory_requirements = { 0 };
> > +int mem_size = 0;
> > +int mem_size_list[AV_NUM_DATA_POINTERS] = { 0 };
> >
> >  AVVulkanDeviceContext *hwctx = ctx->hwctx;
> >
> > @@ -1663,6 +1677,19 @@ static int
> alloc_bind_mem(AVHWFramesContext *hwfc, AVVkFrame *f,
> >  req.memoryRequirements.size = FFALIGN(req.memoryRequirements.size,
> >  p->props.properties.limits.minMemoryMapAlignment);
> >
> > +if (hwfctx->contiguous_planes ==
> AV_VK_FRAME_FLAG_CONTIGUOUS_MEMORY) {
> >
> 
> Bitwise &. Not equals.
> 
> 
> > +if (memory_requirements.size == 0) {
> > +memory_requirements = req.memoryRequirements;
> > +} else if (memory_requirements.memoryTypeBits !=
> req.memoryRequirements.memoryTypeBits) {
> > +av_log(hwfc, AV_LOG_ERROR, "the param for each planes are
> not the same\n");
> > +return AVERROR(EINVAL);
> > +}
> > +
> > +mem_size_list[i] = req.memoryRequirements.size;
> > +mem_size += mem_size_list[i];
> > +continue;
> > +}
> > +
> >  /* In case the implementation prefers/requires dedicated allocation */
> >  use_ded_mem = ded_req.prefersDedicatedAllocation |
> >  ded_req.requiresDedicatedAllocation;
> > @@ -1684,6 +1711,29 @@ static int
> all

[FFmpeg-devel] [PATCH V3 1/5] hwcontext_vaapi: Use PRIME_2 memory type for modifiers.

2021-11-25 Thread Wenbin Chen
From: Bas Nieuwenhuizen 

This way we can pass explicit modifiers in. Sometimes the
modifier matters for the number of memory planes that
libva accepts, in particular when dealing with
driver-compressed textures. Furthermore the driver might
not actually be able to determine the implicit modifier
if all the buffer-passing has used explicit modifier.
All these issues should be resolved by passing in the
modifier, and for that we switch to using the PRIME_2
memory type.

Tested with experimental radeonsi patches for modifiers
and kmsgrab. Also tested with radeonsi without the
patches to double-check it works without PRIME_2 support.

v2:
  Cache PRIME_2 support to avoid doing two calls every time on
  libva drivers that do not support it.

v3:
  Remove prime2_vas usage.

Signed-off-by: Bas Nieuwenhuizen 
---
 libavutil/hwcontext_vaapi.c | 158 ++--
 1 file changed, 114 insertions(+), 44 deletions(-)

diff --git a/libavutil/hwcontext_vaapi.c b/libavutil/hwcontext_vaapi.c
index 83e542876d..75acc851d6 100644
--- a/libavutil/hwcontext_vaapi.c
+++ b/libavutil/hwcontext_vaapi.c
@@ -79,6 +79,9 @@ typedef struct VAAPIFramesContext {
 unsigned int rt_format;
 // Whether vaDeriveImage works.
 int derive_works;
+// Caches whether VA_SURFACE_ATTRIB_MEM_TYPE_DRM_PRIME_2 is unsupported for
+// surface imports.
+int prime_2_import_unsupported;
 } VAAPIFramesContext;
 
 typedef struct VAAPIMapping {
@@ -1022,32 +1025,17 @@ static void vaapi_unmap_from_drm(AVHWFramesContext 
*dst_fc,
 static int vaapi_map_from_drm(AVHWFramesContext *src_fc, AVFrame *dst,
   const AVFrame *src, int flags)
 {
+VAAPIFramesContext *src_vafc = src_fc->internal->priv;
 AVHWFramesContext  *dst_fc =
 (AVHWFramesContext*)dst->hw_frames_ctx->data;
 AVVAAPIDeviceContext  *dst_dev = dst_fc->device_ctx->hwctx;
 const AVDRMFrameDescriptor *desc;
 const VAAPIFormatDescriptor *format_desc;
 VASurfaceID surface_id;
-VAStatus vas;
+VAStatus vas = VA_STATUS_SUCCESS;
+int use_prime2;
 uint32_t va_fourcc;
-int err, i, j, k;
-
-unsigned long buffer_handle;
-VASurfaceAttribExternalBuffers buffer_desc;
-VASurfaceAttrib attrs[2] = {
-{
-.type  = VASurfaceAttribMemoryType,
-.flags = VA_SURFACE_ATTRIB_SETTABLE,
-.value.type= VAGenericValueTypeInteger,
-.value.value.i = VA_SURFACE_ATTRIB_MEM_TYPE_DRM_PRIME,
-},
-{
-.type  = VASurfaceAttribExternalBufferDescriptor,
-.flags = VA_SURFACE_ATTRIB_SETTABLE,
-.value.type= VAGenericValueTypePointer,
-.value.value.p = &buffer_desc,
-}
-};
+int err, i, j;
 
 desc = (AVDRMFrameDescriptor*)src->data[0];
 
@@ -1083,35 +1071,117 @@ static int vaapi_map_from_drm(AVHWFramesContext 
*src_fc, AVFrame *dst,
 format_desc = vaapi_format_from_fourcc(va_fourcc);
 av_assert0(format_desc);
 
-buffer_handle = desc->objects[0].fd;
-buffer_desc.pixel_format = va_fourcc;
-buffer_desc.width= src_fc->width;
-buffer_desc.height   = src_fc->height;
-buffer_desc.data_size= desc->objects[0].size;
-buffer_desc.buffers  = &buffer_handle;
-buffer_desc.num_buffers  = 1;
-buffer_desc.flags= 0;
-
-k = 0;
-for (i = 0; i < desc->nb_layers; i++) {
-for (j = 0; j < desc->layers[i].nb_planes; j++) {
-buffer_desc.pitches[k] = desc->layers[i].planes[j].pitch;
-buffer_desc.offsets[k] = desc->layers[i].planes[j].offset;
-++k;
+use_prime2 = !src_vafc->prime_2_import_unsupported &&
+ desc->objects[0].format_modifier != DRM_FORMAT_MOD_INVALID;
+if (use_prime2) {
+VADRMPRIMESurfaceDescriptor prime_desc;
+VASurfaceAttrib prime_attrs[2] = {
+{
+.type  = VASurfaceAttribMemoryType,
+.flags = VA_SURFACE_ATTRIB_SETTABLE,
+.value.type= VAGenericValueTypeInteger,
+.value.value.i = VA_SURFACE_ATTRIB_MEM_TYPE_DRM_PRIME_2,
+},
+{
+.type  = VASurfaceAttribExternalBufferDescriptor,
+.flags = VA_SURFACE_ATTRIB_SETTABLE,
+.value.type= VAGenericValueTypePointer,
+.value.value.p = &prime_desc,
+}
+};
+prime_desc.fourcc = va_fourcc;
+prime_desc.width = src_fc->width;
+prime_desc.height = src_fc->height;
+prime_desc.num_objects = desc->nb_objects;
+for (i = 0; i < desc->nb_objects; ++i) {
+prime_desc.objects[i].fd = desc->objects[i].fd;
+prime_desc.objects[i].size = desc->objects[i].size;
+prime_desc.objects[i].drm_format_modifier =
+desc->objects[i].format_modifier;
 }
-}
-buffer_desc.num_planes = k;
 
-if (format_desc->chroma_planes

[FFmpeg-devel] [PATCH V3 2/5] libavutil/hwcontext_vaapi: Add a new nv12 format map to support vulkan frame

2021-11-25 Thread Wenbin Chen
Vulkan will map nv12 to R8 and GR88, so add this map to vaapi to support
vulkan frame.

Signed-off-by: Wenbin Chen 
---
 libavutil/hwcontext_vaapi.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/libavutil/hwcontext_vaapi.c b/libavutil/hwcontext_vaapi.c
index 75acc851d6..994b744e4d 100644
--- a/libavutil/hwcontext_vaapi.c
+++ b/libavutil/hwcontext_vaapi.c
@@ -992,6 +992,7 @@ static const struct {
 } vaapi_drm_format_map[] = {
 #ifdef DRM_FORMAT_R8
 DRM_MAP(NV12, 2, DRM_FORMAT_R8,  DRM_FORMAT_RG88),
+DRM_MAP(NV12, 2, DRM_FORMAT_R8,  DRM_FORMAT_GR88),
 #endif
 DRM_MAP(NV12, 1, DRM_FORMAT_NV12),
 #if defined(VA_FOURCC_P010) && defined(DRM_FORMAT_R16)
-- 
2.25.1

___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".


[FFmpeg-devel] [PATCH V3 3/5] libavutil/hwcontext_vulkan: Allocate vkFrame in one memory

2021-11-25 Thread Wenbin Chen
The vaapi can import external frame, but the planes of the external
frames should be in the same drm object. A new option "contiguous_planes"
is added to device. This flag tells device to allocate places in one
memory. When device is derived from vaapi this flag will be enabled.
A new flag frame_flag is also added to AVVulkanFramesContext. User
can use this flag to force enable or disable this behaviour.
A new variable "offset "is added to AVVKFrame. It describe describe the
offset from the memory currently bound to the VkImage.

Signed-off-by: Wenbin Chen 
---
 libavutil/hwcontext_vulkan.c | 67 +++-
 libavutil/hwcontext_vulkan.h | 24 +
 2 files changed, 90 insertions(+), 1 deletion(-)

diff --git a/libavutil/hwcontext_vulkan.c b/libavutil/hwcontext_vulkan.c
index 644ed947f8..b8076fb425 100644
--- a/libavutil/hwcontext_vulkan.c
+++ b/libavutil/hwcontext_vulkan.c
@@ -103,8 +103,14 @@ typedef struct VulkanDevicePriv {
 /* Settings */
 int use_linear_images;
 
+/* allocate planes in a contiguous memory */
+int contiguous_planes;
+
 /* Nvidia */
 int dev_is_nvidia;
+
+/* Intel */
+int dev_is_intel;
 } VulkanDevicePriv;
 
 typedef struct VulkanFramesPriv {
@@ -146,6 +152,8 @@ typedef struct AVVkFrameInternal {
 }  
\
 } while(0)
 
+#define VKF_FLAG(x, f) (((x) & (~AV_VK_FRAME_FLAG_NONE)) & (f))
+
 static const struct {
 enum AVPixelFormat pixfmt;
 const VkFormat vkfmts[4];
@@ -1268,6 +1276,13 @@ static int 
vulkan_device_create_internal(AVHWDeviceContext *ctx,
 if (opt_d)
 p->use_linear_images = strtol(opt_d->value, NULL, 10);
 
+opt_d = av_dict_get(opts, "contiguous_planes", NULL, 0);
+if (opt_d)
+p->contiguous_planes = strtol(opt_d->value, NULL, 10);
+else
+p->contiguous_planes = -1;
+
+
 hwctx->enabled_dev_extensions = dev_info.ppEnabledExtensionNames;
 hwctx->nb_enabled_dev_extensions = dev_info.enabledExtensionCount;
 
@@ -1319,6 +1334,8 @@ static int vulkan_device_init(AVHWDeviceContext *ctx)
 
 p->dev_is_nvidia = (p->props.properties.vendorID == 0x10de);
 
+p->dev_is_intel = (p->props.properties.vendorID == 0x8086);
+
 vk->GetPhysicalDeviceQueueFamilyProperties(hwctx->phys_dev, &queue_num, 
NULL);
 if (!queue_num) {
 av_log(ctx, AV_LOG_ERROR, "Failed to get queues!\n");
@@ -1636,8 +1653,12 @@ static int alloc_bind_mem(AVHWFramesContext *hwfc, 
AVVkFrame *f,
 AVHWDeviceContext *ctx = hwfc->device_ctx;
 VulkanDevicePriv *p = ctx->internal->priv;
 FFVulkanFunctions *vk = &p->vkfn;
+AVVulkanFramesContext *hwfctx = hwfc->hwctx;
 const int planes = av_pix_fmt_count_planes(hwfc->sw_format);
 VkBindImageMemoryInfo bind_info[AV_NUM_DATA_POINTERS] = { { 0 } };
+VkMemoryRequirements memory_requirements = { 0 };
+int mem_size = 0;
+int mem_size_list[AV_NUM_DATA_POINTERS] = { 0 };
 
 AVVulkanDeviceContext *hwctx = ctx->hwctx;
 
@@ -1665,6 +1686,19 @@ static int alloc_bind_mem(AVHWFramesContext *hwfc, 
AVVkFrame *f,
 req.memoryRequirements.size = FFALIGN(req.memoryRequirements.size,
   
p->props.properties.limits.minMemoryMapAlignment);
 
+if (VKF_FLAG(hwfctx->flags, AV_VK_FRAME_FLAG_CONTIGUOUS_MEMORY)) {
+if (memory_requirements.size == 0) {
+memory_requirements = req.memoryRequirements;
+} else if (memory_requirements.memoryTypeBits != 
req.memoryRequirements.memoryTypeBits) {
+av_log(hwfc, AV_LOG_ERROR, "the param for each planes are not 
the same\n");
+return AVERROR(EINVAL);
+}
+
+mem_size_list[i] = req.memoryRequirements.size;
+mem_size += mem_size_list[i];
+continue;
+}
+
 /* In case the implementation prefers/requires dedicated allocation */
 use_ded_mem = ded_req.prefersDedicatedAllocation |
   ded_req.requiresDedicatedAllocation;
@@ -1686,6 +1720,29 @@ static int alloc_bind_mem(AVHWFramesContext *hwfc, 
AVVkFrame *f,
 bind_info[i].memory = f->mem[i];
 }
 
+if (VKF_FLAG(hwfctx->flags, AV_VK_FRAME_FLAG_CONTIGUOUS_MEMORY)) {
+memory_requirements.size = mem_size;
+
+/* Allocate memory */
+if ((err = alloc_mem(ctx, &memory_requirements,
+f->tiling == VK_IMAGE_TILING_LINEAR ?
+VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT :
+VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT,
+(void *)(((uint8_t *)alloc_pnext)),
+&f->flags, &f->mem[0])))
+return err;
+
+f

[FFmpeg-devel] [PATCH V3 4/5] libavutil/hwcontext_vulkan: Add support to hwmap to software frame when using contiguous_planes flag.

2021-11-25 Thread Wenbin Chen
Add support to map vulkan frames to software frames when
using contiguous_planes flag.

Signed-off-by: Wenbin Chen 
---
 libavutil/hwcontext_vulkan.c | 11 +--
 1 file changed, 9 insertions(+), 2 deletions(-)

diff --git a/libavutil/hwcontext_vulkan.c b/libavutil/hwcontext_vulkan.c
index b8076fb425..0648e59243 100644
--- a/libavutil/hwcontext_vulkan.c
+++ b/libavutil/hwcontext_vulkan.c
@@ -2221,9 +2221,10 @@ static int vulkan_map_frame_to_mem(AVHWFramesContext 
*hwfc, AVFrame *dst,
const AVFrame *src, int flags)
 {
 VkResult ret;
-int err, mapped_mem_count = 0;
+int err, mapped_mem_count = 0, loop = 0;
 AVVkFrame *f = (AVVkFrame *)src->data[0];
 AVVulkanDeviceContext *hwctx = hwfc->device_ctx->hwctx;
+AVVulkanFramesContext *hwfctx = hwfc->hwctx;
 const int planes = av_pix_fmt_count_planes(hwfc->sw_format);
 VulkanDevicePriv *p = hwfc->device_ctx->internal->priv;
 FFVulkanFunctions *vk = &p->vkfn;
@@ -2250,7 +2251,9 @@ static int vulkan_map_frame_to_mem(AVHWFramesContext 
*hwfc, AVFrame *dst,
 dst->width  = src->width;
 dst->height = src->height;
 
-for (int i = 0; i < planes; i++) {
+loop = VKF_FLAG(hwfctx->flags, AV_VK_FRAME_FLAG_CONTIGUOUS_MEMORY) ?
+   1 : planes;
+for (int i = 0; i < loop; i++) {
 ret = vk->MapMemory(hwctx->act_dev, f->mem[i], 0,
 VK_WHOLE_SIZE, 0, (void **)&dst->data[i]);
 if (ret != VK_SUCCESS) {
@@ -2261,6 +2264,10 @@ static int vulkan_map_frame_to_mem(AVHWFramesContext 
*hwfc, AVFrame *dst,
 }
 mapped_mem_count++;
 }
+if (VKF_FLAG(hwfctx->flags, AV_VK_FRAME_FLAG_CONTIGUOUS_MEMORY)) {
+for (int i = 0; i < planes; i++)
+dst->data[i] = dst->data[0] + f->offset[i];
+}
 
 /* Check if the memory contents matter */
 if (((flags & AV_HWFRAME_MAP_READ) || !(flags & AV_HWFRAME_MAP_OVERWRITE)) 
&&
-- 
2.25.1

___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".


[FFmpeg-devel] [PATCH V3 5/5] libavutil/hwcontext_vulkan: specify the modifier to create VKImage

2021-11-25 Thread Wenbin Chen
When vulkan image exports to drm, the tilling need to be
VK_IMAGE_TILING_DRM_FORMAT_MODIFIER_EXT. Now add code to create vulkan
image using this format.

Now the following command line works:

ffmpeg -hwaccel vaapi -hwaccel_device /dev/dri/renderD128 
-hwaccel_output_format \
vaapi -i input_1080p.264 -vf "hwmap=derive_device=vulkan,format=vulkan, \
scale_vulkan=1920:1080,hwmap=derive_device=vaapi,format=vaapi" -c:v h264_vaapi 
output.264

Signed-off-by: Wenbin Chen 
---
 libavutil/hwcontext_vulkan.c | 127 +--
 1 file changed, 121 insertions(+), 6 deletions(-)

diff --git a/libavutil/hwcontext_vulkan.c b/libavutil/hwcontext_vulkan.c
index 0648e59243..903a75618a 100644
--- a/libavutil/hwcontext_vulkan.c
+++ b/libavutil/hwcontext_vulkan.c
@@ -120,6 +120,9 @@ typedef struct VulkanFramesPriv {
 /* Image transfers */
 VulkanExecCtx upload_ctx;
 VulkanExecCtx download_ctx;
+
+/*modifier info*/
+VkImageDrmFormatModifierListCreateInfoEXT *modifier_info;
 } VulkanFramesPriv;
 
 typedef struct AVVkFrameInternal {
@@ -235,6 +238,28 @@ const VkFormat *av_vkfmt_from_pixfmt(enum AVPixelFormat p)
 return NULL;
 }
 
+static void *find_in_structure_list(VkBaseOutStructure *stru_list, 
VkStructureType sType) {
+if (!stru_list)
+return NULL;
+
+for(;stru_list;stru_list = stru_list->pNext)
+if (stru_list->sType == sType)
+return stru_list;
+
+return NULL;
+}
+
+static void append_to_structure_list(VkBaseOutStructure **stru_list, 
VkBaseOutStructure *added_stru) {
+VkBaseOutStructure *p;
+if (!*stru_list) {
+*stru_list = added_stru;
+return;
+}
+for(p = *stru_list; p->pNext; p = p->pNext);
+p->pNext = added_stru;
+return;
+}
+
 static int pixfmt_is_supported(AVHWDeviceContext *dev_ctx, enum AVPixelFormat 
p,
int linear)
 {
@@ -1988,6 +2013,10 @@ static void try_export_flags(AVHWFramesContext *hwfc,
 AVVulkanDeviceContext *dev_hwctx = hwfc->device_ctx->hwctx;
 VulkanDevicePriv *p = hwfc->device_ctx->internal->priv;
 FFVulkanFunctions *vk = &p->vkfn;
+const int has_modifiers = hwctx->tiling == 
VK_IMAGE_TILING_DRM_FORMAT_MODIFIER_EXT;
+int loop_count;
+VkImageDrmFormatModifierListCreateInfoEXT *modifier_info = 
find_in_structure_list(hwctx->create_pnext,
+
VK_STRUCTURE_TYPE_IMAGE_DRM_FORMAT_MODIFIER_LIST_CREATE_INFO_EXT);
 VkExternalImageFormatProperties eprops = {
 .sType = VK_STRUCTURE_TYPE_EXTERNAL_IMAGE_FORMAT_PROPERTIES_KHR,
 };
@@ -1995,9 +2024,18 @@ static void try_export_flags(AVHWFramesContext *hwfc,
 .sType = VK_STRUCTURE_TYPE_IMAGE_FORMAT_PROPERTIES_2,
 .pNext = &eprops,
 };
+VkPhysicalDeviceImageDrmFormatModifierInfoEXT phy_dev_mod_info = {
+.sType = 
VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_IMAGE_DRM_FORMAT_MODIFIER_INFO_EXT,
+.pNext = NULL,
+.pQueueFamilyIndices   = p->qfs,
+.queueFamilyIndexCount = p->num_qfs,
+.sharingMode   = p->num_qfs > 1 ? VK_SHARING_MODE_CONCURRENT :
+  VK_SHARING_MODE_EXCLUSIVE,
+};
 VkPhysicalDeviceExternalImageFormatInfo enext = {
 .sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_EXTERNAL_IMAGE_FORMAT_INFO,
 .handleType = exp,
+.pNext = has_modifiers && modifier_info ? &phy_dev_mod_info : NULL,
 };
 VkPhysicalDeviceImageFormatInfo2 pinfo = {
 .sType  = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_IMAGE_FORMAT_INFO_2,
@@ -2009,11 +2047,16 @@ static void try_export_flags(AVHWFramesContext *hwfc,
 .flags  = VK_IMAGE_CREATE_ALIAS_BIT,
 };
 
-ret = vk->GetPhysicalDeviceImageFormatProperties2(dev_hwctx->phys_dev,
-  &pinfo, &props);
-if (ret == VK_SUCCESS) {
-*iexp |= exp;
-*comp_handle_types |= 
eprops.externalMemoryProperties.compatibleHandleTypes;
+loop_count = has_modifiers && modifier_info ? 
modifier_info->drmFormatModifierCount : 1;
+for (int i = 0; i < loop_count; i++) {
+if (has_modifiers && modifier_info)
+phy_dev_mod_info.drmFormatModifier = 
modifier_info->pDrmFormatModifiers[i];
+ret = vk->GetPhysicalDeviceImageFormatProperties2(dev_hwctx->phys_dev,
+&pinfo, &props);
+if (ret == VK_SUCCESS) {
+*iexp |= exp;
+*comp_handle_types |= 
eprops.externalMemoryProperties.compatibleHandleTypes;
+}
 }
 }
 
@@ -2084,6 +2127,12 @@ static void vulkan_frames_uninit(AVHWFramesContext *hwfc)
 {
 VulkanFramesPriv *fp = hwfc->internal->priv;
 
+if (fp->modifier_info) {
+if (fp->modifier_info->pDrmFormatModifiers)
+

Re: [FFmpeg-devel] [PATCH V3 3/5] libavutil/hwcontext_vulkan: Allocate vkFrame in one memory

2021-11-28 Thread Chen, Wenbin
> 26 Nov 2021, 03:54 by wenbin.c...@intel.com:
> 
> > The vaapi can import external frame, but the planes of the external
> > frames should be in the same drm object. A new option
> "contiguous_planes"
> > is added to device. This flag tells device to allocate places in one
> > memory. When device is derived from vaapi this flag will be enabled.
> > A new flag frame_flag is also added to AVVulkanFramesContext. User
> > can use this flag to force enable or disable this behaviour.
> > A new variable "offset "is added to AVVKFrame. It describe describe the
> > offset from the memory currently bound to the VkImage.
> >
> > Signed-off-by: Wenbin Chen 
> >
> 
> Why is a new offset variable needed?
> vkGetImageSubresourceLayout is valid for DRM tiled images.
> According to the specs,
> "If the image’s tiling is VK_IMAGE_TILING_DRM_FORMAT_MODIFIER_EXT,
> then vkGetImageSubresourceLayout describes one memory plane of the
> image. If the image’s tiling is
> VK_IMAGE_TILING_DRM_FORMAT_MODIFIER_EXT and the image is non-
> linear, then the returned layout has an implementation-dependent meaning;
> the vendor of the image’s DRM format modifier may provide documentation
> that explains how to interpret the returned layout.".
> 
> Isn't this what you already have in the offset field?

The offset we get from vkGetImageSubresourceLayout is from the start of the 
image or plane.
The offset drm_object need is from the start of the memory, and 
vkGetImageSubresourceLayout
cannot get this information. I add a new offset variable because I allocate all 
planes in one memory 
not because I use VK_IMAGE_TILING_DRM_FORMAT_MODIFIER_EXT.

> ___
> ffmpeg-devel mailing list
> ffmpeg-devel@ffmpeg.org
> https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
> 
> To unsubscribe, visit link above, or email
> ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".
___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".


Re: [FFmpeg-devel] [PATCH 3/3] libavutil/hwcontext_opencl: fix a bug for mapping qsv frame to opencl

2021-11-29 Thread Chen, Wenbin
> Quoting Wenbin Chen (2021-11-16 09:16:23)
> > From: nyanmisaka 
> >
> > mfxHDLPair was added to qsv, so modify qsv->opencl map function as well.
> > Now the following commandline works:
> >
> > ffmpeg -v verbose -init_hw_device vaapi=va:/dev/dri/renderD128 \
> > -init_hw_device qsv=qs@va -init_hw_device opencl=ocl@va -
> filter_hw_device ocl \
> > -hwaccel qsv -hwaccel_output_format qsv -hwaccel_device qs -c:v
> h264_qsv \
> > -i input.264 -vf
> "hwmap=derive_device=opencl,format=opencl,avgblur_opencl, \
> > hwmap=derive_device=qsv:reverse=1:extra_hw_frames=32,format=qsv" \
> > -c:v h264_qsv output.264
> >
> > Signed-off-by: nyanmisaka 
> > Signed-off-by: Wenbin Chen 
> > ---
> >  libavutil/hwcontext_opencl.c | 3 ++-
> >  1 file changed, 2 insertions(+), 1 deletion(-)
> >
> > diff --git a/libavutil/hwcontext_opencl.c b/libavutil/hwcontext_opencl.c
> > index 26a3a24593..4b6e74ff6f 100644
> > --- a/libavutil/hwcontext_opencl.c
> > +++ b/libavutil/hwcontext_opencl.c
> > @@ -2249,7 +2249,8 @@ static int
> opencl_map_from_qsv(AVHWFramesContext *dst_fc, AVFrame *dst,
> >  #if CONFIG_LIBMFX
> >  if (src->format == AV_PIX_FMT_QSV) {
> >  mfxFrameSurface1 *mfx_surface = (mfxFrameSurface1*)src->data[3];
> > -va_surface = *(VASurfaceID*)mfx_surface->Data.MemId;
> > +mfxHDLPair *pair = (mfxHDLPair*)mfx_surface->Data.MemId;
> > +va_surface = *(VASurfaceID*)pair->first;
> >  } else
> >  #endif
> 
> The casts here are starting to look like overly arcane black magic.
> Who is responsible for setting MemId here? I assume it's something in
> hwcontext_qsv, but is that guaranteed?
> 
> It would be better for hwcontext_qsv to abstract reading/writing MemId
> contents into a function/macro, so other code can just call it and not
> hardcode internal implementation details.
> 
> --
> Anton Khirnov

Thanks for review. I will resubmit the patchset

> ___
> ffmpeg-devel mailing list
> ffmpeg-devel@ffmpeg.org
> https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
> 
> To unsubscribe, visit link above, or email
> ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".
___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".


[FFmpeg-devel] [PATCH V4 1/5] hwcontext_vaapi: Use PRIME_2 memory type for modifiers.

2021-11-29 Thread Wenbin Chen
From: Bas Nieuwenhuizen 

This way we can pass explicit modifiers in. Sometimes the
modifier matters for the number of memory planes that
libva accepts, in particular when dealing with
driver-compressed textures. Furthermore the driver might
not actually be able to determine the implicit modifier
if all the buffer-passing has used explicit modifier.
All these issues should be resolved by passing in the
modifier, and for that we switch to using the PRIME_2
memory type.

Tested with experimental radeonsi patches for modifiers
and kmsgrab. Also tested with radeonsi without the
patches to double-check it works without PRIME_2 support.

v2:
  Cache PRIME_2 support to avoid doing two calls every time on
  libva drivers that do not support it.

v3:
  Remove prime2_vas usage.

Signed-off-by: Bas Nieuwenhuizen 
---
 libavutil/hwcontext_vaapi.c | 158 ++--
 1 file changed, 114 insertions(+), 44 deletions(-)

diff --git a/libavutil/hwcontext_vaapi.c b/libavutil/hwcontext_vaapi.c
index 83e542876d..75acc851d6 100644
--- a/libavutil/hwcontext_vaapi.c
+++ b/libavutil/hwcontext_vaapi.c
@@ -79,6 +79,9 @@ typedef struct VAAPIFramesContext {
 unsigned int rt_format;
 // Whether vaDeriveImage works.
 int derive_works;
+// Caches whether VA_SURFACE_ATTRIB_MEM_TYPE_DRM_PRIME_2 is unsupported for
+// surface imports.
+int prime_2_import_unsupported;
 } VAAPIFramesContext;
 
 typedef struct VAAPIMapping {
@@ -1022,32 +1025,17 @@ static void vaapi_unmap_from_drm(AVHWFramesContext 
*dst_fc,
 static int vaapi_map_from_drm(AVHWFramesContext *src_fc, AVFrame *dst,
   const AVFrame *src, int flags)
 {
+VAAPIFramesContext *src_vafc = src_fc->internal->priv;
 AVHWFramesContext  *dst_fc =
 (AVHWFramesContext*)dst->hw_frames_ctx->data;
 AVVAAPIDeviceContext  *dst_dev = dst_fc->device_ctx->hwctx;
 const AVDRMFrameDescriptor *desc;
 const VAAPIFormatDescriptor *format_desc;
 VASurfaceID surface_id;
-VAStatus vas;
+VAStatus vas = VA_STATUS_SUCCESS;
+int use_prime2;
 uint32_t va_fourcc;
-int err, i, j, k;
-
-unsigned long buffer_handle;
-VASurfaceAttribExternalBuffers buffer_desc;
-VASurfaceAttrib attrs[2] = {
-{
-.type  = VASurfaceAttribMemoryType,
-.flags = VA_SURFACE_ATTRIB_SETTABLE,
-.value.type= VAGenericValueTypeInteger,
-.value.value.i = VA_SURFACE_ATTRIB_MEM_TYPE_DRM_PRIME,
-},
-{
-.type  = VASurfaceAttribExternalBufferDescriptor,
-.flags = VA_SURFACE_ATTRIB_SETTABLE,
-.value.type= VAGenericValueTypePointer,
-.value.value.p = &buffer_desc,
-}
-};
+int err, i, j;
 
 desc = (AVDRMFrameDescriptor*)src->data[0];
 
@@ -1083,35 +1071,117 @@ static int vaapi_map_from_drm(AVHWFramesContext 
*src_fc, AVFrame *dst,
 format_desc = vaapi_format_from_fourcc(va_fourcc);
 av_assert0(format_desc);
 
-buffer_handle = desc->objects[0].fd;
-buffer_desc.pixel_format = va_fourcc;
-buffer_desc.width= src_fc->width;
-buffer_desc.height   = src_fc->height;
-buffer_desc.data_size= desc->objects[0].size;
-buffer_desc.buffers  = &buffer_handle;
-buffer_desc.num_buffers  = 1;
-buffer_desc.flags= 0;
-
-k = 0;
-for (i = 0; i < desc->nb_layers; i++) {
-for (j = 0; j < desc->layers[i].nb_planes; j++) {
-buffer_desc.pitches[k] = desc->layers[i].planes[j].pitch;
-buffer_desc.offsets[k] = desc->layers[i].planes[j].offset;
-++k;
+use_prime2 = !src_vafc->prime_2_import_unsupported &&
+ desc->objects[0].format_modifier != DRM_FORMAT_MOD_INVALID;
+if (use_prime2) {
+VADRMPRIMESurfaceDescriptor prime_desc;
+VASurfaceAttrib prime_attrs[2] = {
+{
+.type  = VASurfaceAttribMemoryType,
+.flags = VA_SURFACE_ATTRIB_SETTABLE,
+.value.type= VAGenericValueTypeInteger,
+.value.value.i = VA_SURFACE_ATTRIB_MEM_TYPE_DRM_PRIME_2,
+},
+{
+.type  = VASurfaceAttribExternalBufferDescriptor,
+.flags = VA_SURFACE_ATTRIB_SETTABLE,
+.value.type= VAGenericValueTypePointer,
+.value.value.p = &prime_desc,
+}
+};
+prime_desc.fourcc = va_fourcc;
+prime_desc.width = src_fc->width;
+prime_desc.height = src_fc->height;
+prime_desc.num_objects = desc->nb_objects;
+for (i = 0; i < desc->nb_objects; ++i) {
+prime_desc.objects[i].fd = desc->objects[i].fd;
+prime_desc.objects[i].size = desc->objects[i].size;
+prime_desc.objects[i].drm_format_modifier =
+desc->objects[i].format_modifier;
 }
-}
-buffer_desc.num_planes = k;
 
-if (format_desc->chroma_planes

[FFmpeg-devel] [PATCH V4 2/5] libavutil/hwcontext_vaapi: Add a new nv12 format map to support vulkan frame

2021-11-29 Thread Wenbin Chen
Vulkan will map nv12 to R8 and GR88, so add this map to vaapi to support
vulkan frame.

Signed-off-by: Wenbin Chen 
---
 libavutil/hwcontext_vaapi.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/libavutil/hwcontext_vaapi.c b/libavutil/hwcontext_vaapi.c
index 75acc851d6..994b744e4d 100644
--- a/libavutil/hwcontext_vaapi.c
+++ b/libavutil/hwcontext_vaapi.c
@@ -992,6 +992,7 @@ static const struct {
 } vaapi_drm_format_map[] = {
 #ifdef DRM_FORMAT_R8
 DRM_MAP(NV12, 2, DRM_FORMAT_R8,  DRM_FORMAT_RG88),
+DRM_MAP(NV12, 2, DRM_FORMAT_R8,  DRM_FORMAT_GR88),
 #endif
 DRM_MAP(NV12, 1, DRM_FORMAT_NV12),
 #if defined(VA_FOURCC_P010) && defined(DRM_FORMAT_R16)
-- 
2.25.1

___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".


[FFmpeg-devel] [PATCH V4 3/5] libavutil/hwcontext_vulkan: Allocate vkFrame in one memory

2021-11-29 Thread Wenbin Chen
The vaapi can import external frame, but the planes of the external
frames should be in the same drm object. A new option "contiguous_planes"
is added to device. This flag tells device to allocate places in one
memory. When device is derived from vaapi this flag will be enabled.
A new flag frame_flag is also added to AVVulkanFramesContext. User
can use this flag to force enable or disable this behaviour.
A new variable "offset "is added to AVVKFrame. It describe describe the
offset from the memory currently bound to the VkImage.

Signed-off-by: Wenbin Chen 
---
 libavutil/hwcontext_vulkan.c | 68 +++-
 libavutil/hwcontext_vulkan.h | 24 +
 2 files changed, 91 insertions(+), 1 deletion(-)

diff --git a/libavutil/hwcontext_vulkan.c b/libavutil/hwcontext_vulkan.c
index a0437c9661..eef9009ae1 100644
--- a/libavutil/hwcontext_vulkan.c
+++ b/libavutil/hwcontext_vulkan.c
@@ -103,8 +103,14 @@ typedef struct VulkanDevicePriv {
 /* Settings */
 int use_linear_images;
 
+/* allocate planes in a contiguous memory */
+int contiguous_planes;
+
 /* Nvidia */
 int dev_is_nvidia;
+
+/* Intel */
+int dev_is_intel;
 } VulkanDevicePriv;
 
 typedef struct VulkanFramesPriv {
@@ -153,6 +159,8 @@ typedef struct AVVkFrameInternal {
 av_free((void *)props);
\
 }
 
+#define VKF_FLAG(x, f) (((x) & (~AV_VK_FRAME_FLAG_NONE)) & (f))
+
 static const struct {
 enum AVPixelFormat pixfmt;
 const VkFormat vkfmts[4];
@@ -1374,6 +1382,13 @@ static int 
vulkan_device_create_internal(AVHWDeviceContext *ctx,
 if (opt_d)
 p->use_linear_images = strtol(opt_d->value, NULL, 10);
 
+opt_d = av_dict_get(opts, "contiguous_planes", NULL, 0);
+if (opt_d)
+p->contiguous_planes = strtol(opt_d->value, NULL, 10);
+else
+p->contiguous_planes = -1;
+
+
 hwctx->enabled_dev_extensions = dev_info.ppEnabledExtensionNames;
 hwctx->nb_enabled_dev_extensions = dev_info.enabledExtensionCount;
 
@@ -1425,6 +1440,8 @@ static int vulkan_device_init(AVHWDeviceContext *ctx)
 
 p->dev_is_nvidia = (p->props.properties.vendorID == 0x10de);
 
+p->dev_is_intel = (p->props.properties.vendorID == 0x8086);
+
 vk->GetPhysicalDeviceQueueFamilyProperties(hwctx->phys_dev, &queue_num, 
NULL);
 if (!queue_num) {
 av_log(ctx, AV_LOG_ERROR, "Failed to get queues!\n");
@@ -1742,8 +1759,12 @@ static int alloc_bind_mem(AVHWFramesContext *hwfc, 
AVVkFrame *f,
 AVHWDeviceContext *ctx = hwfc->device_ctx;
 VulkanDevicePriv *p = ctx->internal->priv;
 FFVulkanFunctions *vk = &p->vkfn;
+AVVulkanFramesContext *hwfctx = hwfc->hwctx;
 const int planes = av_pix_fmt_count_planes(hwfc->sw_format);
 VkBindImageMemoryInfo bind_info[AV_NUM_DATA_POINTERS] = { { 0 } };
+VkMemoryRequirements memory_requirements = { 0 };
+int mem_size = 0;
+int mem_size_list[AV_NUM_DATA_POINTERS] = { 0 };
 
 AVVulkanDeviceContext *hwctx = ctx->hwctx;
 
@@ -1771,6 +1792,19 @@ static int alloc_bind_mem(AVHWFramesContext *hwfc, 
AVVkFrame *f,
 req.memoryRequirements.size = FFALIGN(req.memoryRequirements.size,
   
p->props.properties.limits.minMemoryMapAlignment);
 
+if (VKF_FLAG(hwfctx->flags, AV_VK_FRAME_FLAG_CONTIGUOUS_MEMORY)) {
+if (memory_requirements.size == 0) {
+memory_requirements = req.memoryRequirements;
+} else if (memory_requirements.memoryTypeBits != 
req.memoryRequirements.memoryTypeBits) {
+av_log(hwfc, AV_LOG_ERROR, "the param for each planes are not 
the same\n");
+return AVERROR(EINVAL);
+}
+
+mem_size_list[i] = req.memoryRequirements.size;
+mem_size += mem_size_list[i];
+continue;
+}
+
 /* In case the implementation prefers/requires dedicated allocation */
 use_ded_mem = ded_req.prefersDedicatedAllocation |
   ded_req.requiresDedicatedAllocation;
@@ -1792,6 +1826,29 @@ static int alloc_bind_mem(AVHWFramesContext *hwfc, 
AVVkFrame *f,
 bind_info[i].memory = f->mem[i];
 }
 
+if (VKF_FLAG(hwfctx->flags, AV_VK_FRAME_FLAG_CONTIGUOUS_MEMORY)) {
+memory_requirements.size = mem_size;
+
+/* Allocate memory */
+if ((err = alloc_mem(ctx, &memory_requirements,
+f->tiling == VK_IMAGE_TILING_LINEAR ?
+VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT :
+VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT,
+(void *)(((uint8_t *)alloc_pnext)),
+&f->flags, &f->mem[0])))
+return err;
+
+f

[FFmpeg-devel] [PATCH V4 4/5] libavutil/hwcontext_vulkan: Add support to hwmap to software frame when using contiguous_planes flag.

2021-11-29 Thread Wenbin Chen
Add support to map vulkan frames to software frames when
using contiguous_planes flag.

Signed-off-by: Wenbin Chen 
---
 libavutil/hwcontext_vulkan.c | 11 +--
 1 file changed, 9 insertions(+), 2 deletions(-)

diff --git a/libavutil/hwcontext_vulkan.c b/libavutil/hwcontext_vulkan.c
index eef9009ae1..f980b72720 100644
--- a/libavutil/hwcontext_vulkan.c
+++ b/libavutil/hwcontext_vulkan.c
@@ -2327,9 +2327,10 @@ static int vulkan_map_frame_to_mem(AVHWFramesContext 
*hwfc, AVFrame *dst,
const AVFrame *src, int flags)
 {
 VkResult ret;
-int err, mapped_mem_count = 0;
+int err, mapped_mem_count = 0, loop = 0;
 AVVkFrame *f = (AVVkFrame *)src->data[0];
 AVVulkanDeviceContext *hwctx = hwfc->device_ctx->hwctx;
+AVVulkanFramesContext *hwfctx = hwfc->hwctx;
 const int planes = av_pix_fmt_count_planes(hwfc->sw_format);
 VulkanDevicePriv *p = hwfc->device_ctx->internal->priv;
 FFVulkanFunctions *vk = &p->vkfn;
@@ -2356,7 +2357,9 @@ static int vulkan_map_frame_to_mem(AVHWFramesContext 
*hwfc, AVFrame *dst,
 dst->width  = src->width;
 dst->height = src->height;
 
-for (int i = 0; i < planes; i++) {
+loop = VKF_FLAG(hwfctx->flags, AV_VK_FRAME_FLAG_CONTIGUOUS_MEMORY) ?
+   1 : planes;
+for (int i = 0; i < loop; i++) {
 ret = vk->MapMemory(hwctx->act_dev, f->mem[i], 0,
 VK_WHOLE_SIZE, 0, (void **)&dst->data[i]);
 if (ret != VK_SUCCESS) {
@@ -2367,6 +2370,10 @@ static int vulkan_map_frame_to_mem(AVHWFramesContext 
*hwfc, AVFrame *dst,
 }
 mapped_mem_count++;
 }
+if (VKF_FLAG(hwfctx->flags, AV_VK_FRAME_FLAG_CONTIGUOUS_MEMORY)) {
+for (int i = 0; i < planes; i++)
+dst->data[i] = dst->data[0] + f->offset[i];
+}
 
 /* Check if the memory contents matter */
 if (((flags & AV_HWFRAME_MAP_READ) || !(flags & AV_HWFRAME_MAP_OVERWRITE)) 
&&
-- 
2.25.1

___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".


[FFmpeg-devel] [PATCH V4 5/5] libavutil/hwcontext_vulkan: specify the modifier to create VKImage

2021-11-29 Thread Wenbin Chen
When vulkan image exports to drm, the tilling need to be
VK_IMAGE_TILING_DRM_FORMAT_MODIFIER_EXT. Now add code to create vulkan
image using this format.

Now the following command line works:

ffmpeg -hwaccel vaapi -hwaccel_device /dev/dri/renderD128 
-hwaccel_output_format \
vaapi -i input_1080p.264 -vf "hwmap=derive_device=vulkan,format=vulkan, \
scale_vulkan=1920:1080,hwmap=derive_device=vaapi,format=vaapi" -c:v h264_vaapi 
output.264

Signed-off-by: Wenbin Chen 
---
 libavutil/hwcontext_vulkan.c | 127 +--
 1 file changed, 121 insertions(+), 6 deletions(-)

diff --git a/libavutil/hwcontext_vulkan.c b/libavutil/hwcontext_vulkan.c
index f980b72720..8224c0d4e4 100644
--- a/libavutil/hwcontext_vulkan.c
+++ b/libavutil/hwcontext_vulkan.c
@@ -120,6 +120,9 @@ typedef struct VulkanFramesPriv {
 /* Image transfers */
 VulkanExecCtx upload_ctx;
 VulkanExecCtx download_ctx;
+
+/*modifier info*/
+VkImageDrmFormatModifierListCreateInfoEXT *modifier_info;
 } VulkanFramesPriv;
 
 typedef struct AVVkFrameInternal {
@@ -242,6 +245,28 @@ const VkFormat *av_vkfmt_from_pixfmt(enum AVPixelFormat p)
 return NULL;
 }
 
+static void *find_in_structure_list(VkBaseOutStructure *stru_list, 
VkStructureType sType) {
+if (!stru_list)
+return NULL;
+
+for(;stru_list;stru_list = stru_list->pNext)
+if (stru_list->sType == sType)
+return stru_list;
+
+return NULL;
+}
+
+static void append_to_structure_list(VkBaseOutStructure **stru_list, 
VkBaseOutStructure *added_stru) {
+VkBaseOutStructure *p;
+if (!*stru_list) {
+*stru_list = added_stru;
+return;
+}
+for(p = *stru_list; p->pNext; p = p->pNext);
+p->pNext = added_stru;
+return;
+}
+
 static int pixfmt_is_supported(AVHWDeviceContext *dev_ctx, enum AVPixelFormat 
p,
int linear)
 {
@@ -2094,6 +2119,10 @@ static void try_export_flags(AVHWFramesContext *hwfc,
 AVVulkanDeviceContext *dev_hwctx = hwfc->device_ctx->hwctx;
 VulkanDevicePriv *p = hwfc->device_ctx->internal->priv;
 FFVulkanFunctions *vk = &p->vkfn;
+const int has_modifiers = hwctx->tiling == 
VK_IMAGE_TILING_DRM_FORMAT_MODIFIER_EXT;
+int loop_count;
+VkImageDrmFormatModifierListCreateInfoEXT *modifier_info = 
find_in_structure_list(hwctx->create_pnext,
+
VK_STRUCTURE_TYPE_IMAGE_DRM_FORMAT_MODIFIER_LIST_CREATE_INFO_EXT);
 VkExternalImageFormatProperties eprops = {
 .sType = VK_STRUCTURE_TYPE_EXTERNAL_IMAGE_FORMAT_PROPERTIES_KHR,
 };
@@ -2101,9 +2130,18 @@ static void try_export_flags(AVHWFramesContext *hwfc,
 .sType = VK_STRUCTURE_TYPE_IMAGE_FORMAT_PROPERTIES_2,
 .pNext = &eprops,
 };
+VkPhysicalDeviceImageDrmFormatModifierInfoEXT phy_dev_mod_info = {
+.sType = 
VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_IMAGE_DRM_FORMAT_MODIFIER_INFO_EXT,
+.pNext = NULL,
+.pQueueFamilyIndices   = p->qfs,
+.queueFamilyIndexCount = p->num_qfs,
+.sharingMode   = p->num_qfs > 1 ? VK_SHARING_MODE_CONCURRENT :
+  VK_SHARING_MODE_EXCLUSIVE,
+};
 VkPhysicalDeviceExternalImageFormatInfo enext = {
 .sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_EXTERNAL_IMAGE_FORMAT_INFO,
 .handleType = exp,
+.pNext = has_modifiers && modifier_info ? &phy_dev_mod_info : NULL,
 };
 VkPhysicalDeviceImageFormatInfo2 pinfo = {
 .sType  = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_IMAGE_FORMAT_INFO_2,
@@ -2115,11 +2153,16 @@ static void try_export_flags(AVHWFramesContext *hwfc,
 .flags  = VK_IMAGE_CREATE_ALIAS_BIT,
 };
 
-ret = vk->GetPhysicalDeviceImageFormatProperties2(dev_hwctx->phys_dev,
-  &pinfo, &props);
-if (ret == VK_SUCCESS) {
-*iexp |= exp;
-*comp_handle_types |= 
eprops.externalMemoryProperties.compatibleHandleTypes;
+loop_count = has_modifiers && modifier_info ? 
modifier_info->drmFormatModifierCount : 1;
+for (int i = 0; i < loop_count; i++) {
+if (has_modifiers && modifier_info)
+phy_dev_mod_info.drmFormatModifier = 
modifier_info->pDrmFormatModifiers[i];
+ret = vk->GetPhysicalDeviceImageFormatProperties2(dev_hwctx->phys_dev,
+&pinfo, &props);
+if (ret == VK_SUCCESS) {
+*iexp |= exp;
+*comp_handle_types |= 
eprops.externalMemoryProperties.compatibleHandleTypes;
+}
 }
 }
 
@@ -2190,6 +2233,12 @@ static void vulkan_frames_uninit(AVHWFramesContext *hwfc)
 {
 VulkanFramesPriv *fp = hwfc->internal->priv;
 
+if (fp->modifier_info) {
+if (fp->modifier_info->pDrmFormatModifiers)
+

Re: [FFmpeg-devel] [PATCH V4 3/5] libavutil/hwcontext_vulkan: Allocate vkFrame in one memory

2021-11-29 Thread Chen, Wenbin
> The vaapi can import external frame, but the planes of the external
> frames should be in the same drm object. A new option "contiguous_planes"
> is added to device. This flag tells device to allocate places in one
> memory. When device is derived from vaapi this flag will be enabled.
> A new flag frame_flag is also added to AVVulkanFramesContext. User
> can use this flag to force enable or disable this behaviour.
> A new variable "offset "is added to AVVKFrame. It describe describe the
> offset from the memory currently bound to the VkImage.
> 
> Signed-off-by: Wenbin Chen 
> ---
>  libavutil/hwcontext_vulkan.c | 68
> +++-
>  libavutil/hwcontext_vulkan.h | 24 +
>  2 files changed, 91 insertions(+), 1 deletion(-)
> 
> diff --git a/libavutil/hwcontext_vulkan.c b/libavutil/hwcontext_vulkan.c
> index a0437c9661..eef9009ae1 100644
> --- a/libavutil/hwcontext_vulkan.c
> +++ b/libavutil/hwcontext_vulkan.c
> @@ -103,8 +103,14 @@ typedef struct VulkanDevicePriv {
>  /* Settings */
>  int use_linear_images;
> 
> +/* allocate planes in a contiguous memory */
> +int contiguous_planes;
> +
>  /* Nvidia */
>  int dev_is_nvidia;
> +
> +/* Intel */
> +int dev_is_intel;
>  } VulkanDevicePriv;
> 
>  typedef struct VulkanFramesPriv {
> @@ -153,6 +159,8 @@ typedef struct AVVkFrameInternal {
>  av_free((void *)props);  
>   \
>  }
> 
> +#define VKF_FLAG(x, f) (((x) & (~AV_VK_FRAME_FLAG_NONE)) & (f))
> +
>  static const struct {
>  enum AVPixelFormat pixfmt;
>  const VkFormat vkfmts[4];
> @@ -1374,6 +1382,13 @@ static int
> vulkan_device_create_internal(AVHWDeviceContext *ctx,
>  if (opt_d)
>  p->use_linear_images = strtol(opt_d->value, NULL, 10);
> 
> +opt_d = av_dict_get(opts, "contiguous_planes", NULL, 0);
> +if (opt_d)
> +p->contiguous_planes = strtol(opt_d->value, NULL, 10);
> +else
> +p->contiguous_planes = -1;
> +
> +
>  hwctx->enabled_dev_extensions = dev_info.ppEnabledExtensionNames;
>  hwctx->nb_enabled_dev_extensions = dev_info.enabledExtensionCount;
> 
> @@ -1425,6 +1440,8 @@ static int vulkan_device_init(AVHWDeviceContext
> *ctx)
> 
>  p->dev_is_nvidia = (p->props.properties.vendorID == 0x10de);
> 
> +p->dev_is_intel = (p->props.properties.vendorID == 0x8086);
> +
>  vk->GetPhysicalDeviceQueueFamilyProperties(hwctx->phys_dev,
> &queue_num, NULL);
>  if (!queue_num) {
>  av_log(ctx, AV_LOG_ERROR, "Failed to get queues!\n");
> @@ -1742,8 +1759,12 @@ static int alloc_bind_mem(AVHWFramesContext
> *hwfc, AVVkFrame *f,
>  AVHWDeviceContext *ctx = hwfc->device_ctx;
>  VulkanDevicePriv *p = ctx->internal->priv;
>  FFVulkanFunctions *vk = &p->vkfn;
> +AVVulkanFramesContext *hwfctx = hwfc->hwctx;
>  const int planes = av_pix_fmt_count_planes(hwfc->sw_format);
>  VkBindImageMemoryInfo bind_info[AV_NUM_DATA_POINTERS] = { { 0 } };
> +VkMemoryRequirements memory_requirements = { 0 };
> +int mem_size = 0;
> +int mem_size_list[AV_NUM_DATA_POINTERS] = { 0 };
> 
>  AVVulkanDeviceContext *hwctx = ctx->hwctx;
> 
> @@ -1771,6 +1792,19 @@ static int alloc_bind_mem(AVHWFramesContext
> *hwfc, AVVkFrame *f,
>  req.memoryRequirements.size =
> FFALIGN(req.memoryRequirements.size,
>p-
> >props.properties.limits.minMemoryMapAlignment);
> 
> +if (VKF_FLAG(hwfctx->flags,
> AV_VK_FRAME_FLAG_CONTIGUOUS_MEMORY)) {
> +if (memory_requirements.size == 0) {
> +memory_requirements = req.memoryRequirements;
> +} else if (memory_requirements.memoryTypeBits !=
> req.memoryRequirements.memoryTypeBits) {
> +av_log(hwfc, AV_LOG_ERROR, "the param for each planes are not
> the same\n");
> +return AVERROR(EINVAL);
> +}
> +
> +mem_size_list[i] = req.memoryRequirements.size;
> +mem_size += mem_size_list[i];
> +continue;
> +}
> +
>  /* In case the implementation prefers/requires dedicated allocation 
> */
>  use_ded_mem = ded_req.prefersDedicatedAllocation |
>ded_req.requiresDedicatedAllocation;
> @@ -1792,6 +1826,29 @@ static int alloc_bind_mem(AVHWFramesContext
> *hwfc, AVVkFrame *f,
>  bind_info[i].memory = f->mem[i];
>  }
> 
> +if (VKF_FLAG(hwfctx->flags,
> AV_VK_FRAME_FL

[FFmpeg-devel] [PATCH V2 1/3] libavcodec/vaapi_decode: fix the problem that init_pool_size < nb_surface

2021-11-30 Thread Wenbin Chen
For vaapi if the init_pool_size is not zero, the pool size is fixed.
This means max surfaces is init_pool_size, but when mapping vaapi
frame to qsv frame, the init_pool_size < nb_surface. The cause is that
vaapi_decode_make_config() config the init_pool_size and it is called
twice. The first time is to init frame_context and the second time is to
init codec. On the second time the init_pool_size is changed to original
value so the init_pool_size is lower than the reall size because
pool_size used to initialize frame_context need to plus thread_count and
3 (guarantee 4 base work surfaces). Now add code to make sure
init_pool_size is only set once. Now the following commandline works:

ffmpeg -hwaccel vaapi -hwaccel_device /dev/dri/renderD128 \
-hwaccel_output_format vaapi -i input.264 \
-vf "hwmap=derive_device=qsv,format=qsv" \
-c:v h264_qsv output.264

Signed-off-by: Wenbin Chen 
---
 libavcodec/vaapi_decode.c | 34 ++
 1 file changed, 18 insertions(+), 16 deletions(-)

diff --git a/libavcodec/vaapi_decode.c b/libavcodec/vaapi_decode.c
index 665af370ed..aab8162989 100644
--- a/libavcodec/vaapi_decode.c
+++ b/libavcodec/vaapi_decode.c
@@ -572,22 +572,24 @@ static int vaapi_decode_make_config(AVCodecContext *avctx,
 if (err < 0)
 goto fail;
 
-frames->initial_pool_size = 1;
-// Add per-codec number of surfaces used for storing reference frames.
-switch (avctx->codec_id) {
-case AV_CODEC_ID_H264:
-case AV_CODEC_ID_HEVC:
-case AV_CODEC_ID_AV1:
-frames->initial_pool_size += 16;
-break;
-case AV_CODEC_ID_VP9:
-frames->initial_pool_size += 8;
-break;
-case AV_CODEC_ID_VP8:
-frames->initial_pool_size += 3;
-break;
-default:
-frames->initial_pool_size += 2;
+if (!frames->initial_pool_size) {
+frames->initial_pool_size = 1;
+// Add per-codec number of surfaces used for storing reference 
frames.
+switch (avctx->codec_id) {
+case AV_CODEC_ID_H264:
+case AV_CODEC_ID_HEVC:
+case AV_CODEC_ID_AV1:
+frames->initial_pool_size += 16;
+break;
+case AV_CODEC_ID_VP9:
+frames->initial_pool_size += 8;
+break;
+case AV_CODEC_ID_VP8:
+frames->initial_pool_size += 3;
+break;
+default:
+frames->initial_pool_size += 2;
+}
 }
 }
 
-- 
2.25.1

___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".


[FFmpeg-devel] [PATCH V2 2/3] libavutil/hwcontext_qsv: fix a bug for mapping vaapi frame to qsv

2021-11-30 Thread Wenbin Chen
From: nyanmisaka 

The data stored in data[3] in VAAPI AVFrame is VASurfaceID while
the data stored in pair->first is the pointer of VASurfaceID, so
we need to do cast to make following commandline works:

ffmpeg -hwaccel vaapi -hwaccel_device /dev/dri/renderD128 \
-hwaccel_output_format vaapi -i input.264 \
-vf "hwmap=derive_device=qsv,format=qsv" -c:v h264_qsv output.264

Signed-off-by: nyanmisaka 
Signed-off-by: Wenbin Chen 
---
 libavutil/hwcontext_qsv.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/libavutil/hwcontext_qsv.c b/libavutil/hwcontext_qsv.c
index 268be9f8a1..a5d154a24d 100644
--- a/libavutil/hwcontext_qsv.c
+++ b/libavutil/hwcontext_qsv.c
@@ -1218,7 +1218,7 @@ static int qsv_map_to(AVHWFramesContext *dst_ctx,
 case AV_PIX_FMT_VAAPI:
 {
 mfxHDLPair *pair = (mfxHDLPair*)hwctx->surfaces[i].Data.MemId;
-if (pair->first == src->data[3]) {
+if (*(VASurfaceID*)pair->first == (VASurfaceID)src->data[3]) {
 index = i;
 break;
 }
-- 
2.25.1

___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".


[FFmpeg-devel] [PATCH V2 3/3] libavutil/hwcontext_opencl: fix a bug for mapping qsv frame to opencl

2021-11-30 Thread Wenbin Chen
From: nyanmisaka 

mfxHDLPair was added to qsv, so modify qsv->opencl map function as well.
Now the following commandline works:

ffmpeg -v verbose -init_hw_device vaapi=va:/dev/dri/renderD128 \
-init_hw_device qsv=qs@va -init_hw_device opencl=ocl@va -filter_hw_device ocl \
-hwaccel qsv -hwaccel_output_format qsv -hwaccel_device qs -c:v h264_qsv \
-i input.264 -vf "hwmap=derive_device=opencl,format=opencl,avgblur_opencl, \
hwmap=derive_device=qsv:reverse=1:extra_hw_frames=32,format=qsv" \
-c:v h264_qsv output.264

Signed-off-by: nyanmisaka 
Signed-off-by: Wenbin Chen 
---
 libavutil/hwcontext_opencl.c | 3 ++-
 libavutil/hwcontext_qsv.h| 5 +
 2 files changed, 7 insertions(+), 1 deletion(-)

diff --git a/libavutil/hwcontext_opencl.c b/libavutil/hwcontext_opencl.c
index 26a3a24593..ab812999cd 100644
--- a/libavutil/hwcontext_opencl.c
+++ b/libavutil/hwcontext_opencl.c
@@ -48,6 +48,7 @@
 #if HAVE_OPENCL_VAAPI_INTEL_MEDIA
 #if CONFIG_LIBMFX
 #include 
+#include "hwcontext_qsv.h"
 #endif
 #include 
 #include 
@@ -2249,7 +2250,7 @@ static int opencl_map_from_qsv(AVHWFramesContext *dst_fc, 
AVFrame *dst,
 #if CONFIG_LIBMFX
 if (src->format == AV_PIX_FMT_QSV) {
 mfxFrameSurface1 *mfx_surface = (mfxFrameSurface1*)src->data[3];
-va_surface = *(VASurfaceID*)mfx_surface->Data.MemId;
+va_surface = *MFXSURFACEP_TO_VASURFACEP(mfx_surface);
 } else
 #endif
 if (src->format == AV_PIX_FMT_VAAPI) {
diff --git a/libavutil/hwcontext_qsv.h b/libavutil/hwcontext_qsv.h
index b98d611cfc..957df01ef1 100644
--- a/libavutil/hwcontext_qsv.h
+++ b/libavutil/hwcontext_qsv.h
@@ -29,6 +29,11 @@
  * contain AVBufferRefs whose data pointer points to an mfxFrameSurface1 
struct.
  */
 
+#if CONFIG_VAAPI
+#define MFXSURFACEP_TO_VASURFACEP(surf) \
+(VASurfaceID*)(((mfxHDLPair*)surf->Data.MemId)->first)
+#endif
+
 /**
  * This struct is allocated as AVHWDeviceContext.hwctx
  */
-- 
2.25.1

___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".


Re: [FFmpeg-devel] [PATCH V4 3/5] libavutil/hwcontext_vulkan: Allocate vkFrame in one memory

2021-12-01 Thread Chen, Wenbin
> Quoting Wenbin Chen (2021-11-30 07:28:13)
> > diff --git a/libavutil/hwcontext_vulkan.h b/libavutil/hwcontext_vulkan.h
> > index fdf2a60156..c485ee7437 100644
> > --- a/libavutil/hwcontext_vulkan.h
> > +++ b/libavutil/hwcontext_vulkan.h
> > @@ -35,6 +35,17 @@
> >   * with the data pointer set to an AVVkFrame.
> >   */
> >
> > +/**
> > + * Defines the behaviour of frame allocation
> > + * AV_VK_FRAME_FLAG_NONE: planes will be allocated in separte
> memory
> > + * AV_VK_FRAME_FLAG_CONTIGUOUS_MEMORY: planes will be allocated
> in a
> > + * contiguous memory.
> > + */
> > +typedef enum {
> > +AV_VK_FRAME_FLAG_NONE = (1ULL << 0),
> > +AV_VK_FRAME_FLAG_CONTIGUOUS_MEMORY = (1ULL << 1) | 1ULL
> > +} AVVkFrameFlags;
> > +
> >  /**
> >   * Main Vulkan context, allocated as AVHWDeviceContext.hwctx.
> >   * All of these can be set before init to change what the context uses
> > @@ -157,6 +168,14 @@ typedef struct AVVulkanFramesContext {
> >   */
> >  void *create_pnext;
> >
> > +/**
> > + * Is a combination of AVVkFrameFlags. Defines the behaviour of frame
> > + * allocation.
> > + * If no flag is set, then the flags are automatically determined
> > + * based on the device.
> > + */
> > +int flags;
> > +
> >  /**
> >   * Extension data for memory allocation. Must have as many entries as
> >   * the number of planes of the sw_format.
> > @@ -198,6 +217,11 @@ typedef struct AVVkFrame {
> >  VkDeviceMemory mem[AV_NUM_DATA_POINTERS];
> >  size_t size[AV_NUM_DATA_POINTERS];
> >
> > +/**
> > + * Describe the offset from the memory currently bound to the VkImage.
> > + */
> > +size_t offset[AV_NUM_DATA_POINTERS];
> > +
> 
> These are public structs, you have to add any new fields at the end or
> you will break ABI compatibility.
> 
> --
> Anton Khirnov

Will resubmit the patchset. Thanks

Wenbin

> ___
> ffmpeg-devel mailing list
> ffmpeg-devel@ffmpeg.org
> https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
> 
> To unsubscribe, visit link above, or email
> ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".
___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".


[FFmpeg-devel] [PATCH V5 1/5] hwcontext_vaapi: Use PRIME_2 memory type for modifiers.

2021-12-01 Thread Wenbin Chen
From: Bas Nieuwenhuizen 

This way we can pass explicit modifiers in. Sometimes the
modifier matters for the number of memory planes that
libva accepts, in particular when dealing with
driver-compressed textures. Furthermore the driver might
not actually be able to determine the implicit modifier
if all the buffer-passing has used explicit modifier.
All these issues should be resolved by passing in the
modifier, and for that we switch to using the PRIME_2
memory type.

Tested with experimental radeonsi patches for modifiers
and kmsgrab. Also tested with radeonsi without the
patches to double-check it works without PRIME_2 support.

v2:
  Cache PRIME_2 support to avoid doing two calls every time on
  libva drivers that do not support it.

v3:
  Remove prime2_vas usage.

Signed-off-by: Bas Nieuwenhuizen 
---
 libavutil/hwcontext_vaapi.c | 158 ++--
 1 file changed, 114 insertions(+), 44 deletions(-)

diff --git a/libavutil/hwcontext_vaapi.c b/libavutil/hwcontext_vaapi.c
index 83e542876d..75acc851d6 100644
--- a/libavutil/hwcontext_vaapi.c
+++ b/libavutil/hwcontext_vaapi.c
@@ -79,6 +79,9 @@ typedef struct VAAPIFramesContext {
 unsigned int rt_format;
 // Whether vaDeriveImage works.
 int derive_works;
+// Caches whether VA_SURFACE_ATTRIB_MEM_TYPE_DRM_PRIME_2 is unsupported for
+// surface imports.
+int prime_2_import_unsupported;
 } VAAPIFramesContext;
 
 typedef struct VAAPIMapping {
@@ -1022,32 +1025,17 @@ static void vaapi_unmap_from_drm(AVHWFramesContext 
*dst_fc,
 static int vaapi_map_from_drm(AVHWFramesContext *src_fc, AVFrame *dst,
   const AVFrame *src, int flags)
 {
+VAAPIFramesContext *src_vafc = src_fc->internal->priv;
 AVHWFramesContext  *dst_fc =
 (AVHWFramesContext*)dst->hw_frames_ctx->data;
 AVVAAPIDeviceContext  *dst_dev = dst_fc->device_ctx->hwctx;
 const AVDRMFrameDescriptor *desc;
 const VAAPIFormatDescriptor *format_desc;
 VASurfaceID surface_id;
-VAStatus vas;
+VAStatus vas = VA_STATUS_SUCCESS;
+int use_prime2;
 uint32_t va_fourcc;
-int err, i, j, k;
-
-unsigned long buffer_handle;
-VASurfaceAttribExternalBuffers buffer_desc;
-VASurfaceAttrib attrs[2] = {
-{
-.type  = VASurfaceAttribMemoryType,
-.flags = VA_SURFACE_ATTRIB_SETTABLE,
-.value.type= VAGenericValueTypeInteger,
-.value.value.i = VA_SURFACE_ATTRIB_MEM_TYPE_DRM_PRIME,
-},
-{
-.type  = VASurfaceAttribExternalBufferDescriptor,
-.flags = VA_SURFACE_ATTRIB_SETTABLE,
-.value.type= VAGenericValueTypePointer,
-.value.value.p = &buffer_desc,
-}
-};
+int err, i, j;
 
 desc = (AVDRMFrameDescriptor*)src->data[0];
 
@@ -1083,35 +1071,117 @@ static int vaapi_map_from_drm(AVHWFramesContext 
*src_fc, AVFrame *dst,
 format_desc = vaapi_format_from_fourcc(va_fourcc);
 av_assert0(format_desc);
 
-buffer_handle = desc->objects[0].fd;
-buffer_desc.pixel_format = va_fourcc;
-buffer_desc.width= src_fc->width;
-buffer_desc.height   = src_fc->height;
-buffer_desc.data_size= desc->objects[0].size;
-buffer_desc.buffers  = &buffer_handle;
-buffer_desc.num_buffers  = 1;
-buffer_desc.flags= 0;
-
-k = 0;
-for (i = 0; i < desc->nb_layers; i++) {
-for (j = 0; j < desc->layers[i].nb_planes; j++) {
-buffer_desc.pitches[k] = desc->layers[i].planes[j].pitch;
-buffer_desc.offsets[k] = desc->layers[i].planes[j].offset;
-++k;
+use_prime2 = !src_vafc->prime_2_import_unsupported &&
+ desc->objects[0].format_modifier != DRM_FORMAT_MOD_INVALID;
+if (use_prime2) {
+VADRMPRIMESurfaceDescriptor prime_desc;
+VASurfaceAttrib prime_attrs[2] = {
+{
+.type  = VASurfaceAttribMemoryType,
+.flags = VA_SURFACE_ATTRIB_SETTABLE,
+.value.type= VAGenericValueTypeInteger,
+.value.value.i = VA_SURFACE_ATTRIB_MEM_TYPE_DRM_PRIME_2,
+},
+{
+.type  = VASurfaceAttribExternalBufferDescriptor,
+.flags = VA_SURFACE_ATTRIB_SETTABLE,
+.value.type= VAGenericValueTypePointer,
+.value.value.p = &prime_desc,
+}
+};
+prime_desc.fourcc = va_fourcc;
+prime_desc.width = src_fc->width;
+prime_desc.height = src_fc->height;
+prime_desc.num_objects = desc->nb_objects;
+for (i = 0; i < desc->nb_objects; ++i) {
+prime_desc.objects[i].fd = desc->objects[i].fd;
+prime_desc.objects[i].size = desc->objects[i].size;
+prime_desc.objects[i].drm_format_modifier =
+desc->objects[i].format_modifier;
 }
-}
-buffer_desc.num_planes = k;
 
-if (format_desc->chroma_planes

[FFmpeg-devel] [PATCH V5 2/5] libavutil/hwcontext_vaapi: Add a new nv12 format map to support vulkan frame

2021-12-01 Thread Wenbin Chen
Vulkan will map nv12 to R8 and GR88, so add this map to vaapi to support
vulkan frame.

Signed-off-by: Wenbin Chen 
---
 libavutil/hwcontext_vaapi.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/libavutil/hwcontext_vaapi.c b/libavutil/hwcontext_vaapi.c
index 75acc851d6..994b744e4d 100644
--- a/libavutil/hwcontext_vaapi.c
+++ b/libavutil/hwcontext_vaapi.c
@@ -992,6 +992,7 @@ static const struct {
 } vaapi_drm_format_map[] = {
 #ifdef DRM_FORMAT_R8
 DRM_MAP(NV12, 2, DRM_FORMAT_R8,  DRM_FORMAT_RG88),
+DRM_MAP(NV12, 2, DRM_FORMAT_R8,  DRM_FORMAT_GR88),
 #endif
 DRM_MAP(NV12, 1, DRM_FORMAT_NV12),
 #if defined(VA_FOURCC_P010) && defined(DRM_FORMAT_R16)
-- 
2.25.1

___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".


[FFmpeg-devel] [PATCH V5 3/5] libavutil/hwcontext_vulkan: Allocate vkFrame in one memory

2021-12-01 Thread Wenbin Chen
The vaapi can import external frame, but the planes of the external
frames should be in the same drm object. A new option "contiguous_planes"
is added to device. This flag tells device to allocate places in one
memory. When device is derived from vaapi this flag will be enabled.
A new flag frame_flag is also added to AVVulkanFramesContext. User
can use this flag to force enable or disable this behaviour.
A new variable "offset "is added to AVVKFrame. It describe describe the
offset from the memory currently bound to the VkImage.

Signed-off-by: Wenbin Chen 
---
 libavutil/hwcontext_vulkan.c | 68 +++-
 libavutil/hwcontext_vulkan.h | 24 +
 2 files changed, 91 insertions(+), 1 deletion(-)

diff --git a/libavutil/hwcontext_vulkan.c b/libavutil/hwcontext_vulkan.c
index a0437c9661..eef9009ae1 100644
--- a/libavutil/hwcontext_vulkan.c
+++ b/libavutil/hwcontext_vulkan.c
@@ -103,8 +103,14 @@ typedef struct VulkanDevicePriv {
 /* Settings */
 int use_linear_images;
 
+/* allocate planes in a contiguous memory */
+int contiguous_planes;
+
 /* Nvidia */
 int dev_is_nvidia;
+
+/* Intel */
+int dev_is_intel;
 } VulkanDevicePriv;
 
 typedef struct VulkanFramesPriv {
@@ -153,6 +159,8 @@ typedef struct AVVkFrameInternal {
 av_free((void *)props);
\
 }
 
+#define VKF_FLAG(x, f) (((x) & (~AV_VK_FRAME_FLAG_NONE)) & (f))
+
 static const struct {
 enum AVPixelFormat pixfmt;
 const VkFormat vkfmts[4];
@@ -1374,6 +1382,13 @@ static int 
vulkan_device_create_internal(AVHWDeviceContext *ctx,
 if (opt_d)
 p->use_linear_images = strtol(opt_d->value, NULL, 10);
 
+opt_d = av_dict_get(opts, "contiguous_planes", NULL, 0);
+if (opt_d)
+p->contiguous_planes = strtol(opt_d->value, NULL, 10);
+else
+p->contiguous_planes = -1;
+
+
 hwctx->enabled_dev_extensions = dev_info.ppEnabledExtensionNames;
 hwctx->nb_enabled_dev_extensions = dev_info.enabledExtensionCount;
 
@@ -1425,6 +1440,8 @@ static int vulkan_device_init(AVHWDeviceContext *ctx)
 
 p->dev_is_nvidia = (p->props.properties.vendorID == 0x10de);
 
+p->dev_is_intel = (p->props.properties.vendorID == 0x8086);
+
 vk->GetPhysicalDeviceQueueFamilyProperties(hwctx->phys_dev, &queue_num, 
NULL);
 if (!queue_num) {
 av_log(ctx, AV_LOG_ERROR, "Failed to get queues!\n");
@@ -1742,8 +1759,12 @@ static int alloc_bind_mem(AVHWFramesContext *hwfc, 
AVVkFrame *f,
 AVHWDeviceContext *ctx = hwfc->device_ctx;
 VulkanDevicePriv *p = ctx->internal->priv;
 FFVulkanFunctions *vk = &p->vkfn;
+AVVulkanFramesContext *hwfctx = hwfc->hwctx;
 const int planes = av_pix_fmt_count_planes(hwfc->sw_format);
 VkBindImageMemoryInfo bind_info[AV_NUM_DATA_POINTERS] = { { 0 } };
+VkMemoryRequirements memory_requirements = { 0 };
+int mem_size = 0;
+int mem_size_list[AV_NUM_DATA_POINTERS] = { 0 };
 
 AVVulkanDeviceContext *hwctx = ctx->hwctx;
 
@@ -1771,6 +1792,19 @@ static int alloc_bind_mem(AVHWFramesContext *hwfc, 
AVVkFrame *f,
 req.memoryRequirements.size = FFALIGN(req.memoryRequirements.size,
   
p->props.properties.limits.minMemoryMapAlignment);
 
+if (VKF_FLAG(hwfctx->flags, AV_VK_FRAME_FLAG_CONTIGUOUS_MEMORY)) {
+if (memory_requirements.size == 0) {
+memory_requirements = req.memoryRequirements;
+} else if (memory_requirements.memoryTypeBits != 
req.memoryRequirements.memoryTypeBits) {
+av_log(hwfc, AV_LOG_ERROR, "the param for each planes are not 
the same\n");
+return AVERROR(EINVAL);
+}
+
+mem_size_list[i] = req.memoryRequirements.size;
+mem_size += mem_size_list[i];
+continue;
+}
+
 /* In case the implementation prefers/requires dedicated allocation */
 use_ded_mem = ded_req.prefersDedicatedAllocation |
   ded_req.requiresDedicatedAllocation;
@@ -1792,6 +1826,29 @@ static int alloc_bind_mem(AVHWFramesContext *hwfc, 
AVVkFrame *f,
 bind_info[i].memory = f->mem[i];
 }
 
+if (VKF_FLAG(hwfctx->flags, AV_VK_FRAME_FLAG_CONTIGUOUS_MEMORY)) {
+memory_requirements.size = mem_size;
+
+/* Allocate memory */
+if ((err = alloc_mem(ctx, &memory_requirements,
+f->tiling == VK_IMAGE_TILING_LINEAR ?
+VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT :
+VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT,
+(void *)(((uint8_t *)alloc_pnext)),
+&f->flags, &f->mem[0])))
+return err;
+
+f

[FFmpeg-devel] [PATCH V5 4/5] libavutil/hwcontext_vulkan: Add support to hwmap to software frame when using contiguous_planes flag.

2021-12-01 Thread Wenbin Chen
Add support to map vulkan frames to software frames when
using contiguous_planes flag.

Signed-off-by: Wenbin Chen 
---
 libavutil/hwcontext_vulkan.c | 11 +--
 1 file changed, 9 insertions(+), 2 deletions(-)

diff --git a/libavutil/hwcontext_vulkan.c b/libavutil/hwcontext_vulkan.c
index eef9009ae1..f980b72720 100644
--- a/libavutil/hwcontext_vulkan.c
+++ b/libavutil/hwcontext_vulkan.c
@@ -2327,9 +2327,10 @@ static int vulkan_map_frame_to_mem(AVHWFramesContext 
*hwfc, AVFrame *dst,
const AVFrame *src, int flags)
 {
 VkResult ret;
-int err, mapped_mem_count = 0;
+int err, mapped_mem_count = 0, loop = 0;
 AVVkFrame *f = (AVVkFrame *)src->data[0];
 AVVulkanDeviceContext *hwctx = hwfc->device_ctx->hwctx;
+AVVulkanFramesContext *hwfctx = hwfc->hwctx;
 const int planes = av_pix_fmt_count_planes(hwfc->sw_format);
 VulkanDevicePriv *p = hwfc->device_ctx->internal->priv;
 FFVulkanFunctions *vk = &p->vkfn;
@@ -2356,7 +2357,9 @@ static int vulkan_map_frame_to_mem(AVHWFramesContext 
*hwfc, AVFrame *dst,
 dst->width  = src->width;
 dst->height = src->height;
 
-for (int i = 0; i < planes; i++) {
+loop = VKF_FLAG(hwfctx->flags, AV_VK_FRAME_FLAG_CONTIGUOUS_MEMORY) ?
+   1 : planes;
+for (int i = 0; i < loop; i++) {
 ret = vk->MapMemory(hwctx->act_dev, f->mem[i], 0,
 VK_WHOLE_SIZE, 0, (void **)&dst->data[i]);
 if (ret != VK_SUCCESS) {
@@ -2367,6 +2370,10 @@ static int vulkan_map_frame_to_mem(AVHWFramesContext 
*hwfc, AVFrame *dst,
 }
 mapped_mem_count++;
 }
+if (VKF_FLAG(hwfctx->flags, AV_VK_FRAME_FLAG_CONTIGUOUS_MEMORY)) {
+for (int i = 0; i < planes; i++)
+dst->data[i] = dst->data[0] + f->offset[i];
+}
 
 /* Check if the memory contents matter */
 if (((flags & AV_HWFRAME_MAP_READ) || !(flags & AV_HWFRAME_MAP_OVERWRITE)) 
&&
-- 
2.25.1

___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".


[FFmpeg-devel] [PATCH V5 5/5] libavutil/hwcontext_vulkan: specify the modifier to create VKImage

2021-12-01 Thread Wenbin Chen
When vulkan image exports to drm, the tilling need to be
VK_IMAGE_TILING_DRM_FORMAT_MODIFIER_EXT. Now add code to create vulkan
image using this format.

Now the following command line works:

ffmpeg -hwaccel vaapi -hwaccel_device /dev/dri/renderD128 
-hwaccel_output_format \
vaapi -i input_1080p.264 -vf "hwmap=derive_device=vulkan,format=vulkan, \
scale_vulkan=1920:1080,hwmap=derive_device=vaapi,format=vaapi" -c:v h264_vaapi 
output.264

Signed-off-by: Wenbin Chen 
---
 libavutil/hwcontext_vulkan.c | 127 +--
 1 file changed, 121 insertions(+), 6 deletions(-)

diff --git a/libavutil/hwcontext_vulkan.c b/libavutil/hwcontext_vulkan.c
index f980b72720..8224c0d4e4 100644
--- a/libavutil/hwcontext_vulkan.c
+++ b/libavutil/hwcontext_vulkan.c
@@ -120,6 +120,9 @@ typedef struct VulkanFramesPriv {
 /* Image transfers */
 VulkanExecCtx upload_ctx;
 VulkanExecCtx download_ctx;
+
+/*modifier info*/
+VkImageDrmFormatModifierListCreateInfoEXT *modifier_info;
 } VulkanFramesPriv;
 
 typedef struct AVVkFrameInternal {
@@ -242,6 +245,28 @@ const VkFormat *av_vkfmt_from_pixfmt(enum AVPixelFormat p)
 return NULL;
 }
 
+static void *find_in_structure_list(VkBaseOutStructure *stru_list, 
VkStructureType sType) {
+if (!stru_list)
+return NULL;
+
+for(;stru_list;stru_list = stru_list->pNext)
+if (stru_list->sType == sType)
+return stru_list;
+
+return NULL;
+}
+
+static void append_to_structure_list(VkBaseOutStructure **stru_list, 
VkBaseOutStructure *added_stru) {
+VkBaseOutStructure *p;
+if (!*stru_list) {
+*stru_list = added_stru;
+return;
+}
+for(p = *stru_list; p->pNext; p = p->pNext);
+p->pNext = added_stru;
+return;
+}
+
 static int pixfmt_is_supported(AVHWDeviceContext *dev_ctx, enum AVPixelFormat 
p,
int linear)
 {
@@ -2094,6 +2119,10 @@ static void try_export_flags(AVHWFramesContext *hwfc,
 AVVulkanDeviceContext *dev_hwctx = hwfc->device_ctx->hwctx;
 VulkanDevicePriv *p = hwfc->device_ctx->internal->priv;
 FFVulkanFunctions *vk = &p->vkfn;
+const int has_modifiers = hwctx->tiling == 
VK_IMAGE_TILING_DRM_FORMAT_MODIFIER_EXT;
+int loop_count;
+VkImageDrmFormatModifierListCreateInfoEXT *modifier_info = 
find_in_structure_list(hwctx->create_pnext,
+
VK_STRUCTURE_TYPE_IMAGE_DRM_FORMAT_MODIFIER_LIST_CREATE_INFO_EXT);
 VkExternalImageFormatProperties eprops = {
 .sType = VK_STRUCTURE_TYPE_EXTERNAL_IMAGE_FORMAT_PROPERTIES_KHR,
 };
@@ -2101,9 +2130,18 @@ static void try_export_flags(AVHWFramesContext *hwfc,
 .sType = VK_STRUCTURE_TYPE_IMAGE_FORMAT_PROPERTIES_2,
 .pNext = &eprops,
 };
+VkPhysicalDeviceImageDrmFormatModifierInfoEXT phy_dev_mod_info = {
+.sType = 
VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_IMAGE_DRM_FORMAT_MODIFIER_INFO_EXT,
+.pNext = NULL,
+.pQueueFamilyIndices   = p->qfs,
+.queueFamilyIndexCount = p->num_qfs,
+.sharingMode   = p->num_qfs > 1 ? VK_SHARING_MODE_CONCURRENT :
+  VK_SHARING_MODE_EXCLUSIVE,
+};
 VkPhysicalDeviceExternalImageFormatInfo enext = {
 .sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_EXTERNAL_IMAGE_FORMAT_INFO,
 .handleType = exp,
+.pNext = has_modifiers && modifier_info ? &phy_dev_mod_info : NULL,
 };
 VkPhysicalDeviceImageFormatInfo2 pinfo = {
 .sType  = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_IMAGE_FORMAT_INFO_2,
@@ -2115,11 +2153,16 @@ static void try_export_flags(AVHWFramesContext *hwfc,
 .flags  = VK_IMAGE_CREATE_ALIAS_BIT,
 };
 
-ret = vk->GetPhysicalDeviceImageFormatProperties2(dev_hwctx->phys_dev,
-  &pinfo, &props);
-if (ret == VK_SUCCESS) {
-*iexp |= exp;
-*comp_handle_types |= 
eprops.externalMemoryProperties.compatibleHandleTypes;
+loop_count = has_modifiers && modifier_info ? 
modifier_info->drmFormatModifierCount : 1;
+for (int i = 0; i < loop_count; i++) {
+if (has_modifiers && modifier_info)
+phy_dev_mod_info.drmFormatModifier = 
modifier_info->pDrmFormatModifiers[i];
+ret = vk->GetPhysicalDeviceImageFormatProperties2(dev_hwctx->phys_dev,
+&pinfo, &props);
+if (ret == VK_SUCCESS) {
+*iexp |= exp;
+*comp_handle_types |= 
eprops.externalMemoryProperties.compatibleHandleTypes;
+}
 }
 }
 
@@ -2190,6 +2233,12 @@ static void vulkan_frames_uninit(AVHWFramesContext *hwfc)
 {
 VulkanFramesPriv *fp = hwfc->internal->priv;
 
+if (fp->modifier_info) {
+if (fp->modifier_info->pDrmFormatModifiers)
+

Re: [FFmpeg-devel] [PATCH V5 3/5] libavutil/hwcontext_vulkan: Allocate vkFrame in one memory

2021-12-01 Thread Chen, Wenbin
> The vaapi can import external frame, but the planes of the external
> frames should be in the same drm object. A new option "contiguous_planes"
> is added to device. This flag tells device to allocate places in one
> memory. When device is derived from vaapi this flag will be enabled.
> A new flag frame_flag is also added to AVVulkanFramesContext. User
> can use this flag to force enable or disable this behaviour.
> A new variable "offset "is added to AVVKFrame. It describe describe the
> offset from the memory currently bound to the VkImage.
> 
> Signed-off-by: Wenbin Chen 
> ---
>  libavutil/hwcontext_vulkan.c | 68
> +++-
>  libavutil/hwcontext_vulkan.h | 24 +
>  2 files changed, 91 insertions(+), 1 deletion(-)
> 
> diff --git a/libavutil/hwcontext_vulkan.c b/libavutil/hwcontext_vulkan.c
> index a0437c9661..eef9009ae1 100644
> --- a/libavutil/hwcontext_vulkan.c
> +++ b/libavutil/hwcontext_vulkan.c
> @@ -103,8 +103,14 @@ typedef struct VulkanDevicePriv {
>  /* Settings */
>  int use_linear_images;
> 
> +/* allocate planes in a contiguous memory */
> +int contiguous_planes;
> +
>  /* Nvidia */
>  int dev_is_nvidia;
> +
> +/* Intel */
> +int dev_is_intel;
>  } VulkanDevicePriv;
> 
>  typedef struct VulkanFramesPriv {
> @@ -153,6 +159,8 @@ typedef struct AVVkFrameInternal {
>  av_free((void *)props);  
>   \
>  }
> 
> +#define VKF_FLAG(x, f) (((x) & (~AV_VK_FRAME_FLAG_NONE)) & (f))
> +
>  static const struct {
>  enum AVPixelFormat pixfmt;
>  const VkFormat vkfmts[4];
> @@ -1374,6 +1382,13 @@ static int
> vulkan_device_create_internal(AVHWDeviceContext *ctx,
>  if (opt_d)
>  p->use_linear_images = strtol(opt_d->value, NULL, 10);
> 
> +opt_d = av_dict_get(opts, "contiguous_planes", NULL, 0);
> +if (opt_d)
> +p->contiguous_planes = strtol(opt_d->value, NULL, 10);
> +else
> +p->contiguous_planes = -1;
> +
> +
>  hwctx->enabled_dev_extensions = dev_info.ppEnabledExtensionNames;
>  hwctx->nb_enabled_dev_extensions = dev_info.enabledExtensionCount;
> 
> @@ -1425,6 +1440,8 @@ static int vulkan_device_init(AVHWDeviceContext
> *ctx)
> 
>  p->dev_is_nvidia = (p->props.properties.vendorID == 0x10de);
> 
> +p->dev_is_intel = (p->props.properties.vendorID == 0x8086);
> +
>  vk->GetPhysicalDeviceQueueFamilyProperties(hwctx->phys_dev,
> &queue_num, NULL);
>  if (!queue_num) {
>  av_log(ctx, AV_LOG_ERROR, "Failed to get queues!\n");
> @@ -1742,8 +1759,12 @@ static int alloc_bind_mem(AVHWFramesContext
> *hwfc, AVVkFrame *f,
>  AVHWDeviceContext *ctx = hwfc->device_ctx;
>  VulkanDevicePriv *p = ctx->internal->priv;
>  FFVulkanFunctions *vk = &p->vkfn;
> +AVVulkanFramesContext *hwfctx = hwfc->hwctx;
>  const int planes = av_pix_fmt_count_planes(hwfc->sw_format);
>  VkBindImageMemoryInfo bind_info[AV_NUM_DATA_POINTERS] = { { 0 } };
> +VkMemoryRequirements memory_requirements = { 0 };
> +int mem_size = 0;
> +int mem_size_list[AV_NUM_DATA_POINTERS] = { 0 };
> 
>  AVVulkanDeviceContext *hwctx = ctx->hwctx;
> 
> @@ -1771,6 +1792,19 @@ static int alloc_bind_mem(AVHWFramesContext
> *hwfc, AVVkFrame *f,
>  req.memoryRequirements.size =
> FFALIGN(req.memoryRequirements.size,
>p-
> >props.properties.limits.minMemoryMapAlignment);
> 
> +if (VKF_FLAG(hwfctx->flags,
> AV_VK_FRAME_FLAG_CONTIGUOUS_MEMORY)) {
> +if (memory_requirements.size == 0) {
> +memory_requirements = req.memoryRequirements;
> +} else if (memory_requirements.memoryTypeBits !=
> req.memoryRequirements.memoryTypeBits) {
> +av_log(hwfc, AV_LOG_ERROR, "the param for each planes are not
> the same\n");
> +return AVERROR(EINVAL);
> +}
> +
> +mem_size_list[i] = req.memoryRequirements.size;
> +mem_size += mem_size_list[i];
> +continue;
> +}
> +
>  /* In case the implementation prefers/requires dedicated allocation 
> */
>  use_ded_mem = ded_req.prefersDedicatedAllocation |
>ded_req.requiresDedicatedAllocation;
> @@ -1792,6 +1826,29 @@ static int alloc_bind_mem(AVHWFramesContext
> *hwfc, AVVkFrame *f,
>  bind_info[i].memory = f->mem[i];
>  }
> 
> +if (VKF_FLAG(hwfctx->flags,
> AV_VK_FRAME_FL

[FFmpeg-devel] [PATCH] libavutil/hwcontext_qsv: clean padding when upload qsv frames

2021-12-01 Thread Wenbin Chen
When we upload a frame that is not padded as MSDK requires, we create a
new AVFrame to copy data. The frame's padding data is uninitialized so
it brings run to run problem. For example, If we run the following
command serveral times we will get different outputs.

ffmpeg -init_hw_device qsv=qsv:hw -qsv_device /dev/dri/renderD128
-filter_hw_device qsv -f rawvideo -s 192x200 -pix_fmt p010
-i 192x200_P010.yuv -vf "format=nv12,hwupload=extra_hw_frames=16"
-c:v hevc_qsv output.265

According to 
https://github.com/Intel-Media-SDK/MediaSDK/blob/master/doc/mediasdk-man.md#encoding-procedures
"Note: It is the application's responsibility to fill pixels outside
of crop window when it is smaller than frame to be encoded. Especially
in cases when crops are not aligned to minimum coding block size (16
for AVC, 8 for HEVC and VP9)"

I add a function to fill padding area with border pixel to fix this
run2run problem, and also move the new AVFrame to global structure
to reduce redundant allocation operation to increase preformance.

Signed-off-by: Wenbin Chen 
---
 libavutil/hwcontext_qsv.c | 96 +--
 1 file changed, 83 insertions(+), 13 deletions(-)

diff --git a/libavutil/hwcontext_qsv.c b/libavutil/hwcontext_qsv.c
index 268be9f8a1..983494666b 100644
--- a/libavutil/hwcontext_qsv.c
+++ b/libavutil/hwcontext_qsv.c
@@ -47,6 +47,7 @@
 #include "pixfmt.h"
 #include "pixdesc.h"
 #include "time.h"
+#include "imgutils.h"
 
 #define QSV_VERSION_ATLEAST(MAJOR, MINOR)   \
 (MFX_VERSION_MAJOR > (MAJOR) || \
@@ -90,6 +91,7 @@ typedef struct QSVFramesContext {
 
 mfxExtOpaqueSurfaceAlloc opaque_alloc;
 mfxExtBuffer *ext_buffers[1];
+AVFrame realigned_tmp_frame;
 } QSVFramesContext;
 
 static const struct {
@@ -137,6 +139,54 @@ static uint32_t qsv_get_d3d11va_bind_flags(int mem_type)
 }
 #endif
 
+static int qsv_fill_border(AVFrame *dst, const AVFrame *src)
+{
+const AVPixFmtDescriptor *desc;
+int i, planes_nb = 0;
+if (dst->format != src->format)
+return AVERROR(EINVAL);
+
+desc = av_pix_fmt_desc_get(dst->format);
+
+for (i = 0; i < desc->nb_components; i++)
+planes_nb = FFMAX(planes_nb, desc->comp[i].plane + 1);
+
+for (i = 0; i < planes_nb; i++) {
+int sheight, dheight, y;
+ptrdiff_t swidth = av_image_get_linesize(src->format,
+ src->width,
+ i);
+ptrdiff_t dwidth = av_image_get_linesize(dst->format,
+ dst->width,
+ i);
+const AVComponentDescriptor comp = desc->comp[i];
+if (swidth < 0 || dwidth < 0) {
+av_log(NULL, AV_LOG_ERROR, "av_image_get_linesize failed\n");
+return AVERROR(EINVAL);
+}
+sheight = src->height;
+dheight = dst->height;
+if (i) {
+sheight = AV_CEIL_RSHIFT(src->height, desc->log2_chroma_h);
+dheight = AV_CEIL_RSHIFT(dst->height, desc->log2_chroma_h);
+}
+//fill right padding
+for (y = 0; y < sheight; y++) {
+void *line_ptr = dst->data[i] + y*dst->linesize[i] + swidth;
+av_memcpy_backptr(line_ptr,
+   comp.depth > 8 ? 2 : 1,
+   dwidth - swidth);
+}
+//fill bottom padding
+for (y = sheight; y < dheight; y++) {
+memcpy(dst->data[i]+y*dst->linesize[i],
+   dst->data[i]+(sheight-1)*dst->linesize[i],
+   dwidth);
+}
+}
+return 0;
+}
+
 static int qsv_device_init(AVHWDeviceContext *ctx)
 {
 AVQSVDeviceContext *hwctx = ctx->hwctx;
@@ -220,6 +270,7 @@ static void qsv_frames_uninit(AVHWFramesContext *ctx)
 av_freep(&s->surface_ptrs);
 av_freep(&s->surfaces_internal);
 av_freep(&s->handle_pairs_internal);
+av_frame_unref(&s->realigned_tmp_frame);
 av_buffer_unref(&s->child_frames_ref);
 }
 
@@ -1014,12 +1065,13 @@ static int qsv_transfer_data_to(AVHWFramesContext *ctx, 
AVFrame *dst,
 QSVFramesContext   *s = ctx->internal->priv;
 mfxFrameSurface1   in = {{ 0 }};
 mfxFrameSurface1 *out = (mfxFrameSurface1*)dst->data[3];
+mfxFrameInfo tmp_info;
 
 mfxSyncPoint sync = NULL;
 mfxStatus err;
 int ret = 0;
 /* make a copy if the input is not padded as libmfx requires */
-AVFrame tmp_frame;
+AVFrame *tmp_frame = &s->realigned_tmp_frame;
 const AVFrame *src_frame;
 int realigned = 0;
 
@@ -1048,24 +1100,40 @@ static int qsv_transfer_data_to(AVHWFramesContext *ctx, 
AVFrame *dst,
 if (ret < 0)
 return ret;
 
+/* According to MSDK spec

Re: [FFmpeg-devel] [PATCH V5 3/5] libavutil/hwcontext_vulkan: Allocate vkFrame in one memory

2021-12-03 Thread Chen, Wenbin
> 2 Dec 2021, 02:53 by wenbin.c...@intel.com:
> 
> >> The vaapi can import external frame, but the planes of the external
> >> frames should be in the same drm object. A new option
> "contiguous_planes"
> >> is added to device. This flag tells device to allocate places in one
> >> memory. When device is derived from vaapi this flag will be enabled.
> >> A new flag frame_flag is also added to AVVulkanFramesContext. User
> >> can use this flag to force enable or disable this behaviour.
> >> A new variable "offset "is added to AVVKFrame. It describe describe the
> >> offset from the memory currently bound to the VkImage.
> >>
> >> Signed-off-by: Wenbin Chen 
> >> ---
> >>  libavutil/hwcontext_vulkan.c | 68
> >> +++-
> >>  libavutil/hwcontext_vulkan.h | 24 +
> >>  2 files changed, 91 insertions(+), 1 deletion(-)
> >>
> >> diff --git a/libavutil/hwcontext_vulkan.c b/libavutil/hwcontext_vulkan.c
> >> index a0437c9661..eef9009ae1 100644
> >> --- a/libavutil/hwcontext_vulkan.c
> >> +++ b/libavutil/hwcontext_vulkan.c
> >> @@ -103,8 +103,14 @@ typedef struct VulkanDevicePriv {
> >>  /* Settings */
> >>  int use_linear_images;
> >>
> >> +/* allocate planes in a contiguous memory */
> >> +int contiguous_planes;
> >> +
> >>  /* Nvidia */
> >>  int dev_is_nvidia;
> >> +
> >> +/* Intel */
> >> +int dev_is_intel;
> >>  } VulkanDevicePriv;
> >>
> >>  typedef struct VulkanFramesPriv {
> >> @@ -153,6 +159,8 @@ typedef struct AVVkFrameInternal {
> >>  av_free((void *)props);\
> >>  }
> >>
> >> +#define VKF_FLAG(x, f) (((x) & (~AV_VK_FRAME_FLAG_NONE)) & (f))
> >> +
> >>  static const struct {
> >>  enum AVPixelFormat pixfmt;
> >>  const VkFormat vkfmts[4];
> >> @@ -1374,6 +1382,13 @@ static int
> >> vulkan_device_create_internal(AVHWDeviceContext *ctx,
> >>  if (opt_d)
> >>  p->use_linear_images = strtol(opt_d->value, NULL, 10);
> >>
> >> +opt_d = av_dict_get(opts, "contiguous_planes", NULL, 0);
> >> +if (opt_d)
> >> +p->contiguous_planes = strtol(opt_d->value, NULL, 10);
> >> +else
> >> +p->contiguous_planes = -1;
> >> +
> >> +
> >>  hwctx->enabled_dev_extensions = dev_info.ppEnabledExtensionNames;
> >>  hwctx->nb_enabled_dev_extensions = dev_info.enabledExtensionCount;
> >>
> >> @@ -1425,6 +1440,8 @@ static int
> vulkan_device_init(AVHWDeviceContext
> >> *ctx)
> >>
> >>  p->dev_is_nvidia = (p->props.properties.vendorID == 0x10de);
> >>
> >> +p->dev_is_intel = (p->props.properties.vendorID == 0x8086);
> >> +
> >>  vk->GetPhysicalDeviceQueueFamilyProperties(hwctx->phys_dev,
> >> &queue_num, NULL);
> >>  if (!queue_num) {
> >>  av_log(ctx, AV_LOG_ERROR, "Failed to get queues!\n");
> >> @@ -1742,8 +1759,12 @@ static int
> alloc_bind_mem(AVHWFramesContext
> >> *hwfc, AVVkFrame *f,
> >>  AVHWDeviceContext *ctx = hwfc->device_ctx;
> >>  VulkanDevicePriv *p = ctx->internal->priv;
> >>  FFVulkanFunctions *vk = &p->vkfn;
> >> +AVVulkanFramesContext *hwfctx = hwfc->hwctx;
> >>  const int planes = av_pix_fmt_count_planes(hwfc->sw_format);
> >>  VkBindImageMemoryInfo bind_info[AV_NUM_DATA_POINTERS] = { { 0 } };
> >> +VkMemoryRequirements memory_requirements = { 0 };
> >> +int mem_size = 0;
> >> +int mem_size_list[AV_NUM_DATA_POINTERS] = { 0 };
> >>
> >>  AVVulkanDeviceContext *hwctx = ctx->hwctx;
> >>
> >> @@ -1771,6 +1792,19 @@ static int
> alloc_bind_mem(AVHWFramesContext
> >> *hwfc, AVVkFrame *f,
> >>  req.memoryRequirements.size =
> >> FFALIGN(req.memoryRequirements.size,
> >>  p-
> >> >props.properties.limits.minMemoryMapAlignment);
> >>
> >> +if (VKF_FLAG(hwfctx->flags,
> >> AV_VK_FRAME_FLAG_CONTIGUOUS_MEMORY)) {
> >> +if (memory_requirements.size == 0) {
> >> +memory_requirements = req.memoryRequirements;
> >> +} else if (memory_requirements.memoryTypeBits !=
> >> req.memoryR

Re: [FFmpeg-devel] [PATCH 1/2] avfilter/vf_lut3d: expose 3D LUT file parse function.

2023-10-22 Thread Chen Yufei
Thanks for reviewing this patch.

Do you mean this should be merged with the change to vf_vpp_qsv file
and send only one patch file?

On Mon, Oct 16, 2023 at 3:51 PM Xiang, Haihao  wrote:
>
> On Sa, 2023-09-23 at 23:36 +0800, Chen Yufei wrote:
> > Signed-off-by: Chen Yufei 
> > ---
> >  libavfilter/Makefile   |   8 +-
> >  libavfilter/lut3d.c| 669 +
> >  libavfilter/lut3d.h|  13 +
> >  libavfilter/vf_lut3d.c | 590 +---
> >  4 files changed, 689 insertions(+), 591 deletions(-)
> >  create mode 100644 libavfilter/lut3d.c
> >
> > diff --git a/libavfilter/Makefile b/libavfilter/Makefile
> > index 2fe0033b21..c1cd797e5c 100644
> > --- a/libavfilter/Makefile
> > +++ b/libavfilter/Makefile
> > @@ -330,7 +330,7 @@ OBJS-$(CONFIG_GRAPHMONITOR_FILTER)   +=
> > f_graphmonitor.o
> >  OBJS-$(CONFIG_GRAYWORLD_FILTER)  += vf_grayworld.o
> >  OBJS-$(CONFIG_GREYEDGE_FILTER)   += vf_colorconstancy.o
> >  OBJS-$(CONFIG_GUIDED_FILTER) += vf_guided.o
> > -OBJS-$(CONFIG_HALDCLUT_FILTER)   += vf_lut3d.o framesync.o
> > +OBJS-$(CONFIG_HALDCLUT_FILTER)   += vf_lut3d.o lut3d.o
> > framesync.o
> >  OBJS-$(CONFIG_HFLIP_FILTER)  += vf_hflip.o
> >  OBJS-$(CONFIG_HFLIP_VULKAN_FILTER)   += vf_flip_vulkan.o vulkan.o
> >  OBJS-$(CONFIG_HISTEQ_FILTER) += vf_histeq.o
> > @@ -367,10 +367,10 @@ OBJS-$(CONFIG_LIMITDIFF_FILTER)  +=
> > vf_limitdiff.o framesync.o
> >  OBJS-$(CONFIG_LIMITER_FILTER)+= vf_limiter.o
> >  OBJS-$(CONFIG_LOOP_FILTER)   += f_loop.o
> >  OBJS-$(CONFIG_LUMAKEY_FILTER)+= vf_lumakey.o
> > -OBJS-$(CONFIG_LUT1D_FILTER)  += vf_lut3d.o
> > +OBJS-$(CONFIG_LUT1D_FILTER)  += vf_lut3d.o lut3d.o
> >  OBJS-$(CONFIG_LUT_FILTER)+= vf_lut.o
> >  OBJS-$(CONFIG_LUT2_FILTER)   += vf_lut2.o framesync.o
> > -OBJS-$(CONFIG_LUT3D_FILTER)  += vf_lut3d.o framesync.o
> > +OBJS-$(CONFIG_LUT3D_FILTER)  += vf_lut3d.o lut3d.o
> > framesync.o
> >  OBJS-$(CONFIG_LUTRGB_FILTER) += vf_lut.o
> >  OBJS-$(CONFIG_LUTYUV_FILTER) += vf_lut.o
> >  OBJS-$(CONFIG_MASKEDCLAMP_FILTER)+= vf_maskedclamp.o 
> > framesync.o
> > @@ -549,7 +549,7 @@ OBJS-$(CONFIG_VIDSTABTRANSFORM_FILTER)   +=
> > vidstabutils.o vf_vidstabtransfo
> >  OBJS-$(CONFIG_VIF_FILTER)+= vf_vif.o framesync.o
> >  OBJS-$(CONFIG_VIGNETTE_FILTER)   += vf_vignette.o
> >  OBJS-$(CONFIG_VMAFMOTION_FILTER) += vf_vmafmotion.o framesync.o
> > -OBJS-$(CONFIG_VPP_QSV_FILTER)+= vf_vpp_qsv.o
> > +OBJS-$(CONFIG_VPP_QSV_FILTER)+= vf_vpp_qsv.o lut3d.o
>
> This should be moved to patch 2/2.
>
> Thanks
> Haihao
>
>
> >  OBJS-$(CONFIG_VSTACK_FILTER) += vf_stack.o framesync.o
> >  OBJS-$(CONFIG_W3FDIF_FILTER) += vf_w3fdif.o
> >  OBJS-$(CONFIG_WAVEFORM_FILTER)   += vf_waveform.o
> > diff --git a/libavfilter/lut3d.c b/libavfilter/lut3d.c
> > new file mode 100644
> > index 00..173979adcc
> > --- /dev/null
> > +++ b/libavfilter/lut3d.c
> > @@ -0,0 +1,669 @@
> > +/*
> > + * Copyright (c) 2013 Clément Bœsch
> > + * Copyright (c) 2018 Paul B Mahol
> > + *
> > + * This file is part of FFmpeg.
> > + *
> > + * FFmpeg is free software; you can redistribute it and/or
> > + * modify it under the terms of the GNU Lesser General Public
> > + * License as published by the Free Software Foundation; either
> > + * version 2.1 of the License, or (at your option) any later version.
> > + *
> > + * FFmpeg is distributed in the hope that it will be useful,
> > + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> > + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> > + * Lesser General Public License for more details.
> > + *
> > + * You should have received a copy of the GNU Lesser General Public
> > + * License along with FFmpeg; if not, write to the Free Software
> > + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301
> > USA
> > + */
> > +
> > +#include "lut3d.h"
> > +
> > +#include 
> > +
> > +#include "libavutil/avstring.h"
> > +#include "libavutil/file_open.h"
> >

Re: [FFmpeg-devel] [PATCH 2/2] avfilter/vf_vpp_qsv: apply 3D LUT from file.

2023-10-22 Thread Chen Yufei
Thanks for your comments.

I'll use MFX_RESOURCE_SYSTEM_SURFACE and send another patch.

While implementing this feature, I noticed that
vpp_set_frame_ext_params is called multiple times.
If using system memory for storing 3D LUT, is it possible the LUT
table copying to gfx memory will occur multiple times?

On Mon, Oct 16, 2023 at 4:05 PM Xiang, Haihao  wrote:
>
> On Sa, 2023-09-23 at 23:36 +0800, Chen Yufei wrote:
> > Usage: "vpp_qsv=lut3d_file="
> >
> > Only enabled with VAAPI because using VASurface to store 3D LUT.
> >
> > Signed-off-by: Chen Yufei 
> > ---
> >  libavfilter/vf_vpp_qsv.c | 241 ++-
> >  1 file changed, 236 insertions(+), 5 deletions(-)
> >
> > diff --git a/libavfilter/vf_vpp_qsv.c b/libavfilter/vf_vpp_qsv.c
> > index c07b45fedb..cd913d3c40 100644
> > --- a/libavfilter/vf_vpp_qsv.c
> > +++ b/libavfilter/vf_vpp_qsv.c
> > @@ -23,6 +23,7 @@
> >
> >  #include 
> >
> > +#include "config.h"
> >  #include "config_components.h"
> >
> >  #include "libavutil/opt.h"
> > @@ -37,10 +38,15 @@
> >  #include "internal.h"
> >  #include "avfilter.h"
> >  #include "filters.h"
> > +#include "lut3d.h"
> >
> >  #include "qsvvpp.h"
> >  #include "transpose.h"
> >
> > +#if QSV_ONEVPL && CONFIG_VAAPI
> > +#include 
> > +#endif
>
> VA-API is available on Windows now, however oneVPL can't work with VA-API on
> Windows.
>
> I'd prefer to support MFX_RESOURCE_SYSTEM_SURFACE instead of
> MFX_RESOURCE_VA_SURFACE in FFmpeg because we neend't consider VA-API too much
> for MFX_RESOURCE_SYSTEM_SURFACE. oneVPL should be able to copy data from 
> system
> memory to gfx memory internally.
>
> Thanks
> Haihao
>
>
> > +
> >  #define OFFSET(x) offsetof(VPPContext, x)
> >  #define FLAGS (AV_OPT_FLAG_VIDEO_PARAM | AV_OPT_FLAG_FILTERING_PARAM)
> >
> > @@ -67,6 +73,10 @@ typedef struct VPPContext{
> >  /** HDR parameters attached on the input frame */
> >  mfxExtMasteringDisplayColourVolume mdcv_conf;
> >  mfxExtContentLightLevelInfo clli_conf;
> > +
> > +/** LUT parameters attached on the input frame */
> > +mfxExtVPP3DLut lut3d_conf;
> > +LUT3DContext lut3d;
> >  #endif
> >
> >  /**
> > @@ -260,6 +270,7 @@ static av_cold int vpp_preinit(AVFilterContext *ctx)
> >
> >  static av_cold int vpp_init(AVFilterContext *ctx)
> >  {
> > +int ret = 0;
> >  VPPContext  *vpp  = ctx->priv;
> >
> >  if (!vpp->output_format_str || !strcmp(vpp->output_format_str, 
> > "same")) {
> > @@ -288,9 +299,9 @@ static av_cold int vpp_init(AVFilterContext *ctx)
> >  STRING_OPTION(color_primaries, color_primaries, AVCOL_PRI_UNSPECIFIED);
> >  STRING_OPTION(color_transfer,  color_transfer,  AVCOL_TRC_UNSPECIFIED);
> >  STRING_OPTION(color_matrix,color_space, AVCOL_SPC_UNSPECIFIED);
> > -
> >  #undef STRING_OPTION
> > -return 0;
> > +
> > +return ret;
> >  }
> >
> >  static int config_input(AVFilterLink *inlink)
> > @@ -388,6 +399,194 @@ static mfxStatus get_mfx_version(const AVFilterContext
> > *ctx, mfxVersion *mfx_ver
> >  return MFXQueryVersion(device_hwctx->session, mfx_version);
> >  }
> >
> > +#if QSV_ONEVPL && CONFIG_VAAPI
> > +static mfxStatus get_va_display(AVFilterContext *ctx, VADisplay 
> > *va_display)
> > +{
> > +VPPContext *vpp = ctx->priv;
> > +QSVVPPContext *qsvvpp = &vpp->qsv;
> > +mfxHDL handle;
> > +mfxStatus ret;
> > +
> > +ret = MFXVideoCORE_GetHandle(qsvvpp->session, MFX_HANDLE_VA_DISPLAY,
> > &handle);
> > +if (ret != MFX_ERR_NONE) {
> > +av_log(ctx, AV_LOG_ERROR, "MFXVideoCORE_GetHandle failed, status:
> > %d\n", ret);
> > +*va_display = NULL;
> > +return ret;
> > +}
> > +
> > +*va_display = (VADisplay)handle;
> > +return MFX_ERR_NONE;
> > +}
> > +
> > +// Allocate memory on device and copy 3D LUT table.
> > +// Reference
> > https://spec.oneapi.io/onevpl/2.9.0/programming_guide/VPL_prg_vpp.html#video-processing-3dlut
> > +static int init_3dlut_surface(AVFilterContext *ctx)
> > +{
> > +VPPContext *vpp = ctx->priv;
> > +LUT3DContext *lut3d = &vpp->lut3d;
> > +mfxExtVPP3DLut *lut3d_conf = &vpp->l

  1   2   3   4   5   6   >