from:"James Darnley"

Re: [FFmpeg-devel] [PATCH] libavcodec Adding ff_v210_planar_unpack AVX2

2019-03-27 Thread James Darnley

On 2019-03-26 21:22, Mike Stoner via ffmpeg-devel wrote:
> Hello,
> I’ve accounted for all feedback on this so far, I’m wondering if it is ready 
> to be pushed upstream?
> 
> Here are my results from ‘checkasm’ (lower is better):
> 
> v210_unpack_c: 1636
> v210_unpack_ssse3: 611
> v210_unpack_avx: 601
> v210_unpack_avx2: 423
> 
> I ran it 5 times and averaged the middle 3 results for each CPU target 
> (ignoring the highest and lowest time).
> 
> https://patchwork.ffmpeg.org/patch/12325/
> 
> 
> Thanks… -Mike

Sorry that I keep forgetting about this.  I will try to make some time
tomorrow to give this another look over.

I'm not sure what order this and my checkasm patch should be applied in,
which I also forgot about.

Did anyone else make comments on either patch?

signature.asc
Description: OpenPGP digital signature
___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".

[FFmpeg-devel] [PATCH 0/3] v210dec checkasm test and avx2 function

2019-04-10 Thread James Darnley

I am resending this my patches because I am not sure if I sent this version in
the past.  I split my changes into two patches because they do separate things.

I also changed some tabs to spaces in Mike's AVX2 patch.

James Darnley (2):
  avcodec/v210dec: move DSP function setting into dedicated function
  checkasm: add test for v210dec

Michael Stoner (1):
  libavcodec Adding ff_v210_planar_unpack AVX2

 libavcodec/v210dec.c   | 26 +
 libavcodec/v210dec.h   |  1 +
 libavcodec/x86/v210-init.c |  8 
 libavcodec/x86/v210.asm| 72 +++
 tests/checkasm/Makefile|  1 +
 tests/checkasm/checkasm.c  |  3 ++
 tests/checkasm/checkasm.h  |  1 +
 tests/checkasm/v210dec.c   | 77 ++
 8 files changed, 166 insertions(+), 23 deletions(-)
 create mode 100644 tests/checkasm/v210dec.c

-- 
2.21.0

___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".

[FFmpeg-devel] [PATCH 2/3] checkasm: add test for v210dec

2019-04-10 Thread James Darnley

---
 tests/checkasm/Makefile   |  1 +
 tests/checkasm/checkasm.c |  3 ++
 tests/checkasm/checkasm.h |  1 +
 tests/checkasm/v210dec.c  | 77 +++
 4 files changed, 82 insertions(+)
 create mode 100644 tests/checkasm/v210dec.c

diff --git a/tests/checkasm/Makefile b/tests/checkasm/Makefile
index 8cc0bff2d1..886ae33167 100644
--- a/tests/checkasm/Makefile
+++ b/tests/checkasm/Makefile
@@ -25,6 +25,7 @@ AVCODECOBJS-$(CONFIG_JPEG2000_DECODER)  += jpeg2000dsp.o
 AVCODECOBJS-$(CONFIG_PIXBLOCKDSP)   += pixblockdsp.o
 AVCODECOBJS-$(CONFIG_HEVC_DECODER)  += hevc_add_res.o hevc_idct.o 
hevc_sao.o
 AVCODECOBJS-$(CONFIG_UTVIDEO_DECODER)   += utvideodsp.o
+AVCODECOBJS-$(CONFIG_V210_DECODER)  += v210dec.o
 AVCODECOBJS-$(CONFIG_V210_ENCODER)  += v210enc.o
 AVCODECOBJS-$(CONFIG_VP9_DECODER)   += vp9dsp.o
 
diff --git a/tests/checkasm/checkasm.c b/tests/checkasm/checkasm.c
index 9eec41e3c4..bf51e00eab 100644
--- a/tests/checkasm/checkasm.c
+++ b/tests/checkasm/checkasm.c
@@ -136,6 +136,9 @@ static const struct {
 #if CONFIG_UTVIDEO_DECODER
 { "utvideodsp", checkasm_check_utvideodsp },
 #endif
+#if CONFIG_V210_DECODER
+{ "v210dec", checkasm_check_v210dec },
+#endif
 #if CONFIG_V210_ENCODER
 { "v210enc", checkasm_check_v210enc },
 #endif
diff --git a/tests/checkasm/checkasm.h b/tests/checkasm/checkasm.h
index 9e8e879fd3..9b8d2f5419 100644
--- a/tests/checkasm/checkasm.h
+++ b/tests/checkasm/checkasm.h
@@ -69,6 +69,7 @@ void checkasm_check_sbrdsp(void);
 void checkasm_check_synth_filter(void);
 void checkasm_check_sw_rgb(void);
 void checkasm_check_utvideodsp(void);
+void checkasm_check_v210dec(void);
 void checkasm_check_v210enc(void);
 void checkasm_check_vf_hflip(void);
 void checkasm_check_vf_threshold(void);
diff --git a/tests/checkasm/v210dec.c b/tests/checkasm/v210dec.c
new file mode 100644
index 00..7dd50a8271
--- /dev/null
+++ b/tests/checkasm/v210dec.c
@@ -0,0 +1,77 @@
+/*
+ * Copyright (c) 2019 James Darnley
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with FFmpeg; if not, write to the Free Software Foundation, Inc.,
+ * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ */
+
+#include 
+#include "checkasm.h"
+#include "libavcodec/v210dec.h"
+
+static uint32_t get_v210(void)
+{
+uint32_t t0 = rnd() & 0x3ff,
+ t1 = rnd() & 0x3ff,
+ t2 = rnd() & 0x3ff;
+uint32_t value =  t0
+   | (t1 << 10)
+   | (t2 << 20);
+return value;
+}
+
+#define NUM_SAMPLES 2048
+
+static void randomize_buffers(uint32_t *src0, uint32_t *src1, int len)
+{
+for (int i = 0; i < len; i++) {
+uint32_t value = get_v210();
+src0[i] = value;
+src1[i] = value;
+}
+}
+
+void checkasm_check_v210dec(void)
+{
+V210DecContext h;
+
+h.aligned_input = 0;
+ff_v210dec_init(&h);
+
+if (check_func(h.unpack_frame, "v210_unpack")) {
+uint32_t src0[NUM_SAMPLES/3];
+uint32_t src1[NUM_SAMPLES/3];
+uint16_t y0[NUM_SAMPLES/2];
+uint16_t y1[NUM_SAMPLES/2];
+uint16_t u0[NUM_SAMPLES/4];
+uint16_t u1[NUM_SAMPLES/4];
+uint16_t v0[NUM_SAMPLES/4];
+uint16_t v1[NUM_SAMPLES/4];
+declare_func(void, const uint32_t *src, uint16_t *y, uint16_t *u, 
uint16_t *v, int width);
+const int pixels = NUM_SAMPLES / 2 / 6 * 6;
+
+randomize_buffers(src0, src1, NUM_SAMPLES/3);
+call_ref(src0, y0, u0, v0, pixels);
+call_new(src1, y1, u1, v1, pixels);
+if (memcmp(src0, src1, NUM_SAMPLES/3 * sizeof src0[0])
+|| memcmp(y0, y1, pixels * sizeof y0[0])
+|| memcmp(u0, u1, pixels/2 * sizeof u0[0])
+|| memcmp(v0, v1, pixels/2 * sizeof v0[0]))
+fail();
+bench_new(src1, y1, u1, v1, pixels);
+}
+report("v210_unpack");
+}
-- 
2.21.0

___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".

[FFmpeg-devel] [PATCH 3/3] libavcodec Adding ff_v210_planar_unpack AVX2

2019-04-10 Thread James Darnley

From: Michael Stoner 

Replaced VSHUFPS with VPBLENDD to relieve port 5 bottleneck
AVX2 is 1.4x faster than AVX
---

Mike, is this still the patch you want applied.  I had to make a small
amendment to it because you had some tabs as indentation.

 libavcodec/v210dec.c   | 10 +-
 libavcodec/x86/v210-init.c |  8 +
 libavcodec/x86/v210.asm| 72 +-
 3 files changed, 73 insertions(+), 17 deletions(-)

diff --git a/libavcodec/v210dec.c b/libavcodec/v210dec.c
index fd8a6b0d78..bc1e1d34ff 100644
--- a/libavcodec/v210dec.c
+++ b/libavcodec/v210dec.c
@@ -123,7 +123,7 @@ static int decode_frame(AVCodecContext *avctx, void *data, 
int *got_frame,
 const uint32_t *src = (const uint32_t*)psrc;
 uint32_t val;
 
-w = (avctx->width / 6) * 6;
+w = (avctx->width / 12) * 12;
 s->unpack_frame(src, y, u, v, w);
 
 y += w;
@@ -131,6 +131,14 @@ static int decode_frame(AVCodecContext *avctx, void *data, 
int *got_frame,
 v += w >> 1;
 src += (w << 1) / 3;
 
+if (w < avctx->width - 5) {
+READ_PIXELS(u, y, v);
+READ_PIXELS(y, u, y);
+READ_PIXELS(v, y, u);
+READ_PIXELS(y, v, y);
+w += 6;
+}
+
 if (w < avctx->width - 1) {
 READ_PIXELS(u, y, v);
 
diff --git a/libavcodec/x86/v210-init.c b/libavcodec/x86/v210-init.c
index d64dbca1a8..cb9a6cbd6a 100644
--- a/libavcodec/x86/v210-init.c
+++ b/libavcodec/x86/v210-init.c
@@ -21,9 +21,11 @@
 
 extern void ff_v210_planar_unpack_unaligned_ssse3(const uint32_t *src, 
uint16_t *y, uint16_t *u, uint16_t *v, int width);
 extern void ff_v210_planar_unpack_unaligned_avx(const uint32_t *src, uint16_t 
*y, uint16_t *u, uint16_t *v, int width);
+extern void ff_v210_planar_unpack_unaligned_avx2(const uint32_t *src, uint16_t 
*y, uint16_t *u, uint16_t *v, int width);
 
 extern void ff_v210_planar_unpack_aligned_ssse3(const uint32_t *src, uint16_t 
*y, uint16_t *u, uint16_t *v, int width);
 extern void ff_v210_planar_unpack_aligned_avx(const uint32_t *src, uint16_t 
*y, uint16_t *u, uint16_t *v, int width);
+extern void ff_v210_planar_unpack_aligned_avx2(const uint32_t *src, uint16_t 
*y, uint16_t *u, uint16_t *v, int width);
 
 av_cold void ff_v210_x86_init(V210DecContext *s)
 {
@@ -36,6 +38,9 @@ av_cold void ff_v210_x86_init(V210DecContext *s)
 
 if (HAVE_AVX_EXTERNAL && cpu_flags & AV_CPU_FLAG_AVX)
 s->unpack_frame = ff_v210_planar_unpack_aligned_avx;
+
+if (HAVE_AVX2_EXTERNAL && cpu_flags & AV_CPU_FLAG_AVX2)
+s->unpack_frame = ff_v210_planar_unpack_aligned_avx2;
 }
 else {
 if (cpu_flags & AV_CPU_FLAG_SSSE3)
@@ -43,6 +48,9 @@ av_cold void ff_v210_x86_init(V210DecContext *s)
 
 if (HAVE_AVX_EXTERNAL && cpu_flags & AV_CPU_FLAG_AVX)
 s->unpack_frame = ff_v210_planar_unpack_unaligned_avx;
+
+if (HAVE_AVX2_EXTERNAL && cpu_flags & AV_CPU_FLAG_AVX2)
+s->unpack_frame = ff_v210_planar_unpack_unaligned_avx2;
 }
 #endif
 }
diff --git a/libavcodec/x86/v210.asm b/libavcodec/x86/v210.asm
index c24c765e5b..706712313d 100644
--- a/libavcodec/x86/v210.asm
+++ b/libavcodec/x86/v210.asm
@@ -22,9 +22,14 @@
 
 %include "libavutil/x86/x86util.asm"
 
-SECTION_RODATA
+SECTION_RODATA 32
+
+; for AVX2 version only
+v210_luma_permute: dd 0,1,2,4,5,6,7,7  ; 32-byte alignment required
+v210_chroma_shuf2: db 0,1,2,3,4,5,8,9,10,11,12,13,-1,-1,-1,-1
+v210_luma_shuf_avx2: db 0,1,4,5,6,7,8,9,12,13,14,15,-1,-1,-1,-1
+v210_chroma_shuf_avx2: db 0,1,4,5,10,11,-1,-1,2,3,8,9,12,13,-1,-1
 
-v210_mask: times 4 dd 0x3ff
 v210_mult: dw 64,4,64,4,64,4,64,4
 v210_luma_shuf: db 8,9,0,1,2,3,12,13,4,5,6,7,-1,-1,-1,-1
 v210_chroma_shuf: db 0,1,8,9,6,7,-1,-1,2,3,4,5,12,13,-1,-1
@@ -34,40 +39,65 @@ SECTION .text
 %macro v210_planar_unpack 1
 
 ; v210_planar_unpack(const uint32_t *src, uint16_t *y, uint16_t *u, uint16_t 
*v, int width)
-cglobal v210_planar_unpack_%1, 5, 5, 7
+cglobal v210_planar_unpack_%1, 5, 5, 8
 movsxdifnidn r4, r4d
 lear1, [r1+2*r4]
 addr2, r4
 addr3, r4
 negr4
 
-mova   m3, [v210_mult]
-mova   m4, [v210_mask]
-mova   m5, [v210_luma_shuf]
-mova   m6, [v210_chroma_shuf]
+VBROADCASTI128   m3, [v210_mult]
+VBROADCASTI128   m5, [v210_chroma_shuf]
+
+%if cpuflag(avx2)
+VBROADCASTI128   m4, [v210_luma_shuf_avx2]
+VBROADCASTI128   m5, [v210_chroma_shuf_avx2]
+mova m6, [v210_luma_permute]
+VBROADCASTI128   m7, [v210_chroma_shuf2]
+%else
+VBROADCASTI128   m4, [v210_luma_shuf]
+VBROADCASTI128   m5, [v210_chroma_shuf]
+%endif
+
 .loop:
 %ifidn %1, unaligned
-movu   m0, [r0]
+movu   m0, [r0]; yB v5 yA  u5 y9 v4  y8 u4 y7  v3 y6 u3  y5 v2 y4  u2 
y3 v1  y2 u1 y1  v0 y0 u0
 %else
 mova   m0, [r0]
 %endif
 
 pmullw m1, m0, m3
-psrld  m0, 10
-psrlw  m1, 6  ; u0 v0 y1 y2 v1 u2 y4 y5
-pand   m0, m4 ; y0 __ u1 __ y3 __ v2

[FFmpeg-devel] [PATCH 1/3] avcodec/v210dec: move DSP function setting into dedicated function

2019-04-10 Thread James Darnley

Prepare for checkasm test.
---
 libavcodec/v210dec.c | 16 ++--
 libavcodec/v210dec.h |  1 +
 2 files changed, 11 insertions(+), 6 deletions(-)

diff --git a/libavcodec/v210dec.c b/libavcodec/v210dec.c
index ddc5dbe8be..fd8a6b0d78 100644
--- a/libavcodec/v210dec.c
+++ b/libavcodec/v210dec.c
@@ -50,6 +50,13 @@ static void v210_planar_unpack_c(const uint32_t *src, 
uint16_t *y, uint16_t *u,
 }
 }
 
+av_cold void ff_v210dec_init(V210DecContext *s)
+{
+s->unpack_frame = v210_planar_unpack_c;
+if (ARCH_X86)
+ff_v210_x86_init(s);
+}
+
 static av_cold int decode_init(AVCodecContext *avctx)
 {
 V210DecContext *s = avctx->priv_data;
@@ -57,10 +64,8 @@ static av_cold int decode_init(AVCodecContext *avctx)
 avctx->pix_fmt = AV_PIX_FMT_YUV422P10;
 avctx->bits_per_raw_sample = 10;
 
-s->unpack_frame= v210_planar_unpack_c;
-
-if (HAVE_MMX)
-ff_v210_x86_init(s);
+s->aligned_input = 0;
+ff_v210dec_init(s);
 
 return 0;
 }
@@ -102,8 +107,7 @@ static int decode_frame(AVCodecContext *avctx, void *data, 
int *got_frame,
 aligned_input = !((uintptr_t)psrc & 0xf) && !(stride & 0xf);
 if (aligned_input != s->aligned_input) {
 s->aligned_input = aligned_input;
-if (HAVE_MMX)
-ff_v210_x86_init(s);
+ff_v210dec_init(s);
 }
 
 if ((ret = ff_get_buffer(avctx, pic, 0)) < 0)
diff --git a/libavcodec/v210dec.h b/libavcodec/v210dec.h
index 533afc435c..cfdb29da09 100644
--- a/libavcodec/v210dec.h
+++ b/libavcodec/v210dec.h
@@ -31,6 +31,7 @@ typedef struct {
 void (*unpack_frame)(const uint32_t *src, uint16_t *y, uint16_t *u, 
uint16_t *v, int width);
 } V210DecContext;
 
+void ff_v210dec_init(V210DecContext *s);
 void ff_v210_x86_init(V210DecContext *s);
 
 #endif /* AVCODEC_V210DEC_H */
-- 
2.21.0

___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".

Re: [FFmpeg-devel] [PATCH 3/3] libavcodec Adding ff_v210_planar_unpack AVX2

2019-04-10 Thread James Darnley

On 2019-04-10 14:47, James Darnley wrote:
> From: Michael Stoner 

Screw you mailing list or git, which ever one of you managed to screw up
the author's address.  I will correct that, if I can.

___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".

Re: [FFmpeg-devel] [PATCH 0/3] v210dec checkasm test and avx2 function

2019-04-18 Thread James Darnley

On 2019-04-10 14:47, James Darnley wrote:
> I am resending this my patches because I am not sure if I sent this version in
> the past.  I split my changes into two patches because they do separate 
> things.
> 
> I also changed some tabs to spaces in Mike's AVX2 patch.
> 
> James Darnley (2):
>   avcodec/v210dec: move DSP function setting into dedicated function
>   checkasm: add test for v210dec
> 
> Michael Stoner (1):
>   libavcodec Adding ff_v210_planar_unpack AVX2
> 
>  libavcodec/v210dec.c   | 26 +
>  libavcodec/v210dec.h   |  1 +
>  libavcodec/x86/v210-init.c |  8 
>  libavcodec/x86/v210.asm| 72 +++
>  tests/checkasm/Makefile|  1 +
>  tests/checkasm/checkasm.c  |  3 ++
>  tests/checkasm/checkasm.h  |  1 +
>  tests/checkasm/v210dec.c   | 77 ++
>  8 files changed, 166 insertions(+), 23 deletions(-)
>  create mode 100644 tests/checkasm/v210dec.c
> 

Any objections to this patchset?  I have corrected the address of
Michael's patch to the address I Cced.  I hope that the right one.


___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".

Re: [FFmpeg-devel] [PATCH] avcodec/v210dec: Fix alignment check for AVX2

2019-05-18 Thread James Darnley

On 2019-05-18 09:39, Michael Niedermayer wrote:
> Fixes: "null pointer dereference"
> Fixes: 
> 14551/clusterfuzz-testcase-minimized-ffmpeg_AV_CODEC_ID_V210_fuzzer-5088609952071680
> 
> Found-by: continuous fuzzing process 
> https://github.com/google/oss-fuzz/tree/master/projects/ffmpeg
> Signed-off-by: Michael Niedermayer 
> ---
>  libavcodec/v210dec.c | 2 +-
>  1 file changed, 1 insertion(+), 1 deletion(-)
> 
> diff --git a/libavcodec/v210dec.c b/libavcodec/v210dec.c
> index bc1e1d34ff..5a33d8c089 100644
> --- a/libavcodec/v210dec.c
> +++ b/libavcodec/v210dec.c
> @@ -104,7 +104,7 @@ static int decode_frame(AVCodecContext *avctx, void 
> *data, int *got_frame,
>  && avpkt->size - 64 >= stride * avctx->height)
>  psrc += 64;
>  
> -aligned_input = !((uintptr_t)psrc & 0xf) && !(stride & 0xf);
> +aligned_input = !((uintptr_t)psrc & 0x1f) && !(stride & 0x1f);
>  if (aligned_input != s->aligned_input) {
>  s->aligned_input = aligned_input;
>  ff_v210dec_init(s);
> 

Ah yes, that'll be needed after the recent addition of avx2.  LGTM and
sorry.

I object to the commit message though because it isn't a "null pointer
dereference" but if that is the error as reported by the tool then keep
it as is.
___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".

Re: [FFmpeg-devel] [PATCH] avcodec/v210dec: Fix alignment check for AVX2

2019-05-18 Thread James Darnley

On 2019-05-18 12:15, Michael Niedermayer wrote:
> On Sat, May 18, 2019 at 12:02:55PM +0200, James Darnley wrote:
>> I object to the commit message though because it isn't a "null pointer
>> dereference" but if that is the error as reported by the tool then keep
>> it as is.
> 
> yes, the tool(s) say things like "Null-dereference READ", "SEGV on unknown 
> address 0x"
> 

Hm.  It is almost certainly an aligned move on an unaligned address.

I don't care that much about the rest of the commit message; the subject
is correct which is good enough.
___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".

Re: [FFmpeg-devel] [PATCH 1/7] libavfilter/vf_overlay.c: change the commands style for the macro defined function

2019-05-24 Thread James Darnley

On 2019-05-24 11:36, lance.lmw...@gmail.com wrote:
> From: Limin Wang 
> 
> ...

Why?  And these are "comments" not "commands".

signature.asc
Description: OpenPGP digital signature
___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".

Re: [FFmpeg-devel] [PATCH 1/7] libavfilter/vf_overlay.c: change the commands style for the macro defined function

2019-05-24 Thread James Darnley

On 2019-05-24 12:06, James Darnley wrote:
> On 2019-05-24 11:36, lance.lmw...@gmail.com wrote:
>> From: Limin Wang 
>>
>> ...
> 
> Why?

I see why: so you don't screw-up the macros you create later.

signature.asc
Description: OpenPGP digital signature
___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".

Re: [FFmpeg-devel] [PATCH] avcodec: Add librav1e encoder

2019-05-28 Thread James Darnley

On 2019-05-28 22:00, Derek Buitenhuis wrote:
> On 28/05/2019 20:58, James Almer wrote:
>> I think x26* and vpx/aom call it crf? It's not in option_tables.h in any
>> case.
> 
> They do not. This is a constant quantizer mode, not constant rate factor.

IIRC either qp or cqp




signature.asc
Description: OpenPGP digital signature
___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".

Re: [FFmpeg-devel] [PATCH 1/5] lavu/pixfmt: add Y210/AYUV/Y410 pixel formats

2019-06-27 Thread James Darnley

On 2019-06-28 04:26, Linjie Fu wrote:
> Previously, media driver provided planar format(like 420 8 bit), but
> for HEVC Range Extension (422/444 8/10 bit), the decoded image is
> produced in packed format.
> 
> Y210/AYUV/Y410 are packed formats which are needed in HEVC Rext decoding
> for both VAAPI and QSV:
> - Y210: 422 10 BIT
> - AYUV: 444  8 BIT
> - Y410: 444 10 BIT
> 

Why am I suspicious that at least one of those is a re-ordered v210?  I
seem to recall that we rejected adding v210 to this list.  Either they
don't belong in this list or they don't belong because libavcodec has a
proper decoder (at least for v210).

This might be the thread I was remembering but March seems too recent
> https://ffmpeg.org/pipermail/ffmpeg-devel/2019-March/241549.html

No real conclusion was reached there.

Do bit-packed formats belong in an AVPixelFormat?

signature.asc
Description: OpenPGP digital signature
___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".

Re: [FFmpeg-devel] [PATCH 1/5] lavu/pixfmt: add Y210/AYUV/Y410 pixel formats

2019-06-28 Thread James Darnley

On 2019-06-28 03:03, Hendrik Leppkes wrote:
> On Fri, Jun 28, 2019 at 1:26 AM James Darnley  wrote:
>>
>> On 2019-06-28 04:26, Linjie Fu wrote:
>>> Previously, media driver provided planar format(like 420 8 bit), but
>>> for HEVC Range Extension (422/444 8/10 bit), the decoded image is
>>> produced in packed format.
>>>
>>> Y210/AYUV/Y410 are packed formats which are needed in HEVC Rext decoding
>>> for both VAAPI and QSV:
>>> - Y210: 422 10 BIT
>>> - AYUV: 444  8 BIT
>>> - Y410: 444 10 BIT
>>>
>>
>>
>> Why am I suspicious that at least one of those is a re-ordered v210?  I
>> seem to recall that we rejected adding v210 to this list.  Either they
>> don't belong in this list or they don't belong because libavcodec has a
>> proper decoder (at least for v210).
>>
> 
> They are not quite as bad as v210 (and not related).
> 
> Microsoft documents them here as the recommended formats to be used on 
> Windows:
> https://docs.microsoft.com/en-us/windows/desktop/medfound/recommended-8-bit-yuv-formats-for-video-rendering#444-formats-32-bits-per-pixel
> https://docs.microsoft.com/en-us/windows/desktop/medfound/10-bit-and-16-bit-yuv-video-formats
> 
> - Hendrik

Okay y410 and y210 use the highest 10 bits in each 16-bit word.  I
apologise for jumping to that conclusion.




signature.asc
Description: OpenPGP digital signature
___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".

Re: [FFmpeg-devel] Issues while encoding a ts file to m3u8

2019-08-02 Thread James Darnley

On 2019-08-02 15:55, Ramana Jajula wrote:
> Hi,
> 
> I am trying to encode my ts file m3u8 using my customised ffmpeg of version
> 4.1. I used below command to do encoding.
> 
> ffmpeg -re -threads 8 -i /videos/input.ts -vcodec libx264 -s 320x240 -b:v
> 512000 -maxrate 512000 -acodec libfdk_aac -b:a 32000 -ac 2 -ar 48000
> -force_key_frames 'expr:gte(t,n_forced*3)' -hls_flags single_file
> -hls_list_size 0 -hls_time 3 -fsize 400x222 -frames /frames/my_frames/
> -index /mpegindex/my_index.idx  -y /encoded/test/output.m3u8
> 
> My encoding was bad. The output printed to console is
>   libavutil  56. 22.100 / 56. 22.100
>   libavcodec 58. 35.100 / 58. 35.100
>   libavformat58. 20.100 / 58. 20.100
>   libavdevice58.  5.100 / 58.  5.100
>   libavfilter 7. 40.101 /  7. 40.101
>   libavresample   4.  0.  0 /  4.  0.  0
>   libswscale  5.  3.100 /  5.  3.100
>   libswresample   3.  3.100 /  3.  3.100
>   libpostproc55.  3.100 / 55.  3.100
> /videos/input.ts FPS 25.00 0
> Input #0, mpegts, from '/videos/.input.ts':
>   Duration: 00:04:05.97, start: 85837.091689, bitrate: 1769 kb/s
>   Program 1
> Stream #0:0[0x105]: Video: h264 (Main) ([27][0][0][0] / 0x001B),
> yuv420p(top first), 1920x1080 [SAR 1:1 DAR 16:9], 25 fps, 25 tbr, 90k tbn,
> 50 tbc
> Stream #0:1[0x106]: Audio: ac3 ([129][0][0][0] / 0x0081), 48000 Hz,
> stereo, fltp, 128 kb/s
> [libx264 @ 0x564a2f7cc480] VBV maxrate specified, but no bufsize, ignored
> [libx264 @ 0x564a2f7cc480] using SAR=4/3
> [libx264 @ 0x564a2f7cc480] using cpu capabilities: MMX2 SSE2Fast SSSE3
> SSE4.2
> [libx264 @ 0x564a2f7cc480] profile High, level 2.0
> [libx264 @ 0x564a2f7cc480] 264 - core 148 r2748 97eaef2 - H.264/MPEG-4 AVC
> codec - Copyleft 2003-2016 - http://www.videolan.org/x264.html - options:
> cabac=1 ref=3 debloc
> k=1:0:0 analyse=0x3:0x113 me=hex subme=7 psy=1 psy_rd=1.00:0.00 mixed_ref=1
> me_range=16 chroma_me=1 trellis=1 8x8dct=1 cqm=0 deadzone=21,11
> fast_pskip=1 chroma_qp_offset
> =-2 threads=3 lookahead_threads=1 sliced_threads=0 nr=0 decimate=1
> interlaced=0 bluray_compat=0 constrained_intra=0 bframes=3 b_pyramid=2
> b_adapt=1 b_bias=0 direct=1 wei
> ghtb=1 open_gop=0 weightp=2 keyint=250 keyint_min=25 scenecut=40
> intra_refresh=0 rc_lookahead=40 rc=abr mbtree=1 bitrate=512 ratetol=1.0
> qcomp=0.60 qpmin=0 qpmax=69 qpst
> ep=4 ip_ratio=1.40 aq=1:1.00
> [hls @ 0x564a2f7ccc40] Using AVStream.codec to pass codec parameters to
> muxers is deprecated, use AVStream.codecpar instead.
> Last message repeated 1 times
> [hls @ 0x564a2f7ccc40] Opening '/encodedt/input.ts' for writing
> Output #0, hls, to '/encoded/output.m3u8':
>   Metadata:
> encoder : Lavf58.20.100
> Stream #0:0: Video: h264 (libx264), yuv420p, 320x240 [SAR 4:3 DAR
> 16:9], q=-1--1, 512 kb/s, 25 fps, 90k tbn, 25 tbc
> Metadata:
>   encoder : Lavc58.35.100 libx264
> Side data:
>   cpb: bitrate max/min/avg: 512000/0/512000 buffer size: 0 vbv_delay: -1
> Stream #0:1: Audio: aac (libfdk_aac), 48000 Hz, stereo, s16, 32 kb/s
> Metadata:
>   encoder : Lavc58.35.100 libfdk_aac
> Stream mapping:
>   Stream #0:0 -> #0:0 (h264 (native) -> h264 (libx264))
>   Stream #0:1 -> #0:1 (ac3 (native) -> aac (libfdk_aac))
> Press [q] to stop, [?] for help
> frame=   34 fps=0.1 q=0.0 size=N/A time=00:05:02.11 bitrate=N/A dup=29
> drop=0 speed=0.567x
> [hls @ 0x564a2f7ccc40] Packets poorly interleaved, failed to avoid negative
> timestamp -3360 in stream 0.0.567x
> Try -max_interleave_delta 0 as a possible workaround.
> 
> Since the encoding speed is too slow I had to cancel the encoding process.
> I killed it,
> 
> What is the reason for this slow encoding process?
> 
> PS: My input file is of 1 hour duration.
> 

1 - Wrong mailing list.  This should probably be on ffmpeg-user.

2 - What configure options did you use for ffmpeg?  Why did you remove them?

3 - What "modifications" have you made"?

4 - What CPU do you have?  One without AVX is either old, or limited
(like Celerons and Pentiums)

5 - Why are you using an x264 from 2016?  Have you "modified" it too?

Next time just press 'q' to end encoding so we can see some stats.

___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".

[FFmpeg-devel] [PATCH 1/7] x86inc: Fix VEX -> EVEX instruction conversion

2019-08-05 Thread James Darnley

From: Henrik Gramner 

There's an edge case that wasn't properly handled.
---
 libavutil/x86/x86inc.asm | 5 +
 1 file changed, 5 insertions(+)

diff --git a/libavutil/x86/x86inc.asm b/libavutil/x86/x86inc.asm
index 5044ee86f0..bc370a6186 100644
--- a/libavutil/x86/x86inc.asm
+++ b/libavutil/x86/x86inc.asm
@@ -1662,6 +1662,11 @@ FMA4_INSTR fnmsub,   pd, ps, sd, ss
 %assign %%evex_required 1
 %endif
 %endif
+%ifnum regnumof%3
+%if regnumof%3 >= 16 || sizeof%3 > 32
+%assign %%evex_required 1
+%endif
+%endif
 %if %%evex_required
 %6 %%args
 %else
-- 
2.22.0

___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".

[FFmpeg-devel] [PATCH 4/7] x86inc: Turn 'movsxd' into 'movifnidn' on x86-32

2019-08-05 Thread James Darnley

From: Henrik Gramner 

---
 libavutil/x86/x86inc.asm | 4 
 1 file changed, 4 insertions(+)

diff --git a/libavutil/x86/x86inc.asm b/libavutil/x86/x86inc.asm
index 10b7711637..04dbb6b785 100644
--- a/libavutil/x86/x86inc.asm
+++ b/libavutil/x86/x86inc.asm
@@ -293,6 +293,10 @@ DECLARE_REG_TMP_SIZE 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14
 %endif
 %endmacro
 
+%if ARCH_X86_64 == 0
+%define movsxd movifnidn
+%endif
+
 %macro movsxdifnidn 2
 %ifnidn %1, %2
 movsxd %1, %2
-- 
2.22.0

___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".

[FFmpeg-devel] [PATCH 5/7] x86inc: Make 'non-adjacent' default in the TAIL_CALL macro

2019-08-05 Thread James Darnley

From: Henrik Gramner 

---
 libavutil/x86/x86inc.asm | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/libavutil/x86/x86inc.asm b/libavutil/x86/x86inc.asm
index 04dbb6b785..af35fe1e4d 100644
--- a/libavutil/x86/x86inc.asm
+++ b/libavutil/x86/x86inc.asm
@@ -685,7 +685,7 @@ DECLARE_ARG 7, 8, 9, 10, 11, 12, 13, 14
 
 BRANCH_INSTR jz, je, jnz, jne, jl, jle, jnl, jnle, jg, jge, jng, jnge, ja, 
jae, jna, jnae, jb, jbe, jnb, jnbe, jc, jnc, js, jns, jo, jno, jp, jnp
 
-%macro TAIL_CALL 2 ; callee, is_nonadjacent
+%macro TAIL_CALL 1-2 1 ; callee, is_nonadjacent
 %if has_epilogue
 call %1
 RET
-- 
2.22.0

___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".

[FFmpeg-devel] [PATCH 6/7] x86inc: Improve warnings for use of unsupported instructions

2019-08-05 Thread James Darnley

From: Henrik Gramner 

Warn when the following are used without the appropriate cpuflag:
 * YMM and ZMM registers
 * 'pextrw' with a memory operand
 * GPR instruction set extensions
---
 libavutil/x86/x86inc.asm | 120 +++
 1 file changed, 83 insertions(+), 37 deletions(-)

diff --git a/libavutil/x86/x86inc.asm b/libavutil/x86/x86inc.asm
index af35fe1e4d..d1b4c982fc 100644
--- a/libavutil/x86/x86inc.asm
+++ b/libavutil/x86/x86inc.asm
@@ -1216,8 +1216,22 @@ INIT_XMM
 %ifdef cpuname
 %if notcpuflag(%2)
 %error use of ``%1'' %2 instruction in cpuname function: 
current_function
-%elif cpuflags_%2 < cpuflags_sse && notcpuflag(sse2) && 
__sizeofreg > 8
+%elif %3 == 0 && __sizeofreg == 16 && notcpuflag(sse2)
 %error use of ``%1'' sse2 instruction in cpuname function: 
current_function
+%elif %3 == 0 && __sizeofreg == 32 && notcpuflag(avx2)
+%error use of ``%1'' avx2 instruction in cpuname function: 
current_function
+%elif __sizeofreg == 16 && notcpuflag(sse)
+%error use of ``%1'' sse instruction in cpuname function: 
current_function
+%elif __sizeofreg == 32 && notcpuflag(avx)
+%error use of ``%1'' avx instruction in cpuname function: 
current_function
+%elif __sizeofreg == 64 && notcpuflag(avx512)
+%error use of ``%1'' avx512 instruction in cpuname function: 
current_function
+%elifidn %1, pextrw ; special case because the base instruction is 
mmx2,
+%ifnid %6   ; but sse4 is required for memory operands
+%if notcpuflag(sse4)
+%error use of ``%1'' sse4 instruction in cpuname 
function: current_function
+%endif
+%endif
 %endif
 %endif
 %endif
@@ -1379,38 +1393,38 @@ AVX_INSTR cmpunordpd, sse2, 1, 0, 1
 AVX_INSTR cmpunordps, sse, 1, 0, 1
 AVX_INSTR cmpunordsd, sse2, 1, 0, 0
 AVX_INSTR cmpunordss, sse, 1, 0, 0
-AVX_INSTR comisd, sse2
-AVX_INSTR comiss, sse
-AVX_INSTR cvtdq2pd, sse2
-AVX_INSTR cvtdq2ps, sse2
-AVX_INSTR cvtpd2dq, sse2
-AVX_INSTR cvtpd2ps, sse2
-AVX_INSTR cvtps2dq, sse2
-AVX_INSTR cvtps2pd, sse2
-AVX_INSTR cvtsd2si, sse2
+AVX_INSTR comisd, sse2, 1
+AVX_INSTR comiss, sse, 1
+AVX_INSTR cvtdq2pd, sse2, 1
+AVX_INSTR cvtdq2ps, sse2, 1
+AVX_INSTR cvtpd2dq, sse2, 1
+AVX_INSTR cvtpd2ps, sse2, 1
+AVX_INSTR cvtps2dq, sse2, 1
+AVX_INSTR cvtps2pd, sse2, 1
+AVX_INSTR cvtsd2si, sse2, 1
 AVX_INSTR cvtsd2ss, sse2, 1, 0, 0
 AVX_INSTR cvtsi2sd, sse2, 1, 0, 0
 AVX_INSTR cvtsi2ss, sse, 1, 0, 0
 AVX_INSTR cvtss2sd, sse2, 1, 0, 0
-AVX_INSTR cvtss2si, sse
-AVX_INSTR cvttpd2dq, sse2
-AVX_INSTR cvttps2dq, sse2
-AVX_INSTR cvttsd2si, sse2
-AVX_INSTR cvttss2si, sse
+AVX_INSTR cvtss2si, sse, 1
+AVX_INSTR cvttpd2dq, sse2, 1
+AVX_INSTR cvttps2dq, sse2, 1
+AVX_INSTR cvttsd2si, sse2, 1
+AVX_INSTR cvttss2si, sse, 1
 AVX_INSTR divpd, sse2, 1, 0, 0
 AVX_INSTR divps, sse, 1, 0, 0
 AVX_INSTR divsd, sse2, 1, 0, 0
 AVX_INSTR divss, sse, 1, 0, 0
 AVX_INSTR dppd, sse4, 1, 1, 0
 AVX_INSTR dpps, sse4, 1, 1, 0
-AVX_INSTR extractps, sse4
+AVX_INSTR extractps, sse4, 1
 AVX_INSTR haddpd, sse3, 1, 0, 0
 AVX_INSTR haddps, sse3, 1, 0, 0
 AVX_INSTR hsubpd, sse3, 1, 0, 0
 AVX_INSTR hsubps, sse3, 1, 0, 0
 AVX_INSTR insertps, sse4, 1, 1, 0
 AVX_INSTR lddqu, sse3
-AVX_INSTR ldmxcsr, sse
+AVX_INSTR ldmxcsr, sse, 1
 AVX_INSTR maskmovdqu, sse2
 AVX_INSTR maxpd, sse2, 1, 0, 1
 AVX_INSTR maxps, sse, 1, 0, 1
@@ -1420,10 +1434,10 @@ AVX_INSTR minpd, sse2, 1, 0, 1
 AVX_INSTR minps, sse, 1, 0, 1
 AVX_INSTR minsd, sse2, 1, 0, 0
 AVX_INSTR minss, sse, 1, 0, 0
-AVX_INSTR movapd, sse2
-AVX_INSTR movaps, sse
+AVX_INSTR movapd, sse2, 1
+AVX_INSTR movaps, sse, 1
 AVX_INSTR movd, mmx
-AVX_INSTR movddup, sse3
+AVX_INSTR movddup, sse3, 1
 AVX_INSTR movdqa, sse2
 AVX_INSTR movdqu, sse2
 AVX_INSTR movhlps, sse, 1, 0, 0
@@ -1432,19 +1446,19 @@ AVX_INSTR movhps, sse, 1, 0, 0
 AVX_INSTR movlhps, sse, 1, 0, 0
 AVX_INSTR movlpd, sse2, 1, 0, 0
 AVX_INSTR movlps, sse, 1, 0, 0
-AVX_INSTR movmskpd, sse2
-AVX_INSTR movmskps, sse
+AVX_INSTR movmskpd, sse2, 1
+AVX_INSTR movmskps, sse, 1
 AVX_INSTR movntdq, sse2
 AVX_INSTR movntdqa, sse4
-AVX_INSTR movntpd, sse2
-AVX_INSTR movntps, sse
+AVX_INSTR movntpd, sse2, 1
+AVX_INSTR movntps, sse, 1
 AVX_INSTR movq, mmx
 AVX_INSTR movsd, sse2, 1, 0, 0
-AVX_INSTR movshdup, sse3
-AVX_INSTR movsldup, sse3
+AVX_INSTR movshdup, sse3, 1
+AVX_INSTR movsldup, sse3, 1
 AVX_INSTR movss, sse, 1, 0, 0
-AVX_INSTR movupd, sse2
-AVX_INSTR movups, sse
+AVX_INSTR movupd, sse2, 1
+AVX_INSTR movups, sse, 1
 AVX_INSTR mpsadbw, sse4, 0, 1, 0
 AVX_INSTR mulpd, sse2, 1, 0, 1
 AVX_INSTR mulps, sse, 1, 0, 1
@@ -1577,27 +1591,27 @@ AVX_INSTR punpcklwd, mmx, 0, 0, 0
 AVX_INSTR punpckldq, mmx, 0, 0, 0
 AVX_INSTR punpcklqdq, sse2, 0, 0, 0
 AVX_INSTR pxor, mmx, 0, 0, 1
-AVX_INSTR rcpps, sse
+AVX_INSTR rcpps, sse, 1
 AVX_INST

[FFmpeg-devel] [PATCH 0/7] Import some x264asm patches from x264

2019-08-05 Thread James Darnley

Here are a few easy-to-import patches from x264.  These are all after x264
commit 4a158b00 "x86inc: Correctly set mmreg variables" which FFmpeg already
has (commit eb5f063e7c).

It does not include the following commits:
* 82721eae "x86inc: Add x86-32 PIC support macros"
* 101bd27d "x86inc: Support N_PEXT bit on Mach-O"

They would not apply cleanly because of existing differences between x264 and
FFmpeg.  The PIC one has a change to configure which would need remaking.

Henrik Gramner (7):
  x86inc: Fix VEX -> EVEX instruction conversion
  x86inc: Optimize VEX instruction encoding
  x86inc: Improve SAVE/LOAD_MM_PERMUTATION macros
  x86inc: Turn 'movsxd' into 'movifnidn' on x86-32
  x86inc: Make 'non-adjacent' default in the TAIL_CALL macro
  x86inc: Improve warnings for use of unsupported instructions
  x86inc: Add support for GFNI instructions

 libavutil/x86/x86inc.asm | 219 ---
 1 file changed, 161 insertions(+), 58 deletions(-)

-- 
2.22.0

___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".

[FFmpeg-devel] [PATCH 2/7] x86inc: Optimize VEX instruction encoding

2019-08-05 Thread James Darnley

From: Henrik Gramner 

Most VEX-encoded instructions require an additional byte to encode when src2
is a high register (e.g. x|ymm8..15). If the instruction is commutative we
can swap src1 and src2 when doing so reduces the instruction length, e.g.

vpaddw xmm0, xmm0, xmm8 -> vpaddw xmm0, xmm8, xmm0
---
 libavutil/x86/x86inc.asm | 35 +--
 1 file changed, 33 insertions(+), 2 deletions(-)

diff --git a/libavutil/x86/x86inc.asm b/libavutil/x86/x86inc.asm
index bc370a6186..39cba5db09 100644
--- a/libavutil/x86/x86inc.asm
+++ b/libavutil/x86/x86inc.asm
@@ -1244,9 +1244,40 @@ INIT_XMM
 %elif %0 >= 9
 __instr %6, %7, %8, %9
 %elif %0 == 8
-__instr %6, %7, %8
+%if avx_enabled && %5
+%xdefine __src1 %7
+%xdefine __src2 %8
+%ifnum regnumof%7
+%ifnum regnumof%8
+%if regnumof%7 < 8 && regnumof%8 >= 8 && regnumof%8 < 16 
&& sizeof%8 <= 32
+; Most VEX-encoded instructions require an additional 
byte to encode when
+; src2 is a high register (e.g. m8..15). If the 
instruction is commutative
+; we can swap src1 and src2 when doing so reduces the 
instruction length.
+%xdefine __src1 %8
+%xdefine __src2 %7
+%endif
+%endif
+%endif
+__instr %6, __src1, __src2
+%else
+__instr %6, %7, %8
+%endif
 %elif %0 == 7
-__instr %6, %7
+%if avx_enabled && %5
+%xdefine __src1 %6
+%xdefine __src2 %7
+%ifnum regnumof%6
+%ifnum regnumof%7
+%if regnumof%6 < 8 && regnumof%7 >= 8 && regnumof%7 < 16 
&& sizeof%7 <= 32
+%xdefine __src1 %7
+%xdefine __src2 %6
+%endif
+%endif
+%endif
+__instr %6, __src1, __src2
+%else
+__instr %6, %7
+%endif
 %else
 __instr %6
 %endif
-- 
2.22.0

___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".

[FFmpeg-devel] [PATCH 7/7] x86inc: Add support for GFNI instructions

2019-08-05 Thread James Darnley

From: Henrik Gramner 

---
 libavutil/x86/x86inc.asm | 30 +-
 1 file changed, 17 insertions(+), 13 deletions(-)

diff --git a/libavutil/x86/x86inc.asm b/libavutil/x86/x86inc.asm
index d1b4c982fc..8c8cc97e0c 100644
--- a/libavutil/x86/x86inc.asm
+++ b/libavutil/x86/x86inc.asm
@@ -820,19 +820,20 @@ BRANCH_INSTR jz, je, jnz, jne, jl, jle, jnl, jnle, jg, 
jge, jng, jnge, ja, jae,
 %assign cpuflags_sse4 (1<<10)| cpuflags_ssse3
 %assign cpuflags_sse42(1<<11)| cpuflags_sse4
 %assign cpuflags_aesni(1<<12)| cpuflags_sse42
-%assign cpuflags_avx  (1<<13)| cpuflags_sse42
-%assign cpuflags_xop  (1<<14)| cpuflags_avx
-%assign cpuflags_fma4 (1<<15)| cpuflags_avx
-%assign cpuflags_fma3 (1<<16)| cpuflags_avx
-%assign cpuflags_bmi1 (1<<17)| cpuflags_avx|cpuflags_lzcnt
-%assign cpuflags_bmi2 (1<<18)| cpuflags_bmi1
-%assign cpuflags_avx2 (1<<19)| cpuflags_fma3|cpuflags_bmi2
-%assign cpuflags_avx512   (1<<20)| cpuflags_avx2 ; F, CD, BW, DQ, VL
-
-%assign cpuflags_cache32  (1<<21)
-%assign cpuflags_cache64  (1<<22)
-%assign cpuflags_aligned  (1<<23) ; not a cpu feature, but a function variant
-%assign cpuflags_atom (1<<24)
+%assign cpuflags_gfni (1<<13)| cpuflags_sse42
+%assign cpuflags_avx  (1<<14)| cpuflags_sse42
+%assign cpuflags_xop  (1<<15)| cpuflags_avx
+%assign cpuflags_fma4 (1<<16)| cpuflags_avx
+%assign cpuflags_fma3 (1<<17)| cpuflags_avx
+%assign cpuflags_bmi1 (1<<18)| cpuflags_avx|cpuflags_lzcnt
+%assign cpuflags_bmi2 (1<<19)| cpuflags_bmi1
+%assign cpuflags_avx2 (1<<20)| cpuflags_fma3|cpuflags_bmi2
+%assign cpuflags_avx512   (1<<21)| cpuflags_avx2 ; F, CD, BW, DQ, VL
+
+%assign cpuflags_cache32  (1<<22)
+%assign cpuflags_cache64  (1<<23)
+%assign cpuflags_aligned  (1<<24) ; not a cpu feature, but a function variant
+%assign cpuflags_atom (1<<25)
 
 ; Returns a boolean value expressing whether or not the specified cpuflag is 
enabled.
 %definecpuflag(x) (cpuflags & (cpuflags_ %+ x)) ^ (cpuflags_ %+ x)) - 
1) >> 31) & 1)
@@ -1418,6 +1419,9 @@ AVX_INSTR divss, sse, 1, 0, 0
 AVX_INSTR dppd, sse4, 1, 1, 0
 AVX_INSTR dpps, sse4, 1, 1, 0
 AVX_INSTR extractps, sse4, 1
+AVX_INSTR gf2p8affineinvqb, gfni, 0, 1, 0
+AVX_INSTR gf2p8affineqb, gfni, 0, 1, 0
+AVX_INSTR gf2p8mulb, gfni, 0, 0, 0
 AVX_INSTR haddpd, sse3, 1, 0, 0
 AVX_INSTR haddps, sse3, 1, 0, 0
 AVX_INSTR hsubpd, sse3, 1, 0, 0
-- 
2.22.0

___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".

[FFmpeg-devel] [PATCH 3/7] x86inc: Improve SAVE/LOAD_MM_PERMUTATION macros

2019-08-05 Thread James Darnley

From: Henrik Gramner 

Use register numbers instead of copying the full register names. This makes it
possible to change register widths in the middle of a function and keep the
mmreg permutations intact which can be useful for code that only needs larger
vectors for parts of the function in combination with macros etc.

Also change the LOAD_MM_PERMUTATION macro to use the same default name as the
SAVE macro. This simplifies swapping from ymm to xmm registers or vice versa:

SAVE_MM_PERMUTATION
INIT_XMM 
LOAD_MM_PERMUTATION
---
 libavutil/x86/x86inc.asm | 23 ++-
 1 file changed, 18 insertions(+), 5 deletions(-)

diff --git a/libavutil/x86/x86inc.asm b/libavutil/x86/x86inc.asm
index 39cba5db09..10b7711637 100644
--- a/libavutil/x86/x86inc.asm
+++ b/libavutil/x86/x86inc.asm
@@ -1081,19 +1081,32 @@ INIT_XMM
 %endif
 %assign %%i 0
 %rep num_mmregs
-CAT_XDEFINE %%f, %%i, m %+ %%i
+%xdefine %%tmp m %+ %%i
+CAT_XDEFINE %%f, %%i, regnumof %+ %%tmp
 %assign %%i %%i+1
 %endrep
 %endmacro
 
-%macro LOAD_MM_PERMUTATION 1 ; name to load from
-%ifdef %1_m0
+%macro LOAD_MM_PERMUTATION 0-1 ; name to load from
+%if %0
+%xdefine %%f %1_m
+%else
+%xdefine %%f current_function %+ _m
+%endif
+%xdefine %%tmp %%f %+ 0
+%ifnum %%tmp
+RESET_MM_PERMUTATION
 %assign %%i 0
 %rep num_mmregs
-CAT_XDEFINE m, %%i, %1_m %+ %%i
-CAT_XDEFINE nn, m %+ %%i, %%i
+%xdefine %%tmp %%f %+ %%i
+CAT_XDEFINE %%m, %%i, m %+ %%tmp
 %assign %%i %%i+1
 %endrep
+%rep num_mmregs
+%assign %%i %%i-1
+CAT_XDEFINE m, %%i, %%m %+ %%i
+CAT_XDEFINE nn, m %+ %%i, %%i
+%endrep
 %endif
 %endmacro
 
-- 
2.22.0

___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".

Re: [FFmpeg-devel] [PATCH] frame: Simplify the video allocation

2018-09-03 Thread James Darnley

On 2018-09-03 15:29, James Almer wrote:
> pass 32 - 1 to both av_image_fill_pointers() calls directly?

Please do not add a magic number where nobody will find it.  Use one of
the 3 already existing methods for knowing the alignment necessary for
assembly.

If this is unrelated, my apologies.

___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
http://ffmpeg.org/mailman/listinfo/ffmpeg-devel

Re: [FFmpeg-devel] [PATCH] avformat/matroskaenc: add reserve free space option

2018-09-05 Thread James Darnley

On 2018-09-05 22:52, Sigríður Regína Sigurþórsdóttir wrote:
> +{"reserve_free_space", "Reserve a given amount of space at the
> beginning og the file for unspecified purpose."

I added the "metadata_header_padding" global option many years ago.  Can
you not reuse it for this purpose?  Is it not likely to be "metadata"
that another software might fill this with?

Also there is a typo in the bit I quoted.
___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
http://ffmpeg.org/mailman/listinfo/ffmpeg-devel

Re: [FFmpeg-devel] [PATCH] avformat/matroskaenc: add reserve free space option

2018-09-06 Thread James Darnley

On 2018-09-06 19:39, Sigríður Regína Sigurþórsdóttir wrote:
> +if (s->metadata_header_padding) {
> +if (s->metadata_header_padding == 1)
> +s->metadata_header_padding++;
> +put_ebml_void(pb, s->metadata_header_padding);
> +}

Unfortunately I was forced to make the default -1 so you want to check
that the value is greater than 0 rather than just true.

Furthermore I think you will still want to add to Changelog making a
note that the matroska muxer will now listen to metadata_header_padding.
 That may also want a micro version bump so that library users can check.

___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
http://ffmpeg.org/mailman/listinfo/ffmpeg-devel

Re: [FFmpeg-devel] [FFmpeg-cvslog] pthread_frame: merge the functionality for normal decoder init and init_thread_copy

2020-06-03 Thread James Darnley

On 2020-04-10 16:53, Anton Khirnov wrote:
> ffmpeg | branch: master | Anton Khirnov  | Mon Jan  9 
> 18:04:42 2017 +0100| [1f4cf92cfbd3accbae582ac63126ed5570ddfd37] | committer: 
> Anton Khirnov
> 
> pthread_frame: merge the functionality for normal decoder init and 
> init_thread_copy
> 
> The current design, where
> - proper init is called for the first per-thread context
> - first thread's private data is copied into private data for all the
>   other threads
> - a "fixup" function is called for all the other threads to e.g.
>   allocate dynamically allocated data
> is very fragile and hard to follow, so it is abandoned. Instead, the
> same init function is used to init each per-thread context. Where
> necessary, AVCodecInternal.is_copy can be used to differentiate between
> the first thread and the other ones (e.g. for decoding the extradata
> just once).
> 
>> http://git.videolan.org/gitweb.cgi/ffmpeg.git/?a=commit;h=1f4cf92cfbd3accbae582ac63126ed5570ddfd37

This commit has caused unexpected behavior in one use of the API that I
encountered.

The AVCodecContexts that are used for get_buffer2 calls have different
delay values in them.  Setting 2 threads I see the value alternating
between 0 and 1 for every call.

That constant changing value, from the point of view of the thing
reading it, is what is causing the unexpected behavior.

___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".

Re: [FFmpeg-devel] [PATCH 1/3] avcodec/bitpacked: ,

2020-06-03 Thread James Darnley

On 2020-06-04 01:19, Michael Niedermayer wrote:
> Fixes: array end overread
> Fixes: 
> 22395/clusterfuzz-testcase-minimized-ffmpeg_AV_CODEC_ID_BITPACKED_fuzzer-5760940300828672
> 
> Found-by: continuous fuzzing process 
> https://github.com/google/oss-fuzz/tree/master/projects/ffmpeg
> Signed-off-by: Michael Niedermayer 
> ---
>  libavcodec/bitpacked.c | 2 +-
>  1 file changed, 1 insertion(+), 1 deletion(-)
> 
> diff --git a/libavcodec/bitpacked.c b/libavcodec/bitpacked.c
> index be7d1e3629..952ba73a32 100644
> --- a/libavcodec/bitpacked.c
> +++ b/libavcodec/bitpacked.c
> @@ -147,7 +147,7 @@ AVCodec ff_bitpacked_decoder = {
>  .decode = bitpacked_decode,
>  .capabilities = AV_CODEC_CAP_EXPERIMENTAL,
>  .codec_tags = (const uint32_t []){
> -MKTAG('U', 'Y', 'V', 'Y')
> +MKTAG('U', 'Y', 'V', 'Y'),
>  FF_CODEC_TAGS_END,
>  },
>  };
> 

I think you should add to the commit title.  Something like "add missing
comma to codec tags".

Other than that this looks fine.
___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".

[FFmpeg-devel] [PATCH 0/2] WIP: h264, slice threads, draw_horiz_band

2019-09-02 Thread James Darnley

Trying a combination of sliced threads, chunk decoding, and draw_horiz_band we
found that it didn't work with the current master code.  Modifying the
api-h264-slice fate test showed obvious errors with grey and green blocks and
more subtle ones that looked like misplaced macroblocks.

Kieran identified the cause and coded this quick fix.  He said that essentially
the code would give a region to draw_horiz_band which could include the previous
slice even if it hadn't been finished yet.

This corrects that problem and lets us decode exactly.  However it does cause
errors decoding B-frames in chunked mode.

Needs more work.

James Darnley (1):
  avcodec/h264: enable draw_horiz_band

Kieran Kunhya (1):
  avcodec/h264: fix draw_horiz_band with slice threads

 libavcodec/h264_slice.c | 29 +++--
 libavcodec/h264dec.c|  2 +-
 2 files changed, 24 insertions(+), 7 deletions(-)

-- 
2.22.0

___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".

[FFmpeg-devel] [PATCH 1/2] avcodec/h264: enable draw_horiz_band

2019-09-02 Thread James Darnley

---
 libavcodec/h264dec.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/libavcodec/h264dec.c b/libavcodec/h264dec.c
index 8d1bd16a8e..b9f304936c 100644
--- a/libavcodec/h264dec.c
+++ b/libavcodec/h264dec.c
@@ -1056,7 +1056,7 @@ AVCodec ff_h264_decoder = {
 .init  = h264_decode_init,
 .close = h264_decode_end,
 .decode= h264_decode_frame,
-.capabilities  = /*AV_CODEC_CAP_DRAW_HORIZ_BAND |*/ 
AV_CODEC_CAP_DR1 |
+.capabilities  = AV_CODEC_CAP_DRAW_HORIZ_BAND | AV_CODEC_CAP_DR1 |
  AV_CODEC_CAP_DELAY | AV_CODEC_CAP_SLICE_THREADS |
  AV_CODEC_CAP_FRAME_THREADS,
 .hw_configs= (const AVCodecHWConfigInternal*[]) {
-- 
2.22.0

___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".

[FFmpeg-devel] [PATCH 2/2] avcodec/h264: fix draw_horiz_band with slice threads

2019-09-02 Thread James Darnley

From: Kieran Kunhya 

---
 libavcodec/h264_slice.c | 29 +++--
 1 file changed, 23 insertions(+), 6 deletions(-)

diff --git a/libavcodec/h264_slice.c b/libavcodec/h264_slice.c
index 5ceee107a0..fe2aa01ceb 100644
--- a/libavcodec/h264_slice.c
+++ b/libavcodec/h264_slice.c
@@ -2527,18 +2527,33 @@ static void predict_field_decoding_flag(const 
H264Context *h, H264SliceContext *
 /**
  * Draw edges and report progress for the last MB row.
  */
-static void decode_finish_row(const H264Context *h, H264SliceContext *sl)
+static void decode_finish_row(const H264Context *h, H264SliceContext *sl, int 
slice_end)
 {
 int top= 16 * (sl->mb_y  >> FIELD_PICTURE(h));
 int pic_height = 16 *  h->mb_height >> FIELD_PICTURE(h);
 int height =  16  << FRAME_MBAFF(h);
 int deblock_border = (16 + 4) << FRAME_MBAFF(h);
 
-if (sl->deblocking_filter) {
+/* Slice-threaded draw_horiz_band not useful in this situation */
+if (sl->deblocking_filter == 1) {
 if ((top + height) >= pic_height)
 height += deblock_border;
 top -= deblock_border;
 }
+else if (sl->deblocking_filter == 2) {
+int first_mb_y = sl->first_mb_addr / h->mb_width;
+
+/* Draw the whole slice if it's possible:
+ * - If the beginning of the slice is at the start of a row
+ * - If we are at the end of the slice
+ * Previous slice is guaranteed not be included. */
+if (!(sl->first_mb_addr % h->mb_width)) {
+if (slice_end) {
+top = 16 * (first_mb_y >> FIELD_PICTURE(h));
+height = (16 << FRAME_MBAFF(h)) * ((sl->mb_y+1) - first_mb_y);
+}
+}
+}
 
 if (top >= pic_height || (top + height) < 0)
 return;
@@ -2549,7 +2564,8 @@ static void decode_finish_row(const H264Context *h, 
H264SliceContext *sl)
 top= 0;
 }
 
-ff_h264_draw_horiz_band(h, sl, top, height);
+if (slice_end)
+ff_h264_draw_horiz_band(h, sl, top, height);
 
 if (h->droppable || sl->h264->slice_ctx[0].er.error_occurred)
 return;
@@ -2622,7 +2638,7 @@ static int decode_slice(struct AVCodecContext *avctx, 
void *arg)
 
 for (;;) {
 // START_TIMER
-int ret, eos;
+int ret, eos, slice_end;
 if (sl->mb_x + sl->mb_y * h->mb_width >= sl->next_slice_idx) {
 av_log(h->avctx, AV_LOG_ERROR, "Slice overlaps with next at 
%d\n",
sl->next_slice_idx);
@@ -2669,10 +2685,11 @@ static int decode_slice(struct AVCodecContext *avctx, 
void *arg)
 return AVERROR_INVALIDDATA;
 }
 
+slice_end = eos || sl->mb_y >= h->mb_height;
 if (++sl->mb_x >= h->mb_width) {
 loop_filter(h, sl, lf_x_start, sl->mb_x);
 sl->mb_x = lf_x_start = 0;
-decode_finish_row(h, sl);
+decode_finish_row(h, sl, slice_end);
 ++sl->mb_y;
 if (FIELD_OR_MBAFF_PICTURE(h)) {
 ++sl->mb_y;
@@ -2729,7 +2746,7 @@ static int decode_slice(struct AVCodecContext *avctx, 
void *arg)
 if (++sl->mb_x >= h->mb_width) {
 loop_filter(h, sl, lf_x_start, sl->mb_x);
 sl->mb_x = lf_x_start = 0;
-decode_finish_row(h, sl);
+decode_finish_row(h, sl, 0);
 ++sl->mb_y;
 if (FIELD_OR_MBAFF_PICTURE(h)) {
 ++sl->mb_y;
-- 
2.22.0

___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".

Re: [FFmpeg-devel] [PATCH] avutil/eval: add sgn()

2019-10-12 Thread James Darnley

On 2019-10-11 21:45, Paul B Mahol wrote:

> diff --git a/doc/utils.texi b/doc/utils.texi
> index d55dd315c3..4e2e713505 100644
> --- a/doc/utils.texi
> +++ b/doc/utils.texi
> @@ -920,6 +920,9 @@ corresponding input value will be returned.
>  @item round(expr)
>  Round the value of expression @var{expr} to the nearest integer. For 
> example, "round(1.5)" is "2.0".
>  
> +@item sgn(x)
> +Compute sign of @var{x}.
> +
>  @item sin(x)
>  Compute sine of @var{x}.
>  

Too late now but, since we have round() just above it which is 5 chars,
couldn't you have made this sign()?

___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".

Re: [FFmpeg-devel] [Contract Request] for FFmpeg libmp3lame multi-threaded feature implementation

2019-11-25 Thread James Darnley

On 2019-11-25 13:52, Chandra Nakka wrote:
> Dear FFmpeg developers,
> 
> I'm very happy to have found your details on FFmpeg website for requesting
> FFmpeg feature implementation.
> 
> Currently I'm using FFmpeg command line tool on my linux servers to process
> media files into instant mp3 audio files by using FFmpeg piping feature.
> But, currently libmp3lame encoder support single thread only for encoding
> audio stream to mp3 file. This is the great drawback for my project.
> 
> I have more than 100+ linux servers for processing audio streams to mp3
> files. Each server has 8 physical CPU cores. But, due to libmp3lame single
> thread limitation my project mp3 conversion speed becomes too lazy
> and remaining cores on servers are becomes useless.
> 
> Actually I'm a web developer. I have no idea on FFmpeg tools tech
> languages.  So, I'm looking for FFmpeg developer who can implement
> libmp3lame multi-threaded feature on FFmpeg. I'm ready to pay for this
> feature.
> 
> Looking forward to hearing from you.
> 
> Thank you,
> Chandra N.

https://www.gnu.org/software/parallel/

That'll be $1, thank you.

___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".

Re: [FFmpeg-devel] [PATCH, v3, 1/7] lavu/pixfmt: add new pixel format 0yuv/y210/y410

2019-12-05 Thread James Darnley

On 2019-12-04 15:43, Linjie Fu wrote:
> Previously, media driver provided planar format(like 420 8 bit),
> but for HEVC Range Extension (422/444 8/10 bit), the decoded image
> is produced in packed format because Windows expects it.
> 
> Add some packed pixel formats for hardware decode support in VAAPI
> and QSV:
> 
> 4:2:2 10 bit: Y210
> 4:4:4  8 bit: 0YUV
> 4:4:4 10 bit: Y410
> 

> +[AV_PIX_FMT_Y410LE] = {
> +.name = "y410le",
> +.nb_components = 4,
> +.log2_chroma_w = 0,
> +.log2_chroma_h = 0,
> +.comp = {
> +{ 0, 32, 10, 0, 10, 31, 9, 11 },/* Y */
> +{ 0, 32,  0, 0, 10, 31, 9,  1 },/* U */
> +{ 0, 32, 20, 0, 10, 31, 9, 21 },/* V */
> +{ 0, 32, 30, 0,  2, 31, 1, 31 },/* A */
> +},
> +.flags = AV_PIX_FMT_FLAG_ALPHA | AV_PIX_FMT_FLAG_BITSTREAM,
> +},



> diff --git a/libavutil/pixfmt.h b/libavutil/pixfmt.h
> index d78e863..a163350 100644
> --- a/libavutil/pixfmt.h
> +++ b/libavutil/pixfmt.h
> @@ -348,6 +348,12 @@ enum AVPixelFormat {
>  AV_PIX_FMT_NV24,  ///< planar YUV 4:4:4, 24bpp, 1 plane for Y and 1 
> plane for the UV components, which are interleaved (first byte U and the 
> following byte V)
>  AV_PIX_FMT_NV42,  ///< as above, but U and V bytes are swapped
>  
> +AV_PIX_FMT_Y210BE,///< packed YUV 4:2:2, 32bpp, Y0 Cb Y1 Cr, 
> big-endian
> +AV_PIX_FMT_Y210LE,///< packed YUV 4:2:2, 32bpp, Y0 Cb Y1 Cr, 
> little-endian
> +AV_PIX_FMT_0YUV,  ///< packed YUV 4:4:4, 32bpp,  X  Y Cb Cr, 
> X=unused/undefined
> +AV_PIX_FMT_Y410LE,///< packed YUV 4:4:4, 32bpp, Cr  Y Cb  A, 
> little-endian
> +AV_PIX_FMT_Y410BE,///< packed YUV 4:4:4, 32bpp, Cr  Y Cb  A, 
> big-endian
> +
>  AV_PIX_FMT_NB ///< number of pixel formats, DO NOT USE THIS if 
> you want to link with shared libav* because the number of formats might 
> differ between versions
>  };
>  

I will ask again.  From
> http://ffmpeg.org/pipermail/ffmpeg-devel/2019-June/245929.html

> Why am I suspicious that at least one of those is a re-ordered v210?  I
> seem to recall that we rejected adding v210 to this list.  Either they
> don't belong in this list or they don't belong because libavcodec has a
> proper decoder (at least for v210).
> 
> This might be the thread I was remembering but March seems too recent
>> https://ffmpeg.org/pipermail/ffmpeg-devel/2019-March/241549.html
> 
> No real conclusion was reached there.
> 
> Do bit-packed formats belong in an AVPixelFormat?

Despite what was said last time I do believe this is packed.  I have
taken a little time to actually understand these magic number structs.

y410 is clearly packed like v210.  Look at the those offsets: 0, 10, 20,
30.  Packed into a 32-bit word.  Flagged with AV_PIX_FMT_FLAG_BITSTREAM.

How is that any different to v210?  Can you address a single sample in
that 1 plane format without using shifts and bit-wise ands?  Isn't that
the definition of packed?  I do not mean interleaved.

Okay, y410 is a little better in that it is 444 so the sample order does
not change through 6 word cycle.  Is that the key difference?



Do bit-packed formats belong in an AVPixelFormat?

If yes then I do not object to this patch or any others like this.

If no then why is this not rejected?



Does the AV_PIX_FMT_FLAG_BITSTREAM flag mean they do belong?  I admit I
haven't seen this before so maybe I should shut up and not send this email.




signature.asc
Description: OpenPGP digital signature
___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".

Re: [FFmpeg-devel] [IMPORTANT] FOSDEM meeting

2020-02-01 Thread James Darnley

On 28/01/2020, Liu Steven  wrote:
>
>
>> 在 2020年1月27日，下午3:29，Jean-Baptiste Kempf  写道：
>> It will be joinable through some VideoConf tool.
> Can we join by IRC or other things on internet?
> Because these days are Spring Festival (Chinese New Year, Important
> festivals that have lasted for thousands of years),
> The more important reason is New infectious virus epidemic areas here. :(

Since I don't think it was said yet: yes, there will be participation
on IRC.  At the very least I plan to be there and will relay things
to<->from #ffmpeg-meeting on freenode.

Other people are responsible for other solutions.
___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".

Re: [FFmpeg-devel] What new instructions would you like?

2020-02-01 Thread James Darnley

On 30/12/2019, Lauri Kasanen  wrote:
> Hi,
>
> For the Libre RISC-V project, I'm going to research the popular codecs
> and design new instructions to help speed them up. With ffmpeg being
> home to lots of asm folks for many platforms, I also want to ask your
> opinion.
>
> What new instructions would you like? Anything particular you find
> missing in existing ISAs, slow, or cumbersome?

Do you mean SIMD instructions?  I have no idea what exists in RISC-V
already or what capabilities or limitations it has, and I am going to
use x86 language and terms such as byte, word, dword, qword.

Things I have found missing in old(er) x86 instruction sets are
missing word size and signed/unsigned variants for existing
operations.  Some operations may have byte and word variants but dword
and qword might be missing, or there might be a signed version but not
an unsigned version (and vice versa).  A couple of things I had to
emulate:
* packed absolute value of dwords
* packed maximum unsigned words
* packed max and min signed dwords (I might have really wanted
unsigned for this)
* arithmetic right shift of qwords
* pack dwords to words with unsigned saturation

Shuffle instructions.  pshufb is very useful and I think I read on IRC
that arm/aarch64/neon does not have an equivalent.  (Or was that other
shuffles?)  It allows for arbitrary reordering of bytes and setting
bytes to 0.  On x86 it takes the shuffle pattern from another SIMD
register but I usually use it with a constant pattern that gets loaded
from memory.  An interesting improvement would be if you can encode 17
* 16 (or however long your vectors might be) values in an immediate
value so it doesn't require another register.

Good documentation.  The intel instruction manual has pretty good
explanation of what the instructions do.  The old instructions from
around the time of MMX and SSE had excellent diagrams, these might
have been mostly for shuffle operations.  I need to look and jog my
memory.  I think punpcklbw is an example of what I mean.  The entry in
the manual for it has a good diagram IMO.  (At least the version I am
currently looking at)

No stupid lane stuff.  AVX2 brought us a SIMD vector length extension
from 16 to 32 bytes.  Good except for the stupid lanes they were split
into making it hard to "mix" data from the low 0-15 bytes and the high
16-31 bytes.

I forgot about this email for a month.  Sorry about that.  Seeing
RISC-V in the schedule at FOSDEM reminded me about this.
___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".

Re: [FFmpeg-devel] Followup: FOSDEM meeting

2020-02-22 Thread James Darnley

On 2020-02-22 11:11, Thilo Borgmann wrote:
> Please someone put an IRC log from the meeting there, too. James Darnley?
> Also the audio was streamed, somebody might remember where too exactly. 
> Michael?

I can post my log from the day, probably email attachment.  Should I
remove any of the lines from it, particularly after the meting
concluded?  There was a little chat afterwards and into the early evening.

I didn't record the audio but it was broadcast on Google hangouts.  I
don't know whether it records.

___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".

Re: [FFmpeg-devel] Followup: FOSDEM meeting

2020-02-22 Thread James Darnley

On 2020-02-22 13:25, Paul B Mahol wrote:
> On 2/22/20, James Darnley  wrote:
>> On 2020-02-22 11:11, Thilo Borgmann wrote:
>>> Please someone put an IRC log from the meeting there, too. James Darnley?
>>> Also the audio was streamed, somebody might remember where too exactly.
>>> Michael?
>>
>> I can post my log from the day, probably email attachment.  Should I
>> remove any of the lines from it, particularly after the meting
>> concluded?  There was a little chat afterwards and into the early evening.
>>
> 
> Consor my entries.

> [Sat 22 18:00] <@durandal_1707> J_Darnley: no censoring allowed
That is more clear

Attached is the log for the entire day.  I don't think anything needs
removing so it is complete.

[14:10:59]  hello
[14:11:01]  https://hangouts.google.com/call/jYaO0pADYZELBBfsntHgAEEI
[14:11:13]  hullo
[14:11:40]  I can't invite, need op
[14:13:07]  ugh google wants me phone#
[14:13:12]  my
[14:14:23]  just use talky.io
[14:14:47]  I hope I'm showing up as muted since this UI isn't making me sure if I am or not (I should be)
[14:15:03]  Do you people hear us?
[14:15:20]  no audio so far
[14:15:34]  no
[14:15:35]  No 
[14:15:51]  I'm just following irc, not the hangout unfortunately
[14:16:10]  ok, james's video feed picked up
[14:16:24]  JEEB: with sound ?
[14:16:42]  neat
[14:16:56]  no sound still but I can just attempt to re-join
[14:17:19]  nope
[14:17:27]  ok, audio
[14:17:28]  yes
[14:17:31]  yeah
[14:17:32]  have audio
[14:20:13]  I'm in. idling with mic off
[14:26:55]  usually what you do is have a nomination committee that asks people in advance and then present the nominees
[14:27:53]  Can everybody hear?
[14:28:14]  I can hear
[14:28:20]  voting 1: 3d, vote 2: a week, so seems like the conn is working here :)
[14:28:21]  I can too
[14:28:23]  Atm we donât copy into irc what is said
[14:29:08]  (v1 was IIRC people nominated who might not otherwise show up on voting list, v2 was committees, right?)
[14:29:15]  git log --since="last 36 months" --author="name" --oneline | wc -l
[14:29:16]  yes
[14:29:18]  Jeeb: Please write short summaries about what you hear
[14:29:24]  the hangout in the topic is empty btw
[14:29:31]  (mobile phone here)
[14:29:36]  BBB: https://hangouts.google.com/call/jYaO0pADYZELBBfsntHgAEEI
[14:30:16]  cehoyos: will attempt.
[14:30:22]  git log --no-merges  --since=2020-01-25T00:00:00Z --until 2020-02-01T00:00:00Z --pretty=fuller | grep '^Author:' | sed 's/<.*//' |sort | uniq -c | sort -nr
[14:31:03]  Ty
[14:31:06]  j-b noting - CoC more like a values list as opposed to specific rules. there will be a suggestion which would then be voted on
[14:33:08]  Lynne noting - various audio decoders do checks already done avcodec common utils
[14:33:17]  (if I acught that right)
[14:33:41]  i have some difficulty understanding lynne with my headphones
[14:35:24]  michaelni: the sample rate and other checks in audio decoders that are now checked internally by the API so they should be removed
[14:35:39]  you added them, I pinged you on IRC and you didn't remove them
[14:36:07]  Lynne, i dont remember abouzt the ping but yes if there are redundant checks i should remove them
[14:36:15]  ping me again until i react!
[14:36:42]  for new joiners: since the topic is out of date if you want to join muted the URL is https://hangouts.google.com/call/jYaO0pADYZELBBfsntHgAEEI
[14:36:59]  patches would not be "lost" if we move to gitlab, for example
[14:37:32]  gitlab move: I guess main part being discussed atm being merge requests
[14:37:44]  if patches are handled by say gitlab, is it possible to subscribe via rss/atom?
[14:38:01]  I think yes, you can cehck with videolan's gitlab instance
[14:38:45]  couldn't find RSS/atom right away, but they have JSON https://code.videolan.org/videolan/x264/merge_requests.json
[14:38:50]  ugh
[14:38:56]  (just giving x264 as an example)
[14:39:02]  I keep track of mxf issues over rss
[14:39:11]  which is really handy
[14:39:21]  thardin: there are atom feeds for project activity, not sure if there's one *specific* to MRs
[14:39:27]  ah
[14:39:33]  haasn: that might be enough
[14:39:46]  rss readers typically haev filters
[14:39:57]  i dont see the problem with the existing infrastructure, so i dont see why we should move to gitlab
[14:40:05]  e.g. https://code.videolan.org/videolan/dav1d.atom
[14:40:47]  I run a gitlab instance at uni, and one thing I've found with gitlab is that it's.. a big thing. like it sometimes breaks for seemingly random reasons
[14:42:16]  yes, it's a very large ruby on rails thing, which is why I would hopefully share the system with another project, like videolan
[14:42:31]  that sounds like a good idea
[14:43:01]  I upgraded our instance when the last ubuntu lts came out, which was a bit of a chore but now I don&#

Re: [FFmpeg-devel] [PATCH] swscale/x86/yuv2rgb: Fix build without SSSE3

2020-02-23 Thread James Darnley

On 2020-02-23 13:22, Michael Niedermayer wrote:
> From: Parker Ernest <@>
> 
> commit fc6a5883d6af8cae0e96af84dda0ad74b360a084 breaks build on
> x86_64 CPUs which do not have SSSE3, e.g. AMD Phenom-II
> 
> Signed-off-by: Michael Niedermayer 
> ---
>  libswscale/x86/yuv2rgb.c | 2 ++
>  1 file changed, 2 insertions(+)
> 
> diff --git a/libswscale/x86/yuv2rgb.c b/libswscale/x86/yuv2rgb.c
> index c12e88cbb5..4791e5b93a 100644
> --- a/libswscale/x86/yuv2rgb.c
> +++ b/libswscale/x86/yuv2rgb.c
> @@ -83,6 +83,7 @@ av_cold SwsFunc ff_yuv2rgb_init_x86(SwsContext *c)
>  #if HAVE_X86ASM
>  int cpu_flags = av_get_cpu_flags();
>  
> +#if HAVE_SSSE3
>  if (EXTERNAL_SSSE3(cpu_flags)) {
>  switch (c->dstFormat) {
>  case AV_PIX_FMT_RGB32:
> @@ -111,6 +112,7 @@ av_cold SwsFunc ff_yuv2rgb_init_x86(SwsContext *c)
>  return yuv420_rgb15_ssse3;
>  }
>  }
> +#endif
>  
>  if (EXTERNAL_MMXEXT(cpu_flags)) {
>  switch (c->dstFormat) {
> 

What?  Why doesn't the the EXTERNAL_SSSE3 macro stop the code from
entering that branch?  The #if would only stop the section from being
compiled with --disable-ssse3.  A normal build would still enter that
branch on that CPU.

___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".

Re: [FFmpeg-devel] [PATCH] Add .mailmap

2020-02-23 Thread James Darnley

On 2020-02-23 15:12, Jean-Baptiste Kempf wrote:
> Yo,
> 
> On Sat, Feb 22, 2020, at 22:18, Josh de Kock wrote:
>> This allows for easy shortlog/log parsing, useful in determining
>> eligible members of the general assembly for the new FFmpeg voting
>> system.
> 
> I think this is a good idea.
> But are you sure all of those are in the right order? (aka preferred email is 
> shown)
> 

What is "preferred email" when you have 2 roles?  My commits on the job
get obe.tv (or are supposed to) and ones made in my own time get
gmail.com (or are supposed to).

Is it: when you screw up what email should you be shouted at on?

I guess since I probably send more discussion email from gmail.com,
maybe it is that one.

___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".

Re: [FFmpeg-devel] [PATCH] swscale/x86/yuv2rgb: Fix build without SSSE3

2020-02-23 Thread James Darnley

On 2020-02-23 18:58, Michael Niedermayer wrote:
> On Sun, Feb 23, 2020 at 05:03:36PM +0100, Carl Eugen Hoyos wrote:
>> Am So., 23. Feb. 2020 um 13:30 Uhr schrieb Michael Niedermayer
>> :
>>>
>>> From: Parker Ernest <@>
>>>
>>> commit fc6a5883d6af8cae0e96af84dda0ad74b360a084 breaks build on
>>> x86_64 CPUs which do not have SSSE3, e.g. AMD Phenom-II
>>
>> Does the commit break build on specific CPUs or specific toolchains?
> 
> I dont know what the testcase was the author encountered, i just posted
> this here as the author wanted me to post it for him.
> but a simple
> make distclean ; ./configure --disable-ssse3 && make -j32
> replicates the build failure here (see below for the errors)

Okay, it breaks the build when you do --disable-sse3.  I see that too.

It is okay to fix that any way you want.  This patch is fine by me but
please don't imply that it fixes a run time error in the commit message,
which is what I first thought.

I see a discussion has sprung up on the best way to fix it so I guess
that has to be resolved first.

___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".

Re: [FFmpeg-devel] Lossy GIF encoding

2019-02-15 Thread James Darnley

On 2019-02-15 10:01, Kornel wrote:
> libavcodec/gif.c in ff_gif_encoder.pix_fmts seems to passively declare types 
> of pixel formats it accepts.

If you want to experiment you can change that so it accepts rgb (also or
only).  Then you can implement and test what you want, then you can ask
about submitting it.

You can make your fancy encoding only available with rgb, or with some
option and return an error when given pal8.

signature.asc
Description: OpenPGP digital signature
___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
http://ffmpeg.org/mailman/listinfo/ffmpeg-devel

Re: [FFmpeg-devel] [PATCH] Added ff_v210_planar_unpack_aligned_avx2

2019-03-04 Thread James Darnley

On 2019-03-03 15:44, Martin Vignali wrote:
> Hello,
> 
> ...
> 
> Not directly related to this patch, but it can be interesting for testing
> purpose to write a checkasm test for the v210 func decoding.
> So it's more easy to check the perf for "each" cpu flags, and be sure, the
> various width cases works as expected.

I can probably do that.  I have one for v210 unpacking in a knock-off
checkasm for another project.

I will look over/review the submitted patch first.




signature.asc
Description: OpenPGP digital signature
___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

Re: [FFmpeg-devel] [PATCH] Added ff_v210_planar_unpack_aligned_avx2

2019-03-04 Thread James Darnley

On 2019-03-01 18:41, Michael Stoner wrote:
> The AVX2 code leverages VPERMD to process 12 pixels/iteration.  This is my 
> first patch submission so any comments are greatly appreciated.
> 
> -Mike
> 
> Tested on Skylake (Win32 & Win64)
> 1920x1080 input frame
> =
> C code - 440 fps
> SSSE3  - 920 fps
> AVX- 930 fps
> AVX2   - 1040 fps
> 
> Regression tested at 1920x1080, 1280x720, and 352x288

>  .loop:
>  %ifidn %1, unaligned
> -movu   m0, [r0]
> +movu   m0, [r0]; yB v5 yA  u5 y9 v4  y8 u4 y7  v3 y6 
> u3  y5 v2 y4  u2 y3 v1  y2 u1 y1  v0 y0 u0
>  %else
>  mova   m0, [r0]
>  %endif

At first I didn't understand why you do so much seemingly unnecessary
work.  You don't change how the data loaded into register.  After more
in-depth reading I see now that you shuffle data around just so you can
store the data with a single move for each plane.  The chroma is below.

> +%if cpuflag(avx2)
> +vpermd m1, m6, m1  ; 00 v5 v4 v3 00 v2 v1 v0 00 u5 u4 u3 
> 00 u2 u1 u0
> +pshufb m1, m7  ; 00 00 v5 v4 v3 v2 v1 v0 00 00 u5 u4 
> u3 u2 u1 u0
> +movu   [r2+r4], xm1
> +vextracti128 [r3+r4], m1, 1
> +%else
>  movq   [r2+r4], m1
>  movhps [r3+r4], m1
> +%endif

Sounds commendable but I doubt the use of this many more shuffles gets
you much over a naive AVX2 version (where you treat the high half of ymm
like an unroll).

> +; for AVX2 version only
> +v210_luma_permute: dd 0,1,2,4,5,6,7,7
> +v210_chroma_permute: dd 0,1,4,5,2,3,6,7

Are you sure these can't be replaced with vpermq and its immediate
operand?  It really looks like the second could be.  It'll save you a
register.

> -mova   m3, [v210_mult]
> -mova   m4, [v210_mask]
> -mova   m5, [v210_luma_shuf]
> -mova   m6, [v210_chroma_shuf]
> +mova   m3, [v210_luma_shuf]
> +mova   m4, [v210_chroma_shuf1]
> +
> +%if cpuflag(avx2)
> +mova   m5, [v210_luma_permute]  ; VPERMD constant must be in a 
> register
> +mova   m6, [v210_chroma_permute]; VPERMD constant must be in a 
> register
> +mova   m7, [v210_chroma_shuf2]
> +%endif
> +
> +%if ARCH_X86_64
> +mova   m8, [v210_mult]
> +mova   m9, [v210_mask]
> +%endif
> +

It would let you clean this up a bit.

My suggestion is to make the diff minimal by keeping the existing uses
and if you still need more than 8 registers for avx2 then make it
available for x86-64 only.

Compare yours with the one I committed here
https://github.com/Upipe/upipe/blob/master/lib/upipe-v210/v210dec.asm#L45
which is just FFmpeg's cleaned up a little plus avx2.  I'm surprised
it's not already in FFmpeg.

You should do whatever is faster.

signature.asc
Description: OpenPGP digital signature
___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

[FFmpeg-devel] [PATCH 1/2] avcodec/v210dec: move DSP function setting into dedicated function

2019-03-04 Thread James Darnley

Prepare for checkasm test.
---
 libavcodec/v210dec.c | 13 +
 libavcodec/v210dec.h |  1 +
 2 files changed, 10 insertions(+), 4 deletions(-)

diff --git a/libavcodec/v210dec.c b/libavcodec/v210dec.c
index ddc5dbe8be..28cf00d320 100644
--- a/libavcodec/v210dec.c
+++ b/libavcodec/v210dec.c
@@ -50,6 +50,14 @@ static void v210_planar_unpack_c(const uint32_t *src, 
uint16_t *y, uint16_t *u,
 }
 }
 
+av_cold void ff_v210dec_init(V210DecContext *s)
+{
+s->unpack_frame = v210_planar_unpack_c;
+s->aligned_input = 0;
+if (ARCH_X86)
+ff_v210_x86_init(s);
+}
+
 static av_cold int decode_init(AVCodecContext *avctx)
 {
 V210DecContext *s = avctx->priv_data;
@@ -57,10 +65,7 @@ static av_cold int decode_init(AVCodecContext *avctx)
 avctx->pix_fmt = AV_PIX_FMT_YUV422P10;
 avctx->bits_per_raw_sample = 10;
 
-s->unpack_frame= v210_planar_unpack_c;
-
-if (HAVE_MMX)
-ff_v210_x86_init(s);
+ff_v210dec_init(s);
 
 return 0;
 }
diff --git a/libavcodec/v210dec.h b/libavcodec/v210dec.h
index 533afc435c..cfdb29da09 100644
--- a/libavcodec/v210dec.h
+++ b/libavcodec/v210dec.h
@@ -31,6 +31,7 @@ typedef struct {
 void (*unpack_frame)(const uint32_t *src, uint16_t *y, uint16_t *u, 
uint16_t *v, int width);
 } V210DecContext;
 
+void ff_v210dec_init(V210DecContext *s);
 void ff_v210_x86_init(V210DecContext *s);
 
 #endif /* AVCODEC_V210DEC_H */
-- 
2.20.1

___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

[FFmpeg-devel] [PATCH 2/2] checkasm: add test for v210dec

2019-03-04 Thread James Darnley

---
 tests/checkasm/Makefile   |  1 +
 tests/checkasm/checkasm.c |  3 ++
 tests/checkasm/checkasm.h |  1 +
 tests/checkasm/v210dec.c  | 76 +++
 4 files changed, 81 insertions(+)
 create mode 100644 tests/checkasm/v210dec.c

diff --git a/tests/checkasm/Makefile b/tests/checkasm/Makefile
index 47b7b06d28..70abc1a407 100644
--- a/tests/checkasm/Makefile
+++ b/tests/checkasm/Makefile
@@ -25,6 +25,7 @@ AVCODECOBJS-$(CONFIG_JPEG2000_DECODER)  += jpeg2000dsp.o
 AVCODECOBJS-$(CONFIG_PIXBLOCKDSP)   += pixblockdsp.o
 AVCODECOBJS-$(CONFIG_HEVC_DECODER)  += hevc_add_res.o hevc_idct.o 
hevc_sao.o
 AVCODECOBJS-$(CONFIG_UTVIDEO_DECODER)   += utvideodsp.o
+AVCODECOBJS-$(CONFIG_V210_DECODER)  += v210dec.o
 AVCODECOBJS-$(CONFIG_V210_ENCODER)  += v210enc.o
 AVCODECOBJS-$(CONFIG_VP9_DECODER)   += vp9dsp.o
 
diff --git a/tests/checkasm/checkasm.c b/tests/checkasm/checkasm.c
index 9eec41e3c4..bf51e00eab 100644
--- a/tests/checkasm/checkasm.c
+++ b/tests/checkasm/checkasm.c
@@ -136,6 +136,9 @@ static const struct {
 #if CONFIG_UTVIDEO_DECODER
 { "utvideodsp", checkasm_check_utvideodsp },
 #endif
+#if CONFIG_V210_DECODER
+{ "v210dec", checkasm_check_v210dec },
+#endif
 #if CONFIG_V210_ENCODER
 { "v210enc", checkasm_check_v210enc },
 #endif
diff --git a/tests/checkasm/checkasm.h b/tests/checkasm/checkasm.h
index 9e8e879fd3..9b8d2f5419 100644
--- a/tests/checkasm/checkasm.h
+++ b/tests/checkasm/checkasm.h
@@ -69,6 +69,7 @@ void checkasm_check_sbrdsp(void);
 void checkasm_check_synth_filter(void);
 void checkasm_check_sw_rgb(void);
 void checkasm_check_utvideodsp(void);
+void checkasm_check_v210dec(void);
 void checkasm_check_v210enc(void);
 void checkasm_check_vf_hflip(void);
 void checkasm_check_vf_threshold(void);
diff --git a/tests/checkasm/v210dec.c b/tests/checkasm/v210dec.c
new file mode 100644
index 00..7320ed5e37
--- /dev/null
+++ b/tests/checkasm/v210dec.c
@@ -0,0 +1,76 @@
+/*
+ * Copyright (c) 2019 James Darnley
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with FFmpeg; if not, write to the Free Software Foundation, Inc.,
+ * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ */
+
+#include 
+#include "checkasm.h"
+#include "libavcodec/v210dec.h"
+
+static uint32_t get_v210(void)
+{
+uint32_t t0 = rnd() & 0x3ff,
+ t1 = rnd() & 0x3ff,
+ t2 = rnd() & 0x3ff;
+uint32_t value =  t0
+   | (t1 << 10)
+   | (t2 << 20);
+return value;
+}
+
+#define NUM_SAMPLES 2048
+
+static void randomize_buffers(uint32_t *src0, uint32_t *src1, int len)
+{
+for (int i = 0; i < len; i++) {
+uint32_t value = get_v210();
+src0[i] = value;
+src1[i] = value;
+}
+}
+
+void checkasm_check_v210dec(void)
+{
+V210DecContext h;
+
+ff_v210dec_init(&h);
+
+if (check_func(h.unpack_frame, "v210_unpack")) {
+uint32_t src0[NUM_SAMPLES/3];
+uint32_t src1[NUM_SAMPLES/3];
+uint16_t y0[NUM_SAMPLES/2];
+uint16_t y1[NUM_SAMPLES/2];
+uint16_t u0[NUM_SAMPLES/4];
+uint16_t u1[NUM_SAMPLES/4];
+uint16_t v0[NUM_SAMPLES/4];
+uint16_t v1[NUM_SAMPLES/4];
+declare_func(void, const uint32_t *src, uint16_t *y, uint16_t *u, 
uint16_t *v, int width);
+const int pixels = NUM_SAMPLES / 2 / 6 * 6;
+
+randomize_buffers(src0, src1, NUM_SAMPLES/3);
+call_ref(src0, y0, u0, v0, pixels);
+call_new(src1, y1, u1, v1, pixels);
+if (memcmp(src0, src1, NUM_SAMPLES/3 * sizeof src0[0])
+|| memcmp(y0, y1, pixels * sizeof y0[0])
+|| memcmp(u0, u1, pixels/2 * sizeof u0[0])
+|| memcmp(v0, v1, pixels/2 * sizeof v0[0]))
+fail();
+bench_new(src1, y1, u1, v1, pixels);
+}
+report("v210_unpack");
+}
-- 
2.20.1

___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

Re: [FFmpeg-devel] [PATCH 1/2] avcodec/v210dec: move DSP function setting into dedicated function

2019-03-06 Thread James Darnley

On 2019-03-06 10:11, Paul B Mahol wrote:
> On 3/6/19, Carl Eugen Hoyos  wrote:
>> 2019-03-04 23:58 GMT+01:00, James Darnley :
>>> Prepare for checkasm test.
>>> ---
>>>  libavcodec/v210dec.c | 13 +
>>>  libavcodec/v210dec.h |  1 +
>>>  2 files changed, 10 insertions(+), 4 deletions(-)
>>>
>>> diff --git a/libavcodec/v210dec.c b/libavcodec/v210dec.c
>>> index ddc5dbe8be..28cf00d320 100644
>>> --- a/libavcodec/v210dec.c
>>> +++ b/libavcodec/v210dec.c
>>> @@ -50,6 +50,14 @@ static void v210_planar_unpack_c(const uint32_t *src,
>>> uint16_t *y, uint16_t *u,
>>>  }
>>>  }
>>>
>>> +av_cold void ff_v210dec_init(V210DecContext *s)
>>> +{
>>> +s->unpack_frame = v210_planar_unpack_c;
>>
>>> +s->aligned_input = 0;
>>
>> Isn't this an unrelated change or do I misunderstand?
> 
> You misunderstand.

Maybe.

I need to initialize that member before it is used in the x86 function.
I expect valgrind or similar would catch the use.

It doesn't matter for normal use because it will be set correctly based
on the input data alignment for each frame.  Now that you mention it I
realize I forgot to change that to call the new function so I will send
a v2 later.

___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

[FFmpeg-devel] [PATCH] checkasm: add test for v210dec

2019-03-06 Thread James Darnley

---
 tests/checkasm/Makefile   |  1 +
 tests/checkasm/checkasm.c |  3 ++
 tests/checkasm/checkasm.h |  1 +
 tests/checkasm/v210dec.c  | 76 +++
 4 files changed, 81 insertions(+)
 create mode 100644 tests/checkasm/v210dec.c

diff --git a/tests/checkasm/Makefile b/tests/checkasm/Makefile
index 47b7b06d28..70abc1a407 100644
--- a/tests/checkasm/Makefile
+++ b/tests/checkasm/Makefile
@@ -25,6 +25,7 @@ AVCODECOBJS-$(CONFIG_JPEG2000_DECODER)  += jpeg2000dsp.o
 AVCODECOBJS-$(CONFIG_PIXBLOCKDSP)   += pixblockdsp.o
 AVCODECOBJS-$(CONFIG_HEVC_DECODER)  += hevc_add_res.o hevc_idct.o 
hevc_sao.o
 AVCODECOBJS-$(CONFIG_UTVIDEO_DECODER)   += utvideodsp.o
+AVCODECOBJS-$(CONFIG_V210_DECODER)  += v210dec.o
 AVCODECOBJS-$(CONFIG_V210_ENCODER)  += v210enc.o
 AVCODECOBJS-$(CONFIG_VP9_DECODER)   += vp9dsp.o
 
diff --git a/tests/checkasm/checkasm.c b/tests/checkasm/checkasm.c
index 9eec41e3c4..bf51e00eab 100644
--- a/tests/checkasm/checkasm.c
+++ b/tests/checkasm/checkasm.c
@@ -136,6 +136,9 @@ static const struct {
 #if CONFIG_UTVIDEO_DECODER
 { "utvideodsp", checkasm_check_utvideodsp },
 #endif
+#if CONFIG_V210_DECODER
+{ "v210dec", checkasm_check_v210dec },
+#endif
 #if CONFIG_V210_ENCODER
 { "v210enc", checkasm_check_v210enc },
 #endif
diff --git a/tests/checkasm/checkasm.h b/tests/checkasm/checkasm.h
index 9e8e879fd3..9b8d2f5419 100644
--- a/tests/checkasm/checkasm.h
+++ b/tests/checkasm/checkasm.h
@@ -69,6 +69,7 @@ void checkasm_check_sbrdsp(void);
 void checkasm_check_synth_filter(void);
 void checkasm_check_sw_rgb(void);
 void checkasm_check_utvideodsp(void);
+void checkasm_check_v210dec(void);
 void checkasm_check_v210enc(void);
 void checkasm_check_vf_hflip(void);
 void checkasm_check_vf_threshold(void);
diff --git a/tests/checkasm/v210dec.c b/tests/checkasm/v210dec.c
new file mode 100644
index 00..7320ed5e37
--- /dev/null
+++ b/tests/checkasm/v210dec.c
@@ -0,0 +1,76 @@
+/*
+ * Copyright (c) 2019 James Darnley
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with FFmpeg; if not, write to the Free Software Foundation, Inc.,
+ * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ */
+
+#include 
+#include "checkasm.h"
+#include "libavcodec/v210dec.h"
+
+static uint32_t get_v210(void)
+{
+uint32_t t0 = rnd() & 0x3ff,
+ t1 = rnd() & 0x3ff,
+ t2 = rnd() & 0x3ff;
+uint32_t value =  t0
+   | (t1 << 10)
+   | (t2 << 20);
+return value;
+}
+
+#define NUM_SAMPLES 2048
+
+static void randomize_buffers(uint32_t *src0, uint32_t *src1, int len)
+{
+for (int i = 0; i < len; i++) {
+uint32_t value = get_v210();
+src0[i] = value;
+src1[i] = value;
+}
+}
+
+void checkasm_check_v210dec(void)
+{
+V210DecContext h;
+
+ff_v210dec_init(&h);
+
+if (check_func(h.unpack_frame, "v210_unpack")) {
+uint32_t src0[NUM_SAMPLES/3];
+uint32_t src1[NUM_SAMPLES/3];
+uint16_t y0[NUM_SAMPLES/2];
+uint16_t y1[NUM_SAMPLES/2];
+uint16_t u0[NUM_SAMPLES/4];
+uint16_t u1[NUM_SAMPLES/4];
+uint16_t v0[NUM_SAMPLES/4];
+uint16_t v1[NUM_SAMPLES/4];
+declare_func(void, const uint32_t *src, uint16_t *y, uint16_t *u, 
uint16_t *v, int width);
+const int pixels = NUM_SAMPLES / 2 / 6 * 6;
+
+randomize_buffers(src0, src1, NUM_SAMPLES/3);
+call_ref(src0, y0, u0, v0, pixels);
+call_new(src1, y1, u1, v1, pixels);
+if (memcmp(src0, src1, NUM_SAMPLES/3 * sizeof src0[0])
+|| memcmp(y0, y1, pixels * sizeof y0[0])
+|| memcmp(u0, u1, pixels/2 * sizeof u0[0])
+|| memcmp(v0, v1, pixels/2 * sizeof v0[0]))
+fail();
+bench_new(src1, y1, u1, v1, pixels);
+}
+report("v210_unpack");
+}
-- 
2.20.1

___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

Re: [FFmpeg-devel] [PATCH] checkasm: add test for v210dec

2019-03-06 Thread James Darnley

On 2019-03-06 20:31, James Darnley wrote:
> ...

Wrong patch and wrong reference.  Please ignore this.
___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

[FFmpeg-devel] [PATCH] avcodec/v210dec: move DSP function setting into dedicated function

2019-03-06 Thread James Darnley

Prepare for checkasm test.
---
 libavcodec/v210dec.c | 16 ++--
 libavcodec/v210dec.h |  1 +
 2 files changed, 11 insertions(+), 6 deletions(-)

diff --git a/libavcodec/v210dec.c b/libavcodec/v210dec.c
index ddc5dbe8be..6db662538e 100644
--- a/libavcodec/v210dec.c
+++ b/libavcodec/v210dec.c
@@ -50,6 +50,14 @@ static void v210_planar_unpack_c(const uint32_t *src, 
uint16_t *y, uint16_t *u,
 }
 }
 
+av_cold void ff_v210dec_init(V210DecContext *s)
+{
+s->unpack_frame = v210_planar_unpack_c;
+s->aligned_input = 0;
+if (ARCH_X86)
+ff_v210_x86_init(s);
+}
+
 static av_cold int decode_init(AVCodecContext *avctx)
 {
 V210DecContext *s = avctx->priv_data;
@@ -57,10 +65,7 @@ static av_cold int decode_init(AVCodecContext *avctx)
 avctx->pix_fmt = AV_PIX_FMT_YUV422P10;
 avctx->bits_per_raw_sample = 10;
 
-s->unpack_frame= v210_planar_unpack_c;
-
-if (HAVE_MMX)
-ff_v210_x86_init(s);
+ff_v210dec_init(s);
 
 return 0;
 }
@@ -102,8 +107,7 @@ static int decode_frame(AVCodecContext *avctx, void *data, 
int *got_frame,
 aligned_input = !((uintptr_t)psrc & 0xf) && !(stride & 0xf);
 if (aligned_input != s->aligned_input) {
 s->aligned_input = aligned_input;
-if (HAVE_MMX)
-ff_v210_x86_init(s);
+ff_v210dec_init(s);
 }
 
 if ((ret = ff_get_buffer(avctx, pic, 0)) < 0)
diff --git a/libavcodec/v210dec.h b/libavcodec/v210dec.h
index 533afc435c..cfdb29da09 100644
--- a/libavcodec/v210dec.h
+++ b/libavcodec/v210dec.h
@@ -31,6 +31,7 @@ typedef struct {
 void (*unpack_frame)(const uint32_t *src, uint16_t *y, uint16_t *u, 
uint16_t *v, int width);
 } V210DecContext;
 
+void ff_v210dec_init(V210DecContext *s);
 void ff_v210_x86_init(V210DecContext *s);
 
 #endif /* AVCODEC_V210DEC_H */
-- 
2.20.1

___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

[FFmpeg-devel] [PATCH 2/2] checkasm: add test for v210dec

2019-03-06 Thread James Darnley

---
 tests/checkasm/Makefile   |  1 +
 tests/checkasm/checkasm.c |  3 ++
 tests/checkasm/checkasm.h |  1 +
 tests/checkasm/v210dec.c  | 77 +++
 4 files changed, 82 insertions(+)
 create mode 100644 tests/checkasm/v210dec.c

diff --git a/tests/checkasm/Makefile b/tests/checkasm/Makefile
index 47b7b06d28..70abc1a407 100644
--- a/tests/checkasm/Makefile
+++ b/tests/checkasm/Makefile
@@ -25,6 +25,7 @@ AVCODECOBJS-$(CONFIG_JPEG2000_DECODER)  += jpeg2000dsp.o
 AVCODECOBJS-$(CONFIG_PIXBLOCKDSP)   += pixblockdsp.o
 AVCODECOBJS-$(CONFIG_HEVC_DECODER)  += hevc_add_res.o hevc_idct.o 
hevc_sao.o
 AVCODECOBJS-$(CONFIG_UTVIDEO_DECODER)   += utvideodsp.o
+AVCODECOBJS-$(CONFIG_V210_DECODER)  += v210dec.o
 AVCODECOBJS-$(CONFIG_V210_ENCODER)  += v210enc.o
 AVCODECOBJS-$(CONFIG_VP9_DECODER)   += vp9dsp.o
 
diff --git a/tests/checkasm/checkasm.c b/tests/checkasm/checkasm.c
index 9eec41e3c4..bf51e00eab 100644
--- a/tests/checkasm/checkasm.c
+++ b/tests/checkasm/checkasm.c
@@ -136,6 +136,9 @@ static const struct {
 #if CONFIG_UTVIDEO_DECODER
 { "utvideodsp", checkasm_check_utvideodsp },
 #endif
+#if CONFIG_V210_DECODER
+{ "v210dec", checkasm_check_v210dec },
+#endif
 #if CONFIG_V210_ENCODER
 { "v210enc", checkasm_check_v210enc },
 #endif
diff --git a/tests/checkasm/checkasm.h b/tests/checkasm/checkasm.h
index 9e8e879fd3..9b8d2f5419 100644
--- a/tests/checkasm/checkasm.h
+++ b/tests/checkasm/checkasm.h
@@ -69,6 +69,7 @@ void checkasm_check_sbrdsp(void);
 void checkasm_check_synth_filter(void);
 void checkasm_check_sw_rgb(void);
 void checkasm_check_utvideodsp(void);
+void checkasm_check_v210dec(void);
 void checkasm_check_v210enc(void);
 void checkasm_check_vf_hflip(void);
 void checkasm_check_vf_threshold(void);
diff --git a/tests/checkasm/v210dec.c b/tests/checkasm/v210dec.c
new file mode 100644
index 00..7dd50a8271
--- /dev/null
+++ b/tests/checkasm/v210dec.c
@@ -0,0 +1,77 @@
+/*
+ * Copyright (c) 2019 James Darnley
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with FFmpeg; if not, write to the Free Software Foundation, Inc.,
+ * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ */
+
+#include 
+#include "checkasm.h"
+#include "libavcodec/v210dec.h"
+
+static uint32_t get_v210(void)
+{
+uint32_t t0 = rnd() & 0x3ff,
+ t1 = rnd() & 0x3ff,
+ t2 = rnd() & 0x3ff;
+uint32_t value =  t0
+   | (t1 << 10)
+   | (t2 << 20);
+return value;
+}
+
+#define NUM_SAMPLES 2048
+
+static void randomize_buffers(uint32_t *src0, uint32_t *src1, int len)
+{
+for (int i = 0; i < len; i++) {
+uint32_t value = get_v210();
+src0[i] = value;
+src1[i] = value;
+}
+}
+
+void checkasm_check_v210dec(void)
+{
+V210DecContext h;
+
+h.aligned_input = 0;
+ff_v210dec_init(&h);
+
+if (check_func(h.unpack_frame, "v210_unpack")) {
+uint32_t src0[NUM_SAMPLES/3];
+uint32_t src1[NUM_SAMPLES/3];
+uint16_t y0[NUM_SAMPLES/2];
+uint16_t y1[NUM_SAMPLES/2];
+uint16_t u0[NUM_SAMPLES/4];
+uint16_t u1[NUM_SAMPLES/4];
+uint16_t v0[NUM_SAMPLES/4];
+uint16_t v1[NUM_SAMPLES/4];
+declare_func(void, const uint32_t *src, uint16_t *y, uint16_t *u, 
uint16_t *v, int width);
+const int pixels = NUM_SAMPLES / 2 / 6 * 6;
+
+randomize_buffers(src0, src1, NUM_SAMPLES/3);
+call_ref(src0, y0, u0, v0, pixels);
+call_new(src1, y1, u1, v1, pixels);
+if (memcmp(src0, src1, NUM_SAMPLES/3 * sizeof src0[0])
+|| memcmp(y0, y1, pixels * sizeof y0[0])
+|| memcmp(u0, u1, pixels/2 * sizeof u0[0])
+|| memcmp(v0, v1, pixels/2 * sizeof v0[0]))
+fail();
+bench_new(src1, y1, u1, v1, pixels);
+}
+report("v210_unpack");
+}
-- 
2.20.1

___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

[FFmpeg-devel] [PATCH 1/2] avcodec/v210dec: move DSP function setting into dedicated function

2019-03-06 Thread James Darnley

Prepare for checkasm test.
---
 libavcodec/v210dec.c | 16 ++--
 libavcodec/v210dec.h |  1 +
 2 files changed, 11 insertions(+), 6 deletions(-)

diff --git a/libavcodec/v210dec.c b/libavcodec/v210dec.c
index ddc5dbe8be..fd8a6b0d78 100644
--- a/libavcodec/v210dec.c
+++ b/libavcodec/v210dec.c
@@ -50,6 +50,13 @@ static void v210_planar_unpack_c(const uint32_t *src, 
uint16_t *y, uint16_t *u,
 }
 }
 
+av_cold void ff_v210dec_init(V210DecContext *s)
+{
+s->unpack_frame = v210_planar_unpack_c;
+if (ARCH_X86)
+ff_v210_x86_init(s);
+}
+
 static av_cold int decode_init(AVCodecContext *avctx)
 {
 V210DecContext *s = avctx->priv_data;
@@ -57,10 +64,8 @@ static av_cold int decode_init(AVCodecContext *avctx)
 avctx->pix_fmt = AV_PIX_FMT_YUV422P10;
 avctx->bits_per_raw_sample = 10;
 
-s->unpack_frame= v210_planar_unpack_c;
-
-if (HAVE_MMX)
-ff_v210_x86_init(s);
+s->aligned_input = 0;
+ff_v210dec_init(s);
 
 return 0;
 }
@@ -102,8 +107,7 @@ static int decode_frame(AVCodecContext *avctx, void *data, 
int *got_frame,
 aligned_input = !((uintptr_t)psrc & 0xf) && !(stride & 0xf);
 if (aligned_input != s->aligned_input) {
 s->aligned_input = aligned_input;
-if (HAVE_MMX)
-ff_v210_x86_init(s);
+ff_v210dec_init(s);
 }
 
 if ((ret = ff_get_buffer(avctx, pic, 0)) < 0)
diff --git a/libavcodec/v210dec.h b/libavcodec/v210dec.h
index 533afc435c..cfdb29da09 100644
--- a/libavcodec/v210dec.h
+++ b/libavcodec/v210dec.h
@@ -31,6 +31,7 @@ typedef struct {
 void (*unpack_frame)(const uint32_t *src, uint16_t *y, uint16_t *u, 
uint16_t *v, int width);
 } V210DecContext;
 
+void ff_v210dec_init(V210DecContext *s);
 void ff_v210_x86_init(V210DecContext *s);
 
 #endif /* AVCODEC_V210DEC_H */
-- 
2.20.1

___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

[FFmpeg-devel] [PATCH 0/5 v2] AVX functions for 8-bit H.264 IDCT

2017-04-04 Thread James Darnley

After better testing I have decided to only submit these two functions.  The
others did not provide a speedup better than the deviation in testing.  Those
patches remain in the list archive should someone wish to try them.

James Darnley (5):
  avcodec/h264: change RETs into REP_RETs where appropriate
  avcodec/h264: change some labels to be macro-local
  avcodec/h264: use some 3 operand forms
  avcodec/h264: add avx 8-bit h264_idct_add
  avcodec/h264: add avx 8-bit h264_idct_dc_add

 libavcodec/x86/h264_idct.asm  | 110 ++
 libavcodec/x86/h264dsp_init.c |   5 ++
 2 files changed, 84 insertions(+), 31 deletions(-)

-- 
2.8.3

___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
http://ffmpeg.org/mailman/listinfo/ffmpeg-devel

[FFmpeg-devel] [PATCH 5/5] avcodec/h264: add avx 8-bit h264_idct_dc_add

2017-04-04 Thread James Darnley

Haswell:
 - 1.02x faster (405±0.7 vs. 397±0.8 decicycles) compared with mmxext

Skylake-U:
 - 1.06x faster (498±1.8 vs. 470±1.3 decicycles) compared with mmxext
---
 libavcodec/x86/h264_idct.asm  | 20 
 libavcodec/x86/h264dsp_init.c |  2 ++
 2 files changed, 22 insertions(+)

diff --git a/libavcodec/x86/h264_idct.asm b/libavcodec/x86/h264_idct.asm
index 24fb4d2..7fd57d3 100644
--- a/libavcodec/x86/h264_idct.asm
+++ b/libavcodec/x86/h264_idct.asm
@@ -1158,7 +1158,27 @@ INIT_XMM avx
 movd  [%7+%8], %4
 %endmacro
 
+%macro DC_ADD_INIT 1
+add  %1d, 32
+sar  %1d, 6
+movd m0, %1d
+SPLATW   m0, m0, 0
+lea  %1, [3*stride_q]
+pxor m1, m1
+psubwm1, m0
+packuswb m0, m0
+packuswb m1, m1
+%endmacro
+
 cglobal h264_idct_add_8, 3, 3, 8, dst_, block_, stride_
 movsxdifnidn stride_q, stride_d
 IDCT4_ADDdst_q, block_q, stride_q
 RET
+
+cglobal h264_idct_dc_add_8, 3, 4, 0, dst_, block_, stride_
+movsxdifnidn stride_q, stride_d
+movsx r3d, word [block_q]
+mov   dword [block_q], 0
+DC_ADD_INIT r3
+DC_ADD_MMXEXT_OP movd, dst_q, stride_q, r3
+RET
diff --git a/libavcodec/x86/h264dsp_init.c b/libavcodec/x86/h264dsp_init.c
index 8ba085f..bf74937 100644
--- a/libavcodec/x86/h264dsp_init.c
+++ b/libavcodec/x86/h264dsp_init.c
@@ -35,6 +35,7 @@ IDCT_ADD_FUNC(, 8, mmx)
 IDCT_ADD_FUNC(, 8, avx)
 IDCT_ADD_FUNC(, 10, sse2)
 IDCT_ADD_FUNC(_dc, 8, mmxext)
+IDCT_ADD_FUNC(_dc, 8, avx)
 IDCT_ADD_FUNC(_dc, 10, mmxext)
 IDCT_ADD_FUNC(8_dc, 8, mmxext)
 IDCT_ADD_FUNC(8_dc, 10, sse2)
@@ -340,6 +341,7 @@ av_cold void ff_h264dsp_init_x86(H264DSPContext *c, const 
int bit_depth,
 }
 
 c->h264_idct_add= ff_h264_idct_add_8_avx;
+c->h264_idct_dc_add = ff_h264_idct_dc_add_8_avx;
 }
 } else if (bit_depth == 10) {
 if (EXTERNAL_MMXEXT(cpu_flags)) {
-- 
2.8.3

___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
http://ffmpeg.org/mailman/listinfo/ffmpeg-devel

[FFmpeg-devel] [PATCH 1/5] avcodec/h264: change RETs into REP_RETs where appropriate

2017-04-04 Thread James Darnley

---
 libavcodec/x86/h264_idct.asm | 12 ++--
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/libavcodec/x86/h264_idct.asm b/libavcodec/x86/h264_idct.asm
index c36fea5..878ff02 100644
--- a/libavcodec/x86/h264_idct.asm
+++ b/libavcodec/x86/h264_idct.asm
@@ -695,7 +695,7 @@ cglobal h264_idct_add8_8, 5, 8 + npicregs, 0, dst1, 
block_offset, block, stride,
 addr0mp, gprsize
 %endif
 call h264_idct_add8_mmx_plane
-RET
+RET ; TODO: check rep ret after a function call
 
 cglobal h264_idct_add8_422_8, 5, 8 + npicregs, 0, dst1, block_offset, block, 
stride, nnzc, cntr, coeff, dst2, picreg
 ; dst1, block_offset, block, stride, nnzc, cntr, coeff, dst2, picreg
@@ -727,7 +727,7 @@ cglobal h264_idct_add8_422_8, 5, 8 + npicregs, 0, dst1, 
block_offset, block, str
 add r5, 4
 call h264_idct_add8_mmx_plane
 
-RET
+RET ; TODO: check rep ret after a function call
 
 h264_idct_add8_mmxext_plane:
 movsxdifnidn r3, r3d
@@ -795,7 +795,7 @@ cglobal h264_idct_add8_8, 5, 8 + npicregs, 0, dst1, 
block_offset, block, stride,
 addr0mp, gprsize
 %endif
 call h264_idct_add8_mmxext_plane
-RET
+RET ; TODO: check rep ret after a function call
 
 ; r0 = uint8_t *dst, r2 = int16_t *block, r3 = int stride, r6=clobbered
 h264_idct_dc_add8_mmxext:
@@ -878,7 +878,7 @@ cglobal h264_idct_add16_8, 5, 5 + ARCH_X86_64, 8
 add16_sse2_cycle 5, 0x24
 add16_sse2_cycle 6, 0x1e
 add16_sse2_cycle 7, 0x26
-RET
+REP_RET
 
 %macro add16intra_sse2_cycle 2
 movzx   r0, word [r4+%2]
@@ -925,7 +925,7 @@ cglobal h264_idct_add16intra_8, 5, 7 + ARCH_X86_64, 8
 add16intra_sse2_cycle 5, 0x24
 add16intra_sse2_cycle 6, 0x1e
 add16intra_sse2_cycle 7, 0x26
-RET
+REP_RET
 
 %macro add8_sse2_cycle 2
 movzx   r0, word [r4+%2]
@@ -980,7 +980,7 @@ cglobal h264_idct_add8_8, 5, 7 + ARCH_X86_64, 8
 %endif
 add8_sse2_cycle 2, 0x5c
 add8_sse2_cycle 3, 0x64
-RET
+REP_RET
 
 ;void ff_h264_luma_dc_dequant_idct_mmx(int16_t *output, int16_t *input, int 
qmul)
 
-- 
2.8.3

___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
http://ffmpeg.org/mailman/listinfo/ffmpeg-devel

[FFmpeg-devel] [PATCH 3/5] avcodec/h264: use some 3 operand forms

2017-04-04 Thread James Darnley

---
 libavcodec/x86/h264_idct.asm | 21 +
 1 file changed, 9 insertions(+), 12 deletions(-)

diff --git a/libavcodec/x86/h264_idct.asm b/libavcodec/x86/h264_idct.asm
index dde40e9..bc4dce4 100644
--- a/libavcodec/x86/h264_idct.asm
+++ b/libavcodec/x86/h264_idct.asm
@@ -87,10 +87,9 @@ cglobal h264_idct_add_8, 3, 3, 0
 RET
 
 %macro IDCT8_1D 2
-mova m0, m1
-psrawm1, 1
-mova m4, m5
-psrawm4, 1
+psrawm0, m1, 1
+SWAP 0, 1
+psrawm4, m5, 1
 paddwm4, m5
 paddwm1, m0
 paddwm4, m7
@@ -107,10 +106,9 @@ cglobal h264_idct_add_8, 3, 3, 0
 psubwm0, m3
 psubwm5, m7
 
-mova m7, m1
-psrawm1, 2
-mova m3, m4
-psrawm3, 2
+psrawm7, m1, 2
+SWAP 7,1
+psrawm3, m4, 2
 paddwm3, m0
 psrawm0, 2
 paddwm1, m5
@@ -118,10 +116,9 @@ cglobal h264_idct_add_8, 3, 3, 0
 psubwm0, m4
 psubwm7, m5
 
-mova m5, m6
-psrawm6, 1
-mova m4, m2
-psrawm4, 1
+psrawm5, m6, 1
+SWAP 5,6
+psrawm4, m2, 1
 paddwm6, m2
 psubwm4, m5
 
-- 
2.8.3

___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
http://ffmpeg.org/mailman/listinfo/ffmpeg-devel

[FFmpeg-devel] [PATCH 4/5] avcodec/h264: add avx 8-bit h264_idct_add

2017-04-04 Thread James Darnley

Haswell:
 - 1.11x faster (522±0.4 vs. 469±1.8 decicycles) compared with mmxext

Skylake-U:
 - 1.21x faster (671±5.5 vs. 555±1.4 decicycles) compared with mmxext
---
 libavcodec/x86/h264_idct.asm  | 33 -
 libavcodec/x86/h264dsp_init.c |  3 +++
 2 files changed, 35 insertions(+), 1 deletion(-)

diff --git a/libavcodec/x86/h264_idct.asm b/libavcodec/x86/h264_idct.asm
index bc4dce4..24fb4d2 100644
--- a/libavcodec/x86/h264_idct.asm
+++ b/libavcodec/x86/h264_idct.asm
@@ -65,7 +65,15 @@ SECTION .text
 
 IDCT4_1D  w, 0, 1, 2, 3, 4, 5
 mova m6, [pw_32]
-TRANSPOSE4x4W 0, 1, 2, 3, 4
+%if mmsize == 8
+TRANSPOSE4x4W 0, 1, 2, 3, 4
+%else
+punpcklwd m0, m1
+punpcklwd m2, m3
+SBUTTERFLY dq, 0, 2, 4
+MOVHL m1, m0
+MOVHL m3, m2
+%endif
 paddwm0, m6
 IDCT4_1D  w, 0, 1, 2, 3, 4, 5
 pxor m7, m7
@@ -1131,3 +1139,26 @@ INIT_MMX mmx
 IDCT_DC_DEQUANT 0
 INIT_MMX sse2
 IDCT_DC_DEQUANT 7
+
+INIT_XMM avx
+
+; %unmacro STORE_DIFFx2 8 ; remove macro from x86util.asm but yasm doesn't 
have this yet
+%macro STORE_DIFFx2 8 ; add1, add2, reg1, reg2, zero, shift, source, stride
+movd   %3, [%7]
+movd   %4, [%7+%8]
+psraw  %1, %6
+psraw  %2, %6
+punpcklbw  %3, %5
+punpcklbw  %4, %5
+paddw  %3, %1
+paddw  %4, %2
+packuswb   %3, %5
+packuswb   %4, %5
+movd [%7], %3
+movd  [%7+%8], %4
+%endmacro
+
+cglobal h264_idct_add_8, 3, 3, 8, dst_, block_, stride_
+movsxdifnidn stride_q, stride_d
+IDCT4_ADDdst_q, block_q, stride_q
+RET
diff --git a/libavcodec/x86/h264dsp_init.c b/libavcodec/x86/h264dsp_init.c
index 0643b37..8ba085f 100644
--- a/libavcodec/x86/h264dsp_init.c
+++ b/libavcodec/x86/h264dsp_init.c
@@ -32,6 +32,7 @@ void ff_h264_idct ## NUM ## _add_ ## DEPTH ## _ ## 
OPT(uint8_t *dst,\
int stride);
 
 IDCT_ADD_FUNC(, 8, mmx)
+IDCT_ADD_FUNC(, 8, avx)
 IDCT_ADD_FUNC(, 10, sse2)
 IDCT_ADD_FUNC(_dc, 8, mmxext)
 IDCT_ADD_FUNC(_dc, 10, mmxext)
@@ -337,6 +338,8 @@ av_cold void ff_h264dsp_init_x86(H264DSPContext *c, const 
int bit_depth,
 c->h264_h_loop_filter_chroma   = 
ff_deblock_h_chroma422_8_avx;
 c->h264_h_loop_filter_chroma_intra = 
ff_deblock_h_chroma422_intra_8_avx;
 }
+
+c->h264_idct_add= ff_h264_idct_add_8_avx;
 }
 } else if (bit_depth == 10) {
 if (EXTERNAL_MMXEXT(cpu_flags)) {
-- 
2.8.3

___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
http://ffmpeg.org/mailman/listinfo/ffmpeg-devel

[FFmpeg-devel] [PATCH 2/5] avcodec/h264: change some labels to be macro-local

2017-04-04 Thread James Darnley

The labels get stripped leading to (slightly) nicer disassembly from
objdump.
---
 libavcodec/x86/h264_idct.asm | 24 
 1 file changed, 12 insertions(+), 12 deletions(-)

diff --git a/libavcodec/x86/h264_idct.asm b/libavcodec/x86/h264_idct.asm
index 878ff02..dde40e9 100644
--- a/libavcodec/x86/h264_idct.asm
+++ b/libavcodec/x86/h264_idct.asm
@@ -846,7 +846,7 @@ h264_add8x4_idct_sse2:
 %macro add16_sse2_cycle 2
 movzx   r0, word [r4+%2]
 testr0, r0
-jz .cycle%1end
+jz %%skip
 movr0d, dword [r1+%1*8]
 %if ARCH_X86_64
 add r0, r5
@@ -854,7 +854,7 @@ h264_add8x4_idct_sse2:
 add r0, r0m
 %endif
 callh264_add8x4_idct_sse2
-.cycle%1end:
+%%skip:
 %if %1 < 7
 add r2, 64
 %endif
@@ -883,7 +883,7 @@ REP_RET
 %macro add16intra_sse2_cycle 2
 movzx   r0, word [r4+%2]
 testr0, r0
-jz .try%1dc
+jz %%trydc
 movr0d, dword [r1+%1*8]
 %if ARCH_X86_64
 add r0, r7
@@ -891,11 +891,11 @@ REP_RET
 add r0, r0m
 %endif
 callh264_add8x4_idct_sse2
-jmp .cycle%1end
-.try%1dc:
+jmp %%skip
+%%trydc:
 movsx   r0, word [r2   ]
 or r0w, word [r2+32]
-jz .cycle%1end
+jz %%skip
 movr0d, dword [r1+%1*8]
 %if ARCH_X86_64
 add r0, r7
@@ -903,7 +903,7 @@ REP_RET
 add r0, r0m
 %endif
 callh264_idct_dc_add8_mmxext
-.cycle%1end:
+%%skip:
 %if %1 < 7
 add r2, 64
 %endif
@@ -930,7 +930,7 @@ REP_RET
 %macro add8_sse2_cycle 2
 movzx   r0, word [r4+%2]
 testr0, r0
-jz .try%1dc
+jz %%trydc
 %if ARCH_X86_64
 movr0d, dword [r1+(%1&1)*8+64*(1+(%1>>1))]
 add r0, [r7]
@@ -940,11 +940,11 @@ REP_RET
 add r0, dword [r1+(%1&1)*8+64*(1+(%1>>1))]
 %endif
 callh264_add8x4_idct_sse2
-jmp .cycle%1end
-.try%1dc:
+jmp %%cycle_end
+%%trydc:
 movsx   r0, word [r2   ]
 or r0w, word [r2+32]
-jz .cycle%1end
+jz %%cycle_end
 %if ARCH_X86_64
 movr0d, dword [r1+(%1&1)*8+64*(1+(%1>>1))]
 add r0, [r7]
@@ -954,7 +954,7 @@ REP_RET
 add r0, dword [r1+(%1&1)*8+64*(1+(%1>>1))]
 %endif
 callh264_idct_dc_add8_mmxext
-.cycle%1end:
+%%cycle_end:
 %if %1 == 1
 add r2, 384+64
 %elif %1 < 3
-- 
2.8.3

___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
http://ffmpeg.org/mailman/listinfo/ffmpeg-devel

Re: [FFmpeg-devel] [PATCH 1/5] avcodec/h264: change RETs into REP_RETs where appropriate

2017-04-05 Thread James Darnley

On 2017-04-05 05:33, James Almer wrote:
> On 4/4/2017 10:53 PM, James Darnley wrote:
>> ---
>>  libavcodec/x86/h264_idct.asm | 12 ++--
>>  1 file changed, 6 insertions(+), 6 deletions(-)
>>
>> diff --git a/libavcodec/x86/h264_idct.asm b/libavcodec/x86/h264_idct.asm
>> index c36fea5..878ff02 100644
>> --- a/libavcodec/x86/h264_idct.asm
>> +++ b/libavcodec/x86/h264_idct.asm
>> @@ -695,7 +695,7 @@ cglobal h264_idct_add8_8, 5, 8 + npicregs, 0, dst1, 
>> block_offset, block, stride,
>>  addr0mp, gprsize
>>  %endif
>>  call h264_idct_add8_mmx_plane
>> -RET
>> +RET ; TODO: check rep ret after a function call
>>  
>>  cglobal h264_idct_add8_422_8, 5, 8 + npicregs, 0, dst1, block_offset, 
>> block, stride, nnzc, cntr, coeff, dst2, picreg
>>  ; dst1, block_offset, block, stride, nnzc, cntr, coeff, dst2, picreg
>> @@ -727,7 +727,7 @@ cglobal h264_idct_add8_422_8, 5, 8 + npicregs, 0, dst1, 
>> block_offset, block, str
>>  add r5, 4
>>  call h264_idct_add8_mmx_plane
>>  
>> -RET
>> +RET ; TODO: check rep ret after a function call
>>  
>>  h264_idct_add8_mmxext_plane:
>>  movsxdifnidn r3, r3d
>> @@ -795,7 +795,7 @@ cglobal h264_idct_add8_8, 5, 8 + npicregs, 0, dst1, 
>> block_offset, block, stride,
>>  addr0mp, gprsize
>>  %endif
>>  call h264_idct_add8_mmxext_plane
>> -RET
>> +RET ; TODO: check rep ret after a function call
>>  
>>  ; r0 = uint8_t *dst, r2 = int16_t *block, r3 = int stride, r6=clobbered
>>  h264_idct_dc_add8_mmxext:
>> @@ -878,7 +878,7 @@ cglobal h264_idct_add16_8, 5, 5 + ARCH_X86_64, 8
>>  add16_sse2_cycle 5, 0x24
>>  add16_sse2_cycle 6, 0x1e
>>  add16_sse2_cycle 7, 0x26
>> -RET
>> +REP_RET
>>  
>>  %macro add16intra_sse2_cycle 2
>>  movzx   r0, word [r4+%2]
>> @@ -925,7 +925,7 @@ cglobal h264_idct_add16intra_8, 5, 7 + ARCH_X86_64, 8
>>  add16intra_sse2_cycle 5, 0x24
>>  add16intra_sse2_cycle 6, 0x1e
>>  add16intra_sse2_cycle 7, 0x26
>> -RET
>> +REP_RET
>>  
>>  %macro add8_sse2_cycle 2
>>  movzx   r0, word [r4+%2]
>> @@ -980,7 +980,7 @@ cglobal h264_idct_add8_8, 5, 7 + ARCH_X86_64, 8
>>  %endif
>>  add8_sse2_cycle 2, 0x5c
>>  add8_sse2_cycle 3, 0x64
>> -RET
>> +REP_RET
>>  
>>  ;void ff_h264_luma_dc_dequant_idct_mmx(int16_t *output, int16_t *input, int 
>> qmul)
> 
> This is not necessary. Look at the RET macro in x86inc, It calls the 
> AUTO_REP_RET
> macro.

As I said last time, the macro only knows when the previous instruction
was a jump, not when ret is a branch target.  These macros contain a
jump to a label at the end which means we should use REP_RET.

The relevant doc comment:
> ; On AMD cpus <=K10, an ordinary ret is slow if it immediately follows either
> ; a branch or a branch target. So switch to a 2-byte form of ret in that case.
> ; We can automatically detect "follows a branch", but not a branch target.
> ; (SSSE3 is a sufficient condition to know that your cpu doesn't have this 
> problem.)

Has x86inc gained more magic?

I know people don't care much about old CPUs, I hardly do either.  That
said I do plan to resurrect an old K8 (if I ever tidy this place) just
so I have an sse2slow machine available.

___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
http://ffmpeg.org/mailman/listinfo/ffmpeg-devel

Re: [FFmpeg-devel] [PATCH 2/5] avcodec/h264: change some labels to be macro-local

2017-04-06 Thread James Darnley

On 2017-04-05 13:41, Ronald S. Bultje wrote:
> Hi,
> 
> On Tue, Apr 4, 2017 at 9:53 PM, James Darnley  wrote:
> 
>> The labels get stripped leading to (slightly) nicer disassembly from
>> objdump.
>>
> [..]
> 
>> -jz .cycle%1end
>> +jz %%skip
> 
> 
> Can you preserve the leading dot? I don't mind the %%skip, but please make
> it .%%skip.

That makes the patch pointless because those symbols don't get stripped.
 If you want the leading dot then I will drop this patch.

___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
http://ffmpeg.org/mailman/listinfo/ffmpeg-devel

Re: [FFmpeg-devel] [PATCH 4/5] avcodec/h264: add avx 8-bit h264_idct_add

2017-04-06 Thread James Darnley

On 2017-04-05 05:44, James Almer wrote:
> On 4/4/2017 10:53 PM, James Darnley wrote:
>> Haswell:
>>  - 1.11x faster (522±0.4 vs. 469±1.8 decicycles) compared with mmxext
>>
>> Skylake-U:
>>  - 1.21x faster (671±5.5 vs. 555±1.4 decicycles) compared with mmxext
> 
> Again, you should add an SSE2 version first, then an AVX one if it's
> measurably faster than the SSE2 one.

On a Yorkfield sse2 is barely faster: 1.02x faster (728±2.1 vs. 710±3.9
decicycles).  So 1 or 2 cycles

On a Skylake-U sse2 is most of the speedup: 1.15x faster (661±2.2 vs
573±1.9).  Then avx gains a mere 3 cycles: 547±0.5

On a Haswell sse2 provides only half the speedup:
 - sse2: 1.06x faster (525±2.5 vs 497±1.0 decicycles)
 - avx:  1.06x faster (497±1.0 vs 468±1.2 decicycles)

(All on 64-bit Linux)

On Nehalem and 64-bit Windows sse2 is slower:  0.92x faster (597±3.0 vs.
650±9.3 decicycles)

And on that note I should probably recheck the deblock patches I pushed
a little while ago.

So...  SSE2 for this function, yay or nay?

___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
http://ffmpeg.org/mailman/listinfo/ffmpeg-devel

Re: [FFmpeg-devel] [PATCH 5/5] avcodec/h264: add avx 8-bit h264_idct_dc_add

2017-04-06 Thread James Darnley

On 2017-04-05 06:05, James Almer wrote:
> On 4/4/2017 10:53 PM, James Darnley wrote:
>> Haswell:
>>  - 1.02x faster (405±0.7 vs. 397±0.8 decicycles) compared with mmxext
>>
>> Skylake-U:
>>  - 1.06x faster (498±1.8 vs. 470±1.3 decicycles) compared with mmxext
>> ---
>>  libavcodec/x86/h264_idct.asm  | 20 
>>  libavcodec/x86/h264dsp_init.c |  2 ++
>>  2 files changed, 22 insertions(+)
>>
>> diff --git a/libavcodec/x86/h264_idct.asm b/libavcodec/x86/h264_idct.asm
>> index 24fb4d2..7fd57d3 100644
>> --- a/libavcodec/x86/h264_idct.asm
>> +++ b/libavcodec/x86/h264_idct.asm
>> @@ -1158,7 +1158,27 @@ INIT_XMM avx
>>  movd  [%7+%8], %4
>>  %endmacro
>>  
>> +%macro DC_ADD_INIT 1
>> +add  %1d, 32
>> +sar  %1d, 6
>> +movd m0, %1d
>> +SPLATW   m0, m0, 0
> 
> Considering DC_ADD_MMXEXT_OP works with dwords, a single pshuflw should be
> enough. This macro calls two instructions to fill the entire XMM register,
> and there's no need for that.

Noted, I made that change butit doesn't seemto change much in terms of
performance.

> You could for that matter try to optimize DC_ADD_MMXEXT_OP a bit, combining
> said dwords with punpk* into fewer registers to reduce the amount of padd*
> and psub* needed afterwards. See ADD_RES_MMX_4_8 in hevc_add_res.asm

Noted.  Maybe in the future.

> And again, SSE2 first, AVX only if measurably faster. But since you're not
> making use of the wider XMM regs here at all, the only chips that will see
> any real speed up are those slow in mmx (like Skylake seems to be).

Yorkfield gets no benefit from sse2 (575±0.4 vs. 574±0.3 decicycles).
Haswell gets most of its benefit from sse2 (404±0.6 vs. 390±0.3 vs.
388±0.3).
Skylake-U gets all of its speedup from sse2 (533±3.0 vs 488±2.0 vs 497±1.4).

Nehalem and 64-bit also gets no benefit from sse2.

Again: SSE2 yay or nay?  Maybe I should just drop this; I'm not sure 5
cycles is worth it.

(I will now go and modify my script to divide the recorded decicycle
count by 10.)

>> +cglobal h264_idct_dc_add_8, 3, 4, 0, dst_, block_, stride_
  ^
Fixed this bug.

___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
http://ffmpeg.org/mailman/listinfo/ffmpeg-devel

Re: [FFmpeg-devel] [PATCH 1/5] avcodec/h264: change RETs into REP_RETs where appropriate

2017-04-14 Thread James Darnley

On 2017-04-05 22:26, Henrik Gramner wrote:
> On Wed, Apr 5, 2017 at 3:53 AM, James Darnley  wrote:
>>  call h264_idct_add8_mmx_plane
>> -RET
>> +RET ; TODO: check rep ret after a function call
> 
> call followed by RET should be replaced by the TAIL_CALL macro instead
> which outputs a jmp instruction if there's no function epilogue.

Do you want me to change this patch to add that?

___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
http://ffmpeg.org/mailman/listinfo/ffmpeg-devel

Re: [FFmpeg-devel] [PATCH 4/5] avcodec/h264: add avx 8-bit h264_idct_add

2017-04-14 Thread James Darnley

On 2017-04-06 18:06, James Almer wrote:
> Your numbers are really confusing. Could you post the actual numbers for
> each function instead of doing comparisons?

These figures are the actual numbers!

Using the figures from Haswell above:
> ff_h264_idct_add_8_mmx  = 52 cycles
> ff_h264_idct_add_8_sse2 = 49 cycles
> ff_h264_idct_add_8_avx  = 46 cycles

Coming back to this draft I saved I removed a fair bit of ranting and
cut it down to the essential point.

Also, I forgot about the Pentium I tested previous patches on.  I added
SSE2.  From that commit message:
> Kaby Lake Pentium:
>  - ff_h264_idct_add_8_sse2:~1.18x faster than mmxext
>  - ff_h264_idct_dc_add_8_sse2: ~1.07x faster than mmxext
___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
http://ffmpeg.org/mailman/listinfo/ffmpeg-devel

[FFmpeg-devel] [PATCH 0/6 v3] AVX functions for 8-bit H.264 IDCT

2017-04-14 Thread James Darnley

Changes:
 - Added sse2 functions
 - Fixed an incorrect xmm register count

I did not make the change suggested by Gramner about TAIL_CALL and I did leave
the TODOs there.

If there are no further objections I will push by Monday at the latest.  I want
to get this out the door.

James Darnley (6):
  avcodec/h264: change RETs into REP_RETs where appropriate
  avcodec/h264: change some labels to be macro-local
  avcodec/h264: use some 3 operand forms
  avcodec/h264: add avx 8-bit h264_idct_add
  avcodec/h264: add avx 8-bit h264_idct_dc_add
  avcodec/h264: add sse2 versions of previous idct functions

 libavcodec/x86/h264_idct.asm  | 117 +++---
 libavcodec/x86/h264dsp_init.c |  10 
 2 files changed, 96 insertions(+), 31 deletions(-)

-- 
2.8.3

___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
http://ffmpeg.org/mailman/listinfo/ffmpeg-devel

[FFmpeg-devel] [PATCH 1/6] avcodec/h264: change RETs into REP_RETs where appropriate

2017-04-14 Thread James Darnley

---
 libavcodec/x86/h264_idct.asm | 12 ++--
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/libavcodec/x86/h264_idct.asm b/libavcodec/x86/h264_idct.asm
index c36fea5..878ff02 100644
--- a/libavcodec/x86/h264_idct.asm
+++ b/libavcodec/x86/h264_idct.asm
@@ -695,7 +695,7 @@ cglobal h264_idct_add8_8, 5, 8 + npicregs, 0, dst1, 
block_offset, block, stride,
 addr0mp, gprsize
 %endif
 call h264_idct_add8_mmx_plane
-RET
+RET ; TODO: check rep ret after a function call
 
 cglobal h264_idct_add8_422_8, 5, 8 + npicregs, 0, dst1, block_offset, block, 
stride, nnzc, cntr, coeff, dst2, picreg
 ; dst1, block_offset, block, stride, nnzc, cntr, coeff, dst2, picreg
@@ -727,7 +727,7 @@ cglobal h264_idct_add8_422_8, 5, 8 + npicregs, 0, dst1, 
block_offset, block, str
 add r5, 4
 call h264_idct_add8_mmx_plane
 
-RET
+RET ; TODO: check rep ret after a function call
 
 h264_idct_add8_mmxext_plane:
 movsxdifnidn r3, r3d
@@ -795,7 +795,7 @@ cglobal h264_idct_add8_8, 5, 8 + npicregs, 0, dst1, 
block_offset, block, stride,
 addr0mp, gprsize
 %endif
 call h264_idct_add8_mmxext_plane
-RET
+RET ; TODO: check rep ret after a function call
 
 ; r0 = uint8_t *dst, r2 = int16_t *block, r3 = int stride, r6=clobbered
 h264_idct_dc_add8_mmxext:
@@ -878,7 +878,7 @@ cglobal h264_idct_add16_8, 5, 5 + ARCH_X86_64, 8
 add16_sse2_cycle 5, 0x24
 add16_sse2_cycle 6, 0x1e
 add16_sse2_cycle 7, 0x26
-RET
+REP_RET
 
 %macro add16intra_sse2_cycle 2
 movzx   r0, word [r4+%2]
@@ -925,7 +925,7 @@ cglobal h264_idct_add16intra_8, 5, 7 + ARCH_X86_64, 8
 add16intra_sse2_cycle 5, 0x24
 add16intra_sse2_cycle 6, 0x1e
 add16intra_sse2_cycle 7, 0x26
-RET
+REP_RET
 
 %macro add8_sse2_cycle 2
 movzx   r0, word [r4+%2]
@@ -980,7 +980,7 @@ cglobal h264_idct_add8_8, 5, 7 + ARCH_X86_64, 8
 %endif
 add8_sse2_cycle 2, 0x5c
 add8_sse2_cycle 3, 0x64
-RET
+REP_RET
 
 ;void ff_h264_luma_dc_dequant_idct_mmx(int16_t *output, int16_t *input, int 
qmul)
 
-- 
2.8.3

___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
http://ffmpeg.org/mailman/listinfo/ffmpeg-devel

[FFmpeg-devel] [PATCH 3/6] avcodec/h264: use some 3 operand forms

2017-04-14 Thread James Darnley

---
 libavcodec/x86/h264_idct.asm | 21 +
 1 file changed, 9 insertions(+), 12 deletions(-)

diff --git a/libavcodec/x86/h264_idct.asm b/libavcodec/x86/h264_idct.asm
index dde40e9..bc4dce4 100644
--- a/libavcodec/x86/h264_idct.asm
+++ b/libavcodec/x86/h264_idct.asm
@@ -87,10 +87,9 @@ cglobal h264_idct_add_8, 3, 3, 0
 RET
 
 %macro IDCT8_1D 2
-mova m0, m1
-psrawm1, 1
-mova m4, m5
-psrawm4, 1
+psrawm0, m1, 1
+SWAP 0, 1
+psrawm4, m5, 1
 paddwm4, m5
 paddwm1, m0
 paddwm4, m7
@@ -107,10 +106,9 @@ cglobal h264_idct_add_8, 3, 3, 0
 psubwm0, m3
 psubwm5, m7
 
-mova m7, m1
-psrawm1, 2
-mova m3, m4
-psrawm3, 2
+psrawm7, m1, 2
+SWAP 7,1
+psrawm3, m4, 2
 paddwm3, m0
 psrawm0, 2
 paddwm1, m5
@@ -118,10 +116,9 @@ cglobal h264_idct_add_8, 3, 3, 0
 psubwm0, m4
 psubwm7, m5
 
-mova m5, m6
-psrawm6, 1
-mova m4, m2
-psrawm4, 1
+psrawm5, m6, 1
+SWAP 5,6
+psrawm4, m2, 1
 paddwm6, m2
 psubwm4, m5
 
-- 
2.8.3

___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
http://ffmpeg.org/mailman/listinfo/ffmpeg-devel

[FFmpeg-devel] [PATCH 4/6] avcodec/h264: add avx 8-bit h264_idct_add

2017-04-14 Thread James Darnley

Haswell:
 - 1.11x faster (522±0.4 vs. 469±1.8 decicycles) compared with mmxext

Skylake-U:
 - 1.21x faster (671±5.5 vs. 555±1.4 decicycles) compared with mmxext
---
 libavcodec/x86/h264_idct.asm  | 33 -
 libavcodec/x86/h264dsp_init.c |  3 +++
 2 files changed, 35 insertions(+), 1 deletion(-)

diff --git a/libavcodec/x86/h264_idct.asm b/libavcodec/x86/h264_idct.asm
index bc4dce4..24fb4d2 100644
--- a/libavcodec/x86/h264_idct.asm
+++ b/libavcodec/x86/h264_idct.asm
@@ -65,7 +65,15 @@ SECTION .text
 
 IDCT4_1D  w, 0, 1, 2, 3, 4, 5
 mova m6, [pw_32]
-TRANSPOSE4x4W 0, 1, 2, 3, 4
+%if mmsize == 8
+TRANSPOSE4x4W 0, 1, 2, 3, 4
+%else
+punpcklwd m0, m1
+punpcklwd m2, m3
+SBUTTERFLY dq, 0, 2, 4
+MOVHL m1, m0
+MOVHL m3, m2
+%endif
 paddwm0, m6
 IDCT4_1D  w, 0, 1, 2, 3, 4, 5
 pxor m7, m7
@@ -1131,3 +1139,26 @@ INIT_MMX mmx
 IDCT_DC_DEQUANT 0
 INIT_MMX sse2
 IDCT_DC_DEQUANT 7
+
+INIT_XMM avx
+
+; %unmacro STORE_DIFFx2 8 ; remove macro from x86util.asm but yasm doesn't 
have this yet
+%macro STORE_DIFFx2 8 ; add1, add2, reg1, reg2, zero, shift, source, stride
+movd   %3, [%7]
+movd   %4, [%7+%8]
+psraw  %1, %6
+psraw  %2, %6
+punpcklbw  %3, %5
+punpcklbw  %4, %5
+paddw  %3, %1
+paddw  %4, %2
+packuswb   %3, %5
+packuswb   %4, %5
+movd [%7], %3
+movd  [%7+%8], %4
+%endmacro
+
+cglobal h264_idct_add_8, 3, 3, 8, dst_, block_, stride_
+movsxdifnidn stride_q, stride_d
+IDCT4_ADDdst_q, block_q, stride_q
+RET
diff --git a/libavcodec/x86/h264dsp_init.c b/libavcodec/x86/h264dsp_init.c
index 0643b37..8ba085f 100644
--- a/libavcodec/x86/h264dsp_init.c
+++ b/libavcodec/x86/h264dsp_init.c
@@ -32,6 +32,7 @@ void ff_h264_idct ## NUM ## _add_ ## DEPTH ## _ ## 
OPT(uint8_t *dst,\
int stride);
 
 IDCT_ADD_FUNC(, 8, mmx)
+IDCT_ADD_FUNC(, 8, avx)
 IDCT_ADD_FUNC(, 10, sse2)
 IDCT_ADD_FUNC(_dc, 8, mmxext)
 IDCT_ADD_FUNC(_dc, 10, mmxext)
@@ -337,6 +338,8 @@ av_cold void ff_h264dsp_init_x86(H264DSPContext *c, const 
int bit_depth,
 c->h264_h_loop_filter_chroma   = 
ff_deblock_h_chroma422_8_avx;
 c->h264_h_loop_filter_chroma_intra = 
ff_deblock_h_chroma422_intra_8_avx;
 }
+
+c->h264_idct_add= ff_h264_idct_add_8_avx;
 }
 } else if (bit_depth == 10) {
 if (EXTERNAL_MMXEXT(cpu_flags)) {
-- 
2.8.3

___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
http://ffmpeg.org/mailman/listinfo/ffmpeg-devel

[FFmpeg-devel] [PATCH 6/6] avcodec/h264: add sse2 versions of previous idct functions

2017-04-14 Thread James Darnley

Kaby Lake Pentium:
 - ff_h264_idct_add_8_sse2:~1.18x faster than mmxext
 - ff_h264_idct_dc_add_8_sse2: ~1.07x faster than mmxext
---
 libavcodec/x86/h264_idct.asm  | 11 +--
 libavcodec/x86/h264dsp_init.c |  5 +
 2 files changed, 14 insertions(+), 2 deletions(-)

diff --git a/libavcodec/x86/h264_idct.asm b/libavcodec/x86/h264_idct.asm
index 43f7791..5d83d91 100644
--- a/libavcodec/x86/h264_idct.asm
+++ b/libavcodec/x86/h264_idct.asm
@@ -1140,8 +1140,6 @@ IDCT_DC_DEQUANT 0
 INIT_MMX sse2
 IDCT_DC_DEQUANT 7
 
-INIT_XMM avx
-
 ; %unmacro STORE_DIFFx2 8 ; remove macro from x86util.asm but yasm doesn't 
have this yet
 %macro STORE_DIFFx2 8 ; add1, add2, reg1, reg2, zero, shift, source, stride
 movd   %3, [%7]
@@ -1170,6 +1168,10 @@ INIT_XMM avx
 packuswb m1, m1
 %endmacro
 
+%macro IDCT_XMM 1
+
+INIT_XMM %1
+
 cglobal h264_idct_add_8, 3, 3, 8, dst_, block_, stride_
 movsxdifnidn stride_q, stride_d
 IDCT4_ADDdst_q, block_q, stride_q
@@ -1182,3 +1184,8 @@ cglobal h264_idct_dc_add_8, 3, 4, 6, dst_, block_, stride_
 DC_ADD_INIT r3
 DC_ADD_MMXEXT_OP movd, dst_q, stride_q, r3
 RET
+
+%endmacro
+
+IDCT_XMM sse2
+IDCT_XMM avx
diff --git a/libavcodec/x86/h264dsp_init.c b/libavcodec/x86/h264dsp_init.c
index bf74937..ce7179f 100644
--- a/libavcodec/x86/h264dsp_init.c
+++ b/libavcodec/x86/h264dsp_init.c
@@ -32,9 +32,11 @@ void ff_h264_idct ## NUM ## _add_ ## DEPTH ## _ ## 
OPT(uint8_t *dst,\
int stride);
 
 IDCT_ADD_FUNC(, 8, mmx)
+IDCT_ADD_FUNC(, 8, sse2)
 IDCT_ADD_FUNC(, 8, avx)
 IDCT_ADD_FUNC(, 10, sse2)
 IDCT_ADD_FUNC(_dc, 8, mmxext)
+IDCT_ADD_FUNC(_dc, 8, sse2)
 IDCT_ADD_FUNC(_dc, 8, avx)
 IDCT_ADD_FUNC(_dc, 10, mmxext)
 IDCT_ADD_FUNC(8_dc, 8, mmxext)
@@ -316,6 +318,9 @@ av_cold void ff_h264dsp_init_x86(H264DSPContext *c, const 
int bit_depth,
 c->h264_h_loop_filter_chroma   = 
ff_deblock_h_chroma422_8_sse2;
 c->h264_h_loop_filter_chroma_intra = 
ff_deblock_h_chroma422_intra_8_sse2;
 }
+
+c->h264_idct_add= ff_h264_idct_add_8_sse2;
+c->h264_idct_dc_add = ff_h264_idct_dc_add_8_sse2;
 }
 if (EXTERNAL_SSSE3(cpu_flags)) {
 c->biweight_h264_pixels_tab[0] = ff_h264_biweight_16_ssse3;
-- 
2.8.3

___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
http://ffmpeg.org/mailman/listinfo/ffmpeg-devel

[FFmpeg-devel] [PATCH 2/6] avcodec/h264: change some labels to be macro-local

2017-04-14 Thread James Darnley

The labels get stripped leading to (slightly) nicer disassembly from
objdump.
---
 libavcodec/x86/h264_idct.asm | 24 
 1 file changed, 12 insertions(+), 12 deletions(-)

diff --git a/libavcodec/x86/h264_idct.asm b/libavcodec/x86/h264_idct.asm
index 878ff02..dde40e9 100644
--- a/libavcodec/x86/h264_idct.asm
+++ b/libavcodec/x86/h264_idct.asm
@@ -846,7 +846,7 @@ h264_add8x4_idct_sse2:
 %macro add16_sse2_cycle 2
 movzx   r0, word [r4+%2]
 testr0, r0
-jz .cycle%1end
+jz %%skip
 movr0d, dword [r1+%1*8]
 %if ARCH_X86_64
 add r0, r5
@@ -854,7 +854,7 @@ h264_add8x4_idct_sse2:
 add r0, r0m
 %endif
 callh264_add8x4_idct_sse2
-.cycle%1end:
+%%skip:
 %if %1 < 7
 add r2, 64
 %endif
@@ -883,7 +883,7 @@ REP_RET
 %macro add16intra_sse2_cycle 2
 movzx   r0, word [r4+%2]
 testr0, r0
-jz .try%1dc
+jz %%trydc
 movr0d, dword [r1+%1*8]
 %if ARCH_X86_64
 add r0, r7
@@ -891,11 +891,11 @@ REP_RET
 add r0, r0m
 %endif
 callh264_add8x4_idct_sse2
-jmp .cycle%1end
-.try%1dc:
+jmp %%skip
+%%trydc:
 movsx   r0, word [r2   ]
 or r0w, word [r2+32]
-jz .cycle%1end
+jz %%skip
 movr0d, dword [r1+%1*8]
 %if ARCH_X86_64
 add r0, r7
@@ -903,7 +903,7 @@ REP_RET
 add r0, r0m
 %endif
 callh264_idct_dc_add8_mmxext
-.cycle%1end:
+%%skip:
 %if %1 < 7
 add r2, 64
 %endif
@@ -930,7 +930,7 @@ REP_RET
 %macro add8_sse2_cycle 2
 movzx   r0, word [r4+%2]
 testr0, r0
-jz .try%1dc
+jz %%trydc
 %if ARCH_X86_64
 movr0d, dword [r1+(%1&1)*8+64*(1+(%1>>1))]
 add r0, [r7]
@@ -940,11 +940,11 @@ REP_RET
 add r0, dword [r1+(%1&1)*8+64*(1+(%1>>1))]
 %endif
 callh264_add8x4_idct_sse2
-jmp .cycle%1end
-.try%1dc:
+jmp %%cycle_end
+%%trydc:
 movsx   r0, word [r2   ]
 or r0w, word [r2+32]
-jz .cycle%1end
+jz %%cycle_end
 %if ARCH_X86_64
 movr0d, dword [r1+(%1&1)*8+64*(1+(%1>>1))]
 add r0, [r7]
@@ -954,7 +954,7 @@ REP_RET
 add r0, dword [r1+(%1&1)*8+64*(1+(%1>>1))]
 %endif
 callh264_idct_dc_add8_mmxext
-.cycle%1end:
+%%cycle_end:
 %if %1 == 1
 add r2, 384+64
 %elif %1 < 3
-- 
2.8.3

___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
http://ffmpeg.org/mailman/listinfo/ffmpeg-devel

[FFmpeg-devel] [PATCH 5/6] avcodec/h264: add avx 8-bit h264_idct_dc_add

2017-04-14 Thread James Darnley

Haswell:
 - 1.02x faster (405±0.7 vs. 397±0.8 decicycles) compared with mmxext

Skylake-U:
 - 1.06x faster (498±1.8 vs. 470±1.3 decicycles) compared with mmxext
---
 libavcodec/x86/h264_idct.asm  | 20 
 libavcodec/x86/h264dsp_init.c |  2 ++
 2 files changed, 22 insertions(+)

diff --git a/libavcodec/x86/h264_idct.asm b/libavcodec/x86/h264_idct.asm
index 24fb4d2..43f7791 100644
--- a/libavcodec/x86/h264_idct.asm
+++ b/libavcodec/x86/h264_idct.asm
@@ -1158,7 +1158,27 @@ INIT_XMM avx
 movd  [%7+%8], %4
 %endmacro
 
+%macro DC_ADD_INIT 1
+add  %1d, 32
+sar  %1d, 6
+movd m0, %1d
+pshuflw  m0, m0, 0
+lea  %1, [3*stride_q]
+pxor m1, m1
+psubwm1, m0
+packuswb m0, m0
+packuswb m1, m1
+%endmacro
+
 cglobal h264_idct_add_8, 3, 3, 8, dst_, block_, stride_
 movsxdifnidn stride_q, stride_d
 IDCT4_ADDdst_q, block_q, stride_q
 RET
+
+cglobal h264_idct_dc_add_8, 3, 4, 6, dst_, block_, stride_
+movsxdifnidn stride_q, stride_d
+movsx r3d, word [block_q]
+mov   dword [block_q], 0
+DC_ADD_INIT r3
+DC_ADD_MMXEXT_OP movd, dst_q, stride_q, r3
+RET
diff --git a/libavcodec/x86/h264dsp_init.c b/libavcodec/x86/h264dsp_init.c
index 8ba085f..bf74937 100644
--- a/libavcodec/x86/h264dsp_init.c
+++ b/libavcodec/x86/h264dsp_init.c
@@ -35,6 +35,7 @@ IDCT_ADD_FUNC(, 8, mmx)
 IDCT_ADD_FUNC(, 8, avx)
 IDCT_ADD_FUNC(, 10, sse2)
 IDCT_ADD_FUNC(_dc, 8, mmxext)
+IDCT_ADD_FUNC(_dc, 8, avx)
 IDCT_ADD_FUNC(_dc, 10, mmxext)
 IDCT_ADD_FUNC(8_dc, 8, mmxext)
 IDCT_ADD_FUNC(8_dc, 10, sse2)
@@ -340,6 +341,7 @@ av_cold void ff_h264dsp_init_x86(H264DSPContext *c, const 
int bit_depth,
 }
 
 c->h264_idct_add= ff_h264_idct_add_8_avx;
+c->h264_idct_dc_add = ff_h264_idct_dc_add_8_avx;
 }
 } else if (bit_depth == 10) {
 if (EXTERNAL_MMXEXT(cpu_flags)) {
-- 
2.8.3

___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
http://ffmpeg.org/mailman/listinfo/ffmpeg-devel

[FFmpeg-devel] [PATCH] add Falcom Xanadu demuxer

2017-04-15 Thread James Darnley

---
 libavformat/Makefile |  1 +
 libavformat/allformats.c |  1 +
 libavformat/falcom_xa.c  | 98 
 3 files changed, 100 insertions(+)
 create mode 100644 libavformat/falcom_xa.c

diff --git a/libavformat/Makefile b/libavformat/Makefile
index 6bdfbe6789..06b6b5da57 100644
--- a/libavformat/Makefile
+++ b/libavformat/Makefile
@@ -158,6 +158,7 @@ OBJS-$(CONFIG_EA_DEMUXER)+= electronicarts.o
 OBJS-$(CONFIG_EAC3_DEMUXER)  += ac3dec.o rawdec.o
 OBJS-$(CONFIG_EAC3_MUXER)+= rawenc.o
 OBJS-$(CONFIG_EPAF_DEMUXER)  += epafdec.o pcm.o
+OBJS-$(CONFIG_FALCOM_XA_DEMUXER) += falcom_xa.o
 OBJS-$(CONFIG_FFM_DEMUXER)   += ffmdec.o
 OBJS-$(CONFIG_FFM_MUXER) += ffmenc.o
 OBJS-$(CONFIG_FFMETADATA_DEMUXER)+= ffmetadec.o
diff --git a/libavformat/allformats.c b/libavformat/allformats.c
index 09e62c3cfc..0c23ea9df0 100644
--- a/libavformat/allformats.c
+++ b/libavformat/allformats.c
@@ -117,6 +117,7 @@ static void register_all(void)
 REGISTER_MUXDEMUX(EAC3, eac3);
 REGISTER_DEMUXER (EPAF, epaf);
 REGISTER_MUXER   (F4V,  f4v);
+REGISTER_DEMUXER (FALCOM_XA,falcom_xa);
 REGISTER_MUXDEMUX(FFM,  ffm);
 REGISTER_MUXDEMUX(FFMETADATA,   ffmetadata);
 REGISTER_MUXER   (FIFO, fifo);
diff --git a/libavformat/falcom_xa.c b/libavformat/falcom_xa.c
new file mode 100644
index 00..4c5f32a1b6
--- /dev/null
+++ b/libavformat/falcom_xa.c
@@ -0,0 +1,98 @@
+/*
+ * Falcom Xanadu demuxer
+ * Copyright (c) 2016 James Darnley
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "avformat.h"
+
+typedef struct FalcomXADemuxContext {
+const AVClass *class;
+unsigned frame_size;
+} FalcomXADemuxContext;
+
+static av_cold
+int falcom_xa_read_header(AVFormatContext *s)
+{
+int sample_rate, bit_depth, channels;
+AVStream *st;
+AVIOContext *pb = s->pb;
+
+avio_seek(pb, 22, SEEK_SET);
+channels = avio_rl16(pb);
+if (channels != 2)
+return AVERROR_INVALIDDATA;
+
+sample_rate = avio_rl32(pb);
+if (sample_rate != 44100)
+return AVERROR_INVALIDDATA;
+
+avio_seek(pb, 34, SEEK_SET);
+bit_depth = avio_rl16(pb);
+if (bit_depth != 16)
+return AVERROR_INVALIDDATA;
+
+avio_seek(pb, 44, SEEK_SET);
+
+st = avformat_new_stream(s, NULL);
+if (!st)
+return AVERROR(ENOMEM);
+
+st->codecpar->codec_type = AVMEDIA_TYPE_AUDIO;
+st->codecpar->codec_id = AV_CODEC_ID_ADPCM_MS;
+st->codecpar->sample_rate = sample_rate;
+st->codecpar->channels = channels;
+st->codecpar->block_align = 2048;
+
+return 0;
+}
+
+static
+int falcom_xa_read_packet(AVFormatContext *s, AVPacket *pkt)
+{
+FalcomXADemuxContext *ctx = s->priv_data;
+
+if (ctx->frame_size == 0) {
+unsigned tell = avio_tell(s->pb);
+unsigned size = avio_rl32(s->pb);
+av_log(ctx, AV_LOG_DEBUG, "read size %u (0x%x) at offset %u (0x%x)\n",
+size, size, tell, tell);
+if (size % 2048)
+return AVERROR_INVALIDDATA;
+ctx->frame_size = size;
+avio_skip(s->pb, 4); // unknown bytes
+}
+
+ctx->frame_size -= 2048;
+return av_get_packet(s->pb, pkt, 2048);
+}
+
+static const AVClass falcom_xa_demux_class = {
+.class_name = "Falcom Xanadu demuxer",
+.item_name  = av_default_item_name,
+.version= LIBAVUTIL_VERSION_INT,
+};
+
+AVInputFormat ff_falcom_xa_demuxer = {
+.name   = "falcom_xa",
+.long_name  = NULL_IF_CONFIG_SMALL("Falcom Xanadu demuxer"),
+.priv_data_size = sizeof(FalcomXADemuxContext),
+.read_header= falcom_xa_read_header,
+.read_packet= falcom_xa_read_packet,
+.priv_class = &falcom_xa_demux_class,
+};
-- 
2.12.2

___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
http://ffmpeg.org/mailman/listinfo/ffmpeg-devel

Re: [FFmpeg-devel] [PATCH 2/6] avcodec/h264: change some labels to be macro-local

2017-04-15 Thread James Darnley

On 2017-04-15 14:29, Ronald S. Bultje wrote:
> Hi,
> 
> On Fri, Apr 14, 2017 at 9:46 PM, James Darnley  wrote:
> 
>> The labels get stripped leading to (slightly) nicer disassembly from
>> objdump.
>> ---
>>  libavcodec/x86/h264_idct.asm | 24 
>>  1 file changed, 12 insertions(+), 12 deletions(-)
>>
>> diff --git a/libavcodec/x86/h264_idct.asm b/libavcodec/x86/h264_idct.asm
>> index 878ff02..dde40e9 100644
>> --- a/libavcodec/x86/h264_idct.asm
>> +++ b/libavcodec/x86/h264_idct.asm
>> @@ -846,7 +846,7 @@ h264_add8x4_idct_sse2:
>>  %macro add16_sse2_cycle 2
>>  movzx   r0, word [r4+%2]
>>  testr0, r0
>> -jz .cycle%1end
>> +jz %%skip
> 
> 
> So I've thought about it some more. I think I'd first need to understand
> what you're doing here and why.
> 
> It seems to me that the issue you're trying to address is that when you
> look at disassembly (in e.g. a debugger or objdump), it goes from label to
> label (where function entry is also a label), and so every function-local
> label means disassembly is cut off as a block, right? (Each block then
> represents a jump target or loop or something like that.)
> 
> And you don't like that, so you're getting rid of the labels, right?

Yes.  I didn't like that because the function I was looking at had (I
think) 16 labels showing in objdump output.

Strictly speaking, I'm not getting rid of the labels but just changing
them into a format that lets STRIP strip them.  Make will run STRIP to
strip labels that begin ..@ (if configure has determined that your STRIP
supports it).

Usually I don't have a problem with labels representing a loop (or 2
nested ones) because it makes it easy to see where the code jumps back to.

> So, if all of this is correct, then I agree that the output of tools like
> debugger/objdump is irritating. In fact, it has irritated me forever in any
> codec's DSP functions. But it also seems like we're moving away from a de
> facto convention if we don't use dot-labels anymore. If we do it for
> h264_idct, we should do it everywhere (for consistency). Is that what
> people want? Maybe we should follow convention and fix objdump to include
> all dot labels in a block if a CLI option is provided?

... I don't know what to say.

___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
http://ffmpeg.org/mailman/listinfo/ffmpeg-devel

Re: [FFmpeg-devel] [PATCH] add Falcom Xanadu demuxer

2017-04-15 Thread James Darnley

On 2017-04-15 15:36, James Darnley wrote:
> add Falcom Xanadu demuxer

I mean Xanadu Next, not the original one.




signature.asc
Description: OpenPGP digital signature
___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
http://ffmpeg.org/mailman/listinfo/ffmpeg-devel

Re: [FFmpeg-devel] [PATCH] add Falcom Xanadu demuxer

2017-04-15 Thread James Darnley

On 2017-04-15 17:56, James Almer wrote:
> On 4/15/2017 10:36 AM, James Darnley wrote:
>> ---
>>  libavformat/Makefile |  1 +
>>  libavformat/allformats.c |  1 +
>>  libavformat/falcom_xa.c  | 98 
>> 
>>  3 files changed, 100 insertions(+)
>>  create mode 100644 libavformat/falcom_xa.c
> 
> Changelog/version.h/etc before you push.

Sure.

>>
>> diff --git a/libavformat/Makefile b/libavformat/Makefile
>> index 6bdfbe6789..06b6b5da57 100644
>> --- a/libavformat/Makefile
>> +++ b/libavformat/Makefile
>> @@ -158,6 +158,7 @@ OBJS-$(CONFIG_EA_DEMUXER)+= 
>> electronicarts.o
>>  OBJS-$(CONFIG_EAC3_DEMUXER)  += ac3dec.o rawdec.o
>>  OBJS-$(CONFIG_EAC3_MUXER)+= rawenc.o
>>  OBJS-$(CONFIG_EPAF_DEMUXER)  += epafdec.o pcm.o
>> +OBJS-$(CONFIG_FALCOM_XA_DEMUXER) += falcom_xa.o
>>  OBJS-$(CONFIG_FFM_DEMUXER)   += ffmdec.o
>>  OBJS-$(CONFIG_FFM_MUXER) += ffmenc.o
>>  OBJS-$(CONFIG_FFMETADATA_DEMUXER)+= ffmetadec.o
>> diff --git a/libavformat/allformats.c b/libavformat/allformats.c
>> index 09e62c3cfc..0c23ea9df0 100644
>> --- a/libavformat/allformats.c
>> +++ b/libavformat/allformats.c
>> @@ -117,6 +117,7 @@ static void register_all(void)
>>  REGISTER_MUXDEMUX(EAC3, eac3);
>>  REGISTER_DEMUXER (EPAF, epaf);
>>  REGISTER_MUXER   (F4V,  f4v);
>> +REGISTER_DEMUXER (FALCOM_XA,falcom_xa);
> 
> Maybe just Xanadu?
> 
>>  REGISTER_MUXDEMUX(FFM,  ffm);
>>  REGISTER_MUXDEMUX(FFMETADATA,   ffmetadata);
>>  REGISTER_MUXER   (FIFO, fifo);
>> diff --git a/libavformat/falcom_xa.c b/libavformat/falcom_xa.c
>> new file mode 100644
>> index 00..4c5f32a1b6
>> --- /dev/null
>> +++ b/libavformat/falcom_xa.c
>> @@ -0,0 +1,98 @@
>> +/*
>> + * Falcom Xanadu demuxer
>> + * Copyright (c) 2016 James Darnley
> 
> 2017?

I wrote it in November 2016 and have only just remembered about it.

>> + *
>> + * This file is part of FFmpeg.
>> + *
>> + * FFmpeg is free software; you can redistribute it and/or
>> + * modify it under the terms of the GNU Lesser General Public
>> + * License as published by the Free Software Foundation; either
>> + * version 2.1 of the License, or (at your option) any later version.
>> + *
>> + * FFmpeg is distributed in the hope that it will be useful,
>> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
>> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
>> + * Lesser General Public License for more details.
>> + *
>> + * You should have received a copy of the GNU Lesser General Public
>> + * License along with FFmpeg; if not, write to the Free Software
>> + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 
>> USA
>> + */
>> +
>> +#include "avformat.h"
>> +
>> +typedef struct FalcomXADemuxContext {
>> +const AVClass *class;
>> +unsigned frame_size;
>> +} FalcomXADemuxContext;
>> +
>> +static av_cold
>> +int falcom_xa_read_header(AVFormatContext *s)
> 
> A probe function should be added if possible.
> 
>> +{
>> +int sample_rate, bit_depth, channels;
>> +AVStream *st;
>> +AVIOContext *pb = s->pb;
>> +
>> +avio_seek(pb, 22, SEEK_SET);
> 
> avio_skip() is more in line with most demuxers.
> 
> What are those first 22 bytes? I assume header magic (which you should
> check in a probe function) and maybe some version byte?
> If the latter, better check it here.

It is a riff or wave header with incorrect data.  I can probe for the
commonalities for all the files but I couldn't be bothered at the time.

>> +channels = avio_rl16(pb);
>> +if (channels != 2)
>> +return AVERROR_INVALIDDATA;
> 
> If there are real world files with more than two channels, you could
> instead return PATCHWELCOME and even ask for a sample.

I wonder.  I only wanted it to decode the 24 files from the game.

>> +
>> +sample_rate = avio_rl32(pb);
>> +if (sample_rate != 44100)
>> +return AVERROR_INVALIDDATA;
> 
> Same.
> 
>> +
>> +avio_seek(pb, 34, SEEK_SET);
> 
> avio_skip().
> 
>> +bit_depth = avio_rl16(pb);
>> +if (bit_depth != 16)
>> +return AVERROR_INVALIDDATA;
>> +
>> +avio_seek(pb, 44, SEEK_SET);
> 
> Same.

All noted.

>> +

[FFmpeg-devel] Why do we use `strip -wN` instead of the more general `strip -x`?

2017-05-11 Thread James Darnley

I want to discuss why we use this and argue that we should be using
`strip -x` all the time anyway.

The man page for strip says that -x removes all non-global symbols.  -wN
is a combination of -w for wildcard matching and -N to remove a given
symbol.

-wN gets ..@* as an argument.  Together they remove the symbols
generated by nasm/yasm and the x264asm layer for branch instructions
(and maybe other things).  Macro local labels generated by the
preprocessor with %%[1] are also of this form and will get removed.

Unfortunately that does not remove the common local labels we use
throughout the assembly code.  In the output object these labels get
prepended by the previous non-local label.[2]  Meaning you get
function1.label1, function1.label2, function2.label1, and function2.label2.

These remaining symbols can cause confusion for both tools and by
extension developers.  I will use the function
ff_h264_idct_add16intra_8_sse2 as an extreme example (from the file
libavcodec/x86/h264_idct.asm).  This function gets many local labels
because it uses the preprocessor to create them in a macro call.

Objdump is a tool which shows the issue quite easily.

> 2b20 :
> 2b3c :
> 2b50 :
> 2b6b :
> 2b80 :
> 2b9b :
> 2bb0 :
> 2bcb :
> 2be0 :
> 2bfb :
> 2c10 :
> 2c2b :
> 2c40 :
> 2c5b :
> 2c70 :
> 2c8b :
> 2ca0 :

The disassembled function gets split between all these labels.

Perf is another tool which shows the issue.  Its recording of time spent
in a function gets split among the labels.  Useful to see at a glance if
a particular loop is exceptionally slow (if the labels are used for a
loop).  Less useful if the labels split the time into several pieces
making each one look "quick".

The worst is what I heard on IRC from, I think, Ronald.  He told me that
when he tries to disassemble a function in gdb it will only print the
top of the function until it reaches the next label.  I guess it thinks
that is another function.

It is these reasons that I think we should be using -x all the time.  Do
we gain anything from leaving these symbols in the assembly objects?  Do
we lose something is we use -x?

I welcome your comments.

[1] http://www.nasm.us/xdoc/2.13.01/html/nasmdoc4.html#section-4.3.2
[2] http://www.nasm.us/xdoc/2.13.01/html/nasmdoc3.html#section-3.9



signature.asc
Description: OpenPGP digital signature
___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
http://ffmpeg.org/mailman/listinfo/ffmpeg-devel

[FFmpeg-devel] [PATCH/WIP] avcodec/x86: move simple_idct to external assembly

2017-05-12 Thread James Darnley

---

For initial review and comments.

I plan to drop the '2' from the filename before pushing.  I haven't done it yet
because I am still working on the file.  I didn't make any changes with speedup
in mind so I haven't done any benchmarking yet.

 libavcodec/x86/Makefile |   4 +-
 libavcodec/x86/simple_idct.c| 929 
 libavcodec/x86/simple_idct2.asm | 892 ++
 3 files changed, 894 insertions(+), 931 deletions(-)
 delete mode 100644 libavcodec/x86/simple_idct.c
 create mode 100644 libavcodec/x86/simple_idct2.asm

diff --git a/libavcodec/x86/Makefile b/libavcodec/x86/Makefile
index d4cb27fa13..af3b50f17a 100644
--- a/libavcodec/x86/Makefile
+++ b/libavcodec/x86/Makefile
@@ -79,7 +79,6 @@ OBJS-$(CONFIG_WEBP_DECODER)+= x86/vp8dsp_init.o
 # GCC inline assembly optimizations
 # subsystems
 MMX-OBJS-$(CONFIG_FDCTDSP) += x86/fdct.o
-MMX-OBJS-$(CONFIG_IDCTDSP) += x86/simple_idct.o
 MMX-OBJS-$(CONFIG_VC1DSP)  += x86/vc1dsp_mmx.o
 
 # decoders/encoders
@@ -128,7 +127,8 @@ YASM-OBJS-$(CONFIG_QPELDSP)+= x86/qpeldsp.o 
\
 YASM-OBJS-$(CONFIG_RV34DSP)+= x86/rv34dsp.o
 YASM-OBJS-$(CONFIG_VC1DSP) += x86/vc1dsp_loopfilter.o   \
   x86/vc1dsp_mc.o
-YASM-OBJS-$(CONFIG_IDCTDSP)+= x86/simple_idct10.o
+YASM-OBJS-$(CONFIG_IDCTDSP)+= x86/simple_idct10.o   \
+  x86/simple_idct2.o
 YASM-OBJS-$(CONFIG_VIDEODSP)   += x86/videodsp.o
 YASM-OBJS-$(CONFIG_VP3DSP) += x86/vp3dsp.o
 YASM-OBJS-$(CONFIG_VP8DSP) += x86/vp8dsp.o  \
diff --git a/libavcodec/x86/simple_idct.c b/libavcodec/x86/simple_idct.c
deleted file mode 100644
index 1155920ae8..00
--- a/libavcodec/x86/simple_idct.c
+++ /dev/null
@@ -1,929 +0,0 @@
-/*
- * Simple IDCT MMX
- *
- * Copyright (c) 2001, 2002 Michael Niedermayer 
- *
- * This file is part of FFmpeg.
- *
- * FFmpeg is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * FFmpeg is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with FFmpeg; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- */
-
-#include "libavutil/mem.h"
-#include "libavutil/x86/asm.h"
-
-#include "libavcodec/idctdsp.h"
-#include "libavcodec/x86/idctdsp.h"
-
-#include "idctdsp.h"
-#include "simple_idct.h"
-
-#if HAVE_INLINE_ASM
-
-/*
-23170.475006
-22725.260826
-21406.727617
-19265.545870
-16384.00
-12872.826198
-8866.956905
-4520.335430
-*/
-#define C0 23170 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
-#define C1 22725 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
-#define C2 21407 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
-#define C3 19266 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
-#define C4 16383 //cos(i*M_PI/16)*sqrt(2)*(1<<14) - 0.5
-#define C5 12873 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
-#define C6 8867  //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
-#define C7 4520  //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
-
-#define ROW_SHIFT 11
-#define COL_SHIFT 20 // 6
-
-DECLARE_ASM_CONST(8, uint64_t, wm1010)= 0xULL;
-DECLARE_ASM_CONST(8, uint64_t, d4)= 0x0004ULL;
-
-DECLARE_ALIGNED(8, static const int16_t, coeffs)[]= {
-1<<(ROW_SHIFT-1), 0, 1<<(ROW_SHIFT-1), 0,
-//1<<(COL_SHIFT-1), 0, 1<<(COL_SHIFT-1), 0,
-//0, 1<<(COL_SHIFT-1-16), 0, 1<<(COL_SHIFT-1-16),
-1<<(ROW_SHIFT-1), 1, 1<<(ROW_SHIFT-1), 0,
-// the 1 = ((1<<(COL_SHIFT-1))/C4)<
+;
+; This file is part of FFmpeg.
+;
+; FFmpeg is free software; you can redistribute it and/or
+; modify it under the terms of the GNU Lesser General Public
+; License as published by the Free Software Foundation; either
+; version 2.1 of the License, or (at your option) any later version.
+;
+; FFmpeg is distributed in the hope that it will be useful,
+; but WITHOUT ANY WARRANTY; without even the implied warranty of
+; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+; Lesser General Public License for more details.
+;
+; You should have received a copy of the GNU Lesser General Public
+; License along with FFmpeg; if not, write to the Free Software
+; Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+;/
+
+%include "libavutil/x86/x86util.asm"
+
+SECTION_RODATA
+
+cextern pb_80
+
+wm1010: dw 0, 0x, 0, 0x
+d4: dw 0, 4, 0, 0
+
+; 2

Re: [FFmpeg-devel] [PATCH] configure: use -x instead of -wN ..@ to strip assembly files

2017-05-16 Thread James Darnley

On 2017-05-16 13:08, Rostislav Pehlivanov wrote:
> Reduces the amount of debugging information of external asm from
> uselessly verbose to informative enough.
> ---
>  configure | 3 +--
>  1 file changed, 1 insertion(+), 2 deletions(-)
> 
> diff --git a/configure b/configure
> index e4862f6a35..df849df14f 100755
> --- a/configure
> +++ b/configure
> @@ -6185,8 +6185,7 @@ enabled rpath && add_ldlibflags -Wl,-rpath,$libdir
>  test_ldflags -Wl,-Bsymbolic && append SHFLAGS -Wl,-Bsymbolic
>  
>  # add some strip flags
> -# -wN '..@*' is more selective than -x, but not available everywhere.
> -check_stripflags -wN \'..@*\' || check_stripflags -x
> +check_stripflags -x
>  
>  enabled neon_clobber_test &&
>  check_ldflags -Wl,--wrap,avcodec_open2  \
> 

Look good.  I'm in favour of this change.  It does exactly what I was
(partly) advocating for in my email on Friday.




signature.asc
Description: OpenPGP digital signature
___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
http://ffmpeg.org/mailman/listinfo/ffmpeg-devel

Re: [FFmpeg-devel] [PATCH/WIP] avcodec/x86: move simple_idct to external assembly

2017-05-23 Thread James Darnley

On 2017-05-18 19:13, Ronald S. Bultje wrote:
> - do you think a checkasm test makes sense? That would also make
> performance measuring easier.

The (I)DCT code seems to have its own test program in the fate-idct8x8
test.  That is built from libavcodec/tests/dct.c.  It even includes its
own benchmarking.

>> +; Simple IDCT MMX
>> +;
>> +; Copyright (c) 2001, 2002 Michael Niedermayer 
>>
> 
> Please add a line that you converted it from inline asm to x264asm syntax,
> we've done that in other places also.

Done.  I found a line in 2 other files and copied their wording.

>> +%macro DC_COND_IDCT 7
>> +movq   mm0, [blockq + %1] ; R4 R0  r4  r0
>> +movq   mm1, [blockq + %2] ; R6 R2  r6  r2
>> +movq   mm2, [blockq + %3] ; R3 R1  r3  r1
>> +movq   mm3, [blockq + %4] ; R7 R5  r7  r5
>>
> 
> Please use 4-space indentation.

I will change this along with fixing the alignment in places.

>> +%%9:
>> +; :: "r" (block), "r" (temp), "r" (coeffs)
>> +;NAMED_CONSTRAINTS_ADD(wm1010,d4)
>> +; : "%eax"
>> +%endmacro
>>
> 
> The inline asm bits (middle 3 lines) can be removed (yay!).

Thanks for reminding me about this left-over from writing.

> Rest is fine. I am assuming that the binary size will grow slightly because
> the idct is now inlined in the put/add (and duplicated between mmx/sse2),
> but no performance implication (or possibly slight improvement because of
> the inline). If that's correct, I'm OK with this.

About the size.  It shouldn't change that much.  The idct() function was
marked as inline so it should have been duplicated into the 5 other
functions.  I have copied the {ADD,PUT}_PIXELS_CLAMPED macros whereas
they used to call the functions in another asm file.  These are fairly
small, about 20 instructions or so, each.

> Are you planning to write an actual SSE2 version of the IDCT? (No pressure,
> just wondering.)

I do have that plan.  I first wrote a simple and naive (and slightly
pointless) sse2 version which just uses half the width of the xmm
registers.  Similar to what I've done elsewhere in h264.  A quick
benchmark by the program mentioned above did show that it was faster
despite needing a few pshufd to put things in the right places.  I'll
show that alongside an updated version of this patch, Coming Soon(TM).

However I am trying to work on a better sse2 version using the full
width of xmm registers (and maybe all 16 regs on x86-64 if I need to).
It is slow work and I'll probably have still more in-depth questions for
you (or anyone who can answer them) later on IRC.

P.S.  My apologies for sending this directly to you Ronald.
___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
http://ffmpeg.org/mailman/listinfo/ffmpeg-devel

[FFmpeg-devel] [PATCH 2/2] reindent

2017-05-29 Thread James Darnley

---
 libavcodec/x86/idctdsp_init.c | 38 +++---
 1 file changed, 19 insertions(+), 19 deletions(-)

diff --git a/libavcodec/x86/idctdsp_init.c b/libavcodec/x86/idctdsp_init.c
index 1f308cc079..f1c915aa00 100644
--- a/libavcodec/x86/idctdsp_init.c
+++ b/libavcodec/x86/idctdsp_init.c
@@ -68,16 +68,16 @@ av_cold void ff_idctdsp_init_x86(IDCTDSPContext *c, 
AVCodecContext *avctx,
 c->put_pixels_clamped= ff_put_pixels_clamped_mmx;
 c->add_pixels_clamped= ff_add_pixels_clamped_mmx;
 
-if (!high_bit_depth &&
-avctx->lowres == 0 &&
-(avctx->idct_algo == FF_IDCT_AUTO ||
- avctx->idct_algo == FF_IDCT_SIMPLEAUTO ||
- avctx->idct_algo == FF_IDCT_SIMPLEMMX)) {
-c->idct_put  = ff_simple_idct_put_mmx;
-c->idct_add  = ff_simple_idct_add_mmx;
-c->idct  = ff_simple_idct_mmx;
-c->perm_type = FF_IDCT_PERM_SIMPLE;
-}
+if (!high_bit_depth &&
+avctx->lowres == 0 &&
+(avctx->idct_algo == FF_IDCT_AUTO ||
+avctx->idct_algo == FF_IDCT_SIMPLEAUTO ||
+avctx->idct_algo == FF_IDCT_SIMPLEMMX)) {
+c->idct_put  = ff_simple_idct_put_mmx;
+c->idct_add  = ff_simple_idct_add_mmx;
+c->idct  = ff_simple_idct_mmx;
+c->perm_type = FF_IDCT_PERM_SIMPLE;
+}
 }
 
 if (EXTERNAL_SSE2(cpu_flags)) {
@@ -85,15 +85,15 @@ av_cold void ff_idctdsp_init_x86(IDCTDSPContext *c, 
AVCodecContext *avctx,
 c->put_pixels_clamped= ff_put_pixels_clamped_sse2;
 c->add_pixels_clamped= ff_add_pixels_clamped_sse2;
 
-if (!high_bit_depth &&
-avctx->lowres == 0 &&
-(avctx->idct_algo == FF_IDCT_AUTO ||
- avctx->idct_algo == FF_IDCT_SIMPLEAUTO ||
- avctx->idct_algo == FF_IDCT_SIMPLEMMX)) {
-c->idct_put  = ff_simple_idct_put_sse2;
-c->idct_add  = ff_simple_idct_add_sse2;
-c->perm_type = FF_IDCT_PERM_SIMPLE;
-}
+if (!high_bit_depth &&
+avctx->lowres == 0 &&
+(avctx->idct_algo == FF_IDCT_AUTO ||
+avctx->idct_algo == FF_IDCT_SIMPLEAUTO ||
+avctx->idct_algo == FF_IDCT_SIMPLEMMX)) {
+c->idct_put  = ff_simple_idct_put_sse2;
+c->idct_add  = ff_simple_idct_add_sse2;
+c->perm_type = FF_IDCT_PERM_SIMPLE;
+}
 }
 
 if (ARCH_X86_64 && avctx->lowres == 0) {
-- 
2.12.2

___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
http://ffmpeg.org/mailman/listinfo/ffmpeg-devel

[FFmpeg-devel] [PATCH 1/2] avcodec/x86: move simple_idct to external assembly

2017-05-29 Thread James Darnley

---
Changes:
 - Changed type of d4 constant to dwords because it gets used as dwords.
 - Changed or removed HAVE_MMX_INLINE preprocessor guards.
 - Added note about conversion from inline.
 - New file no longer has "2" suffix.
 - Whitespace (indentation and alignment).

 libavcodec/tests/x86/dct.c |   2 +-
 libavcodec/x86/Makefile|   4 +-
 libavcodec/x86/idctdsp_init.c  |   4 -
 libavcodec/x86/simple_idct.asm | 889 +++
 libavcodec/x86/simple_idct.c   | 929 -
 5 files changed, 892 insertions(+), 936 deletions(-)
 create mode 100644 libavcodec/x86/simple_idct.asm
 delete mode 100644 libavcodec/x86/simple_idct.c

diff --git a/libavcodec/tests/x86/dct.c b/libavcodec/tests/x86/dct.c
index b6cdfb346c..34f5b8767b 100644
--- a/libavcodec/tests/x86/dct.c
+++ b/libavcodec/tests/x86/dct.c
@@ -67,7 +67,7 @@ static const struct algo fdct_tab_arch[] = {
 };
 
 static const struct algo idct_tab_arch[] = {
-#if HAVE_MMX_INLINE
+#if HAVE_MMX_EXTERNAL
 { "SIMPLE-MMX",  ff_simple_idct_mmx,  FF_IDCT_PERM_SIMPLE, AV_CPU_FLAG_MMX 
},
 #endif
 #if CONFIG_MPEG4_DECODER && HAVE_YASM
diff --git a/libavcodec/x86/Makefile b/libavcodec/x86/Makefile
index d4cb27fa13..710e48b15f 100644
--- a/libavcodec/x86/Makefile
+++ b/libavcodec/x86/Makefile
@@ -79,7 +79,6 @@ OBJS-$(CONFIG_WEBP_DECODER)+= x86/vp8dsp_init.o
 # GCC inline assembly optimizations
 # subsystems
 MMX-OBJS-$(CONFIG_FDCTDSP) += x86/fdct.o
-MMX-OBJS-$(CONFIG_IDCTDSP) += x86/simple_idct.o
 MMX-OBJS-$(CONFIG_VC1DSP)  += x86/vc1dsp_mmx.o
 
 # decoders/encoders
@@ -128,7 +127,8 @@ YASM-OBJS-$(CONFIG_QPELDSP)+= x86/qpeldsp.o 
\
 YASM-OBJS-$(CONFIG_RV34DSP)+= x86/rv34dsp.o
 YASM-OBJS-$(CONFIG_VC1DSP) += x86/vc1dsp_loopfilter.o   \
   x86/vc1dsp_mc.o
-YASM-OBJS-$(CONFIG_IDCTDSP)+= x86/simple_idct10.o
+YASM-OBJS-$(CONFIG_IDCTDSP)+= x86/simple_idct10.o   \
+  x86/simple_idct.o
 YASM-OBJS-$(CONFIG_VIDEODSP)   += x86/videodsp.o
 YASM-OBJS-$(CONFIG_VP3DSP) += x86/vp3dsp.o
 YASM-OBJS-$(CONFIG_VP8DSP) += x86/vp8dsp.o  \
diff --git a/libavcodec/x86/idctdsp_init.c b/libavcodec/x86/idctdsp_init.c
index fd5ef3ff18..1f308cc079 100644
--- a/libavcodec/x86/idctdsp_init.c
+++ b/libavcodec/x86/idctdsp_init.c
@@ -68,7 +68,6 @@ av_cold void ff_idctdsp_init_x86(IDCTDSPContext *c, 
AVCodecContext *avctx,
 c->put_pixels_clamped= ff_put_pixels_clamped_mmx;
 c->add_pixels_clamped= ff_add_pixels_clamped_mmx;
 
-if (INLINE_MMX(cpu_flags)) {
 if (!high_bit_depth &&
 avctx->lowres == 0 &&
 (avctx->idct_algo == FF_IDCT_AUTO ||
@@ -79,7 +78,6 @@ av_cold void ff_idctdsp_init_x86(IDCTDSPContext *c, 
AVCodecContext *avctx,
 c->idct  = ff_simple_idct_mmx;
 c->perm_type = FF_IDCT_PERM_SIMPLE;
 }
-}
 }
 
 if (EXTERNAL_SSE2(cpu_flags)) {
@@ -87,7 +85,6 @@ av_cold void ff_idctdsp_init_x86(IDCTDSPContext *c, 
AVCodecContext *avctx,
 c->put_pixels_clamped= ff_put_pixels_clamped_sse2;
 c->add_pixels_clamped= ff_add_pixels_clamped_sse2;
 
-if (INLINE_SSE2(cpu_flags)) {
 if (!high_bit_depth &&
 avctx->lowres == 0 &&
 (avctx->idct_algo == FF_IDCT_AUTO ||
@@ -97,7 +94,6 @@ av_cold void ff_idctdsp_init_x86(IDCTDSPContext *c, 
AVCodecContext *avctx,
 c->idct_add  = ff_simple_idct_add_sse2;
 c->perm_type = FF_IDCT_PERM_SIMPLE;
 }
-}
 }
 
 if (ARCH_X86_64 && avctx->lowres == 0) {
diff --git a/libavcodec/x86/simple_idct.asm b/libavcodec/x86/simple_idct.asm
new file mode 100644
index 00..6fedbb5784
--- /dev/null
+++ b/libavcodec/x86/simple_idct.asm
@@ -0,0 +1,889 @@
+;
+; Simple IDCT MMX
+;
+; Copyright (c) 2001, 2002 Michael Niedermayer 
+;
+; Conversion from gcc syntax to x264asm syntax with minimal modifications
+; by James Darnley .
+;
+; This file is part of FFmpeg.
+;
+; FFmpeg is free software; you can redistribute it and/or
+; modify it under the terms of the GNU Lesser General Public
+; License as published by the Free Software Foundation; either
+; version 2.1 of the License, or (at your option) any later version.
+;
+; FFmpeg is distributed in the hope that it will be useful,
+; but WITHOUT ANY WARRANTY; without even the implied warranty of
+; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+; Lesser General Public License for more details.
+;
+; You should have received a copy of the GNU Lesser General Public
+; License along with

Re: [FFmpeg-devel] [PATCH 1/2] avcodec/x86: move simple_idct to external assembly

2017-05-29 Thread James Darnley

On 2017-05-29 16:51, James Darnley wrote:
> ---
> Changes:
>  - Changed type of d4 constant to dwords because it gets used as dwords.
>  - Changed or removed HAVE_MMX_INLINE preprocessor guards.
>  - Added note about conversion from inline.
>  - New file no longer has "2" suffix.
>  - Whitespace (indentation and alignment).
> 
>  libavcodec/tests/x86/dct.c |   2 +-
>  libavcodec/x86/Makefile|   4 +-
>  libavcodec/x86/idctdsp_init.c  |   4 -
>  libavcodec/x86/simple_idct.asm | 889 +++
>  libavcodec/x86/simple_idct.c   | 929 
> -
>  5 files changed, 892 insertions(+), 936 deletions(-)
>  create mode 100644 libavcodec/x86/simple_idct.asm
>  delete mode 100644 libavcodec/x86/simple_idct.c

Ronald queried on IRC about the performance.  The libavcodec/tests/dct
utility reports these numbers

Yorkfield:
 - inline:   IDCT SIMPLE-MMX: 15715.9 kdct/s
 - external: IDCT SIMPLE-MMX: 15699.9 kdct/s

Skylake-U:
 - inline:   IDCT SIMPLE-MMX: 11193.3 kdct/s
 - external: IDCT SIMPLE-MMX: 11189.7 kdct/s

I assume those units are "thousand discrete cosine transforms per second".

___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
http://ffmpeg.org/mailman/listinfo/ffmpeg-devel

Re: [FFmpeg-devel] [PATCH 1/2] avcodec/x86: move simple_idct to external assembly

2017-05-30 Thread James Darnley

On 2017-05-29 23:26, Michael Niedermayer wrote:
> On Mon, May 29, 2017 at 09:40:49PM +0200, James Darnley wrote:
>> On 2017-05-29 16:51, James Darnley wrote:
>>> ---
>>> Changes:
>>>  - Changed type of d4 constant to dwords because it gets used as dwords.
>>>  - Changed or removed HAVE_MMX_INLINE preprocessor guards.
>>>  - Added note about conversion from inline.
>>>  - New file no longer has "2" suffix.
>>>  - Whitespace (indentation and alignment).
>>>
>>>  libavcodec/tests/x86/dct.c |   2 +-
>>>  libavcodec/x86/Makefile|   4 +-
>>>  libavcodec/x86/idctdsp_init.c  |   4 -
>>>  libavcodec/x86/simple_idct.asm | 889 
>>> +++
>>>  libavcodec/x86/simple_idct.c   | 929 
>>> -
>>>  5 files changed, 892 insertions(+), 936 deletions(-)
>>>  create mode 100644 libavcodec/x86/simple_idct.asm
>>>  delete mode 100644 libavcodec/x86/simple_idct.c
>>
>> Ronald queried on IRC about the performance.  The libavcodec/tests/dct
>> utility reports these numbers
>>
>> Yorkfield:
>>  - inline:   IDCT SIMPLE-MMX: 15715.9 kdct/s
>>  - external: IDCT SIMPLE-MMX: 15699.9 kdct/s
>>
>> Skylake-U:
>>  - inline:   IDCT SIMPLE-MMX: 11193.3 kdct/s
>>  - external: IDCT SIMPLE-MMX: 11189.7 kdct/s
> 
> Its better to benchmark by decoding some videos as the sparsness of
> the coeffs affects speed

Ah, quite true.

Decoding a large HD sample for many runs stays close around 220fps and
187s run time before and after the change.

I will push shortly.

___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
http://ffmpeg.org/mailman/listinfo/ffmpeg-devel

Re: [FFmpeg-devel] [PATCH 2/2] reindent

2017-05-30 Thread James Darnley

On 2017-05-29 16:51, James Darnley wrote:
> Commit message: reindent

Is this acceptable?  Should I be more verbose?

___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
http://ffmpeg.org/mailman/listinfo/ffmpeg-devel

[FFmpeg-devel] [WIP] [PATCH 0/6] sse2/xmm version of 8-bit simple_idct

2017-06-02 Thread James Darnley

Two ideas here.

The first 3 patches alter the old mmx code so that it can use xmm registers.  It
still only uses half the available width and adds a few shuffles meaning it
isn't an ideal solution.  Though it is exact compared with the mmx version.
Seems to be moderately faster of Skylake despite the shuffles but similar speed
on Yorkfield (like some of my previous work). Possibly useful if anybody still
uses a 32-bit build on these CPUs.

The 4th patch is a bit of cleanup I did while reading and partly redoing the
10-bit simple_idct.  It uses the named registers to remove a little indirection.
Not used everywhere, yet.  It could be applied regardless of any other of these
patches.

The last 2 are an attempt to use the 10- and 12-bit macros.  I don't think it is
correct, perhaps due to rounding or due to a small difference in the
coefficients used.  Changing these causes other errors.

James Darnley (6):
  initial alignment corrections for xmm registers
  change explicit mmx register use to x264asm style
  add and fix xmm version of simple_idct
  avcodec/x86: cleanup simple_idct10
  add x86_64 8-bit simple_idct function
  change coeffs

 libavcodec/tests/x86/dct.c|5 +
 libavcodec/x86/idctdsp_init.c |   11 +
 libavcodec/x86/proresdsp.asm  |2 +-
 libavcodec/x86/simple_idct.asm| 1242 +++--
 libavcodec/x86/simple_idct.h  |4 +
 libavcodec/x86/simple_idct10.asm  |   18 +-
 libavcodec/x86/simple_idct10_template.asm |   64 +-
 7 files changed, 715 insertions(+), 631 deletions(-)

-- 
2.12.2

___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
http://ffmpeg.org/mailman/listinfo/ffmpeg-devel

[FFmpeg-devel] [PATCH 2/6] change explicit mmx register use to x264asm style

2017-06-02 Thread James Darnley

---
 libavcodec/x86/simple_idct.asm | 1172 
 1 file changed, 586 insertions(+), 586 deletions(-)

Picture s/mm([0-7])/m\1/g here for 1229 lines and 64695 bytes.
___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
http://ffmpeg.org/mailman/listinfo/ffmpeg-devel

[FFmpeg-devel] [PATCH 1/6] initial alignment corrections for xmm registers

2017-06-02 Thread James Darnley

---
 libavcodec/x86/simple_idct.asm | 47 ++
 1 file changed, 34 insertions(+), 13 deletions(-)

diff --git a/libavcodec/x86/simple_idct.asm b/libavcodec/x86/simple_idct.asm
index 6fedbb5784..b5d05ca653 100644
--- a/libavcodec/x86/simple_idct.asm
+++ b/libavcodec/x86/simple_idct.asm
@@ -29,8 +29,8 @@ SECTION_RODATA
 
 cextern pb_80
 
-wm1010: dw 0, 0x, 0, 0x
 d4: dd 4 << 16, 0
+wm1010: dw 0, 0x, 0, 0x
 
 ; 23170.475006
 ; 22725.260826
@@ -53,30 +53,51 @@ d4: dd 4 << 16, 0
 %define ROW_SHIFT 11
 %define COL_SHIFT 20 ; 6
 
-coeffs:
+rounding:
+dw 1 << (ROW_SHIFT - 1), 0
 dw 1 << (ROW_SHIFT - 1), 0
 dw 1 << (ROW_SHIFT - 1), 0
+dw 1 << (ROW_SHIFT - 1), 0
+
+coeffs:
+dw 1 << (ROW_SHIFT - 1), 1
+dw 1 << (ROW_SHIFT - 1), 0
 dw 1 << (ROW_SHIFT - 1), 1
 dw 1 << (ROW_SHIFT - 1), 0
 
+; coeffs + 16
 dw C4,  C4,  C4,  C4
 dw C4, -C4,  C4, -C4
 
+; coeffs + 32
 dw C2,  C6,  C2,  C6
 dw C6, -C2,  C6, -C2
 
+; coeffs + 48
 dw C1,  C3,  C1,  C3
 dw C5,  C7,  C5,  C7
 
+; coeffs + 64
 dw C3, -C7,  C3, -C7
 dw -C1, -C5, -C1, -C5
 
+; coeffs + 80
 dw C5, -C1,  C5, -C1
 dw C7,  C3,  C7,  C3
 
+; coeffs + 96
 dw C7, -C5,  C7, -C5
 dw C3, -C1,  C3, -C1
 
+; for alignment
+; coeffs + 112
+dw C3, -C1,  C3, -C1
+times 8 db 0
+
+; coeffs + 128
+dw C6, -C2,  C6, -C2
+
+
 SECTION .text
 
 %macro DC_COND_IDCT 7
@@ -103,13 +124,13 @@ SECTION .text
 pmaddwd mm1, mm6; -C2R6+C6R2 -C2r6+C6r2
 movqmm7, [coeffs + 48]  ; C3 C1  C3  C1
 pmaddwd mm7, mm2; C3R3+C1R1  C3r3+C1r1
-paddd   mm4, [coeffs + 8]
+paddd   mm4, [rounding + 16]
 movqmm6, mm4; C4R4+C4R0  C4r4+C4r0
 paddd   mm4, mm5; A0 a0
 psubd   mm6, mm5; A3 a3
 movqmm5, [coeffs + 56]  ; C7 C5  C7  C5
 pmaddwd mm5, mm3; C7R7+C5R5  C7r7+C5r5
-paddd   mm0, [coeffs + 8]
+paddd   mm0, [rounding + 16]
 paddd   mm1, mm0; A1 a1
 paddd   mm0, mm0
 psubd   mm0, mm1; A2 a2
@@ -139,7 +160,7 @@ SECTION .text
 pmaddwd mm1, [coeffs + 96]  ; -C5R3+C7R1 -C5r3+C7r1
 pmaddwd mm7, mm3; C3R7+C7R5  C3r7+C7r5
 movqmm2, mm0; A2 a2
-pmaddwd mm3, [coeffs + 104] ; -C1R7+C3R5 -C1r7+C3r5
+pmaddwd mm3, [coeffs + 112] ; -C1R7+C3R5 -C1r7+C3r5
 paddd   mm4, mm7; B2 b2
 paddd   mm2, mm4; A2+B2  a2+b2
 psubd   mm0, mm4; a2-B2  a2-b2
@@ -191,13 +212,13 @@ SECTION .text
 pmaddwd mm1, mm6; -C2R6+C6R2 -C2r6+C6r2
 movqmm7, [coeffs + 48]  ; C3 C1  C3  C1
 pmaddwd mm7, mm2; C3R3+C1R1  C3r3+C1r1
-paddd   mm4, [coeffs]
+paddd   mm4, [rounding]
 movqmm6, mm4; C4R4+C4R0  C4r4+C4r0
 paddd   mm4, mm5; A0 a0
 psubd   mm6, mm5; A3 a3
 movqmm5, [coeffs + 56]  ; C7 C5  C7  C5
 pmaddwd mm5, mm3; C7R7+C5R5  C7r7+C5r5
-paddd   mm0, [coeffs]
+paddd   mm0, [rounding]
 paddd   mm1, mm0; A1 a1
 paddd   mm0, mm0
 psubd   mm0, mm1; A2 a2
@@ -227,7 +248,7 @@ SECTION .text
 pmaddwd mm1, [coeffs + 96]  ; -C5R3+C7R1 -C5r3+C7r1
 pmaddwd mm7, mm3; C3R7+C7R5  C3r7+C7r5
 movqmm2, mm0; A2 a2
-pmaddwd mm3, [coeffs + 104] ; -C1R7+C3R5 -C1r7+C3r5
+pmaddwd mm3, [coeffs + 112] ; -C1R7+C3R5 -C1r7+C3r5
 paddd   mm4, mm7; B2 b2
 paddd   mm2, mm4; A2+B2  a2+b2
 psubd   mm0, mm4; a2-B2  a2-b2
@@ -298,7 +319,7 @@ SECTION .text
 pmaddwd mm0, [coeffs + 96]  ; -C5R3+C7R1 -C5r3+C7r1
 pmaddwd mm7, mm3; C3R7+C7R5  C3r7+C7r5
 movqmm2, mm5; A2 a2
-pmaddwd mm3, [coeffs + 104] ; -C1R7+C3R5 -C1r7+C3r5
+pmaddwd mm3, [coeffs + 112] ; -C1R7+C3R5 -C1r7+C3r5
 paddd   mm4, mm7; B2 b2
 paddd   mm2, mm4; A2+B2  a2+b2
 psubd   mm5, mm4; a2-B2  a2-b2
@@ -363,7 +384,7 @@ SECTION .text
 movqmm1, [coeffs + 88]  ; C3

[FFmpeg-devel] [PATCH 3/6] add and fix xmm version of simple_idct

2017-06-02 Thread James Darnley

---
 libavcodec/tests/x86/dct.c |  3 +++
 libavcodec/x86/idctdsp_init.c  |  1 +
 libavcodec/x86/simple_idct.asm | 45 ++
 libavcodec/x86/simple_idct.h   |  1 +
 4 files changed, 50 insertions(+)

diff --git a/libavcodec/tests/x86/dct.c b/libavcodec/tests/x86/dct.c
index 34f5b8767b..97116570f4 100644
--- a/libavcodec/tests/x86/dct.c
+++ b/libavcodec/tests/x86/dct.c
@@ -97,6 +97,9 @@ static const struct algo idct_tab_arch[] = {
 #endif
 #endif
 #endif
+#if HAVE_SSE2_EXTERNAL
+{ "SIMPLE-SSE2",  ff_simple_idct_sse2,  FF_IDCT_PERM_SIMPLE, 
AV_CPU_FLAG_SSE2 },
+#endif
 { 0 }
 };
 
diff --git a/libavcodec/x86/idctdsp_init.c b/libavcodec/x86/idctdsp_init.c
index f1c915aa00..82530a5cc4 100644
--- a/libavcodec/x86/idctdsp_init.c
+++ b/libavcodec/x86/idctdsp_init.c
@@ -92,6 +92,7 @@ av_cold void ff_idctdsp_init_x86(IDCTDSPContext *c, 
AVCodecContext *avctx,
 avctx->idct_algo == FF_IDCT_SIMPLEMMX)) {
 c->idct_put  = ff_simple_idct_put_sse2;
 c->idct_add  = ff_simple_idct_add_sse2;
+c->idct  = ff_simple_idct_sse2;
 c->perm_type = FF_IDCT_PERM_SIMPLE;
 }
 }
diff --git a/libavcodec/x86/simple_idct.asm b/libavcodec/x86/simple_idct.asm
index 3b62a4f9d3..a6eb42464b 100644
--- a/libavcodec/x86/simple_idct.asm
+++ b/libavcodec/x86/simple_idct.asm
@@ -151,6 +151,10 @@ SECTION .text
 psradm2, %7
 packssdw m7, m1 ; A1+B1  a1+b1   A0+B0   a0+b0
 packssdw m2, m4 ; A0-B0  a0-b0   A1-B1   a1-b1
+%if mmsize == 16
+pshufd m7, m7, 8
+pshufd m2, m2, 8
+%endif
 movq   [%5], m7
 movq m1, [blockq + %3]  ; R3 R1  r3  r1
 movq m4, [coeffs + 80]  ; -C1C5  -C1 C5
@@ -172,9 +176,15 @@ SECTION .text
 psubdm4, m3 ; a3-B3  a3-b3
 psradm6, %7
 packssdw m2, m6 ; A3+B3  a3+b3   A2+B2   a2+b2
+%if mmsize == 16
+pshufd m2, m2, 8
+%endif
 movq   [8 + %5], m2
 psradm4, %7
 packssdw m4, m0 ; A2-B2  a2-b2   A3-B3   a3-b3
+%if mmsize == 16
+pshufd m4, m4, 8
+%endif
 movq  [16 + %5], m4
 jmp %%2
 %%1:
@@ -182,6 +192,9 @@ SECTION .text
 padddm0, [d4]
 psradm0, 13
 packssdw m0, m0
+%if mmsize == 16
+pshufd m0, m0, 8
+%endif
 movq   [%5], m0
 movq   [8 + %5], m0
 movq  [16 + %5], m0
@@ -239,6 +252,10 @@ SECTION .text
 psradm2, %7
 packssdw m7, m1 ; A1+B1  a1+b1   A0+B0   a0+b0
 packssdw m2, m4 ; A0-B0  a0-b0   A1-B1   a1-b1
+%if mmsize == 16
+pshufd m7, m7, 8
+pshufd m2, m2, 8
+%endif
 movq   [%5], m7
 movq m1, [blockq + %3]  ; R3 R1  r3  r1
 movq m4, [coeffs + 80]  ; -C1C5  -C1 C5
@@ -260,9 +277,15 @@ SECTION .text
 psubdm4, m3 ; a3-B3  a3-b3
 psradm6, %7
 packssdw m2, m6 ; A3+B3  a3+b3   A2+B2   a2+b2
+%if mmsize == 16
+pshufd m2, m2, 8
+%endif
 movq   [8 + %5], m2
 psradm4, %7
 packssdw m4, m0 ; A2-B2  a2-b2   A3-B3   a3-b3
+%if mmsize == 16
+pshufd m4, m4, 8
+%endif
 movq  [16 + %5], m4
 %endmacro
 
@@ -614,9 +637,15 @@ SECTION .text
 psradm7, %6
 psradm3, %6
 packssdw m4, m7 ; A0 a0
+%if mmsize == 16
+pshufd m4, m4, q0020
+%endif
 movq   [%5], m4
 psradm0, %6
 packssdw m0, m3 ; A1 a1
+%if mmsize == 16
+pshufd m0, m0, q0020
+%endif
 movq  [16 + %5], m0
 movq  [96 + %5], m0
 movq [112 + %5], m4
@@ -624,9 +653,15 @@ SECTION .text
 psradm6, %6
 psradm2, %6
 packssdw m5, m2 ; A2-B2  a2-b2
+%if mmsize == 16
+pshufd m5, m5, q0020
+%endif
 movq  [32 + %5], m5
 psradm1, %6
 packssdw m6, m1 ; A3+B3  a3+b3
+%if mmsize == 16
+pshufd m6, m6, q0020
+%endif
 movq  [48 + %5], m6
 movq  [64 + %5], m6
 movq  [80 + %5], m5
@@ -711,9 +746,15 @@ SECTION .text
 movq m7, [coeffs + 32]  ; C6 C2  C6  C2
 psradm1, %6
 packssdw m4, m1 ; A0 a0
+%if mmsize == 16
+pshufd m4, m4, 8
+%endif
 movq   [%5], m4
 psradm2, %6
 packssdw m0, m2 ; A1 a1
+%if mmsize == 16
+pshufd m0, m0, 8
+%endif
 movq  [16 + %5], m0
 movq  [96 + %5], m0
 movq [112 + %5], m4
@@ -889,6 +930,10 @@ RET
 
 INIT_XMM sse2
 
+cglobal simple_idct, 1, 2, 8, 128, block, t0
+IDCT
+RET
+
 cglobal simple_idct_put, 3, 5, 8, 128, pixels, lsize, block, lsize3,

[FFmpeg-devel] [PATCH 6/6] change coeffs

2017-06-02 Thread James Darnley

---
 libavcodec/x86/simple_idct10.asm | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/libavcodec/x86/simple_idct10.asm b/libavcodec/x86/simple_idct10.asm
index b4b47afcee..ae848b7faf 100644
--- a/libavcodec/x86/simple_idct10.asm
+++ b/libavcodec/x86/simple_idct10.asm
@@ -46,8 +46,8 @@ times 4 dw %2, %3
 
 %define W1sh2 22725 ; W1 = 90901 = 22725<<2 + 1
 %define W2sh2 21407 ; W2 = 85627 = 21407<<2 - 1
-%define W3sh2 19265 ; W3 = 77062 = 19265<<2 + 2
-%define W4sh2 16384 ; W4 = 65535 = 16384<<2 - 1
+%define W3sh2 19266 ; W3 = 77062 = 19265<<2 + 2
+%define W4sh2 16383 ; W4 = 65535 = 16384<<2 - 1
 %define W5sh2 12873 ; W5 = 51491 = 12873<<2 - 1
 %define W6sh2  8867 ; W6 = 35468 =  8867<<2
 %define W7sh2  4520 ; W7 = 18081 =  4520<<2 + 1
-- 
2.12.2

___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
http://ffmpeg.org/mailman/listinfo/ffmpeg-devel

[FFmpeg-devel] [PATCH 4/6] avcodec/x86: cleanup simple_idct10

2017-06-02 Thread James Darnley

Use named arguments for the functions so we can remove a define.  The
stride/linesize argument is now ptrdiff_t type so we no longer need to
sign extend the register.
---
 libavcodec/x86/proresdsp.asm  |  2 +-
 libavcodec/x86/simple_idct10.asm  |  8 ++--
 libavcodec/x86/simple_idct10_template.asm | 64 ++-
 3 files changed, 33 insertions(+), 41 deletions(-)

diff --git a/libavcodec/x86/proresdsp.asm b/libavcodec/x86/proresdsp.asm
index 16fc262aeb..8318a81c5e 100644
--- a/libavcodec/x86/proresdsp.asm
+++ b/libavcodec/x86/proresdsp.asm
@@ -51,7 +51,7 @@ cextern w7_min_w5
 SECTION .text
 
 %macro idct_fn 0
-cglobal prores_idct_put_10, 4, 4, 15
+cglobal prores_idct_put_10, 4, 4, 15, pixels, lsize, block, qmat
 IDCT_FNpw_1, 15, pw_88, 18, pw_4, pw_1019, r3
 RET
 %endmacro
diff --git a/libavcodec/x86/simple_idct10.asm b/libavcodec/x86/simple_idct10.asm
index 5dee533de0..7cfd33eaa3 100644
--- a/libavcodec/x86/simple_idct10.asm
+++ b/libavcodec/x86/simple_idct10.asm
@@ -68,21 +68,21 @@ CONST_DEC  w7_min_w5,W7sh2, -W5sh2
 SECTION .text
 
 %macro idct_fn 0
-cglobal simple_idct10, 1, 1, 16
+cglobal simple_idct10, 1, 1, 16, block
 IDCT_FN"", 12, "", 19
 RET
 
-cglobal simple_idct10_put, 3, 3, 16
+cglobal simple_idct10_put, 3, 3, 16, pixels, lsize, block
 IDCT_FN"", 12, "", 19, 0, pw_1023
 RET
 
-cglobal simple_idct12, 1, 1, 16
+cglobal simple_idct12, 1, 1, 16, block
 ; coeffs are already 15bits, adding the offset would cause
 ; overflow in the input
 IDCT_FN"", 15, pw_2, 16
 RET
 
-cglobal simple_idct12_put, 3, 3, 16
+cglobal simple_idct12_put, 3, 3, 16, pixels, lsize, block
 ; range isn't known, so the C simple_idct range is used
 ; Also, using a bias on input overflows, so use the bias
 ; on output of the first butterfly instead
diff --git a/libavcodec/x86/simple_idct10_template.asm 
b/libavcodec/x86/simple_idct10_template.asm
index 9d323d99b3..c0d1637ca2 100644
--- a/libavcodec/x86/simple_idct10_template.asm
+++ b/libavcodec/x86/simple_idct10_template.asm
@@ -115,18 +115,18 @@
 psubd   m3,  m9; a1[4-7] intermediate
 
 ; load/store
-mova   [COEFFS+  0], m0
-mova   [COEFFS+ 32], m2
-mova   [COEFFS+ 64], m4
-mova   [COEFFS+ 96], m6
-movam10,[COEFFS+ 16]   ; { row[1] }[0-7]
-movam8, [COEFFS+ 48]   ; { row[3] }[0-7]
-movam13,[COEFFS+ 80]   ; { row[5] }[0-7]
-movam14,[COEFFS+112]   ; { row[7] }[0-7]
-mova   [COEFFS+ 16], m1
-mova   [COEFFS+ 48], m3
-mova   [COEFFS+ 80], m5
-mova   [COEFFS+112], m7
+mova   [blockq+  0], m0
+mova   [blockq+ 32], m2
+mova   [blockq+ 64], m4
+mova   [blockq+ 96], m6
+movam10,[blockq+ 16]   ; { row[1] }[0-7]
+movam8, [blockq+ 48]   ; { row[3] }[0-7]
+movam13,[blockq+ 80]   ; { row[5] }[0-7]
+movam14,[blockq+112]   ; { row[7] }[0-7]
+mova   [blockq+ 16], m1
+mova   [blockq+ 48], m3
+mova   [blockq+ 80], m5
+mova   [blockq+112], m7
 %if %0 == 3
 pmullw  m10,[%3+ 16]
 pmullw  m8, [%3+ 48]
@@ -197,17 +197,17 @@
 ; row[5] = (a2 - b2) >> 15;
 ; row[3] = (a3 + b3) >> 15;
 ; row[4] = (a3 - b3) >> 15;
-movam8, [COEFFS+ 0]; a0[0-3]
-movam9, [COEFFS+16]; a0[4-7]
+movam8, [blockq+ 0]; a0[0-3]
+movam9, [blockq+16]; a0[4-7]
 SUMSUB_SHPK m8,  m9,  m10, m11, m0,  m1,  %2
-movam0, [COEFFS+32]; a1[0-3]
-movam1, [COEFFS+48]; a1[4-7]
+movam0, [blockq+32]; a1[0-3]
+movam1, [blockq+48]; a1[4-7]
 SUMSUB_SHPK m0,  m1,  m9,  m11, m2,  m3,  %2
-movam1, [COEFFS+64]; a2[0-3]
-movam2, [COEFFS+80]; a2[4-7]
+movam1, [blockq+64]; a2[0-3]
+movam2, [blockq+80]; a2[4-7]
 SUMSUB_SHPK m1,  m2,  m11, m3,  m4,  m5,  %2
-movam2, [COEFFS+96]; a3[0-3]
-movam3, [COEFFS+112]   ; a3[4-7]
+movam2, [blockq+96]; a3[0-3]
+movam3, [blockq+112]   ; a3[4-7]
 SUMSUB_SHPK m2,  m3,  m4,  m5,  m6,  m7,  %2
 %endmacro
 
@@ -223,20 +223,12 @@
 ; %7 = qmat (for prores)
 
 %macro IDCT_FN 4-7
-%if %0 == 4
-; No clamping, means pure idct
-%xdefine COEFFS r0
-%else
-movsxd  r1,  r1d
-%xdefine COEFFS r2
-%endif
-
 ; for (i = 0; i < 8; i++)
 ; idctRowCondDC(block + i*8);
-movam10,[COEFFS+ 0]; { row[0] }[0-7]
-movam8, [COEFFS+32]; { row[2] }[0-7]
-movam13,[COEFFS+64]; { row[4] }[0-7]
-movam12,[COEFFS+96]; { row[6] }[0-7]
+movam10,[blockq+ 0]; { row[0] }[0-7]
+movam8, [blockq+32]; { row[2] }[0-7]
+movam13,[blockq+64]; {

[FFmpeg-devel] [PATCH 5/6] add x86_64 8-bit simple_idct function

2017-06-02 Thread James Darnley

---
 libavcodec/tests/x86/dct.c   |  2 ++
 libavcodec/x86/idctdsp_init.c| 10 ++
 libavcodec/x86/simple_idct.h |  3 +++
 libavcodec/x86/simple_idct10.asm |  6 ++
 4 files changed, 21 insertions(+)

diff --git a/libavcodec/tests/x86/dct.c b/libavcodec/tests/x86/dct.c
index 97116570f4..a9b949f2b1 100644
--- a/libavcodec/tests/x86/dct.c
+++ b/libavcodec/tests/x86/dct.c
@@ -88,10 +88,12 @@ static const struct algo idct_tab_arch[] = {
 #if HAVE_YASM
 #if ARCH_X86_64
 #if HAVE_SSE2_EXTERNAL
+{ "SIMPLE8-SSE2",   ff_simple_idct8_sse2,  FF_IDCT_PERM_TRANSPOSE, 
AV_CPU_FLAG_SSE2},
 { "SIMPLE10-SSE2",  ff_simple_idct10_sse2, FF_IDCT_PERM_TRANSPOSE, 
AV_CPU_FLAG_SSE2},
 { "SIMPLE12-SSE2",  ff_simple_idct12_sse2, FF_IDCT_PERM_TRANSPOSE, 
AV_CPU_FLAG_SSE2, 1 },
 #endif
 #if HAVE_AVX_EXTERNAL
+{ "SIMPLE8-AVX",ff_simple_idct8_avx,   FF_IDCT_PERM_TRANSPOSE, 
AV_CPU_FLAG_AVX},
 { "SIMPLE10-AVX",   ff_simple_idct10_avx,  FF_IDCT_PERM_TRANSPOSE, 
AV_CPU_FLAG_AVX},
 { "SIMPLE12-AVX",   ff_simple_idct12_avx,  FF_IDCT_PERM_TRANSPOSE, 
AV_CPU_FLAG_AVX,  1 },
 #endif
diff --git a/libavcodec/x86/idctdsp_init.c b/libavcodec/x86/idctdsp_init.c
index 82530a5cc4..1e30496da0 100644
--- a/libavcodec/x86/idctdsp_init.c
+++ b/libavcodec/x86/idctdsp_init.c
@@ -95,6 +95,16 @@ av_cold void ff_idctdsp_init_x86(IDCTDSPContext *c, 
AVCodecContext *avctx,
 c->idct  = ff_simple_idct_sse2;
 c->perm_type = FF_IDCT_PERM_SIMPLE;
 }
+
+if (ARCH_X86_64 &&
+!high_bit_depth &&
+avctx->lowres == 0 &&
+(avctx->idct_algo == FF_IDCT_AUTO ||
+avctx->idct_algo == FF_IDCT_SIMPLEAUTO ||
+avctx->idct_algo == FF_IDCT_SIMPLE)) {
+c->idct  = ff_simple_idct8_sse2;
+c->perm_type = FF_IDCT_PERM_TRANSPOSE;
+}
 }
 
 if (ARCH_X86_64 && avctx->lowres == 0) {
diff --git a/libavcodec/x86/simple_idct.h b/libavcodec/x86/simple_idct.h
index b19e910372..7a26e96b60 100644
--- a/libavcodec/x86/simple_idct.h
+++ b/libavcodec/x86/simple_idct.h
@@ -30,6 +30,9 @@ void ff_simple_idct_sse2(int16_t *block);
 void ff_simple_idct_add_sse2(uint8_t *dest, ptrdiff_t line_size, int16_t 
*block);
 void ff_simple_idct_put_sse2(uint8_t *dest, ptrdiff_t line_size, int16_t 
*block);
 
+void ff_simple_idct8_sse2(int16_t *block);
+void ff_simple_idct8_avx(int16_t *block);
+
 void ff_simple_idct10_sse2(int16_t *block);
 void ff_simple_idct10_avx(int16_t *block);
 
diff --git a/libavcodec/x86/simple_idct10.asm b/libavcodec/x86/simple_idct10.asm
index 7cfd33eaa3..b4b47afcee 100644
--- a/libavcodec/x86/simple_idct10.asm
+++ b/libavcodec/x86/simple_idct10.asm
@@ -33,9 +33,11 @@ cextern pw_2
 cextern pw_16
 cextern pw_1023
 cextern pw_4095
+pd_round_11: times 4 dd 1<<(11-1)
 pd_round_12: times 4 dd 1<<(12-1)
 pd_round_15: times 4 dd 1<<(15-1)
 pd_round_19: times 4 dd 1<<(19-1)
+pd_round_20: times 4 dd 1<<(20-1)
 
 %macro CONST_DEC  3
 const %1
@@ -68,6 +70,10 @@ CONST_DEC  w7_min_w5,W7sh2, -W5sh2
 SECTION .text
 
 %macro idct_fn 0
+cglobal simple_idct8, 1, 1, 16, block
+IDCT_FN"", 11, "", 20
+RET
+
 cglobal simple_idct10, 1, 1, 16, block
 IDCT_FN"", 12, "", 19
 RET
-- 
2.12.2

___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
http://ffmpeg.org/mailman/listinfo/ffmpeg-devel

Re: [FFmpeg-devel] [WIP] [PATCH 0/6] sse2/xmm version of 8-bit simple_idct

2017-06-05 Thread James Darnley

To answer the couple of questions that were asked over the weekend.

Rostislav, about the performance.  I can see how to force a particular
IDCT implementation for real world decoding (the -idct option) but the
MPEG2 HD sample I've been working with mostly uses the "idct add"
function which doesn't exist for the functions in simple_idct10.asm.  So
for a next best thing, these are the results from the dct testing
utility over several runs.

> SIMPLE-C:  9124.8 ± 7.52
> SIMPLE-MMX:   11281.9 ± 32.67
> SIMPLE-SSE2:  15453.3 ± 78.86 (the adaption in the first 3 patches)
> SIMPLE8-SSE2: 15684.2 ± 7.52 (from simple_idct10.asm)
> SIMPLE8-AVX:  15398.4 ± 6.36 (simple_idct10.asm again)

I will try to get some real world results, eventually.

Ronald, yes.  I was thinking that the first 3 could be ignored if I can
get the latter patches to work correctly (pass fate that is).

I forgot to mention in my cover letter that although the dct test
passes, fate does not.  As I mentioned on IRC, changing them causes
errors elsewhere in fate.  I am currently looking into this problem and
I'm sure I will speak to you or others about it.

___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
http://ffmpeg.org/mailman/listinfo/ffmpeg-devel

[FFmpeg-devel] [PATCH 0/5] x264asm: take some patches from upstream

2017-06-08 Thread James Darnley

Incorporate some of the recent changes committed to x264.  This is an initial
set with no controversial changes: no nasm requirement, no avx512.

I do want your comments on where I should put the aesni define in the last
patch.  I will make a note on that one too.  I will attempt to upstream that
define so that our differences remain small.

Anton Mitrofanov (1):
  x86inc: Remove argument from WIN64_RESTORE_XMM

Henrik Gramner (4):
  x86inc: Fix call with memory operands
  x86inc: Make REP_RET identical to RET in SSSE3+ functions
  x86inc: Prefer r14/r15 over r12/r13 on x86-64
  x86: Add some additional cpuflag relations

 libavutil/x86/x86inc.asm | 81 +---
 1 file changed, 43 insertions(+), 38 deletions(-)

-- 
2.13.0

___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
http://ffmpeg.org/mailman/listinfo/ffmpeg-devel

[FFmpeg-devel] [PATCH 2/5] x86inc: Make REP_RET identical to RET in SSSE3+ functions

2017-06-08 Thread James Darnley

From: Henrik Gramner 

There's no point in emitting a rep prefix before ret on modern CPUs.
---
 libavutil/x86/x86inc.asm | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/libavutil/x86/x86inc.asm b/libavutil/x86/x86inc.asm
index f2a6a3f1db..44069741cc 100644
--- a/libavutil/x86/x86inc.asm
+++ b/libavutil/x86/x86inc.asm
@@ -636,7 +636,7 @@ DECLARE_ARG 7, 8, 9, 10, 11, 12, 13, 14
 ; We can automatically detect "follows a branch", but not a branch target.
 ; (SSSE3 is a sufficient condition to know that your cpu doesn't have this 
problem.)
 %macro REP_RET 0
-%if has_epilogue
+%if has_epilogue || cpuflag(ssse3)
 RET
 %else
 rep ret
-- 
2.13.0

___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
http://ffmpeg.org/mailman/listinfo/ffmpeg-devel

[FFmpeg-devel] [PATCH 5/5] x86: Add some additional cpuflag relations

2017-06-08 Thread James Darnley

From: Henrik Gramner 

Simplifies writing assembly code that depends on available instructions.

LZCNT implies SSE2
BMI1 implies AVX+LZCNT
AVX2 implies BMI2
---
This is the patch I was talking about.  Where should I put the aesni define?
x264 doesn't have it but I will try to get it upstreamed.

 libavutil/x86/x86inc.asm | 38 +++---
 1 file changed, 19 insertions(+), 19 deletions(-)

diff --git a/libavutil/x86/x86inc.asm b/libavutil/x86/x86inc.asm
index 2a13ca957e..acda0e0b4e 100644
--- a/libavutil/x86/x86inc.asm
+++ b/libavutil/x86/x86inc.asm
@@ -788,25 +788,25 @@ BRANCH_INSTR jz, je, jnz, jne, jl, jle, jnl, jnle, jg, 
jge, jng, jnge, ja, jae,
 %assign cpuflags_sse  (1<<4) | cpuflags_mmx2
 %assign cpuflags_sse2 (1<<5) | cpuflags_sse
 %assign cpuflags_sse2slow (1<<6) | cpuflags_sse2
-%assign cpuflags_sse3 (1<<7) | cpuflags_sse2
-%assign cpuflags_ssse3(1<<8) | cpuflags_sse3
-%assign cpuflags_sse4 (1<<9) | cpuflags_ssse3
-%assign cpuflags_sse42(1<<10)| cpuflags_sse4
-%assign cpuflags_avx  (1<<11)| cpuflags_sse42
-%assign cpuflags_xop  (1<<12)| cpuflags_avx
-%assign cpuflags_fma4 (1<<13)| cpuflags_avx
-%assign cpuflags_fma3 (1<<14)| cpuflags_avx
-%assign cpuflags_avx2 (1<<15)| cpuflags_fma3
-
-%assign cpuflags_cache32  (1<<16)
-%assign cpuflags_cache64  (1<<17)
-%assign cpuflags_slowctz  (1<<18)
-%assign cpuflags_lzcnt(1<<19)
-%assign cpuflags_aligned  (1<<20) ; not a cpu feature, but a function variant
-%assign cpuflags_atom (1<<21)
-%assign cpuflags_bmi1 (1<<22)|cpuflags_lzcnt
-%assign cpuflags_bmi2 (1<<23)|cpuflags_bmi1
-%assign cpuflags_aesni(1<<24)|cpuflags_sse42
+%assign cpuflags_lzcnt(1<<7) | cpuflags_sse2
+%assign cpuflags_sse3 (1<<8) | cpuflags_sse2
+%assign cpuflags_ssse3(1<<9) | cpuflags_sse3
+%assign cpuflags_sse4 (1<<10)| cpuflags_ssse3
+%assign cpuflags_sse42(1<<11)| cpuflags_sse4
+%assign cpuflags_avx  (1<<12)| cpuflags_sse42
+%assign cpuflags_xop  (1<<13)| cpuflags_avx
+%assign cpuflags_fma4 (1<<14)| cpuflags_avx
+%assign cpuflags_fma3 (1<<15)| cpuflags_avx
+%assign cpuflags_bmi1 (1<<16)| cpuflags_avx|cpuflags_lzcnt
+%assign cpuflags_bmi2 (1<<17)| cpuflags_bmi1
+%assign cpuflags_avx2 (1<<18)| cpuflags_fma3|cpuflags_bmi2
+
+%assign cpuflags_cache32  (1<<19)
+%assign cpuflags_cache64  (1<<20)
+%assign cpuflags_slowctz  (1<<21)
+%assign cpuflags_aligned  (1<<22) ; not a cpu feature, but a function variant
+%assign cpuflags_atom (1<<23)
+%assign cpuflags_aesni(1<<24)| cpuflags_sse42
 
 ; Returns a boolean value expressing whether or not the specified cpuflag is 
enabled.
 %definecpuflag(x) (cpuflags & (cpuflags_ %+ x)) ^ (cpuflags_ %+ x)) - 
1) >> 31) & 1)
-- 
2.13.0

___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
http://ffmpeg.org/mailman/listinfo/ffmpeg-devel

[FFmpeg-devel] [PATCH 3/5] x86inc: Prefer r14/r15 over r12/r13 on x86-64

2017-06-08 Thread James Darnley

From: Henrik Gramner 

Due to a peculiarity in the ModR/M addressing encoding, the r12 and r13
registers sometimes requires an additional byte when used as a base register.

r14 and r15 doesn't have that issue, so prefer using them.
---
 libavutil/x86/x86inc.asm | 16 
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/libavutil/x86/x86inc.asm b/libavutil/x86/x86inc.asm
index 44069741cc..65853f72cd 100644
--- a/libavutil/x86/x86inc.asm
+++ b/libavutil/x86/x86inc.asm
@@ -426,10 +426,10 @@ DECLARE_REG 7,  rdi, 64
 DECLARE_REG 8,  rsi, 72
 DECLARE_REG 9,  rbx, 80
 DECLARE_REG 10, rbp, 88
-DECLARE_REG 11, R12, 96
-DECLARE_REG 12, R13, 104
-DECLARE_REG 13, R14, 112
-DECLARE_REG 14, R15, 120
+DECLARE_REG 11, R14, 96
+DECLARE_REG 12, R15, 104
+DECLARE_REG 13, R12, 112
+DECLARE_REG 14, R13, 120
 
 %macro PROLOGUE 2-5+ 0 ; #args, #regs, #xmm_regs, [stack_size,] arg_names...
 %assign num_args %1
@@ -530,10 +530,10 @@ DECLARE_REG 7,  R10, 16
 DECLARE_REG 8,  R11, 24
 DECLARE_REG 9,  rbx, 32
 DECLARE_REG 10, rbp, 40
-DECLARE_REG 11, R12, 48
-DECLARE_REG 12, R13, 56
-DECLARE_REG 13, R14, 64
-DECLARE_REG 14, R15, 72
+DECLARE_REG 11, R14, 48
+DECLARE_REG 12, R15, 56
+DECLARE_REG 13, R12, 64
+DECLARE_REG 14, R13, 72
 
 %macro PROLOGUE 2-5+ ; #args, #regs, #xmm_regs, [stack_size,] arg_names...
 %assign num_args %1
-- 
2.13.0

___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
http://ffmpeg.org/mailman/listinfo/ffmpeg-devel

[FFmpeg-devel] [PATCH 1/5] x86inc: Fix call with memory operands

2017-06-08 Thread James Darnley

From: Henrik Gramner 

We overload the `call` instruction with a macro, but it would misbehave when
the macro argument wasn't a valid identifier. Fix it by explicitly checking
if the argument is an identifier.
---
 libavutil/x86/x86inc.asm | 6 +-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/libavutil/x86/x86inc.asm b/libavutil/x86/x86inc.asm
index 128ddc1089..f2a6a3f1db 100644
--- a/libavutil/x86/x86inc.asm
+++ b/libavutil/x86/x86inc.asm
@@ -1037,7 +1037,11 @@ INIT_XMM
 
 ; Append cpuflags to the callee's name iff the appended name is known and the 
plain name isn't
 %macro call 1
-call_internal %1 %+ SUFFIX, %1
+%ifid %1
+call_internal %1 %+ SUFFIX, %1
+%else
+call %1
+%endif
 %endmacro
 %macro call_internal 2
 %xdefine %%i %2
-- 
2.13.0

___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
http://ffmpeg.org/mailman/listinfo/ffmpeg-devel

[FFmpeg-devel] [PATCH 4/5] x86inc: Remove argument from WIN64_RESTORE_XMM

2017-06-08 Thread James Darnley

From: Anton Mitrofanov 

The use of rsp was pretty much hardcoded there and probably didn't work
otherwise with stack_size > 0.
---
 libavutil/x86/x86inc.asm | 19 ++-
 1 file changed, 10 insertions(+), 9 deletions(-)

diff --git a/libavutil/x86/x86inc.asm b/libavutil/x86/x86inc.asm
index 65853f72cd..2a13ca957e 100644
--- a/libavutil/x86/x86inc.asm
+++ b/libavutil/x86/x86inc.asm
@@ -475,41 +475,42 @@ DECLARE_REG 14, R13, 120
 WIN64_PUSH_XMM
 %endmacro
 
-%macro WIN64_RESTORE_XMM_INTERNAL 1
+%macro WIN64_RESTORE_XMM_INTERNAL 0
 %assign %%pad_size 0
 %if xmm_regs_used > 8
 %assign %%i xmm_regs_used
 %rep xmm_regs_used-8
 %assign %%i %%i-1
-movaps xmm %+ %%i, [%1 + (%%i-8)*16 + stack_size + 32]
+movaps xmm %+ %%i, [rsp + (%%i-8)*16 + stack_size + 32]
 %endrep
 %endif
 %if stack_size_padded > 0
 %if stack_size > 0 && required_stack_alignment > STACK_ALIGNMENT
 mov rsp, rstkm
 %else
-add %1, stack_size_padded
+add rsp, stack_size_padded
 %assign %%pad_size stack_size_padded
 %endif
 %endif
 %if xmm_regs_used > 7
-movaps xmm7, [%1 + stack_offset - %%pad_size + 24]
+movaps xmm7, [rsp + stack_offset - %%pad_size + 24]
 %endif
 %if xmm_regs_used > 6
-movaps xmm6, [%1 + stack_offset - %%pad_size +  8]
+movaps xmm6, [rsp + stack_offset - %%pad_size +  8]
 %endif
 %endmacro
 
-%macro WIN64_RESTORE_XMM 1
-WIN64_RESTORE_XMM_INTERNAL %1
+%macro WIN64_RESTORE_XMM 0
+WIN64_RESTORE_XMM_INTERNAL
 %assign stack_offset (stack_offset-stack_size_padded)
+%assign stack_size_padded 0
 %assign xmm_regs_used 0
 %endmacro
 
 %define has_epilogue regs_used > 7 || xmm_regs_used > 6 || mmsize == 32 || 
stack_size > 0
 
 %macro RET 0
-WIN64_RESTORE_XMM_INTERNAL rsp
+WIN64_RESTORE_XMM_INTERNAL
 POP_IF_USED 14, 13, 12, 11, 10, 9, 8, 7
 %if mmsize == 32
 vzeroupper
@@ -625,7 +626,7 @@ DECLARE_ARG 7, 8, 9, 10, 11, 12, 13, 14
 %if WIN64 == 0
 %macro WIN64_SPILL_XMM 1
 %endmacro
-%macro WIN64_RESTORE_XMM 1
+%macro WIN64_RESTORE_XMM 0
 %endmacro
 %macro WIN64_PUSH_XMM 0
 %endmacro
-- 
2.13.0

___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
http://ffmpeg.org/mailman/listinfo/ffmpeg-devel

Re: [FFmpeg-devel] [PATCH 5/5] x86: Add some additional cpuflag relations

2017-06-09 Thread James Darnley

On 2017-06-09 10:08, Henrik Gramner wrote:
> On Fri, Jun 9, 2017 at 1:05 AM, James Darnley  wrote:
>> Where should I put the aesni define?
> 
> Between sse42 and avx.

Thank you.  I will change this and the first patch to bump the date.
I'll give other people about an hour to make other comments.

___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
http://ffmpeg.org/mailman/listinfo/ffmpeg-devel

[FFmpeg-devel] [PATCH 1/1] configure: require NASM version 2.11 or newer for external x86 assembly

2017-06-09 Thread James Darnley

---
 configure | 17 -
 1 file changed, 4 insertions(+), 13 deletions(-)

diff --git a/configure b/configure
index e3941f9dfd..69bbf25bf5 100755
--- a/configure
+++ b/configure
@@ -3258,7 +3258,7 @@ pkg_config_default=pkg-config
 ranlib_default="ranlib"
 strip_default="strip"
 version_script='--version-script'
-yasmexe_default="yasm"
+yasmexe_default="nasm"
 windres_default="windres"
 nvcc_default="nvcc"
 nvccflags_default="-gencode arch=compute_30,code=sm_30 -O2"
@@ -5476,11 +5476,9 @@ EOF
 enabled mmxext && check_inline_asm mmxext_inline '"pmaxub %mm0, %mm1"'
 
 if ! disabled_any asm mmx yasm; then
-if check_cmd $yasmexe --version; then
-enabled x86_64 && yasm_extra="-m amd64"
-yasm_debug="-g dwarf2"
-elif check_cmd nasm -v; then
-yasmexe=nasm
+check_yasm "vdivpd zmm0{k1}{z}, zmm1, zmm3" ||
+die "nasm not found or too old, version 2.11 is now required. Use 
--disable-yasm for a crippled build."
+
 yasm_debug="-g -F dwarf"
 if enabled x86_64; then
 case "$objformat" in
@@ -5488,7 +5486,6 @@ EOF
 win32) objformat=win64 ;;
 esac
 fi
-fi
 
 YASMFLAGS="-f $objformat $yasm_extra"
 enabled pic   && append YASMFLAGS "-DPIC"
@@ -5497,12 +5494,6 @@ EOF
 elf*) enabled debug && append YASMFLAGS $yasm_debug ;;
 esac
 
-check_yasm "movbe ecx, [5]" && enable yasm ||
-die "yasm/nasm not found or too old. Use --disable-yasm for a 
crippled build."
-check_yasm "vextracti128 xmm0, ymm0, 0"  || disable avx2_external
-check_yasm "vpmacsdd xmm0, xmm1, xmm2, xmm3" || disable xop_external
-check_yasm "vfmaddps ymm0, ymm1, ymm2, ymm3" || disable fma4_external
-check_yasm "CPU amdnop" || disable cpunop
 fi
 
 case "$cpu" in
-- 
2.13.0

___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
http://ffmpeg.org/mailman/listinfo/ffmpeg-devel

1 2 3 4 5 6 >

1 - 100 of 517 matches

Mail list logo