On 02/02/15 2:11 PM, Christophe Gisquet wrote: > @@ -87,11 +95,22 @@ QPEL_TABLE 12, 4, w, sse4 > %elif %1 <= 8 > movdqa %3, [%2] > ; load data from source2 > %elif %1 <= 12 > +%if avx_enabled
If this is meant for avx2, then the correct check is "cpuflag(avx2)" or "mmsize > 16", because if at some point we add an avx version (just to take advantage of the non-destructive three operand format compared to sse4), they will have to be replaced. > + mova %3, [%2] > +%else > movdqa %3, [%2] > ; load data from source2 > movq %4, [%2+16] > ; load data from source2 [...] > @@ -589,6 +731,89 @@ void ff_hevc_dsp_init_x86(HEVCDSPContext *c, const int > bit_depth) > c->idct_dc[3] = ff_hevc_idct32x32_dc_8_avx2; > if (ARCH_X86_64) { > SAO_BAND_INIT(8, avx2); Indentation for all the stuff bellow is wrong. > + c->put_hevc_epel[7][0][0] = ff_hevc_put_hevc_pel_pixels32_8_avx2; > + c->put_hevc_epel[8][0][0] = ff_hevc_put_hevc_pel_pixels48_8_avx2; > + c->put_hevc_epel[9][0][0] = ff_hevc_put_hevc_pel_pixels64_8_avx2; > + > + c->put_hevc_qpel[7][0][0] = ff_hevc_put_hevc_pel_pixels32_8_avx2; > + c->put_hevc_qpel[8][0][0] = ff_hevc_put_hevc_pel_pixels48_8_avx2; > + c->put_hevc_qpel[9][0][0] = ff_hevc_put_hevc_pel_pixels64_8_avx2; > + > + c->put_hevc_epel_uni[7][0][0] = > ff_hevc_put_hevc_uni_pel_pixels32_8_avx2; > + c->put_hevc_epel_uni[8][0][0] = > ff_hevc_put_hevc_uni_pel_pixels48_8_avx2; > + c->put_hevc_epel_uni[9][0][0] = > ff_hevc_put_hevc_uni_pel_pixels64_8_avx2; > + > + c->put_hevc_qpel_uni[7][0][0] = > ff_hevc_put_hevc_uni_pel_pixels32_8_avx2; > + c->put_hevc_qpel_uni[8][0][0] = > ff_hevc_put_hevc_uni_pel_pixels48_8_avx2; > + c->put_hevc_qpel_uni[9][0][0] = > ff_hevc_put_hevc_uni_pel_pixels64_8_avx2; > + > + c->put_hevc_qpel_bi[7][0][0] = > ff_hevc_put_hevc_bi_pel_pixels32_8_avx2; > + c->put_hevc_qpel_bi[8][0][0] = > ff_hevc_put_hevc_bi_pel_pixels48_8_avx2; > + c->put_hevc_qpel_bi[9][0][0] = > ff_hevc_put_hevc_bi_pel_pixels64_8_avx2; > + > + c->put_hevc_epel_bi[7][0][0] = > ff_hevc_put_hevc_bi_pel_pixels32_8_avx2; > + c->put_hevc_epel_bi[8][0][0] = > ff_hevc_put_hevc_bi_pel_pixels48_8_avx2; > + c->put_hevc_epel_bi[9][0][0] = > ff_hevc_put_hevc_bi_pel_pixels64_8_avx2; > + > + c->put_hevc_epel[7][0][1] = ff_hevc_put_hevc_epel_h32_8_avx2; > + c->put_hevc_epel[8][0][1] = ff_hevc_put_hevc_epel_h48_8_avx2; > + c->put_hevc_epel[9][0][1] = ff_hevc_put_hevc_epel_h64_8_avx2; > + > + c->put_hevc_epel_uni[7][0][1] = > ff_hevc_put_hevc_uni_epel_h32_8_avx2; > + c->put_hevc_epel_uni[8][0][1] = > ff_hevc_put_hevc_uni_epel_h48_8_avx2; > + c->put_hevc_epel_uni[9][0][1] = > ff_hevc_put_hevc_uni_epel_h64_8_avx2; > + > + c->put_hevc_epel_bi[7][0][1] = > ff_hevc_put_hevc_bi_epel_h32_8_avx2; > + c->put_hevc_epel_bi[8][0][1] = > ff_hevc_put_hevc_bi_epel_h48_8_avx2; > + c->put_hevc_epel_bi[9][0][1] = > ff_hevc_put_hevc_bi_epel_h64_8_avx2; > + > + c->put_hevc_epel[7][1][0] = ff_hevc_put_hevc_epel_v32_8_avx2; > + c->put_hevc_epel[8][1][0] = ff_hevc_put_hevc_epel_v48_8_avx2; > + c->put_hevc_epel[9][1][0] = ff_hevc_put_hevc_epel_v64_8_avx2; > + > + c->put_hevc_epel_uni[7][1][0] = > ff_hevc_put_hevc_uni_epel_v32_8_avx2; > + c->put_hevc_epel_uni[8][1][0] = > ff_hevc_put_hevc_uni_epel_v48_8_avx2; > + c->put_hevc_epel_uni[9][1][0] = > ff_hevc_put_hevc_uni_epel_v64_8_avx2; > + > + c->put_hevc_epel_bi[7][1][0] = > ff_hevc_put_hevc_bi_epel_v32_8_avx2; > + c->put_hevc_epel_bi[8][1][0] = > ff_hevc_put_hevc_bi_epel_v48_8_avx2; > + c->put_hevc_epel_bi[9][1][0] = > ff_hevc_put_hevc_bi_epel_v64_8_avx2; > + > + c->put_hevc_epel[7][1][1] = ff_hevc_put_hevc_epel_hv32_8_avx2; > + c->put_hevc_epel[8][1][1] = ff_hevc_put_hevc_epel_hv48_8_avx2; > + c->put_hevc_epel[9][1][1] = ff_hevc_put_hevc_epel_hv64_8_avx2; > + > + c->put_hevc_epel_uni[7][1][1] = > ff_hevc_put_hevc_uni_epel_hv32_8_avx2; > + c->put_hevc_epel_uni[8][1][1] = > ff_hevc_put_hevc_uni_epel_hv48_8_avx2; > + c->put_hevc_epel_uni[9][1][1] = > ff_hevc_put_hevc_uni_epel_hv64_8_avx2; > + > + c->put_hevc_epel_bi[7][1][1] = > ff_hevc_put_hevc_bi_epel_hv32_8_avx2; > + c->put_hevc_epel_bi[8][1][1] = > ff_hevc_put_hevc_bi_epel_hv48_8_avx2; > + c->put_hevc_epel_bi[9][1][1] = > ff_hevc_put_hevc_bi_epel_hv64_8_avx2; > + > + c->put_hevc_qpel[7][0][1] = ff_hevc_put_hevc_qpel_h32_8_avx2; > + c->put_hevc_qpel[8][0][1] = ff_hevc_put_hevc_qpel_h48_8_avx2; > + c->put_hevc_qpel[9][0][1] = ff_hevc_put_hevc_qpel_h64_8_avx2; > + > + c->put_hevc_qpel[7][1][0] = ff_hevc_put_hevc_qpel_v32_8_avx2; > + c->put_hevc_qpel[8][1][0] = ff_hevc_put_hevc_qpel_v48_8_avx2; > + c->put_hevc_qpel[9][1][0] = ff_hevc_put_hevc_qpel_v64_8_avx2; > + > + c->put_hevc_qpel_uni[7][0][1] = > ff_hevc_put_hevc_uni_qpel_h32_8_avx2; > + c->put_hevc_qpel_uni[8][0][1] = > ff_hevc_put_hevc_uni_qpel_h48_8_avx2; > + c->put_hevc_qpel_uni[9][0][1] = > ff_hevc_put_hevc_uni_qpel_h64_8_avx2; > + > + c->put_hevc_qpel_uni[7][1][0] = > ff_hevc_put_hevc_uni_qpel_v32_8_avx2; > + c->put_hevc_qpel_uni[8][1][0] = > ff_hevc_put_hevc_uni_qpel_v48_8_avx2; > + c->put_hevc_qpel_uni[9][1][0] = > ff_hevc_put_hevc_uni_qpel_v64_8_avx2; > + > + c->put_hevc_qpel_bi[7][0][1] = > ff_hevc_put_hevc_bi_qpel_h32_8_avx2; > + c->put_hevc_qpel_bi[8][0][1] = > ff_hevc_put_hevc_bi_qpel_h48_8_avx2; > + c->put_hevc_qpel_bi[9][0][1] = > ff_hevc_put_hevc_bi_qpel_h64_8_avx2; > + > + c->put_hevc_qpel_bi[7][1][0] = > ff_hevc_put_hevc_bi_qpel_v32_8_avx2; > + c->put_hevc_qpel_bi[8][1][0] = > ff_hevc_put_hevc_bi_qpel_v48_8_avx2; > + c->put_hevc_qpel_bi[9][1][0] = > ff_hevc_put_hevc_bi_qpel_v64_8_avx2; > } Mickaël: How much faster than sse4 is this code? It uses lots of perms/inserts/extracts that cross ymm reg lanes, which is usually slow. _______________________________________________ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org http://ffmpeg.org/mailman/listinfo/ffmpeg-devel