On 6/26/2017 7:15 PM, Paul B Mahol wrote: > diff --git a/libavcodec/x86/utvideodsp.asm b/libavcodec/x86/utvideodsp.asm > new file mode 100644 > index 0000000..2e96f8b > --- /dev/null > +++ b/libavcodec/x86/utvideodsp.asm > @@ -0,0 +1,101 @@ > +;****************************************************************************** > +;* SIMD-optimized UTVideo functions > +;* Copyright (c) 2017 Paul B Mahol > +;* > +;* This file is part of FFmpeg. > +;* > +;* FFmpeg is free software; you can redistribute it and/or > +;* modify it under the terms of the GNU Lesser General Public > +;* License as published by the Free Software Foundation; either > +;* version 2.1 of the License, or (at your option) any later version. > +;* > +;* FFmpeg is distributed in the hope that it will be useful, > +;* but WITHOUT ANY WARRANTY; without even the implied warranty of > +;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU > +;* Lesser General Public License for more details. > +;* > +;* You should have received a copy of the GNU Lesser General Public > +;* License along with FFmpeg; if not, write to the Free Software > +;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 > USA > +;****************************************************************************** > + > +%include "libavutil/x86/x86util.asm" > + > +SECTION_RODATA > + > +pb_128: times 16 db 128 > +pw_512: times 8 dw 512 > +pw_1023: times 8 dw 1023 > + > +SECTION .text > + > +INIT_XMM sse2 > + > +; void restore_rgb_planes(uint8_t *src_r, uint8_t *src_g, uint8_t *src_b, > +; ptrdiff_t linesize_r, ptrdiff_t linesize_g, > ptrdiff_t linesize_b, > +; int width, int height) > +cglobal restore_rgb_planes, 8,9,4, src_r, src_g, src_b, linesize_r, > linesize_g, linesize_b, w, h, x > + movsxdifnidn wq, wd > + add src_rq, wq > + add src_gq, wq > + add src_bq, wq > + neg wq > + mova m3, [pb_128] > +.nextrow: > + mov xq, wq > + > + .loop: > + mova m0, [src_rq + xq] > + mova m1, [src_gq + xq] > + mova m2, [src_bq + xq] > + psubb m1, m3 > + paddb m0, m1 > + paddb m2, m1 > + mova [src_rq+xq], m0 > + mova [src_bq+xq], m2 > + add xq, mmsize > + jl .loop > + > + add src_rq, linesize_rq > + add src_gq, linesize_gq > + add src_bq, linesize_bq > + sub hq, 1
sub hd, 1 For the same reason as for w. The high 32 bits may contain garbage. > + jg .nextrow > + REP_RET > + > +cglobal restore_rgb_planes10, 8,9,5, src_r, src_g, src_b, linesize_r, > linesize_g, linesize_b, w, h, x > + movsxd wq, wd > + shl wd, 1 > + shl linesize_rq, 1 > + shl linesize_gq, 1 > + shl linesize_bq, 1 > + add src_rq, wq > + add src_gq, wq > + add src_bq, wq > + mova m3, [pw_512] > + mova m4, [pw_1023] > + neg wq > +.nextrow: > + mov xq, wq > + > + .loop: > + mova m0, [src_rq + xq] > + mova m1, [src_gq + xq] > + mova m2, [src_bq + xq] > + paddw m0, m1 > + paddw m2, m1 > + psubw m0, m3 > + psubw m2, m3 > + pand m0, m4 > + pand m2, m4 > + mova [src_rq+xq], m0 > + mova [src_bq+xq], m2 > + add xq, mmsize > + jl .loop > + > + add src_rq, linesize_rq > + add src_gq, linesize_gq > + add src_bq, linesize_bq > + sub hq, 1 Same. > + jg .nextrow > + REP_RET _______________________________________________ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org http://ffmpeg.org/mailman/listinfo/ffmpeg-devel