I was motivated to send this to list after seeing Timothy Arceri's add SSE optimization for glDrawElements patch. This patch is not very nice with its ifdefs which are needed to get gcc to optimize loops when possible and still avoid sse code where at the minimum sse2 is not present.
What this causes is in _mesa_apply_rgba_transfer_ops clamping inner loop to change from this (I had -march=native during compilation thus see avx opcodes here): 0x00007ffff3dbe430 <+128>: vmovss (%rcx),%xmm2 0x00007ffff3dbe434 <+132>: vmovaps %xmm1,%xmm3 0x00007ffff3dbe438 <+136>: vucomiss %xmm2,%xmm0 0x00007ffff3dbe43c <+140>: ja 0x7ffff3dbe442 <_mesa_apply_rgba_transfer_ops+146> 0x00007ffff3dbe43e <+142>: vminss %xmm2,%xmm4,%xmm3 0x00007ffff3dbe442 <+146>: vmovss 0x4(%rcx),%xmm2 0x00007ffff3dbe447 <+151>: vmovss %xmm3,(%rcx) 0x00007ffff3dbe44b <+155>: vmovaps %xmm1,%xmm3 0x00007ffff3dbe44f <+159>: vucomiss %xmm2,%xmm0 0x00007ffff3dbe453 <+163>: ja 0x7ffff3dbe459 <_mesa_apply_rgba_transfer_ops+169> 0x00007ffff3dbe455 <+165>: vminss %xmm2,%xmm7,%xmm3 0x00007ffff3dbe459 <+169>: vmovss 0x8(%rcx),%xmm2 0x00007ffff3dbe45e <+174>: vmovss %xmm3,0x4(%rcx) 0x00007ffff3dbe463 <+179>: vmovaps %xmm1,%xmm3 0x00007ffff3dbe467 <+183>: vucomiss %xmm2,%xmm0 0x00007ffff3dbe46b <+187>: ja 0x7ffff3dbe471 <_mesa_apply_rgba_transfer_ops+193> 0x00007ffff3dbe46d <+189>: vminss %xmm2,%xmm6,%xmm3 0x00007ffff3dbe471 <+193>: vmovss 0xc(%rcx),%xmm2 0x00007ffff3dbe476 <+198>: vmovss %xmm3,0x8(%rcx) 0x00007ffff3dbe47b <+203>: vmovaps %xmm1,%xmm3 0x00007ffff3dbe47f <+207>: vucomiss %xmm2,%xmm0 0x00007ffff3dbe483 <+211>: ja 0x7ffff3dbe489 <_mesa_apply_rgba_transfer_ops+217> 0x00007ffff3dbe485 <+213>: vminss %xmm2,%xmm5,%xmm3 0x00007ffff3dbe489 <+217>: vmovss %xmm3,0xc(%rcx) 0x00007ffff3dbe48e <+222>: add $0x10,%rcx 0x00007ffff3dbe492 <+226>: cmp %rax,%rcx 0x00007ffff3dbe495 <+229>: jne 0x7ffff3dbe430 <_mesa_apply_rgba_transfer_ops+128> into this: 0x00007ffff3dbe4d0 <+288>: vmovups (%rcx),%xmm0 0x00007ffff3dbe4d4 <+292>: add $0x10,%rcx 0x00007ffff3dbe4d8 <+296>: vmaxps %xmm1,%xmm0,%xmm0 0x00007ffff3dbe4dc <+300>: vminps %xmm2,%xmm0,%xmm0 0x00007ffff3dbe4e0 <+304>: vmovups %xmm0,-0x10(%rcx) 0x00007ffff3dbe4e5 <+309>: cmp %rax,%rcx 0x00007ffff3dbe4e8 <+312>: jne 0x7ffff3dbe4d0 <_mesa_apply_rgba_transfer_ops+288> The two pieces of code do the same thing. Places where this does really help is for example Android home screen. /Juha-Pekka Juha-Pekka Heikkila (1): mesa/main: Clamp rgba with streamed sse src/mesa/main/colormac.h | 20 +++++++++++++++ src/mesa/main/pixeltransfer.c | 59 ++++++++++++++++++++++++++++++++----------- 2 files changed, 64 insertions(+), 15 deletions(-) -- 1.8.5.1 _______________________________________________ mesa-dev mailing list mesa-dev@lists.freedesktop.org http://lists.freedesktop.org/mailman/listinfo/mesa-dev