.gitignore | 2 Makefile.am | 4 configure.ac | 102 + demos/Makefile.am | 13 demos/checkerboard.c | 71 + demos/composite-test.c | 55 - demos/gtk-utils.c | 46 demos/parrot.c | 1079 ++++++++++++++++++++ demos/parrot.jpg |binary demos/quad2quad.c | 2183 +++++++++++++++++++++++++++++++++++++++++ pixman/Makefile.am | 12 pixman/loongson-mmintrin.h | 273 +++++ pixman/pixman-bits-image.c | 10 pixman/pixman-compiler.h | 4 pixman/pixman-cpu.c | 39 pixman/pixman-fast-path.c | 6 pixman/pixman-mips-dspr2-asm.S | 443 ++++++++ pixman/pixman-mips-dspr2-asm.h | 363 ++++++ pixman/pixman-mips-dspr2.c | 22 pixman/pixman-mips-dspr2.h | 42 pixman/pixman-mmx.c | 665 +++++++++--- pixman/pixman-private.h | 9 test/utils.c | 35 test/utils.h | 9 24 files changed, 5211 insertions(+), 276 deletions(-)
New commits: commit 1e1a00e964a1d8ef43d6d75c1c3a0b5d518d1979 Author: Søren Sandmann Pedersen <s...@redhat.com> Date: Tue May 15 13:20:09 2012 -0400 Pre-release version bump to 0.25.6 Note that 0.25.4 was a botched release that doesn't have a tag and doesn't correspond to any commit ID. It was however uploaded and announced, so I'll just use the 0.25.6 version number. diff --git a/configure.ac b/configure.ac index d949839..502815e 100644 --- a/configure.ac +++ b/configure.ac @@ -54,7 +54,7 @@ AC_PREREQ([2.57]) m4_define([pixman_major], 0) m4_define([pixman_minor], 25) -m4_define([pixman_micro], 3) +m4_define([pixman_micro], 6) m4_define([pixman_version],[pixman_major.pixman_minor.pixman_micro]) commit b2c16aaadfae64d2573abb537bfedd92c13b8d06 Author: Søren Sandmann Pedersen <s...@redhat.com> Date: Tue May 15 13:19:19 2012 -0400 demos/Makefile.am: Add parrot.c to EXTRA_DIST To get 'make distcheck' to pass. diff --git a/demos/Makefile.am b/demos/Makefile.am index a664d93..8f734cf 100644 --- a/demos/Makefile.am +++ b/demos/Makefile.am @@ -22,6 +22,8 @@ DEMOS = \ quad2quad \ checkerboard +EXTRA_DIST = parrot.c + gradient_test_SOURCES = gradient-test.c $(GTK_UTILS) alpha_test_SOURCES = alpha-test.c $(GTK_UTILS) composite_test_SOURCES = composite-test.c $(GTK_UTILS) commit 50d3088d7882e1054a35e917becb7752662da6f0 Author: Matt Turner <matts...@gmail.com> Date: Fri May 11 21:59:13 2012 -0400 configure.ac: Rename loongson -> loongson-mmi Make it match with the other fast paths, and the PIXMAN_DISABLE value is already loongson-mmi. diff --git a/configure.ac b/configure.ac index 57fd060..d949839 100644 --- a/configure.ac +++ b/configure.ac @@ -278,7 +278,7 @@ if test "x$LS_CFLAGS" = "x" ; then fi have_loongson_mmi=no -AC_MSG_CHECKING(whether to use Loongson MMI) +AC_MSG_CHECKING(whether to use Loongson MMI assembler) xserver_save_CFLAGS=$CFLAGS CFLAGS=" $LS_CFLAGS $CFLAGS -I$srcdir" @@ -301,12 +301,12 @@ int main () { }]])], have_loongson_mmi=yes) CFLAGS=$xserver_save_CFLAGS -AC_ARG_ENABLE(loongson, - [AC_HELP_STRING([--disable-loongson], - [disable Loongson fast paths])], - [enable_loongson=$enableval], [enable_loongson=auto]) +AC_ARG_ENABLE(loongson-mmi, + [AC_HELP_STRING([--disable-loongson-mmi], + [disable Loongson MMI fast paths])], + [enable_loongson_mmi=$enableval], [enable_loongson_mmi=auto]) -if test $enable_loongson = no ; then +if test $enable_loongson_mmi = no ; then have_loongson_mmi=disabled fi @@ -317,7 +317,7 @@ else fi AC_MSG_RESULT($have_loongson_mmi) -if test $enable_loongson = yes && test $have_loongson_mmi = no ; then +if test $enable_loongson_mmi = yes && test $have_loongson_mmi = no ; then AC_MSG_ERROR([Loongson MMI not detected]) fi commit a0a40cb822bec52494c64e6750be50b734dc29df Author: Matt Turner <matts...@gmail.com> Date: Fri May 11 21:49:42 2012 -0400 configure.ac: Fix loongson-mmi out-of-tree builds When building out-of-tree, gcc wasn't able to find loongson-mmintrin.h to compile the test program. Add -I$srcdir to CFLAGS to point gcc to it. diff --git a/configure.ac b/configure.ac index 345bc33..57fd060 100644 --- a/configure.ac +++ b/configure.ac @@ -281,7 +281,7 @@ have_loongson_mmi=no AC_MSG_CHECKING(whether to use Loongson MMI) xserver_save_CFLAGS=$CFLAGS -CFLAGS=" $LS_CFLAGS $CFLAGS" +CFLAGS=" $LS_CFLAGS $CFLAGS -I$srcdir" AC_COMPILE_IFELSE([AC_LANG_SOURCE([[ #ifndef __mips_loongson_vector_rev #error "Loongson Multimedia Instructions are only available on Loongson" commit 618a08e6aa03b38e8dc71ac610f7fdd55e8a8558 Author: Nemanja Lukic <nemanja.lu...@rt-rk.com> Date: Thu May 3 00:03:42 2012 +0200 MIPS: DSPr2: Added over_n_8_8888 and over_n_8_0565 fast paths. Performance numbers before/after on MIPS-74kc @ 1GHz Referent (before): lowlevel-blt-bench: over_n_8_8888 = L1: 10.40 L2: 9.79 M: 8.47 ( 33.62%) HT: 7.64 VT: 7.59 R: 7.48 RT: 5.30 ( 40Kops/s) over_n_8_0565 = L1: 7.40 L2: 7.23 M: 6.78 ( 17.94%) HT: 6.23 VT: 6.17 R: 6.14 RT: 4.62 ( 37Kops/s) Optimized: lowlevel-blt-bench: over_n_8_8888 = L1: 27.25 L2: 26.24 M: 18.15 ( 72.12%) HT: 14.52 VT: 14.31 R: 13.83 RT: 7.57 ( 48Kops/s) over_n_8_0565 = L1: 18.91 L2: 17.59 M: 15.06 ( 39.90%) HT: 12.18 VT: 11.98 R: 11.83 RT: 6.80 ( 46Kops/s) diff --git a/pixman/pixman-mips-dspr2-asm.S b/pixman/pixman-mips-dspr2-asm.S index 6a0fc18..68ad33f 100644 --- a/pixman/pixman-mips-dspr2-asm.S +++ b/pixman/pixman-mips-dspr2-asm.S @@ -527,3 +527,227 @@ LEAF_MIPS_DSPR2(pixman_composite_over_n_8888_0565_ca_asm_mips) nop END(pixman_composite_over_n_8888_0565_ca_asm_mips) + +LEAF_MIPS_DSPR2(pixman_composite_over_n_8_8888_asm_mips) +/* + * a0 - dst (a8r8g8b8) + * a1 - src (32bit constant) + * a2 - mask (a8) + * a3 - w + */ + + SAVE_REGS_ON_STACK 4, s0, s1, s2, s3, s4 + beqz a3, 4f + nop + li t4, 0x00ff00ff + li t5, 0xff + addiu t0, a3, -1 + beqz t0, 3f /* last pixel */ + srl t6, a1, 24 /* t6 = srca */ + not s4, a1 + beq t5, t6, 2f /* if (srca == 0xff) */ + srl s4, s4, 24 +1: + /* a1 = src */ + lbu t0, 0(a2) /* t0 = mask */ + lbu t1, 1(a2) /* t1 = mask */ + or t2, t0, t1 + beqz t2, 111f /* if (t0 == 0) && (t1 == 0) */ + addiu a2, a2, 2 + and t3, t0, t1 + + lw t2, 0(a0) /* t2 = dst */ + beq t3, t5, 11f /* if (t0 == 0xff) && (t1 == 0xff) */ + lw t3, 4(a0) /* t3 = dst */ + + MIPS_2xUN8x4_MUL_2xUN8 a1, a1, t0, t1, s0, s1, t4, t6, t7, t8, t9, s2, s3 + not s2, s0 + not s3, s1 + srl s2, s2, 24 + srl s3, s3, 24 + MIPS_2xUN8x4_MUL_2xUN8 t2, t3, s2, s3, t2, t3, t4, t0, t1, t6, t7, t8, t9 + addu_s.qb s2, t2, s0 + addu_s.qb s3, t3, s1 + sw s2, 0(a0) + b 111f + sw s3, 4(a0) +11: + MIPS_2xUN8x4_MUL_2xUN8 t2, t3, s4, s4, t2, t3, t4, t0, t1, t6, t7, t8, t9 + addu_s.qb s2, t2, a1 + addu_s.qb s3, t3, a1 + sw s2, 0(a0) + sw s3, 4(a0) + +111: + addiu a3, a3, -2 + addiu t0, a3, -1 + bgtz t0, 1b + addiu a0, a0, 8 + b 3f + nop +2: + /* a1 = src */ + lbu t0, 0(a2) /* t0 = mask */ + lbu t1, 1(a2) /* t1 = mask */ + or t2, t0, t1 + beqz t2, 222f /* if (t0 == 0) && (t1 == 0) */ + addiu a2, a2, 2 + and t3, t0, t1 + beq t3, t5, 22f /* if (t0 == 0xff) && (t1 == 0xff) */ + nop + lw t2, 0(a0) /* t2 = dst */ + lw t3, 4(a0) /* t3 = dst */ + + OVER_2x8888_2x8_2x8888 a1, a1, t0, t1, t2, t3, \ + t6, t7, t4, t8, t9, s0, s1, s2, s3 + sw t6, 0(a0) + b 222f + sw t7, 4(a0) +22: + sw a1, 0(a0) + sw a1, 4(a0) +222: + addiu a3, a3, -2 + addiu t0, a3, -1 + bgtz t0, 2b + addiu a0, a0, 8 +3: + blez a3, 4f + nop + /* a1 = src */ + lbu t0, 0(a2) /* t0 = mask */ + beqz t0, 4f /* if (t0 == 0) */ + addiu a2, a2, 1 + move t3, a1 + beq t0, t5, 31f /* if (t0 == 0xff) */ + lw t1, 0(a0) /* t1 = dst */ + + MIPS_UN8x4_MUL_UN8 a1, t0, t3, t4, t6, t7, t8 +31: + not t2, t3 + srl t2, t2, 24 + MIPS_UN8x4_MUL_UN8 t1, t2, t1, t4, t6, t7, t8 + addu_s.qb t2, t1, t3 + sw t2, 0(a0) +4: + RESTORE_REGS_FROM_STACK 4, s0, s1, s2, s3, s4 + j ra + nop + +END(pixman_composite_over_n_8_8888_asm_mips) + +LEAF_MIPS_DSPR2(pixman_composite_over_n_8_0565_asm_mips) +/* + * a0 - dst (r5g6b5) + * a1 - src (32bit constant) + * a2 - mask (a8) + * a3 - w + */ + SAVE_REGS_ON_STACK 24, v0, s0, s1, s2, s3, s4, s5, s6, s7, s8 + beqz a3, 4f + nop + li t4, 0x00ff00ff + li t5, 0xff + li t6, 0xf800f800 + li t7, 0x07e007e0 + li t8, 0x001F001F + addiu t1, a3, -1 + beqz t1, 3f /* last pixel */ + srl t0, a1, 24 /* t0 = srca */ + not v0, a1 + beq t0, t5, 2f /* if (srca == 0xff) */ + srl v0, v0, 24 +1: + /* a1 = src */ + lbu t0, 0(a2) /* t0 = mask */ + lbu t1, 1(a2) /* t1 = mask */ + or t2, t0, t1 + beqz t2, 111f /* if (t0 == 0) && (t1 == 0) */ + addiu a2, a2, 2 + lhu t2, 0(a0) /* t2 = dst */ + lhu t3, 2(a0) /* t3 = dst */ + CONVERT_2x0565_TO_2x8888 t2, t3, s0, s1, t7, t8, t9, s2, s3, s4 + and t9, t0, t1 + beq t9, t5, 11f /* if (t0 == 0xff) && (t1 == 0xff) */ + nop + + MIPS_2xUN8x4_MUL_2xUN8 a1, a1, t0, t1, s2, s3, t4, t9, s4, s5, s6, s7, s8 + not s4, s2 + not s5, s3 + srl s4, s4, 24 + srl s5, s5, 24 + MIPS_2xUN8x4_MUL_2xUN8 s0, s1, s4, s5, s0, s1, t4, t9, t0, t1, s6, s7, s8 + addu_s.qb s4, s2, s0 + addu_s.qb s5, s3, s1 + CONVERT_2x8888_TO_2x0565 s4, s5, t2, t3, t6, t7, t8, s0, s1 + sh t2, 0(a0) + b 111f + sh t3, 2(a0) +11: + MIPS_2xUN8x4_MUL_2xUN8 s0, s1, v0, v0, s0, s1, t4, t9, t0, t1, s6, s7, s8 + addu_s.qb s4, a1, s0 + addu_s.qb s5, a1, s1 + CONVERT_2x8888_TO_2x0565 s4, s5, t2, t3, t6, t7, t8, s0, s1 + sh t2, 0(a0) + sh t3, 2(a0) +111: + addiu a3, a3, -2 + addiu t0, a3, -1 + bgtz t0, 1b + addiu a0, a0, 4 + b 3f + nop +2: + CONVERT_1x8888_TO_1x0565 a1, s0, s1, s2 +21: + /* a1 = src */ + lbu t0, 0(a2) /* t0 = mask */ + lbu t1, 1(a2) /* t1 = mask */ + or t2, t0, t1 + beqz t2, 222f /* if (t0 == 0) && (t1 == 0) */ + addiu a2, a2, 2 + and t9, t0, t1 + move s2, s0 + beq t9, t5, 22f /* if (t0 == 0xff) && (t2 == 0xff) */ + move s3, s0 + lhu t2, 0(a0) /* t2 = dst */ + lhu t3, 2(a0) /* t3 = dst */ + + CONVERT_2x0565_TO_2x8888 t2, t3, s2, s3, t7, t8, s4, s5, s6, s7 + OVER_2x8888_2x8_2x8888 a1, a1, t0, t1, s2, s3, \ + t2, t3, t4, t9, s4, s5, s6, s7, s8 + CONVERT_2x8888_TO_2x0565 t2, t3, s2, s3, t6, t7, t8, s4, s5 +22: + sh s2, 0(a0) + sh s3, 2(a0) +222: + addiu a3, a3, -2 + addiu t0, a3, -1 + bgtz t0, 21b + addiu a0, a0, 4 +3: + blez a3, 4f + nop + /* a1 = src */ + lbu t0, 0(a2) /* t0 = mask */ + beqz t0, 4f /* if (t0 == 0) */ + nop + lhu t1, 0(a0) /* t1 = dst */ + CONVERT_1x0565_TO_1x8888 t1, t2, t3, t7 + beq t0, t5, 31f /* if (t0 == 0xff) */ + move t3, a1 + + MIPS_UN8x4_MUL_UN8 a1, t0, t3, t4, t7, t8, t9 +31: + not t6, t3 + srl t6, t6, 24 + MIPS_UN8x4_MUL_UN8 t2, t6, t2, t4, t7, t8, t9 + addu_s.qb t1, t2, t3 + CONVERT_1x8888_TO_1x0565 t1, t2, t3, t7 + sh t2, 0(a0) +4: + RESTORE_REGS_FROM_STACK 24, v0, s0, s1, s2, s3, s4, s5, s6, s7, s8 + j ra + nop + +END(pixman_composite_over_n_8_0565_asm_mips) diff --git a/pixman/pixman-mips-dspr2-asm.h b/pixman/pixman-mips-dspr2-asm.h index 12ff42c..8383060 100644 --- a/pixman/pixman-mips-dspr2-asm.h +++ b/pixman/pixman-mips-dspr2-asm.h @@ -499,4 +499,71 @@ LEAF_MIPS32R2(symbol) \ precr.qb.ph \d2_8888, \scratch5, \scratch6 .endm +/* + * OVER operation on single a8r8g8b8 source pixel (s_8888) and single a8r8g8b8 + * destination pixel (d_8888) using a8 mask (m_8). It also requires maskLSR + * needed for rounding process. maskLSR must have following value: + * li maskLSR, 0x00ff00ff + */ +.macro OVER_8888_8_8888 s_8888, \ + m_8, \ + d_8888, \ + out_8888, \ + maskLSR, \ + scratch1, scratch2, scratch3, scratch4 + MIPS_UN8x4_MUL_UN8 \s_8888, \m_8, \ + \scratch1, \maskLSR, \ + \scratch2, \scratch3, \scratch4 + + not \scratch2, \scratch1 + srl \scratch2, \scratch2, 24 + + MIPS_UN8x4_MUL_UN8 \d_8888, \scratch2, \ + \d_8888, \maskLSR, \ + \scratch3, \scratch4, \out_8888 + + addu_s.qb \out_8888, \d_8888, \scratch1 +.endm + +/* + * OVER operation on two a8r8g8b8 source pixels (s1_8888 and s2_8888) and two + * a8r8g8b8 destination pixels (d1_8888 and d2_8888) using a8 masks (m1_8 and + * m2_8). It also requires maskLSR needed for rounding process. maskLSR must + * have following value: + * li maskLSR, 0x00ff00ff + */ +.macro OVER_2x8888_2x8_2x8888 s1_8888, \ + s2_8888, \ + m1_8, \ + m2_8, \ + d1_8888, \ + d2_8888, \ + out1_8888, \ + out2_8888, \ + maskLSR, \ + scratch1, scratch2, scratch3, \ + scratch4, scratch5, scratch6 + MIPS_2xUN8x4_MUL_2xUN8 \s1_8888, \s2_8888, \ + \m1_8, \m2_8, \ + \scratch1, \scratch2, \ + \maskLSR, \ + \scratch3, \scratch4, \out1_8888, \ + \out2_8888, \scratch5, \scratch6 + + not \scratch3, \scratch1 + srl \scratch3, \scratch3, 24 + not \scratch4, \scratch2 + srl \scratch4, \scratch4, 24 + + MIPS_2xUN8x4_MUL_2xUN8 \d1_8888, \d2_8888, \ + \scratch3, \scratch4, \ + \d1_8888, \d2_8888, \ + \maskLSR, \ + \scratch5, \scratch6, \out1_8888, \ + \out2_8888, \scratch3, \scratch4 + + addu_s.qb \out1_8888, \d1_8888, \scratch1 + addu_s.qb \out2_8888, \d2_8888, \scratch2 +.endm + #endif //PIXMAN_MIPS_DSPR2_ASM_H diff --git a/pixman/pixman-mips-dspr2.c b/pixman/pixman-mips-dspr2.c index 018770a..7081734 100644 --- a/pixman/pixman-mips-dspr2.c +++ b/pixman/pixman-mips-dspr2.c @@ -53,6 +53,10 @@ PIXMAN_MIPS_BIND_FAST_PATH_N_MASK_DST (SKIP_ZERO_SRC, over_n_8888_8888_ca, uint32_t, 1, uint32_t, 1) PIXMAN_MIPS_BIND_FAST_PATH_N_MASK_DST (SKIP_ZERO_SRC, over_n_8888_0565_ca, uint32_t, 1, uint16_t, 1) +PIXMAN_MIPS_BIND_FAST_PATH_N_MASK_DST (SKIP_ZERO_SRC, over_n_8_8888, + uint8_t, 1, uint32_t, 1) +PIXMAN_MIPS_BIND_FAST_PATH_N_MASK_DST (SKIP_ZERO_SRC, over_n_8_0565, + uint8_t, 1, uint16_t, 1) static pixman_bool_t pixman_fill_mips (uint32_t *bits, @@ -195,6 +199,12 @@ static const pixman_fast_path_t mips_dspr2_fast_paths[] = PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8b8g8r8, x8b8g8r8, mips_composite_over_n_8888_8888_ca), PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8r8g8b8, r5g6b5, mips_composite_over_n_8888_0565_ca), PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8b8g8r8, b5g6r5, mips_composite_over_n_8888_0565_ca), + PIXMAN_STD_FAST_PATH (OVER, solid, a8, a8r8g8b8, mips_composite_over_n_8_8888), + PIXMAN_STD_FAST_PATH (OVER, solid, a8, x8r8g8b8, mips_composite_over_n_8_8888), + PIXMAN_STD_FAST_PATH (OVER, solid, a8, a8b8g8r8, mips_composite_over_n_8_8888), + PIXMAN_STD_FAST_PATH (OVER, solid, a8, x8b8g8r8, mips_composite_over_n_8_8888), + PIXMAN_STD_FAST_PATH (OVER, solid, a8, r5g6b5, mips_composite_over_n_8_0565), + PIXMAN_STD_FAST_PATH (OVER, solid, a8, b5g6r5, mips_composite_over_n_8_0565), { PIXMAN_OP_NONE }, }; commit 7d4beedc612a32b73d7673bbf6447de0f3fca298 Author: Matt Turner <matts...@gmail.com> Date: Wed May 9 19:20:55 2012 -0400 mmx: add and use pack_4x565 function The pack_4x565 makes use of the pack_4xpacked565 function which uses pmadd. Some of the speed up is probably attributable to removing the artificial serialization imposed by the vdest = pack_565 (..., vdest, 0); vdest = pack_565 (..., vdest, 1); ... pattern. Loongson: over_n_0565 = L1: 16.44 L2: 16.42 M: 13.83 ( 9.85%) HT: 12.83 VT: 12.61 R: 12.34 RT: 8.90 ( 93Kops/s) over_n_0565 = L1: 42.48 L2: 42.53 M: 29.83 ( 21.20%) HT: 23.39 VT: 23.72 R: 21.80 RT: 11.60 ( 113Kops/s) over_8888_0565 = L1: 15.61 L2: 15.42 M: 12.11 ( 25.79%) HT: 11.07 VT: 10.70 R: 10.37 RT: 7.25 ( 82Kops/s) over_8888_0565 = L1: 35.01 L2: 35.20 M: 21.42 ( 45.57%) HT: 18.12 VT: 17.61 R: 16.09 RT: 9.01 ( 97Kops/s) over_n_8_0565 = L1: 15.17 L2: 14.94 M: 12.57 ( 17.86%) HT: 11.96 VT: 11.52 R: 10.79 RT: 7.31 ( 79Kops/s) over_n_8_0565 = L1: 29.83 L2: 29.79 M: 21.85 ( 30.94%) HT: 18.82 VT: 18.25 R: 16.15 RT: 8.72 ( 91Kops/s) over_n_8888_0565_ca = L1: 15.25 L2: 15.02 M: 11.64 ( 41.39%) HT: 11.08 VT: 10.72 R: 10.02 RT: 7.00 ( 77Kops/s) over_n_8888_0565_ca = L1: 30.12 L2: 29.99 M: 19.47 ( 68.99%) HT: 17.05 VT: 16.55 R: 14.67 RT: 8.38 ( 88Kops/s) ARM/iwMMXt: over_n_0565 = L1: 19.29 L2: 19.88 M: 17.38 ( 10.54%) HT: 15.53 VT: 16.11 R: 13.69 RT: 11.00 ( 96Kops/s) over_n_0565 = L1: 36.02 L2: 34.85 M: 28.04 ( 16.97%) HT: 22.12 VT: 24.21 R: 22.36 RT: 12.22 ( 103Kops/s) over_8888_0565 = L1: 18.38 L2: 16.59 M: 12.34 ( 22.29%) HT: 11.67 VT: 11.71 R: 11.02 RT: 6.89 ( 72Kops/s) over_8888_0565 = L1: 24.96 L2: 22.17 M: 15.11 ( 26.81%) HT: 14.14 VT: 13.71 R: 13.18 RT: 8.13 ( 78Kops/s) over_n_8_0565 = L1: 14.65 L2: 12.44 M: 11.56 ( 14.50%) HT: 10.93 VT: 10.39 R: 10.06 RT: 7.05 ( 70Kops/s) over_n_8_0565 = L1: 18.37 L2: 14.98 M: 13.97 ( 16.51%) HT: 12.67 VT: 10.35 R: 11.80 RT: 8.14 ( 74Kops/s) over_n_8888_0565_ca = L1: 14.27 L2: 12.93 M: 10.52 ( 33.23%) HT: 9.70 VT: 9.90 R: 9.31 RT: 6.34 ( 65Kops/s) over_n_8888_0565_ca = L1: 19.69 L2: 17.58 M: 13.40 ( 42.35%) HT: 11.75 VT: 11.33 R: 11.17 RT: 7.49 ( 73Kops/s) diff --git a/pixman/pixman-mmx.c b/pixman/pixman-mmx.c index b14201a..01a2bc9 100644 --- a/pixman/pixman-mmx.c +++ b/pixman/pixman-mmx.c @@ -598,6 +598,12 @@ pack_4xpacked565 (__m64 a, __m64 b) #endif } +static force_inline __m64 +pack_4x565 (__m64 v0, __m64 v1, __m64 v2, __m64 v3) +{ + return pack_4xpacked565 (pack8888 (v0, v1), pack8888 (v2, v3)); +} + #ifndef _MSC_VER static force_inline __m64 @@ -1396,16 +1402,14 @@ mmx_composite_over_n_0565 (pixman_implementation_t *imp, while (w >= 4) { - __m64 vdest; + __m64 vdest = *(__m64 *)dst; - vdest = *(__m64 *)dst; - - vdest = pack_565 (over (vsrc, vsrca, expand565 (vdest, 0)), vdest, 0); - vdest = pack_565 (over (vsrc, vsrca, expand565 (vdest, 1)), vdest, 1); - vdest = pack_565 (over (vsrc, vsrca, expand565 (vdest, 2)), vdest, 2); - vdest = pack_565 (over (vsrc, vsrca, expand565 (vdest, 3)), vdest, 3); + __m64 v0 = over (vsrc, vsrca, expand565 (vdest, 0)); + __m64 v1 = over (vsrc, vsrca, expand565 (vdest, 1)); + __m64 v2 = over (vsrc, vsrca, expand565 (vdest, 2)); + __m64 v3 = over (vsrc, vsrca, expand565 (vdest, 3)); - *(__m64 *)dst = vdest; + *(__m64 *)dst = pack_4x565 (v0, v1, v2, v3); dst += 4; w -= 4; @@ -1818,22 +1822,19 @@ mmx_composite_over_8888_0565 (pixman_implementation_t *imp, while (w >= 4) { - __m64 vsrc0, vsrc1, vsrc2, vsrc3; - __m64 vdest; + __m64 vdest = *(__m64 *)dst; - vsrc0 = load8888 ((src + 0)); - vsrc1 = load8888 ((src + 1)); - vsrc2 = load8888 ((src + 2)); - vsrc3 = load8888 ((src + 3)); + __m64 vsrc0 = load8888 ((src + 0)); + __m64 vsrc1 = load8888 ((src + 1)); + __m64 vsrc2 = load8888 ((src + 2)); + __m64 vsrc3 = load8888 ((src + 3)); - vdest = *(__m64 *)dst; - - vdest = pack_565 (over (vsrc0, expand_alpha (vsrc0), expand565 (vdest, 0)), vdest, 0); - vdest = pack_565 (over (vsrc1, expand_alpha (vsrc1), expand565 (vdest, 1)), vdest, 1); - vdest = pack_565 (over (vsrc2, expand_alpha (vsrc2), expand565 (vdest, 2)), vdest, 2); - vdest = pack_565 (over (vsrc3, expand_alpha (vsrc3), expand565 (vdest, 3)), vdest, 3); + __m64 v0 = over (vsrc0, expand_alpha (vsrc0), expand565 (vdest, 0)); + __m64 v1 = over (vsrc1, expand_alpha (vsrc1), expand565 (vdest, 1)); + __m64 v2 = over (vsrc2, expand_alpha (vsrc2), expand565 (vdest, 2)); + __m64 v3 = over (vsrc3, expand_alpha (vsrc3), expand565 (vdest, 3)); - *(__m64 *)dst = vdest; + *(__m64 *)dst = pack_4x565 (v0, v1, v2, v3); w -= 4; dst += 4; @@ -2368,25 +2369,22 @@ mmx_composite_over_n_8_0565 (pixman_implementation_t *imp, } else if (m0 | m1 | m2 | m3) { - __m64 vdest; - __m64 vm0, vm1, vm2, vm3; - - vdest = *(__m64 *)dst; + __m64 vdest = *(__m64 *)dst; - vm0 = to_m64 (m0); - vdest = pack_565 (in_over (vsrc, vsrca, expand_alpha_rev (vm0), - expand565 (vdest, 0)), vdest, 0); - vm1 = to_m64 (m1); - vdest = pack_565 (in_over (vsrc, vsrca, expand_alpha_rev (vm1), - expand565 (vdest, 1)), vdest, 1); - vm2 = to_m64 (m2); - vdest = pack_565 (in_over (vsrc, vsrca, expand_alpha_rev (vm2), - expand565 (vdest, 2)), vdest, 2); - vm3 = to_m64 (m3); - vdest = pack_565 (in_over (vsrc, vsrca, expand_alpha_rev (vm3), - expand565 (vdest, 3)), vdest, 3); - - *(__m64 *)dst = vdest; + __m64 vm0 = to_m64 (m0); + __m64 v0 = in_over (vsrc, vsrca, expand_alpha_rev (vm0), + expand565 (vdest, 0)); + __m64 vm1 = to_m64 (m1); + __m64 v1 = in_over (vsrc, vsrca, expand_alpha_rev (vm1), + expand565 (vdest, 1)); + __m64 vm2 = to_m64 (m2); + __m64 v2 = in_over (vsrc, vsrca, expand_alpha_rev (vm2), + expand565 (vdest, 2)); + __m64 vm3 = to_m64 (m3); + __m64 v3 = in_over (vsrc, vsrca, expand_alpha_rev (vm3), + expand565 (vdest, 3)); + + *(__m64 *)dst = pack_4x565 (v0, v1, v2, v3);; } w -= 4; @@ -2483,24 +2481,23 @@ mmx_composite_over_pixbuf_0565 (pixman_implementation_t *imp, if ((a0 & a1 & a2 & a3) == 0xFF) { - __m64 vdest; - vdest = pack_565 (invert_colors (load8888 (&s0)), _mm_setzero_si64 (), 0); - vdest = pack_565 (invert_colors (load8888 (&s1)), vdest, 1); - vdest = pack_565 (invert_colors (load8888 (&s2)), vdest, 2); - vdest = pack_565 (invert_colors (load8888 (&s3)), vdest, 3); + __m64 v0 = invert_colors (load8888 (&s0)); + __m64 v1 = invert_colors (load8888 (&s1)); + __m64 v2 = invert_colors (load8888 (&s2)); + __m64 v3 = invert_colors (load8888 (&s3)); - *(__m64 *)dst = vdest; + *(__m64 *)dst = pack_4x565 (v0, v1, v2, v3); } else if (s0 | s1 | s2 | s3) { __m64 vdest = *(__m64 *)dst; - vdest = pack_565 (over_rev_non_pre (load8888 (&s0), expand565 (vdest, 0)), vdest, 0); - vdest = pack_565 (over_rev_non_pre (load8888 (&s1), expand565 (vdest, 1)), vdest, 1); - vdest = pack_565 (over_rev_non_pre (load8888 (&s2), expand565 (vdest, 2)), vdest, 2); - vdest = pack_565 (over_rev_non_pre (load8888 (&s3), expand565 (vdest, 3)), vdest, 3); + __m64 v0 = over_rev_non_pre (load8888 (&s0), expand565 (vdest, 0)); + __m64 v1 = over_rev_non_pre (load8888 (&s1), expand565 (vdest, 1)); + __m64 v2 = over_rev_non_pre (load8888 (&s2), expand565 (vdest, 2)); + __m64 v3 = over_rev_non_pre (load8888 (&s3), expand565 (vdest, 3)); - *(__m64 *)dst = vdest; + *(__m64 *)dst = pack_4x565 (v0, v1, v2, v3); } w -= 4; @@ -2675,12 +2672,12 @@ mmx_composite_over_n_8888_0565_ca (pixman_implementation_t *imp, { __m64 vdest = *(__m64 *)q; - vdest = pack_565 (in_over (vsrc, vsrca, load8888 (&m0), expand565 (vdest, 0)), vdest, 0); - vdest = pack_565 (in_over (vsrc, vsrca, load8888 (&m1), expand565 (vdest, 1)), vdest, 1); - vdest = pack_565 (in_over (vsrc, vsrca, load8888 (&m2), expand565 (vdest, 2)), vdest, 2); - vdest = pack_565 (in_over (vsrc, vsrca, load8888 (&m3), expand565 (vdest, 3)), vdest, 3); + __m64 v0 = in_over (vsrc, vsrca, load8888 (&m0), expand565 (vdest, 0)); + __m64 v1 = in_over (vsrc, vsrca, load8888 (&m1), expand565 (vdest, 1)); + __m64 v2 = in_over (vsrc, vsrca, load8888 (&m2), expand565 (vdest, 2)); + __m64 v3 = in_over (vsrc, vsrca, load8888 (&m3), expand565 (vdest, 3)); - *(__m64 *)q = vdest; + *(__m64 *)q = pack_4x565 (v0, v1, v2, v3); } twidth -= 4; p += 4; commit 2beabd9fed76de0023eb36b0c938b8803aa8d129 Author: Matt Turner <matts...@gmail.com> Date: Thu May 10 16:15:34 2012 -0400 configure.ac: make -march=loongson2f come before CFLAGS Otherwise we'd have -march=loongson2f being overridden by automake's CFLAGS ordering which causes build failures when -march=<not loongson2f> is specified by the user. diff --git a/configure.ac b/configure.ac index 5478734..345bc33 100644 --- a/configure.ac +++ b/configure.ac @@ -281,7 +281,7 @@ have_loongson_mmi=no AC_MSG_CHECKING(whether to use Loongson MMI) xserver_save_CFLAGS=$CFLAGS -CFLAGS=" $CFLAGS $LS_CFLAGS" +CFLAGS=" $LS_CFLAGS $CFLAGS" AC_COMPILE_IFELSE([AC_LANG_SOURCE([[ #ifndef __mips_loongson_vector_rev #error "Loongson Multimedia Instructions are only available on Loongson" commit dadb9a318b8ca10c65e31e7278f4335a6968d246 Author: Søren Sandmann Pedersen <s...@redhat.com> Date: Tue May 8 10:05:18 2012 -0400 Add Makefile.win32 and Makefile.win32.common to EXTRA_DIST https://bugs.freedesktop.org/show_bug.cgi?id=46905 diff --git a/Makefile.am b/Makefile.am index df8677a..88ff897 100644 --- a/Makefile.am +++ b/Makefile.am @@ -21,6 +21,10 @@ RELEASE_XORG_HOST = $(USERNAME)@xorg.freedesktop.org RELEASE_XORG_DIR = /srv/xorg.freedesktop.org/archive/individual/lib RELEASE_ANNOUNCE_LIST = cairo-annou...@cairographics.org, xorg-annou...@lists.freedesktop.org, pix...@lists.freedesktop.org +EXTRA_DIST = \ + Makefile.win32 \ + Makefile.win32.common + tar_gz = $(PACKAGE)-$(VERSION).tar.gz tar_bz2 = $(PACKAGE)-$(VERSION).tar.bz2 commit 3c57ec471e1aacc863747b82bbe0a84c6d776ab7 Author: Matt Turner <matts...@gmail.com> Date: Wed May 9 22:50:50 2012 -0400 .gitignore: add demos/checkerboard and demos/quad2quad diff --git a/.gitignore b/.gitignore index 60b5bb4..98612c9 100644 --- a/.gitignore +++ b/.gitignore @@ -27,11 +27,13 @@ config.h config.h.in .*.swp demos/alpha-test +demos/checkerboard demos/clip-in demos/clip-test demos/composite-test demos/convolution-test demos/gradient-test +demos/quad2quad demos/radial-test demos/screen-test demos/trap-test commit 2d431b53d3cdbf1997e2d3b8e17408c12220c3a1 Author: Matt Turner <matts...@gmail.com> Date: Fri Apr 27 14:12:56 2012 -0400 mmx: Use wpackhus in src_x888_0565 on iwMMXt iwMMXt which has an unsigned saturation pack instruction, while MMX/EXT and Loongson don't. ARM/iwMMXt: src_8888_0565 = L1: 110.38 L2: 82.33 M: 40.92 ( 73.22%) HT: 35.63 VT: 32.22 R: 30.07 RT: 18.40 ( 132Kops/s) src_8888_0565 = L1: 117.91 L2: 83.05 M: 41.52 ( 75.58%) HT: 37.63 VT: 35.40 R: 29.37 RT: 19.39 ( 134Kops/s) diff --git a/pixman/pixman-mmx.c b/pixman/pixman-mmx.c index 7fe19d5..b14201a 100644 --- a/pixman/pixman-mmx.c +++ b/pixman/pixman-mmx.c @@ -589,9 +589,13 @@ pack_4xpacked565 (__m64 a, __m64 b) t1 = _mm_or_si64 (t1, g1); t0 = shift(t0, -5); +#ifdef USE_ARM_IWMMXT + t1 = shift(t1, -5); + return _mm_packs_pu32 (t0, t1); +#else t1 = shift(t1, -5 + 16); - return _mm_shuffle_pi16 (_mm_or_si64 (t0, t1), _MM_SHUFFLE (3, 1, 2, 0)); +#endif } #ifndef _MSC_VER commit 2ddd1c498b723e8e48a38eef01d5befba30b5259 Author: Matt Turner <matts...@gmail.com> Date: Thu Apr 19 17:33:27 2012 -0400 mmx: add src_8888_0565 Uses the pmadd technique described in http://software.intel.com/sites/landingpage/legacy/mmx/MMX_App_24-16_Bit_Conversion.pdf The technique uses the packssdw instruction which uses signed saturatation. This works in their example because they pack 888 to 555 leaving the high bit as zero. For packing to 565, it is unsuitable, so we replace it with an or+shuffle. Loongson: src_8888_0565 = L1: 106.13 L2: 83.57 M: 33.46 ( 68.90%) HT: 30.29 VT: 27.67 R: 26.11 RT: 15.06 ( 135Kops/s) src_8888_0565 = L1: 122.10 L2: 117.53 M: 37.97 ( 78.58%) HT: 33.14 VT: 30.09 R: 29.01 RT: 15.76 ( 139Kops/s) ARM/iwMMXt: src_8888_0565 = L1: 67.88 L2: 56.61 M: 31.20 ( 56.74%) HT: 29.22 VT: 27.01 R: 25.39 RT: 19.29 ( 130Kops/s) src_8888_0565 = L1: 110.38 L2: 82.33 M: 40.92 ( 73.22%) HT: 35.63 VT: 32.22 R: 30.07 RT: 18.40 ( 132Kops/s) diff --git a/pixman/loongson-mmintrin.h b/pixman/loongson-mmintrin.h index 76ae892..8295ba0 100644 --- a/pixman/loongson-mmintrin.h +++ b/pixman/loongson-mmintrin.h @@ -84,6 +84,17 @@ _mm_empty (void) } extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_madd_pi16 (__m64 __m1, __m64 __m2) +{ + __m64 ret; + asm("pmaddhw %0, %1, %2\n\t" + : "=f" (ret) + : "f" (__m1), "f" (__m2) + ); + return ret; +} + +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_mulhi_pu16 (__m64 __m1, __m64 __m2) { __m64 ret; diff --git a/pixman/pixman-mmx.c b/pixman/pixman-mmx.c index 320e20a..7fe19d5 100644 --- a/pixman/pixman-mmx.c +++ b/pixman/pixman-mmx.c @@ -179,9 +179,12 @@ typedef struct mmxdatafield mmx_4x0080; mmxdatafield mmx_565_rgb; mmxdatafield mmx_565_unpack_multiplier; + mmxdatafield mmx_565_pack_multiplier; mmxdatafield mmx_565_r; mmxdatafield mmx_565_g; mmxdatafield mmx_565_b; + mmxdatafield mmx_packed_565_rb; + mmxdatafield mmx_packed_565_g; #ifndef USE_LOONGSON_MMI mmxdatafield mmx_mask_0; mmxdatafield mmx_mask_1; @@ -207,9 +210,12 @@ static const mmx_data_t c = MMXDATA_INIT (.mmx_4x0080, 0x0080008000800080), MMXDATA_INIT (.mmx_565_rgb, 0x000001f0003f001f), MMXDATA_INIT (.mmx_565_unpack_multiplier, 0x0000008404100840), + MMXDATA_INIT (.mmx_565_pack_multiplier, 0x2000000420000004), MMXDATA_INIT (.mmx_565_r, 0x000000f800000000), MMXDATA_INIT (.mmx_565_g, 0x0000000000fc0000), MMXDATA_INIT (.mmx_565_b, 0x00000000000000f8), + MMXDATA_INIT (.mmx_packed_565_rb, 0x00f800f800f800f8), + MMXDATA_INIT (.mmx_packed_565_g, 0x0000fc000000fc00), #ifndef USE_LOONGSON_MMI MMXDATA_INIT (.mmx_mask_0, 0xffffffffffff0000), MMXDATA_INIT (.mmx_mask_1, 0xffffffff0000ffff), @@ -567,6 +573,27 @@ pack_565 (__m64 pixel, __m64 target, int pos) #endif } +static force_inline __m64 +pack_4xpacked565 (__m64 a, __m64 b) +{ + __m64 rb0 = _mm_and_si64 (a, MC (packed_565_rb)); + __m64 rb1 = _mm_and_si64 (b, MC (packed_565_rb)); + + __m64 t0 = _mm_madd_pi16 (rb0, MC (565_pack_multiplier)); + __m64 t1 = _mm_madd_pi16 (rb1, MC (565_pack_multiplier)); + + __m64 g0 = _mm_and_si64 (a, MC (packed_565_g)); + __m64 g1 = _mm_and_si64 (b, MC (packed_565_g)); + + t0 = _mm_or_si64 (t0, g0); + t1 = _mm_or_si64 (t1, g1); + + t0 = shift(t0, -5); + t1 = shift(t1, -5 + 16); + + return _mm_shuffle_pi16 (_mm_or_si64 (t0, t1), _MM_SHUFFLE (3, 1, 2, 0)); +} + #ifndef _MSC_VER static force_inline __m64 @@ -2091,6 +2118,60 @@ pixman_fill_mmx (uint32_t *bits, } static void +mmx_composite_src_x888_0565 (pixman_implementation_t *imp, + pixman_composite_info_t *info) +{ + PIXMAN_COMPOSITE_ARGS (info); + uint16_t *dst_line, *dst; + uint32_t *src_line, *src, s; + int dst_stride, src_stride; + int32_t w; + + PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1); + PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1); + + while (height--) + { + dst = dst_line; + dst_line += dst_stride; + src = src_line; + src_line += src_stride; + w = width; + + while (w && (unsigned long)dst & 7) + { + s = *src++; + *dst = CONVERT_8888_TO_0565 (s); + dst++; + w--; + } + + while (w >= 4) + { + __m64 vdest; + __m64 vsrc0 = ldq_u ((__m64 *)(src + 0)); + __m64 vsrc1 = ldq_u ((__m64 *)(src + 2)); + + vdest = pack_4xpacked565 (vsrc0, vsrc1); + + *(__m64 *)dst = vdest; + + w -= 4; + src += 4; + dst += 4; + } + + while (w) + { + s = *src++; + *dst = CONVERT_8888_TO_0565 (s); + dst++; + w--; + } + } +} + +static void mmx_composite_src_n_8_8888 (pixman_implementation_t *imp, pixman_composite_info_t *info) { @@ -3433,6 +3514,10 @@ static const pixman_fast_path_t mmx_fast_paths[] = PIXMAN_STD_FAST_PATH (ADD, a8, null, a8, mmx_composite_add_8_8 ), PIXMAN_STD_FAST_PATH (ADD, solid, a8, a8, mmx_composite_add_n_8_8 ), + PIXMAN_STD_FAST_PATH (SRC, a8r8g8b8, null, r5g6b5, mmx_composite_src_x888_0565 ), + PIXMAN_STD_FAST_PATH (SRC, a8b8g8r8, null, b5g6r5, mmx_composite_src_x888_0565 ), + PIXMAN_STD_FAST_PATH (SRC, x8r8g8b8, null, r5g6b5, mmx_composite_src_x888_0565 ), + PIXMAN_STD_FAST_PATH (SRC, x8b8g8r8, null, b5g6r5, mmx_composite_src_x888_0565 ), PIXMAN_STD_FAST_PATH (SRC, solid, a8, a8r8g8b8, mmx_composite_src_n_8_8888 ), PIXMAN_STD_FAST_PATH (SRC, solid, a8, x8r8g8b8, mmx_composite_src_n_8_8888 ), PIXMAN_STD_FAST_PATH (SRC, solid, a8, a8b8g8r8, mmx_composite_src_n_8_8888 ), commit 3e8fe65a0893fcd82bdea205de49f53be32bb074 Author: Matt Turner <matts...@gmail.com> Date: Wed Apr 18 16:24:28 2012 -0400 mmx: add x8f8g8b8 fetcher Loongson: -- To UNSUBSCRIBE, email to debian-x-requ...@lists.debian.org with a subject of "unsubscribe". Trouble? Contact listmas...@lists.debian.org Archive: http://lists.debian.org/e1sw8db-0005hk...@vasks.debian.org