ChangeLog | 579 ++++++++++++++++++++ Makefile.am | 7 configure.ac | 51 + debian/changelog | 11 demos/tri-test.c | 2 pixman/Makefile.am | 1 pixman/Makefile.win32 | 6 pixman/pixman-access.c | 97 +++ pixman/pixman-arm-common.h | 90 +++ pixman/pixman-arm-neon-asm-bilinear.S | 768 ++++++++++++++++++++++++++ pixman/pixman-arm-neon-asm.S | 982 +++++++++++++++++++++++++++++++++- pixman/pixman-arm-neon-asm.h | 17 pixman/pixman-arm-neon.c | 62 ++ pixman/pixman-arm-simd-asm.S | 66 +- pixman/pixman-arm-simd.c | 9 pixman/pixman-bits-image.c | 20 pixman/pixman-conical-gradient.c | 7 pixman/pixman-fast-path.h | 432 ++++++++++++++ pixman/pixman-general.c | 58 -- pixman/pixman-image.c | 1 pixman/pixman-implementation.c | 46 - pixman/pixman-linear-gradient.c | 16 pixman/pixman-private.h | 51 - pixman/pixman-radial-gradient.c | 7 pixman/pixman-solid-fill.c | 17 pixman/pixman-sse2.c | 139 ++++ pixman/pixman-trap.c | 23 pixman/pixman.c | 6 pixman/pixman.h | 6 test/Makefile.am | 2 test/Makefile.win32 | 73 ++ test/affine-test.c | 6 test/blitters-test.c | 13 test/composite-traps-test.c | 8 test/composite.c | 60 +- test/fetch-test.c | 63 +- test/scaling-helpers-test.c | 93 +++ test/scaling-test.c | 6 test/stress-test.c | 41 + test/trap-crasher.c | 20 test/utils.c | 19 test/utils.h | 5 42 files changed, 3679 insertions(+), 307 deletions(-)
New commits: commit 2296b15c9d4d5002f354695992e12ac5d912677d Author: Cyril Brulebois <k...@debian.org> Date: Fri Apr 29 17:53:20 2011 +0200 Upload to unstable. diff --git a/debian/changelog b/debian/changelog index a2680f6..b14d5e2 100644 --- a/debian/changelog +++ b/debian/changelog @@ -1,4 +1,4 @@ -pixman (0.21.8-1) UNRELEASED; urgency=low +pixman (0.21.8-1) unstable; urgency=low * New upstream release. * As seen in the upstream announcement: “When this version of pixman is @@ -7,7 +7,7 @@ pixman (0.21.8-1) UNRELEASED; urgency=low * This new release should fix the FTBFS on big endian machines, tests were failing due to missing swapping (Closes: #622211). - -- Cyril Brulebois <k...@debian.org> Fri, 29 Apr 2011 17:52:08 +0200 + -- Cyril Brulebois <k...@debian.org> Fri, 29 Apr 2011 17:53:12 +0200 pixman (0.21.6-2) unstable; urgency=low commit c48a9b803597eebd63b3a77f5cc65c7eb2f98fdf Author: Cyril Brulebois <k...@debian.org> Date: Fri Apr 29 17:53:09 2011 +0200 Mention endianness-related FTBFS fix (Closes: #622211). diff --git a/debian/changelog b/debian/changelog index a5fdd88..a2680f6 100644 --- a/debian/changelog +++ b/debian/changelog @@ -4,6 +4,8 @@ pixman (0.21.8-1) UNRELEASED; urgency=low * As seen in the upstream announcement: “When this version of pixman is used with the git version of the X server, trapezoid rendering will be corrupted. This is a known bug in the X server.” + * This new release should fix the FTBFS on big endian machines, tests + were failing due to missing swapping (Closes: #622211). -- Cyril Brulebois <k...@debian.org> Fri, 29 Apr 2011 17:52:08 +0200 commit fa956ebd6b28216e5144cfdc87f44660256e1b1a Author: Cyril Brulebois <k...@debian.org> Date: Fri Apr 29 17:52:36 2011 +0200 Bump changelogs. diff --git a/ChangeLog b/ChangeLog index 17896a2..69d93cb 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,582 @@ +commit 89868e93bd8d66f0fac0f0b42cf7718756992e4e +Author: Søren Sandmann Pedersen <s...@redhat.com> +Date: Tue Apr 19 00:00:37 2011 -0400 + + Pre-release version bump to 0.21.8 + +commit 33f1652b953467f3910605b3be723e21b3ebe078 +Author: Taekyun Kim <tkq....@samsung.com> +Date: Wed Apr 13 11:57:35 2011 +0900 + + ARM: Enable bilinear fast paths using scanline functions in pixman-arm-neon-asm-bilinear.S + + Enable fast paths which is supported by scanline functions in + pixman-arm-neon-asm-bilinear.S + +commit e8185f1cb43417d9f7b1d2856bb899f1b84fde81 +Author: Taekyun Kim <tkq....@samsung.com> +Date: Wed Apr 13 11:48:40 2011 +0900 + + ARM: NEON scanline functions for bilinear scaling + + General fetch->combine->store based bilinear scanline functions. + Need further optimizations and eventually will be replaced with optimal + functions one by one. + General functions should be located in pixman-arm-neon-asm-bilinear.S and + optimal functions in pixman-arm-neon-asm.S + + Following general bilinear scanline functions are implemented + over_8888_8888 + add_8888_8888 + src_8888_8_8888 + src_8888_8_0565 + src_0565_8_x888 + src_0565_8_0565 + over_8888_8_8888 + add_8888_8_8888 + +commit 00939d35628e733fab63606cfb1d7fcb667860d3 +Author: Taekyun Kim <tkq....@samsung.com> +Date: Wed Apr 13 11:43:44 2011 +0900 + + ARM: Common macro for scaled bilinear scanline function with A8 mask + + Defining PIXMAN_ARM_BIND_SCALED_BILINEAR_SRC_A8_DST macro for declaration of + scaled bilinear scanline functions in common header. + +commit b455496890f7f941d561c284aca14783300bedd6 +Author: Søren Sandmann Pedersen <s...@redhat.com> +Date: Fri Mar 11 07:52:57 2011 -0500 + + Offset rendering in pixman_composite_trapezoids() by (x_dst, y_dst) + + Previously, this function would do coordinate calculations in such a + way that (x_dst, y_dst) would only affect the alignment of the source + image, but not of the traps, which would always be considered to be in + absolute destination coordinates. This is unlike the + pixman_image_composite() function which also registers the mask to the + destination. + + This patch makes it so that traps are also offset by (x_dst, y_dst). + + Also add a comment explaining how this function is supposed to + operate, and update tri-test.c and composite-trap-test.c to deal with + the new semantics. + +commit e75e6a4ef5c5a8ac8b0e8464f08f83fd2b6e86ed +Author: Søren Sandmann Pedersen <s...@redhat.com> +Date: Sat Apr 2 23:24:48 2011 -0400 + + ARM: Add 'neon_composite_over_n_8888_0565_ca' fast path + + This improves the performance of the firefox-talos-gfx benchmark with + the image16 backend. Benchmark on an 800 MHz ARM Cortex A8: + + Before: + + [ # ] backend test min(s) median(s) stddev. count + [ 0] image16 firefox-talos-gfx 121.773 122.218 0.15% 6/6 + + After: + + [ # ] backend test min(s) median(s) stddev. count + [ 0] image16 firefox-talos-gfx 85.247 85.563 0.22% 6/6 + + V2: Slightly better instruction scheduling based on comments from Taekyun Kim. + V3: Eliminate all stalls from the inner loop. Also based on comments from Taekyun Kim. + +commit 1670b952143284f480c39ff087b5694a64eb7db3 +Author: Gilles Espinasse <g....@free.fr> +Date: Tue Apr 12 22:44:56 2011 +0200 + + Fix OpenMP not supported case + + PIXMAN_LINK_WITH_ENV did not fail unless -Wall -Werror is used. + So even when the compiler did not support OpenMP, USE_OPENMP was defined. + Fix that by running the second OpenMP test only when first AC_OPENMP find supported + + configure tested in the cases : + gcc without libgomp support, no openmp option, --enable-openmp and --disable-openmp + gcc with libgomp support, no openmp option, --enable-openmp and --disable-openmp + + Not tested with autoconf version not knowing openmp (<2.62) + + Warn when --enable-openmp is requested but no support is found + + Signed-off-by: Gilles Espinasse <g....@free.fr> + +commit b9e8f7fb7494e4ee4be56d1555632233a494b28e +Author: Gilles Espinasse <g....@free.fr> +Date: Tue Apr 12 22:44:25 2011 +0200 + + Fix missing AC_MSG_RESULT value from Werror test + + Use the correct variable name + + Signed-off-by: Gilles Espinasse <g....@free.fr> + +commit caae4e82ffdeebfb9aa98a6c49dd563e065c0959 +Author: Siarhei Siamashka <siarhei.siamas...@nokia.com> +Date: Mon Mar 21 20:25:27 2011 +0200 + + ARM: pipelined NEON implementation of bilinear scaled 'src_8888_0565' + + Benchmark on ARM Cortex-A8 r1p3 @600MHz, 32-bit LPDDR @166MHz: + Microbenchmark (scaling 2000x2000 image with scale factor close to 1x): + before: op=1, src=20028888, dst=10020565, speed=33.59 MPix/s + after: op=1, src=20028888, dst=10020565, speed=46.25 MPix/s + + Benchmark on ARM Cortex-A8 r2p2 @1GHz, 32-bit LPDDR @200MHz: + Microbenchmark (scaling 2000x2000 image with scale factor close to 1x): + before: op=1, src=20028888, dst=10020565, speed=63.86 MPix/s + after: op=1, src=20028888, dst=10020565, speed=84.22 MPix/s + +commit d080d59b802c351daed84b92bd4eb20c775b81c7 +Author: Siarhei Siamashka <siarhei.siamas...@nokia.com> +Date: Wed Mar 16 17:24:49 2011 +0200 + + ARM: pipelined NEON implementation of bilinear scaled 'src_8888_8888' + + Performance of the inner loop when working with the data in L1 cache: + ARM Cortex-A8: 41 cycles per 4 pixels (no stalls and partial dual issue) + ARM Cortex-A9: 48 cycles per 4 pixels (no stalls) + + It might be still possible to improve performance even more on ARM Cortex-A8 + with a better use of dual issue. + + Benchmark on ARM Cortex-A8 r1p3 @600MHz, 32-bit LPDDR @166MHz: + Microbenchmark (scaling 2000x2000 image with scale factor close to 1x): + before: op=1, src=20028888, dst=20028888, speed=40.38 MPix/s + after: op=1, src=20028888, dst=20028888, speed=48.47 MPix/s + + Benchmark on ARM Cortex-A8 r2p2 @1GHz, 32-bit LPDDR @200MHz: + Microbenchmark (scaling 2000x2000 image with scale factor close to 1x): + before: op=1, src=20028888, dst=20028888, speed=79.68 MPix/s + after: op=1, src=20028888, dst=20028888, speed=93.11 MPix/s + +commit b496a8b279baebb8b9ab4fbcb2101583be08fe3b +Author: Siarhei Siamashka <siarhei.siamas...@nokia.com> +Date: Thu Mar 17 19:42:01 2011 +0200 + + ARM: support different levels of loop unrolling in bilinear scaler + + Now an extra 'flag' parameter is supported in bilinear scaline scaling + function generation macro. It can be used to enable 4 or 8 pixels per + loop iteration unrolling and provide save/restore code for d8-d15 + registers. + +commit 34ca9cf03fa897cd377cdb19acc22e876b2f4b0e +Author: Siarhei Siamashka <siarhei.siamas...@nokia.com> +Date: Mon Mar 21 18:41:53 2011 +0200 + + ARM: use less ARM instructions in NEON bilinear scaling code + + This reduces code size and also puts less pressure on the + instruction decoder. + +commit 0f7be9f72ef6bfe2555b7f2cc29297c4f4762740 +Author: Siarhei Siamashka <siarhei.siamas...@nokia.com> +Date: Wed Mar 16 16:33:41 2011 +0200 + + ARM: support for software pipelining in bilinear macros + + Now it's possible to override the main loop of bilinear scaling code + with optimized pipelined implementation. + +commit 9638af95832563040d6bd861cf4c20ab632058df +Author: Siarhei Siamashka <siarhei.siamas...@nokia.com> +Date: Thu Mar 10 16:12:23 2011 +0200 + + ARM: use aligned memory writes in NEON bilinear scaling code + +commit 8bba3a0e1e54f03ea78fb44314f3bfa57ec8da31 +Author: Siarhei Siamashka <siarhei.siamas...@nokia.com> +Date: Thu Mar 10 15:34:10 2011 +0200 + + ARM: tweaked horizontal weights update in NEON bilinear scaling code + + Moving horizontal interpolation weights update instructions from the + beginning of loop to its end allows to hide some pipeline stalls and + improve performance. + +commit a2153222677327be43251012f462d19a7e98ce14 +Author: Søren Sandmann Pedersen <s...@redhat.com> +Date: Sun Apr 3 20:32:30 2011 -0400 + + ARM: Tiny improvement in over_n_8888_8888_ca_process_pixblock_head + + Instead of two + + mvn d24, d24 + mvn d25, d25 + + use just one + + mvn q12, q12 + + Also move another vmvn instruction into the created pipeline bubble, + as pointed out by Siarhei. + +commit 44f99735d9c6a897078db12172d9d2d07b204f37 +Author: Søren Sandmann Pedersen <s...@redhat.com> +Date: Sat Apr 2 14:12:12 2011 -0400 + + Makefile.am: Put development releases in "snapshots" directory + + Up until now, all pixman release, both snapshots and releases were + uploaded to the "releases" directory on www.cairographics.org, but + it's better to development snapshots in the "snapshots" directory. + + This patch changes Makefile.am to do that. + +commit ad3cbfb073fc325e1b3152898ca71b8255675957 +Author: Søren Sandmann Pedersen <s...@redhat.com> +Date: Tue Mar 22 13:42:05 2011 -0400 + + test: Fix infinite loop in composite + + When run in PIXMAN_RANDOMIZE_TESTS mode, this test would go into an + infinite loop because the loop started at 'seed' but the stop + condition was still N_TESTS. + +commit b514e63cfc58af21f7097db5a1b04292a758782a +Author: Alexandros Frantzis <alexandros.frant...@linaro.org> +Date: Fri Mar 18 14:37:27 2011 +0200 + + Add support for the r8g8b8a8 and r8g8b8x8 formats to the tests. + +commit f05a90e5f8d1d0af60e2c684cbe9f1327c33135a +Author: Alexandros Frantzis <alexandros.frant...@linaro.org> +Date: Fri Mar 18 14:36:15 2011 +0200 + + Add simple support for the r8g8b8a8 and r8g8b8x8 formats. + + This format is particularly useful on big-endian architectures, where RGBA in + memory/file order corresponds to r8g8b8a8 as an uint32_t. This is important + because RGBA is in some cases the only available choice (for example as a pixel + format in OpenGL ES 2.0). + +commit 7eb0abb5e819046537b9f809c7ec332c6679c557 +Author: Søren Sandmann Pedersen <s...@redhat.com> +Date: Mon Mar 14 14:56:22 2011 -0400 + + test: Randomize some tests if PIXMAN_RANDOMIZE_TESTS is set + + This patch makes so that composite and stress-test will start from a + random seed if the PIXMAN_RANDOMIZE_TESTS environment variable is + set. Running the test suite in this mode is useful to get more test + coverage. + + Also, in stress-test.c make it so that setting the initial seed causes + threads to be turned off. This makes it much easier to see when + something fails. + +commit 6b27768d81c254a4f1d05473157328d5a5d99b9c +Author: Søren Sandmann Pedersen <s...@redhat.com> +Date: Sat Mar 12 19:42:58 2011 -0500 + + Simplify the prototype for iterator initializers. + + All of the information previously passed to the iterator initializers + is now available in the iterator itself, so there is no need to pass + it as arguments anymore. + +commit 74d0f44b6d6d613d24541b849835da0464cc6fd0 +Author: Søren Sandmann Pedersen <s...@redhat.com> +Date: Sat Mar 12 19:12:35 2011 -0500 + + Fill out parts of iters in _pixman_implementation_{src,dest}_iter_init() + + This makes _pixman_implementation_{src,dest}_iter_init() responsible + for filling parts of the information in the iterators. Specifically, + the information passed as arguments is stored in the iterator. + + Also add a height field to pixman_iter_t(). + +commit be4eaa0e4f79af38b7b89c5b09ca88d3a88d9396 +Author: Søren Sandmann Pedersen <s...@redhat.com> +Date: Sat Mar 12 19:06:02 2011 -0500 + + In delegate_{src,dest}_iter_init() call delegate directly. + + There is no reason to go through + _pixman_implementation_{src,dest}_iter_init(), especially since + _pixman_implementation_src_iter_init() is doing various other checks + that only need to be done once. + + Also call delegate->src_iter_init() directly in pixman-sse2.c + +commit 70a923882ca24664344ba91a649e7aa12c3063f7 +Author: Siarhei Siamashka <siarhei.siamas...@nokia.com> +Date: Wed Mar 9 13:55:48 2011 +0200 + + ARM: a bit faster NEON bilinear scaling for r5g6b5 source images + + Instructions scheduling improved in the code responsible for fetching r5g6b5 + pixels and converting them to the intermediate x8r8g8b8 color format used in + the interpolation part of code. Still a lot of NEON stalls are remaining, + which can be resolved later by the use of pipelining. + + Benchmark on ARM Cortex-A8 r2p2 @1GHz, 32-bit LPDDR @200MHz: + Microbenchmark (scaling 2000x2000 image with scale factor close to 1x): + before: op=1, src=10020565, dst=10020565, speed=32.29 MPix/s + op=1, src=10020565, dst=20020888, speed=36.82 MPix/s + after: op=1, src=10020565, dst=10020565, speed=41.35 MPix/s + op=1, src=10020565, dst=20020888, speed=49.16 MPix/s + +commit fe99673719091d4a880d031add1369332a75731b +Author: Siarhei Siamashka <siarhei.siamas...@nokia.com> +Date: Wed Mar 9 13:27:41 2011 +0200 + + ARM: NEON optimization for bilinear scaled 'src_0565_0565' + + Benchmark on ARM Cortex-A8 r2p2 @1GHz, 32-bit LPDDR @200MHz: + Microbenchmark (scaling 2000x2000 image with scale factor close to 1x): + before: op=1, src=10020565, dst=10020565, speed=3.30 MPix/s + after: op=1, src=10020565, dst=10020565, speed=32.29 MPix/s + +commit 29003c3befe2159396d181ef9ac1caaadcabf382 +Author: Siarhei Siamashka <siarhei.siamas...@nokia.com> +Date: Wed Mar 9 13:21:53 2011 +0200 + + ARM: NEON optimization for bilinear scaled 'src_0565_x888' + + Benchmark on ARM Cortex-A8 r2p2 @1GHz, 32-bit LPDDR @200MHz: + Microbenchmark (scaling 2000x2000 image with scale factor close to 1x): + before: op=1, src=10020565, dst=20020888, speed=3.39 MPix/s + after: op=1, src=10020565, dst=20020888, speed=36.82 MPix/s + +commit 2ee27e7d79637da9173ee1bf3423e5a81534ccb4 +Author: Siarhei Siamashka <siarhei.siamas...@nokia.com> +Date: Wed Mar 9 11:53:04 2011 +0200 + + ARM: NEON optimization for bilinear scaled 'src_8888_0565' + + Benchmark on ARM Cortex-A8 r2p2 @1GHz, 32-bit LPDDR @200MHz: + Microbenchmark (scaling 2000x2000 image with scale factor close to 1x): + before: op=1, src=20028888, dst=10020565, speed=6.56 MPix/s + after: op=1, src=20028888, dst=10020565, speed=61.65 MPix/s + +commit 11a0c5badbc59ce967707ef836313cc98f8aec4e +Author: Siarhei Siamashka <siarhei.siamas...@nokia.com> +Date: Wed Mar 9 11:46:48 2011 +0200 + + ARM: use common macro template for bilinear scaled 'src_8888_8888' + + This is a cleanup for old and now duplicated code. The performance improvement + is mostly coming from the enabled use of software prefetch, but instructions + scheduling is also slightly better. + + Benchmark on ARM Cortex-A8 r2p2 @1GHz, 32-bit LPDDR @200MHz: + Microbenchmark (scaling 2000x2000 image with scale factor close to 1x): + before: op=1, src=20028888, dst=20028888, speed=53.24 MPix/s + after: op=1, src=20028888, dst=20028888, speed=74.36 MPix/s + +commit 34098dba6763afd3636a14f9c2a079ab08f23b2d +Author: Siarhei Siamashka <siarhei.siamas...@nokia.com> +Date: Wed Mar 9 11:34:15 2011 +0200 + + ARM: NEON: common macro template for bilinear scanline scalers + + This allows to generate bilinear scanline scaling functions targeting + various source and destination color formats. Right now a8r8g8b8/x8r8g8b8 + and r5g6b5 color formats are supported. More formats can be added if needed. + +commit 66f4ee1b3bccf4516433d61dbf2035551a712fa2 +Author: Siarhei Siamashka <siarhei.siamas...@nokia.com> +Date: Wed Mar 9 10:59:46 2011 +0200 + + ARM: new bilinear fast path template macro in 'pixman-arm-common.h' + + It can be reused in different ARM NEON bilinear scaling fast path functions. + +commit 5921c17639fe5fdc595c850e3347281c1c8746ba +Author: Siarhei Siamashka <siarhei.siamas...@nokia.com> +Date: Sun Mar 6 22:16:32 2011 +0200 + + ARM: assembly optimized nearest scaled 'src_8888_8888' + + Benchmark on ARM Cortex-A8 r1p3 @500MHz, 32-bit LPDDR @166MHz: + Microbenchmark (scaling 2000x2000 image with scale factor close to 1x): + before: op=1, src=20028888, dst=20028888, speed=44.36 MPix/s + after: op=1, src=20028888, dst=20028888, speed=39.79 MPix/s + + Benchmark on ARM Cortex-A8 r2p2 @1GHz, 32-bit LPDDR @200MHz: + Microbenchmark (scaling 2000x2000 image with scale factor close to 1x): + before: op=1, src=20028888, dst=20028888, speed=102.36 MPix/s + after: op=1, src=20028888, dst=20028888, speed=163.12 MPix/s + +commit f3e17872f5522e25da8e32de83e62bee8cc198d7 +Author: Siarhei Siamashka <siarhei.siamas...@nokia.com> +Date: Mon Mar 7 03:10:43 2011 +0200 + + ARM: common macro for nearest scaling fast paths + + The code of nearest scaled 'src_0565_0565' function was generalized + and moved to a common macro, so that it can be reused for other + fast paths. + +commit bb3d1b67fd0f42ae00af811c624ea1c44541034d +Author: Siarhei Siamashka <siarhei.siamas...@nokia.com> +Date: Sun Mar 6 16:17:12 2011 +0200 + + ARM: use prefetch in nearest scaled 'src_0565_0565' + + Benchmark on ARM Cortex-A8 r1p3 @500MHz, 32-bit LPDDR @166MHz: + Microbenchmark (scaling 2000x2000 image with scale factor close to 1x): + before: op=1, src=10020565, dst=10020565, speed=75.02 MPix/s + after: op=1, src=10020565, dst=10020565, speed=73.63 MPix/s + + Benchmark on ARM Cortex-A8 r2p2 @1GHz, 32-bit LPDDR @200MHz: + Microbenchmark (scaling 2000x2000 image with scale factor close to 1x): + before: op=1, src=10020565, dst=10020565, speed=176.12 MPix/s + after: op=1, src=10020565, dst=10020565, speed=267.50 MPix/s + +commit 84e361c8e357e26f299213fbeefe64c73447b116 +Author: Søren Sandmann Pedersen <s...@redhat.com> +Date: Fri Mar 4 15:51:18 2011 -0500 + + test: Do endian swapping of the source and destination images. + + Otherwise the test fails on big endian. Fix for bug 34767, reported by + Siarhei Siamashka. + +commit 84f3c5a71a2de1a96dcf0c7f9ab0a8ee1b1b158f +Author: Søren Sandmann Pedersen <s...@redhat.com> +Date: Mon Mar 7 13:45:54 2011 -0500 + + test: In image_endian_swap() use pixman_image_get_format() to get the bpp. + + There is no reason to pass in the bpp as an argument; it can be gotten + directly from the image. + +commit 17feaa9c50bb8521b0366345efe181bd99754957 +Author: Siarhei Siamashka <siarhei.siamas...@nokia.com> +Date: Tue Feb 22 18:45:03 2011 +0200 + + ARM: NEON optimization for bilinear scaled 'src_8888_8888' + + Initial NEON optimization for bilinear scaling. Can be probably + improved more. + + Benchmark on ARM Cortex-A8: + Microbenchmark (scaling 2000x2000 image with scale factor close to 1x): + before: op=1, src=20028888, dst=20028888, speed=6.70 MPix/s + after: op=1, src=20028888, dst=20028888, speed=44.27 MPix/s + +commit 350029396d911941591149cc82b5e68a78ad6747 +Author: Siarhei Siamashka <siarhei.siamas...@nokia.com> +Date: Mon Feb 21 20:18:02 2011 +0200 + + SSE2 optimization for bilinear scaled 'src_8888_8888' + + A primitive naive implementation of bilinear scaling using SSE2 intrinsics, + which only handles one pixel at a time. It is approximately 2x faster than + pixman general compositing path. Single pass processing without intermediate + temporary buffer contributes to ~15% and loop unrolling contributes to ~20% + of this speedup. + + Benchmark on Intel Core i7 (x86-64): + Using cairo-perf-trace: + before: image firefox-planet-gnome 12.566 12.610 0.23% 6/6 + after: image firefox-planet-gnome 10.961 11.013 0.19% 5/6 + + Microbenchmark (scaling 2000x2000 image with scale factor close to 1x): + before: op=1, src=20028888, dst=20028888, speed=70.48 MPix/s + after: op=1, src=20028888, dst=20028888, speed=165.38 MPix/s + +commit 0df43b8ae5031dd83775d00b57b6bed809db0e89 +Author: Siarhei Siamashka <siarhei.siamas...@nokia.com> +Date: Mon Feb 21 02:07:09 2011 +0200 + + test: check correctness of 'bilinear_pad_repeat_get_scanline_bounds' + + Individual correctness check for the new bilinear scaling related + supplementary function. This test program uses a bit wider range + of input arguments, not covered by other tests. + +commit d506bf68fd0e9a1c5dd484daee70631699918387 +Author: Siarhei Siamashka <siarhei.siamas...@nokia.com> +Date: Mon Feb 21 01:29:02 2011 +0200 + + Main loop template for fast single pass bilinear scaling + + Can be used for implementing SIMD optimized fast path + functions which work with bilinear scaled source images. + + Similar to the template for nearest scaling main loop, the + following types of mask are supported: + 1. no mask + 2. non-scaled a8 mask with SAMPLES_COVER_CLIP flag + 3. solid mask + + PAD repeat is fully supported. NONE repeat is partially + supported (right now only works if source image has alpha + channel or when alpha channel of the source image does not + have any effect on the compositing operation). + +commit 9ebde285fa990bfa1524f166fbfb1368c346b14a +Author: Andrea Canciani <ranm...@gmail.com> +Date: Thu Feb 24 12:53:39 2011 +0100 + + test: Silence MSVC warnings + + MSVC does not notice non-returning functions (abort() / assert(0)) + and warns about paths which end with them in non-void functions: + + c:\cygwin\home\ranma42\code\fdo\pixman\test\fetch-test.c(114) : + warning C4715: 'reader' : not all control paths return a value + c:\cygwin\home\ranma42\code\fdo\pixman\test\stress-test.c(133) : + warning C4715: 'real_reader' : not all control paths return a value + c:\cygwin\home\ranma42\code\fdo\pixman\test\composite.c(431) : + warning C4715: 'calc_op' : not all control paths return a value + + These warnings can be silenced by adding a return after the + termination call. + +commit 8868778ea1fdc8e70da76b3b00ea78106c5840d8 +Author: Andrea Canciani <ranm...@gmail.com> +Date: Tue Feb 22 22:43:48 2011 +0100 + + Do not include unused headers + + pixman-combine32.h is included without being used both in + pixman-image.c and in pixman-general.c. + +commit 72f5e5f608506c18c484bc5bc3e58bd83aeb7691 +Author: Andrea Canciani <ranm...@gmail.com> +Date: Tue Feb 22 22:04:49 2011 +0100 + + test: Add Makefile for Win32 + +commit 11305b4ecdd36a17592c5c75de9157874853ab20 +Author: Andrea Canciani <ranm...@gmail.com> +Date: Tue Feb 22 21:46:37 2011 +0100 + + test: Fix tests for compilation on Windows + + The Microsoft C compiler cannot handle subobject initialization and + Win32 does not provide snprintf. + + Work around these limitations by using normal struct initialization + and using sprintf (a manual check shows that the buffer size is + sufficient). + +commit 20ed723a5a42fb8636bc9a5f32974dec1b66a785 +Author: Andrea Canciani <ranm...@gmail.com> +Date: Thu Feb 24 10:44:04 2011 +0100 + + Fix compilation on Win32 + + Makefile.win32 contained a typo and was missing the dependency from + the built sources. + +commit 48e951000c7ff14f40c671f3efb6abb18162c840 +Author: Søren Sandmann Pedersen <s...@redhat.com> +Date: Tue Feb 22 16:13:32 2011 -0500 + + Post-release version bump to 0.21.7 + commit 8b3332166094db657e96c365a524b2cd7513359b Author: Søren Sandmann Pedersen <s...@redhat.com> Date: Tue Feb 22 15:43:41 2011 -0500 diff --git a/debian/changelog b/debian/changelog index e26a43b..a5fdd88 100644 --- a/debian/changelog +++ b/debian/changelog @@ -1,3 +1,12 @@ +pixman (0.21.8-1) UNRELEASED; urgency=low + + * New upstream release. + * As seen in the upstream announcement: “When this version of pixman is + used with the git version of the X server, trapezoid rendering will be + corrupted. This is a known bug in the X server.” + + -- Cyril Brulebois <k...@debian.org> Fri, 29 Apr 2011 17:52:08 +0200 + pixman (0.21.6-2) unstable; urgency=low * Upload to unstable. commit 89868e93bd8d66f0fac0f0b42cf7718756992e4e Author: Søren Sandmann Pedersen <s...@redhat.com> Date: Tue Apr 19 00:00:37 2011 -0400 Pre-release version bump to 0.21.8 diff --git a/configure.ac b/configure.ac index 09a4948..0d51bd0 100644 --- a/configure.ac +++ b/configure.ac @@ -54,7 +54,7 @@ AC_PREREQ([2.57]) m4_define([pixman_major], 0) m4_define([pixman_minor], 21) -m4_define([pixman_micro], 7) +m4_define([pixman_micro], 8) m4_define([pixman_version],[pixman_major.pixman_minor.pixman_micro]) commit 33f1652b953467f3910605b3be723e21b3ebe078 Author: Taekyun Kim <tkq....@samsung.com> Date: Wed Apr 13 11:57:35 2011 +0900 ARM: Enable bilinear fast paths using scanline functions in pixman-arm-neon-asm-bilinear.S Enable fast paths which is supported by scanline functions in pixman-arm-neon-asm-bilinear.S diff --git a/pixman/pixman-arm-neon.c b/pixman/pixman-arm-neon.c index 77875ad..e5127a6 100644 --- a/pixman/pixman-arm-neon.c +++ b/pixman/pixman-arm-neon.c @@ -137,6 +137,23 @@ PIXMAN_ARM_BIND_SCALED_BILINEAR_SRC_DST (0, neon, 0565_x888, SRC, uint16_t, uint32_t) PIXMAN_ARM_BIND_SCALED_BILINEAR_SRC_DST (0, neon, 0565_0565, SRC, uint16_t, uint16_t) +PIXMAN_ARM_BIND_SCALED_BILINEAR_SRC_DST (SKIP_ZERO_SRC, neon, 8888_8888, OVER, + uint32_t, uint32_t) +PIXMAN_ARM_BIND_SCALED_BILINEAR_SRC_DST (SKIP_ZERO_SRC, neon, 8888_8888, ADD, + uint32_t, uint32_t) + +PIXMAN_ARM_BIND_SCALED_BILINEAR_SRC_A8_DST (0, neon, 8888_8_8888, SRC, + uint32_t, uint32_t) +PIXMAN_ARM_BIND_SCALED_BILINEAR_SRC_A8_DST (0, neon, 8888_8_0565, SRC, + uint32_t, uint16_t) +PIXMAN_ARM_BIND_SCALED_BILINEAR_SRC_A8_DST (0, neon, 0565_8_x888, SRC, + uint16_t, uint32_t) +PIXMAN_ARM_BIND_SCALED_BILINEAR_SRC_A8_DST (0, neon, 0565_8_0565, SRC, + uint16_t, uint16_t) +PIXMAN_ARM_BIND_SCALED_BILINEAR_SRC_A8_DST (SKIP_ZERO_SRC, neon, 8888_8_8888, OVER, + uint32_t, uint32_t) +PIXMAN_ARM_BIND_SCALED_BILINEAR_SRC_A8_DST (SKIP_ZERO_SRC, neon, 8888_8_8888, ADD, + uint32_t, uint32_t) void pixman_composite_src_n_8_asm_neon (int32_t w, @@ -366,6 +383,28 @@ static const pixman_fast_path_t arm_neon_fast_paths[] = SIMPLE_BILINEAR_FAST_PATH (SRC, r5g6b5, x8r8g8b8, neon_0565_x888), SIMPLE_BILINEAR_FAST_PATH (SRC, r5g6b5, r5g6b5, neon_0565_0565), + SIMPLE_BILINEAR_FAST_PATH (OVER, a8r8g8b8, a8r8g8b8, neon_8888_8888), + SIMPLE_BILINEAR_FAST_PATH (OVER, a8r8g8b8, x8r8g8b8, neon_8888_8888), + + SIMPLE_BILINEAR_FAST_PATH (ADD, a8r8g8b8, a8r8g8b8, neon_8888_8888), + SIMPLE_BILINEAR_FAST_PATH (ADD, a8r8g8b8, x8r8g8b8, neon_8888_8888), + + SIMPLE_BILINEAR_A8_MASK_FAST_PATH (SRC, a8r8g8b8, a8r8g8b8, neon_8888_8_8888), + SIMPLE_BILINEAR_A8_MASK_FAST_PATH (SRC, a8r8g8b8, x8r8g8b8, neon_8888_8_8888), + SIMPLE_BILINEAR_A8_MASK_FAST_PATH (SRC, x8r8g8b8, x8r8g8b8, neon_8888_8_8888), + + SIMPLE_BILINEAR_A8_MASK_FAST_PATH (SRC, a8r8g8b8, r5g6b5, neon_8888_8_0565), + SIMPLE_BILINEAR_A8_MASK_FAST_PATH (SRC, x8r8g8b8, r5g6b5, neon_8888_8_0565), + + SIMPLE_BILINEAR_A8_MASK_FAST_PATH (SRC, r5g6b5, x8r8g8b8, neon_0565_8_x888), + SIMPLE_BILINEAR_A8_MASK_FAST_PATH (SRC, r5g6b5, r5g6b5, neon_0565_8_0565), + + SIMPLE_BILINEAR_A8_MASK_FAST_PATH (OVER, a8r8g8b8, a8r8g8b8, neon_8888_8_8888), + SIMPLE_BILINEAR_A8_MASK_FAST_PATH (OVER, a8r8g8b8, x8r8g8b8, neon_8888_8_8888), + + SIMPLE_BILINEAR_A8_MASK_FAST_PATH (ADD, a8r8g8b8, a8r8g8b8, neon_8888_8_8888), + SIMPLE_BILINEAR_A8_MASK_FAST_PATH (ADD, a8r8g8b8, x8r8g8b8, neon_8888_8_8888), + { PIXMAN_OP_NONE }, }; commit e8185f1cb43417d9f7b1d2856bb899f1b84fde81 Author: Taekyun Kim <tkq....@samsung.com> Date: Wed Apr 13 11:48:40 2011 +0900 ARM: NEON scanline functions for bilinear scaling General fetch->combine->store based bilinear scanline functions. Need further optimizations and eventually will be replaced with optimal functions one by one. General functions should be located in pixman-arm-neon-asm-bilinear.S and optimal functions in pixman-arm-neon-asm.S Following general bilinear scanline functions are implemented over_8888_8888 add_8888_8888 src_8888_8_8888 src_8888_8_0565 src_0565_8_x888 src_0565_8_0565 over_8888_8_8888 add_8888_8_8888 diff --git a/pixman/Makefile.am b/pixman/Makefile.am index d016e9f..be08266 100644 --- a/pixman/Makefile.am +++ b/pixman/Makefile.am @@ -115,6 +115,7 @@ libpixman_arm_neon_la_SOURCES = \ pixman-arm-neon.c \ pixman-arm-common.h \ pixman-arm-neon-asm.S \ + pixman-arm-neon-asm-bilinear.S \ pixman-arm-neon-asm.h libpixman_arm_neon_la_CFLAGS = $(DEP_CFLAGS) libpixman_arm_neon_la_LIBADD = $(DEP_LIBS) diff --git a/pixman/pixman-arm-neon-asm-bilinear.S b/pixman/pixman-arm-neon-asm-bilinear.S new file mode 100644 index 0000000..9a4a1ff --- /dev/null +++ b/pixman/pixman-arm-neon-asm-bilinear.S @@ -0,0 +1,768 @@ +/* + * Copyright © 2011 SCore Corporation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. + * + * Author: Siarhei Siamashka (siarhei.siamas...@nokia.com) + * Author: Taekyun Kim (tkq....@samsung.com) + */ + +/* + * This file contains scaled bilinear scanline functions implemented + * using older siarhei's bilinear macro template. + * + * << General scanline function procedures >> + * 1. bilinear interpolate source pixels + * 2. load mask pixels + * 3. load destination pixels + * 4. duplicate mask to fill whole register + * 5. interleave source & destination pixels + * 6. apply mask to source pixels + * 7. combine source & destination pixels + * 8, Deinterleave final result + * 9. store destination pixels + * + * All registers with single number (i.e. src0, tmp0) are 64-bits registers. + * Registers with double numbers(src01, dst01) are 128-bits registers. + * All temp registers can be used freely outside the code block. + * Assume that symbol(register .req) OUT and MASK are defined at caller of these macro blocks. + * + * TODOs + * Support 0565 pixel format + * Optimization for two and last pixel cases + * + * Remarks + * There can be lots of pipeline stalls inside code block and between code blocks. + * Further optimizations will be done by new macro templates using head/tail_head/tail scheme. + */ + +/* Prevent the stack from becoming executable for no reason... */ +#if defined(__linux__) && defined (__ELF__) +.section .note.GNU-stack,"",%progbits +#endif + +.text +.fpu neon +.arch armv7a +.object_arch armv4 +.eabi_attribute 10, 0 +.eabi_attribute 12, 0 +.arm +.altmacro + +#include "pixman-arm-neon-asm.h" + +/* + * Bilinear macros from pixman-arm-neon-asm.S + */ + +/* Supplementary macro for setting function attributes */ +.macro pixman_asm_function fname + .func fname + .global fname +#ifdef __ELF__ + .hidden fname + .type fname, %function +#endif +fname: +.endm + +/* + * Bilinear scaling support code which tries to provide pixel fetching, color + * format conversion, and interpolation as separate macros which can be used + * as the basic building blocks for constructing bilinear scanline functions. + */ + +.macro bilinear_load_8888 reg1, reg2, tmp + mov TMP2, X, asr #16 + add X, X, UX + add TMP1, TOP, TMP2, asl #2 + add TMP2, BOTTOM, TMP2, asl #2 + vld1.32 {reg1}, [TMP1] + vld1.32 {reg2}, [TMP2] +.endm + +.macro bilinear_load_0565 reg1, reg2, tmp + mov TMP2, X, asr #16 + add X, X, UX + add TMP1, TOP, TMP2, asl #1 + add TMP2, BOTTOM, TMP2, asl #1 + vld1.32 {reg2[0]}, [TMP1] + vld1.32 {reg2[1]}, [TMP2] + convert_four_0565_to_x888_packed reg2, reg1, reg2, tmp +.endm + +.macro bilinear_load_and_vertical_interpolate_two_8888 \ + acc1, acc2, reg1, reg2, reg3, reg4, tmp1, tmp2 + + bilinear_load_8888 reg1, reg2, tmp1 + vmull.u8 acc1, reg1, d28 + vmlal.u8 acc1, reg2, d29 + bilinear_load_8888 reg3, reg4, tmp2 + vmull.u8 acc2, reg3, d28 + vmlal.u8 acc2, reg4, d29 +.endm + +.macro bilinear_load_and_vertical_interpolate_four_8888 \ + xacc1, xacc2, xreg1, xreg2, xreg3, xreg4, xacc2lo, xacc2hi \ + yacc1, yacc2, yreg1, yreg2, yreg3, yreg4, yacc2lo, yacc2hi + + bilinear_load_and_vertical_interpolate_two_8888 \ + xacc1, xacc2, xreg1, xreg2, xreg3, xreg4, xacc2lo, xacc2hi + bilinear_load_and_vertical_interpolate_two_8888 \ + yacc1, yacc2, yreg1, yreg2, yreg3, yreg4, yacc2lo, yacc2hi +.endm + +.macro bilinear_load_and_vertical_interpolate_two_0565 \ + acc1, acc2, reg1, reg2, reg3, reg4, acc2lo, acc2hi + + mov TMP2, X, asr #16 + add X, X, UX + mov TMP4, X, asr #16 + add X, X, UX + add TMP1, TOP, TMP2, asl #1 + add TMP2, BOTTOM, TMP2, asl #1 + add TMP3, TOP, TMP4, asl #1 + add TMP4, BOTTOM, TMP4, asl #1 + vld1.32 {acc2lo[0]}, [TMP1] + vld1.32 {acc2hi[0]}, [TMP3] + vld1.32 {acc2lo[1]}, [TMP2] + vld1.32 {acc2hi[1]}, [TMP4] + convert_0565_to_x888 acc2, reg3, reg2, reg1 + vzip.u8 reg1, reg3 + vzip.u8 reg2, reg4 + vzip.u8 reg3, reg4 + vzip.u8 reg1, reg2 + vmull.u8 acc1, reg1, d28 + vmlal.u8 acc1, reg2, d29 + vmull.u8 acc2, reg3, d28 + vmlal.u8 acc2, reg4, d29 +.endm + +.macro bilinear_load_and_vertical_interpolate_four_0565 \ + xacc1, xacc2, xreg1, xreg2, xreg3, xreg4, xacc2lo, xacc2hi \ + yacc1, yacc2, yreg1, yreg2, yreg3, yreg4, yacc2lo, yacc2hi + + mov TMP2, X, asr #16 + add X, X, UX + mov TMP4, X, asr #16 + add X, X, UX + add TMP1, TOP, TMP2, asl #1 + add TMP2, BOTTOM, TMP2, asl #1 + add TMP3, TOP, TMP4, asl #1 + add TMP4, BOTTOM, TMP4, asl #1 + vld1.32 {xacc2lo[0]}, [TMP1] -- To UNSUBSCRIBE, email to debian-x-requ...@lists.debian.org with a subject of "unsubscribe". Trouble? Contact listmas...@lists.debian.org Archive: http://lists.debian.org/e1qfqdz-0006ws...@alioth.debian.org