ChangeLog | 198 +++++++ Makefile.am | 4 configure.ac | 2 debian/changelog | 6 pixman/pixman-arm-neon-asm-bilinear.S | 922 ++++++++++++++++++++++++++++------ pixman/pixman-arm-neon-asm.S | 139 +++++ pixman/pixman-arm-neon.c | 10 pixman/pixman-gradient-walker.c | 175 +----- pixman/pixman-image.c | 73 ++ pixman/pixman-noop.c | 6 pixman/pixman-private.h | 16 pixman/pixman-sse2.c | 2 12 files changed, 1242 insertions(+), 311 deletions(-)
New commits: commit 39102f8b3e7f36ad912fc95596dcd0a61ae2bab0 Author: Cyril Brulebois <k...@debian.org> Date: Tue Nov 1 12:29:25 2011 +0100 Upload to experimental. diff --git a/debian/changelog b/debian/changelog index e2f7f36..7bebde7 100644 --- a/debian/changelog +++ b/debian/changelog @@ -1,8 +1,8 @@ -pixman (0.23.8-1) UNRELEASED; urgency=low +pixman (0.23.8-1) experimental; urgency=low * New upstream release. - -- Cyril Brulebois <k...@debian.org> Tue, 01 Nov 2011 12:28:45 +0100 + -- Cyril Brulebois <k...@debian.org> Tue, 01 Nov 2011 12:29:16 +0100 pixman (0.23.6-1) experimental; urgency=low commit bfad5455b6885b09fb8a63a7384f077fc0a45741 Author: Cyril Brulebois <k...@debian.org> Date: Tue Nov 1 12:28:58 2011 +0100 Bump changelogs. diff --git a/ChangeLog b/ChangeLog index 6a10342..fa61d98 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,201 @@ +commit a0f1b565811388b0567c845b9b7063d5b93d325e +Author: Søren Sandmann Pedersen <s...@redhat.com> +Date: Sat Oct 29 05:33:44 2011 -0400 + + Pre-release version bump to 0.23.8 + +commit 498138c293a2abce44ce122114852f4e6c5b87fe +Author: Søren Sandmann Pedersen <s...@redhat.com> +Date: Tue Oct 25 08:45:34 2011 -0400 + + Fix use of uninitialized fields reported by valgrind + + In pixman-noop.c and pixman-sse2.c, we are accessing + image->bits.width/height without first making sure the image is a bits + image. The warning is harmless because we never act on this + information without checking that the image is a8r8g8b8, but valgrind + does warn about it. + + In pixman-noop.c, just reorder the clauses in the if statement; in + pixman-sse2.c require images to have the FAST_PATH_BITS_IMAGE flag + set. + +commit 6131707e8fc39187d1d358481f7c57c57cfab206 +Merge: 3d4d705 ec7c9c2 +Author: Søren Sandmann Pedersen <s...@redhat.com> +Date: Thu Oct 20 09:13:12 2011 -0400 + + Merge branch 'gradients' + +commit 3d4d705d2ffa4aeab3dc02a23c2aadbea1374a3f +Author: Taekyun Kim <tkq....@samsung.com> +Date: Tue Oct 18 21:50:18 2011 +0900 + + ARM: NEON: Fix assembly typo error in src_n_8_8888 + + Binutils 2.21 does not complain about missing comma between ARM + register and alignement specifier in vld/vst instructions which + causes build error on binutils 2.20. + +commit 19f118f41f8725f22395d31eac5670cb350b55ec +Author: Taekyun Kim <tkq....@samsung.com> +Date: Mon Sep 26 18:33:27 2011 +0900 + + ARM: NEON: Standard fast path src_n_8_8 + + Performance numbers of before/after on cortex-a8 @ 1GHz + + - before + L1: 28.05 L2: 28.26 M: 26.97 ( 4.48%) HT: 19.79 VT: 19.14 R: 17.61 RT: 9.88 ( 101Kops/s) + + - after + L1:1430.28 L2:1252.10 M:421.93 ( 75.48%) HT:170.16 VT:138.03 R:145.86 RT: 35.51 ( 255Kops/s) + +commit 4db9e2bc13d3ed26416f249e57acec4b41f58b7f +Author: Taekyun Kim <tkq....@samsung.com> +Date: Mon Sep 26 17:03:54 2011 +0900 + + ARM: NEON: Standard fast path src_n_8_8888 + + Performance numbers of before/after on cortex-a8 @ 1GHz + + - before + L1: 32.39 L2: 31.79 M: 30.84 ( 13.77%) HT: 21.58 VT: 19.75 R: 18.83 RT: 10.46 ( 106Kops/s) + + - after + L1: 516.25 L2: 372.00 M:193.49 ( 85.59%) HT:136.93 VT:109.10 R:104.48 RT: 34.77 ( 253Kops/s) + +commit 26659de6cd2775c83a9a6e6660324d5baacf61f9 +Author: Taekyun Kim <tkq....@samsung.com> +Date: Mon Sep 26 19:04:53 2011 +0900 + + ARM: NEON: Instruction scheduling of bilinear over_8888_8_8888 + + Instructions are reordered to eliminate pipeline stalls and get + better memory access. + + Performance of before/after on cortex-a8 @ 1GHz + + << 2000 x 2000 with scale factor close to 1.x >> + before : 40.53 Mpix/s + after : 50.76 Mpix/s + +commit 4481920f405e47b3a92811a8cb06afbd37dee01b +Author: Taekyun Kim <tkq....@samsung.com> +Date: Wed Sep 21 15:52:13 2011 +0900 + + ARM: NEON: Instruction scheduling of bilinear over_8888_8888 + + Instructions are reordered to eliminate pipeline stalls and get + better memory access. + + Performance of before/after on cortex-a8 @ 1GHz + + << 2000 x 2000 with scale factor close to 1.x >> + before : 50.43 Mpix/s + after : 61.09 Mpix/s + +commit 1cd916f3a5ebeb943f66eecf0b8ce99af0b95d11 +Author: Taekyun Kim <tkq....@samsung.com> +Date: Fri Sep 23 00:03:22 2011 +0900 + + ARM: NEON: Replace old bilinear scanline generator with new template + + Bilinear scanline functions in pixman-arm-neon-asm-bilinear.S can + be replaced with new template just by wrapping existing macros. + +commit 6682b2b3597c9f431900bfe7b1b42dfbe006bae5 +Author: Taekyun Kim <tkq....@samsung.com> +Date: Tue Sep 20 21:32:35 2011 +0900 + + ARM: NEON: Bilinear macro template for instruction scheduling + + This macro template takes 6 code blocks. + + 1. process_last_pixel + 2. process_two_pixels + 3. process_four_pixels + 4. process_pixblock_head + 5. process_pixblock_tail + 6. process_pixblock_tail_head + + process_last_pixel does not need to update horizontal weight. This + is done by the template. two and four code block should update + horizontal weight inside of them. head/tail/tail_head blocks + consist unrolled core loop. You can apply instruction scheduling + to the tail_head blocks. + + You can also specify size of the pixel block. Supported size is 4 + and 8. If you want to use mask, give BILINEAR_FLAG_USE_MASK flags + to the template, then you can use register MASK. When using d8~d15 + registers, give BILINEAR_FLAG_USE_ALL_NEON_REGS to make sure + registers are properly saved on the stack and later restored. + +commit b5e4355fa4973e3edd4abeb11bdc47c42371cc76 +Author: Taekyun Kim <tkq....@samsung.com> +Date: Tue Sep 20 19:46:25 2011 +0900 + + ARM: NEON: Some cleanup of bilinear scanline functions + + Use STRIDE and initial horizontal weight update is done before + entering interpolation loop. Cache preload for mask and dst. + +commit ec7c9c2b6865b48b8bd14e4509538f8fcbe93463 +Author: Søren Sandmann Pedersen <s...@redhat.com> +Date: Fri Oct 14 09:04:48 2011 -0400 + + Simplify gradient_walker_reset() + + The code that searches for the closest color stop to the given + position is duplicated across the various repeat modes. Replace the + switch with two if/else constructions, and put the search code between + them. + +commit 2d0da8ab8d8fef60ed1bbb9d6b75f66577c3f85d +Author: Søren Sandmann Pedersen <s...@redhat.com> +Date: Fri Oct 14 09:02:14 2011 -0400 + + Use sentinels instead of special casing first and last stops + + When storing the gradient stops internally, allocate two more stops, + one before the beginning of the stop list and one after the + end. Initialize those stops based on the repeat property of the + gradient. + + This allows gradient_walker_reset() to be simplified because it can + now simply pick the two closest stops to the position without special + casing the first and last stops. + +commit 84d6ca7c891601b019d4862a556ed98b7e6fe525 +Author: Søren Sandmann Pedersen <s...@redhat.com> +Date: Fri Oct 14 07:42:00 2011 -0400 + + gradient walker: Correct types and fix formatting + + The type of pos in gradient_walker_reset() and gradient_walker_pixel() + is pixman_fixed_48_16_t and not pixman_fixed_32_32. The types of the + positions in the walker struct are pixman_fixed_t and not int32_t, and + need_reset is a boolean, not an integer. The spread field should be + called repeat and have the type pixman_repeat_t. + + Also fix some formatting issues, make gradient_walker_reset() static, + and delete the pointless PIXMAN_GRADIENT_WALKER_NEED_RESET() macro. + +commit ace225b53dee88d134753ac901f26ba3db6781da +Author: Søren Sandmann Pedersen <s...@redhat.com> +Date: Tue Oct 11 16:12:24 2011 -0400 + + Add stable release / development snapshot to draft release notes + + This will hopefully serve as a reminder to me that I should put this + information in the release notes. + +commit bb7142d361d56d66ac40debb60a7c4d099764ba8 +Author: Søren Sandmann Pedersen <s...@redhat.com> +Date: Tue Oct 11 06:10:39 2011 -0400 + + Post-release version bump to 0.23.7 + commit e20ac40bd30484f0f711b52d0c1993ef08760284 Author: Søren Sandmann Pedersen <s...@redhat.com> Date: Tue Oct 11 06:00:51 2011 -0400 diff --git a/debian/changelog b/debian/changelog index af38044..e2f7f36 100644 --- a/debian/changelog +++ b/debian/changelog @@ -1,3 +1,9 @@ +pixman (0.23.8-1) UNRELEASED; urgency=low + + * New upstream release. + + -- Cyril Brulebois <k...@debian.org> Tue, 01 Nov 2011 12:28:45 +0100 + pixman (0.23.6-1) experimental; urgency=low [ Rico Tzschichholz ] commit a0f1b565811388b0567c845b9b7063d5b93d325e Author: Søren Sandmann Pedersen <s...@redhat.com> Date: Sat Oct 29 05:33:44 2011 -0400 Pre-release version bump to 0.23.8 diff --git a/configure.ac b/configure.ac index 6c88c84..0552563 100644 --- a/configure.ac +++ b/configure.ac @@ -54,7 +54,7 @@ AC_PREREQ([2.57]) m4_define([pixman_major], 0) m4_define([pixman_minor], 23) -m4_define([pixman_micro], 7) +m4_define([pixman_micro], 8) m4_define([pixman_version],[pixman_major.pixman_minor.pixman_micro]) commit 498138c293a2abce44ce122114852f4e6c5b87fe Author: Søren Sandmann Pedersen <s...@redhat.com> Date: Tue Oct 25 08:45:34 2011 -0400 Fix use of uninitialized fields reported by valgrind In pixman-noop.c and pixman-sse2.c, we are accessing image->bits.width/height without first making sure the image is a bits image. The warning is harmless because we never act on this information without checking that the image is a8r8g8b8, but valgrind does warn about it. In pixman-noop.c, just reorder the clauses in the if statement; in pixman-sse2.c require images to have the FAST_PATH_BITS_IMAGE flag set. diff --git a/pixman/pixman-noop.c b/pixman/pixman-noop.c index 906a491..f4012d8 100644 --- a/pixman/pixman-noop.c +++ b/pixman/pixman-noop.c @@ -76,12 +76,12 @@ noop_src_iter_init (pixman_implementation_t *imp, pixman_iter_t *iter) { iter->get_scanline = _pixman_iter_get_scanline_noop; } - else if ((iter->flags & ITER_NARROW) && + else if (image->common.extended_format_code == PIXMAN_a8r8g8b8 && + (iter->flags & ITER_NARROW) && (image->common.flags & FLAGS) == FLAGS && iter->x >= 0 && iter->y >= 0 && iter->x + iter->width <= image->bits.width && - iter->y + iter->height <= image->bits.height && - image->common.extended_format_code == PIXMAN_a8r8g8b8) + iter->y + iter->height <= image->bits.height) { iter->buffer = image->bits.bits + iter->y * image->bits.rowstride + iter->x; diff --git a/pixman/pixman-sse2.c b/pixman/pixman-sse2.c index c419511..8adf541 100644 --- a/pixman/pixman-sse2.c +++ b/pixman/pixman-sse2.c @@ -5982,7 +5982,7 @@ sse2_src_iter_init (pixman_implementation_t *imp, pixman_iter_t *iter) int height = iter->height; #define FLAGS \ - (FAST_PATH_STANDARD_FLAGS | FAST_PATH_ID_TRANSFORM) + (FAST_PATH_STANDARD_FLAGS | FAST_PATH_ID_TRANSFORM | FAST_PATH_BITS_IMAGE) if ((iter->flags & ITER_NARROW) && (image->common.flags & FLAGS) == FLAGS && commit 3d4d705d2ffa4aeab3dc02a23c2aadbea1374a3f Author: Taekyun Kim <tkq....@samsung.com> Date: Tue Oct 18 21:50:18 2011 +0900 ARM: NEON: Fix assembly typo error in src_n_8_8888 Binutils 2.21 does not complain about missing comma between ARM register and alignement specifier in vld/vst instructions which causes build error on binutils 2.20. diff --git a/pixman/pixman-arm-neon-asm.S b/pixman/pixman-arm-neon-asm.S index da8f054..87aae1d 100644 --- a/pixman/pixman-arm-neon-asm.S +++ b/pixman/pixman-arm-neon-asm.S @@ -1260,7 +1260,7 @@ generate_composite_function \ PF subges PF_CTL, PF_CTL, #0x10 vmull.u8 q11, d24, d3 PF ldrgeb DUMMY, [PF_MASK, MASK_STRIDE, lsl #mask_bpp_shift]! - vst4.8 {d28, d29, d30, d31}, [DST_W :128]! + vst4.8 {d28, d29, d30, d31}, [DST_W, :128]! vrsra.u16 q8, q8, #8 vrsra.u16 q9, q9, #8 vrsra.u16 q10, q10, #8 commit 19f118f41f8725f22395d31eac5670cb350b55ec Author: Taekyun Kim <tkq....@samsung.com> Date: Mon Sep 26 18:33:27 2011 +0900 ARM: NEON: Standard fast path src_n_8_8 Performance numbers of before/after on cortex-a8 @ 1GHz - before L1: 28.05 L2: 28.26 M: 26.97 ( 4.48%) HT: 19.79 VT: 19.14 R: 17.61 RT: 9.88 ( 101Kops/s) - after L1:1430.28 L2:1252.10 M:421.93 ( 75.48%) HT:170.16 VT:138.03 R:145.86 RT: 35.51 ( 255Kops/s) diff --git a/pixman/pixman-arm-neon-asm.S b/pixman/pixman-arm-neon-asm.S index 1db02db..da8f054 100644 --- a/pixman/pixman-arm-neon-asm.S +++ b/pixman/pixman-arm-neon-asm.S @@ -1292,6 +1292,72 @@ generate_composite_function \ /******************************************************************************/ +.macro pixman_composite_src_n_8_8_process_pixblock_head + vmull.u8 q0, d24, d16 + vmull.u8 q1, d25, d16 + vmull.u8 q2, d26, d16 + vmull.u8 q3, d27, d16 + vrsra.u16 q0, q0, #8 + vrsra.u16 q1, q1, #8 + vrsra.u16 q2, q2, #8 + vrsra.u16 q3, q3, #8 +.endm + +.macro pixman_composite_src_n_8_8_process_pixblock_tail + vrshrn.u16 d28, q0, #8 + vrshrn.u16 d29, q1, #8 + vrshrn.u16 d30, q2, #8 + vrshrn.u16 d31, q3, #8 +.endm + +.macro pixman_composite_src_n_8_8_process_pixblock_tail_head + fetch_mask_pixblock + PF add PF_X, PF_X, #8 + vrshrn.u16 d28, q0, #8 + PF tst PF_CTL, #0x0F + vrshrn.u16 d29, q1, #8 + PF addne PF_X, PF_X, #8 + vrshrn.u16 d30, q2, #8 + PF subne PF_CTL, PF_CTL, #1 + vrshrn.u16 d31, q3, #8 + PF cmp PF_X, ORIG_W + vmull.u8 q0, d24, d16 + PF pld, [PF_MASK, PF_X, lsl #mask_bpp_shift] + vmull.u8 q1, d25, d16 + PF subge PF_X, PF_X, ORIG_W + vmull.u8 q2, d26, d16 + PF subges PF_CTL, PF_CTL, #0x10 + vmull.u8 q3, d27, d16 + PF ldrgeb DUMMY, [PF_MASK, MASK_STRIDE, lsl #mask_bpp_shift]! + vst1.8 {d28, d29, d30, d31}, [DST_W, :128]! + vrsra.u16 q0, q0, #8 + vrsra.u16 q1, q1, #8 + vrsra.u16 q2, q2, #8 + vrsra.u16 q3, q3, #8 +.endm + +.macro pixman_composite_src_n_8_8_init + add DUMMY, sp, #ARGS_STACK_OFFSET + vld1.32 {d16[0]}, [DUMMY] + vdup.8 d16, d16[3] +.endm + +.macro pixman_composite_src_n_8_8_cleanup +.endm + +generate_composite_function \ + pixman_composite_src_n_8_8_asm_neon, 0, 8, 8, \ + FLAG_DST_WRITEONLY, \ + 32, /* number of pixels, processed in a single block */ \ + 5, /* prefetch distance */ \ + pixman_composite_src_n_8_8_init, \ + pixman_composite_src_n_8_8_cleanup, \ + pixman_composite_src_n_8_8_process_pixblock_head, \ + pixman_composite_src_n_8_8_process_pixblock_tail, \ + pixman_composite_src_n_8_8_process_pixblock_tail_head + +/******************************************************************************/ + .macro pixman_composite_over_n_8_8888_process_pixblock_head /* expecting deinterleaved source data in {d8, d9, d10, d11} */ /* d8 - blue, d9 - green, d10 - red, d11 - alpha */ diff --git a/pixman/pixman-arm-neon.c b/pixman/pixman-arm-neon.c index 3db9adf..ca139de 100644 --- a/pixman/pixman-arm-neon.c +++ b/pixman/pixman-arm-neon.c @@ -92,6 +92,8 @@ PIXMAN_ARM_BIND_FAST_PATH_N_MASK_DST (SKIP_ZERO_SRC, neon, add_n_8_8888, uint8_t, 1, uint32_t, 1) PIXMAN_ARM_BIND_FAST_PATH_N_MASK_DST (0, neon, src_n_8_8888, uint8_t, 1, uint32_t, 1) +PIXMAN_ARM_BIND_FAST_PATH_N_MASK_DST (0, neon, src_n_8_8, + uint8_t, 1, uint8_t, 1) PIXMAN_ARM_BIND_FAST_PATH_SRC_N_DST (SKIP_ZERO_MASK, neon, over_8888_n_8888, uint32_t, 1, uint32_t, 1) @@ -295,6 +297,7 @@ static const pixman_fast_path_t arm_neon_fast_paths[] = PIXMAN_STD_FAST_PATH (SRC, solid, a8, x8r8g8b8, neon_composite_src_n_8_8888), PIXMAN_STD_FAST_PATH (SRC, solid, a8, a8b8g8r8, neon_composite_src_n_8_8888), PIXMAN_STD_FAST_PATH (SRC, solid, a8, x8b8g8r8, neon_composite_src_n_8_8888), + PIXMAN_STD_FAST_PATH (SRC, solid, a8, a8, neon_composite_src_n_8_8), PIXMAN_STD_FAST_PATH (OVER, solid, a8, a8, neon_composite_over_n_8_8), PIXMAN_STD_FAST_PATH (OVER, solid, a8, r5g6b5, neon_composite_over_n_8_0565), commit 4db9e2bc13d3ed26416f249e57acec4b41f58b7f Author: Taekyun Kim <tkq....@samsung.com> Date: Mon Sep 26 17:03:54 2011 +0900 ARM: NEON: Standard fast path src_n_8_8888 Performance numbers of before/after on cortex-a8 @ 1GHz - before L1: 32.39 L2: 31.79 M: 30.84 ( 13.77%) HT: 21.58 VT: 19.75 R: 18.83 RT: 10.46 ( 106Kops/s) - after L1: 516.25 L2: 372.00 M:193.49 ( 85.59%) HT:136.93 VT:109.10 R:104.48 RT: 34.77 ( 253Kops/s) diff --git a/pixman/pixman-arm-neon-asm.S b/pixman/pixman-arm-neon-asm.S index 3fcd07d..1db02db 100644 --- a/pixman/pixman-arm-neon-asm.S +++ b/pixman/pixman-arm-neon-asm.S @@ -1219,6 +1219,79 @@ generate_composite_function \ /******************************************************************************/ +.macro pixman_composite_src_n_8_8888_process_pixblock_head + /* expecting solid source in {d0, d1, d2, d3} */ + /* mask is in d24 (d25, d26, d27 are unused) */ + + /* in */ + vmull.u8 q8, d24, d0 + vmull.u8 q9, d24, d1 + vmull.u8 q10, d24, d2 + vmull.u8 q11, d24, d3 + vrsra.u16 q8, q8, #8 + vrsra.u16 q9, q9, #8 + vrsra.u16 q10, q10, #8 + vrsra.u16 q11, q11, #8 +.endm + +.macro pixman_composite_src_n_8_8888_process_pixblock_tail + vrshrn.u16 d28, q8, #8 + vrshrn.u16 d29, q9, #8 + vrshrn.u16 d30, q10, #8 + vrshrn.u16 d31, q11, #8 +.endm + +.macro pixman_composite_src_n_8_8888_process_pixblock_tail_head + fetch_mask_pixblock + PF add PF_X, PF_X, #8 + vrshrn.u16 d28, q8, #8 + PF tst PF_CTL, #0x0F + vrshrn.u16 d29, q9, #8 + PF addne PF_X, PF_X, #8 + vrshrn.u16 d30, q10, #8 + PF subne PF_CTL, PF_CTL, #1 + vrshrn.u16 d31, q11, #8 + PF cmp PF_X, ORIG_W + vmull.u8 q8, d24, d0 + PF pld, [PF_MASK, PF_X, lsl #mask_bpp_shift] + vmull.u8 q9, d24, d1 + PF subge PF_X, PF_X, ORIG_W + vmull.u8 q10, d24, d2 + PF subges PF_CTL, PF_CTL, #0x10 + vmull.u8 q11, d24, d3 + PF ldrgeb DUMMY, [PF_MASK, MASK_STRIDE, lsl #mask_bpp_shift]! + vst4.8 {d28, d29, d30, d31}, [DST_W :128]! + vrsra.u16 q8, q8, #8 + vrsra.u16 q9, q9, #8 + vrsra.u16 q10, q10, #8 + vrsra.u16 q11, q11, #8 +.endm + +.macro pixman_composite_src_n_8_8888_init + add DUMMY, sp, #ARGS_STACK_OFFSET + vld1.32 {d3[0]}, [DUMMY] + vdup.8 d0, d3[0] + vdup.8 d1, d3[1] + vdup.8 d2, d3[2] + vdup.8 d3, d3[3] +.endm + +.macro pixman_composite_src_n_8_8888_cleanup +.endm + +generate_composite_function \ + pixman_composite_src_n_8_8888_asm_neon, 0, 8, 32, \ + FLAG_DST_WRITEONLY | FLAG_DEINTERLEAVE_32BPP, \ + 8, /* number of pixels, processed in a single block */ \ + 5, /* prefetch distance */ \ + pixman_composite_src_n_8_8888_init, \ + pixman_composite_src_n_8_8888_cleanup, \ + pixman_composite_src_n_8_8888_process_pixblock_head, \ + pixman_composite_src_n_8_8888_process_pixblock_tail, \ + pixman_composite_src_n_8_8888_process_pixblock_tail_head, \ + +/******************************************************************************/ + .macro pixman_composite_over_n_8_8888_process_pixblock_head /* expecting deinterleaved source data in {d8, d9, d10, d11} */ /* d8 - blue, d9 - green, d10 - red, d11 - alpha */ diff --git a/pixman/pixman-arm-neon.c b/pixman/pixman-arm-neon.c index effb50b..3db9adf 100644 --- a/pixman/pixman-arm-neon.c +++ b/pixman/pixman-arm-neon.c @@ -90,6 +90,8 @@ PIXMAN_ARM_BIND_FAST_PATH_N_MASK_DST (SKIP_ZERO_SRC, neon, add_n_8_8, uint8_t, 1, uint8_t, 1) PIXMAN_ARM_BIND_FAST_PATH_N_MASK_DST (SKIP_ZERO_SRC, neon, add_n_8_8888, uint8_t, 1, uint32_t, 1) +PIXMAN_ARM_BIND_FAST_PATH_N_MASK_DST (0, neon, src_n_8_8888, + uint8_t, 1, uint32_t, 1) PIXMAN_ARM_BIND_FAST_PATH_SRC_N_DST (SKIP_ZERO_MASK, neon, over_8888_n_8888, uint32_t, 1, uint32_t, 1) @@ -289,6 +291,11 @@ static const pixman_fast_path_t arm_neon_fast_paths[] = PIXMAN_STD_FAST_PATH (SRC, pixbuf, pixbuf, a8b8g8r8, neon_composite_src_rpixbuf_8888), PIXMAN_STD_FAST_PATH (SRC, rpixbuf, rpixbuf, a8r8g8b8, neon_composite_src_rpixbuf_8888), PIXMAN_STD_FAST_PATH (SRC, rpixbuf, rpixbuf, a8b8g8r8, neon_composite_src_pixbuf_8888), + PIXMAN_STD_FAST_PATH (SRC, solid, a8, a8r8g8b8, neon_composite_src_n_8_8888), + PIXMAN_STD_FAST_PATH (SRC, solid, a8, x8r8g8b8, neon_composite_src_n_8_8888), + PIXMAN_STD_FAST_PATH (SRC, solid, a8, a8b8g8r8, neon_composite_src_n_8_8888), + PIXMAN_STD_FAST_PATH (SRC, solid, a8, x8b8g8r8, neon_composite_src_n_8_8888), + PIXMAN_STD_FAST_PATH (OVER, solid, a8, a8, neon_composite_over_n_8_8), PIXMAN_STD_FAST_PATH (OVER, solid, a8, r5g6b5, neon_composite_over_n_8_0565), PIXMAN_STD_FAST_PATH (OVER, solid, a8, b5g6r5, neon_composite_over_n_8_0565), commit 26659de6cd2775c83a9a6e6660324d5baacf61f9 Author: Taekyun Kim <tkq....@samsung.com> Date: Mon Sep 26 19:04:53 2011 +0900 ARM: NEON: Instruction scheduling of bilinear over_8888_8_8888 Instructions are reordered to eliminate pipeline stalls and get better memory access. Performance of before/after on cortex-a8 @ 1GHz << 2000 x 2000 with scale factor close to 1.x >> before : 40.53 Mpix/s after : 50.76 Mpix/s diff --git a/pixman/pixman-arm-neon-asm-bilinear.S b/pixman/pixman-arm-neon-asm-bilinear.S index 82d248e..f7913ad 100644 --- a/pixman/pixman-arm-neon-asm-bilinear.S +++ b/pixman/pixman-arm-neon-asm-bilinear.S @@ -949,7 +949,7 @@ pixman_asm_function fname vshrn.u32 d0, q0, #16 vshrn.u32 d1, q1, #16 vld1.32 {d2, d3}, [OUT, :128] - pld [OUT, PF_OFFS] + pld [OUT, #(prefetch_offset * 4)] vshrn.u32 d4, q2, #16 vshr.u16 q15, q12, #8 vshrn.u32 d5, q3, #16 @@ -1061,15 +1061,169 @@ pixman_asm_function fname .endm .macro bilinear_over_8888_8_8888_process_pixblock_head - bilinear_over_8888_8_8888_process_four_pixels + mov TMP1, X, asr #16 + add X, X, UX + add TMP1, TOP, TMP1, asl #2 + vld1.32 {d0}, [TMP1], STRIDE + mov TMP2, X, asr #16 + add X, X, UX + add TMP2, TOP, TMP2, asl #2 + vld1.32 {d1}, [TMP1] + mov TMP3, X, asr #16 + add X, X, UX + add TMP3, TOP, TMP3, asl #2 + vld1.32 {d2}, [TMP2], STRIDE + mov TMP4, X, asr #16 + add X, X, UX + add TMP4, TOP, TMP4, asl #2 + vld1.32 {d3}, [TMP2] + vmull.u8 q2, d0, d28 + vmull.u8 q3, d2, d28 + vmlal.u8 q2, d1, d29 + vmlal.u8 q3, d3, d29 + vshll.u16 q0, d4, #8 + vshll.u16 q1, d6, #8 + vmlsl.u16 q0, d4, d30 + vmlsl.u16 q1, d6, d31 + vmlal.u16 q0, d5, d30 + vmlal.u16 q1, d7, d31 + vshrn.u32 d0, q0, #16 + vshrn.u32 d1, q1, #16 + vld1.32 {d2}, [TMP3], STRIDE + vld1.32 {d3}, [TMP3] + pld [TMP4, PF_OFFS] + vld1.32 {d4}, [TMP4], STRIDE + vld1.32 {d5}, [TMP4] + pld [TMP4, PF_OFFS] + vmull.u8 q3, d2, d28 + vmlal.u8 q3, d3, d29 + vmull.u8 q1, d4, d28 + vmlal.u8 q1, d5, d29 + vshr.u16 q15, q12, #8 + vld1.32 {d22[0]}, [MASK]! + pld [MASK, #prefetch_offset] + vadd.u16 q12, q12, q13 + vmovn.u16 d16, q0 .endm .macro bilinear_over_8888_8_8888_process_pixblock_tail + vshll.u16 q9, d6, #8 + vshll.u16 q10, d2, #8 + vmlsl.u16 q9, d6, d30 + vmlsl.u16 q10, d2, d31 + vmlal.u16 q9, d7, d30 + vmlal.u16 q10, d3, d31 + vshr.u16 q15, q12, #8 + vadd.u16 q12, q12, q13 + vdup.32 d22, d22[0] + vshrn.u32 d18, q9, #16 + vshrn.u32 d19, q10, #16 + vmovn.u16 d17, q9 + vld1.32 {d18, d19}, [OUT, :128] + pld [OUT, PF_OFFS] + vuzp.8 d16, d17 + vuzp.8 d18, d19 + vuzp.8 d16, d17 + vuzp.8 d18, d19 + vmull.u8 q10, d16, d22 + vmull.u8 q11, d17, d22 + vrsra.u16 q10, q10, #8 + vrsra.u16 q11, q11, #8 + vrshrn.u16 d16, q10, #8 + vrshrn.u16 d17, q11, #8 + vdup.32 d22, d17[1] + vmvn.8 d22, d22 + vmull.u8 q10, d18, d22 + vmull.u8 q11, d19, d22 + vrshr.u16 q9, q10, #8 + vrshr.u16 q0, q11, #8 + vraddhn.u16 d18, q9, q10 + vraddhn.u16 d19, q0, q11 + vqadd.u8 q9, q8, q9 + vuzp.8 d18, d19 + vuzp.8 d18, d19 + vst1.32 {d18, d19}, [OUT, :128]! .endm .macro bilinear_over_8888_8_8888_process_pixblock_tail_head - bilinear_over_8888_8_8888_process_pixblock_tail - bilinear_over_8888_8_8888_process_pixblock_head + vshll.u16 q9, d6, #8 + mov TMP1, X, asr #16 + add X, X, UX + add TMP1, TOP, TMP1, asl #2 + vshll.u16 q10, d2, #8 + vld1.32 {d0}, [TMP1], STRIDE + mov TMP2, X, asr #16 + add X, X, UX + add TMP2, TOP, TMP2, asl #2 + vmlsl.u16 q9, d6, d30 + vmlsl.u16 q10, d2, d31 + vld1.32 {d1}, [TMP1] + mov TMP3, X, asr #16 + add X, X, UX + add TMP3, TOP, TMP3, asl #2 + vmlal.u16 q9, d7, d30 + vmlal.u16 q10, d3, d31 + vld1.32 {d2}, [TMP2], STRIDE + mov TMP4, X, asr #16 + add X, X, UX + add TMP4, TOP, TMP4, asl #2 + vshr.u16 q15, q12, #8 + vadd.u16 q12, q12, q13 + vld1.32 {d3}, [TMP2] + vdup.32 d22, d22[0] + vshrn.u32 d18, q9, #16 + vshrn.u32 d19, q10, #16 + vmull.u8 q2, d0, d28 + vmull.u8 q3, d2, d28 + vmovn.u16 d17, q9 + vld1.32 {d18, d19}, [OUT, :128] + pld [OUT, #(prefetch_offset * 4)] + vmlal.u8 q2, d1, d29 + vmlal.u8 q3, d3, d29 + vuzp.8 d16, d17 + vuzp.8 d18, d19 + vshll.u16 q0, d4, #8 + vshll.u16 q1, d6, #8 + vuzp.8 d16, d17 + vuzp.8 d18, d19 + vmlsl.u16 q0, d4, d30 + vmlsl.u16 q1, d6, d31 + vmull.u8 q10, d16, d22 + vmull.u8 q11, d17, d22 + vmlal.u16 q0, d5, d30 + vmlal.u16 q1, d7, d31 + vrsra.u16 q10, q10, #8 + vrsra.u16 q11, q11, #8 + vshrn.u32 d0, q0, #16 + vshrn.u32 d1, q1, #16 + vrshrn.u16 d16, q10, #8 + vrshrn.u16 d17, q11, #8 + vld1.32 {d2}, [TMP3], STRIDE + vdup.32 d22, d17[1] + vld1.32 {d3}, [TMP3] + vmvn.8 d22, d22 + pld [TMP4, PF_OFFS] + vld1.32 {d4}, [TMP4], STRIDE + vmull.u8 q10, d18, d22 + vmull.u8 q11, d19, d22 + vld1.32 {d5}, [TMP4] + pld [TMP4, PF_OFFS] + vmull.u8 q3, d2, d28 + vrshr.u16 q9, q10, #8 + vrshr.u16 q15, q11, #8 + vmlal.u8 q3, d3, d29 + vmull.u8 q1, d4, d28 + vraddhn.u16 d18, q9, q10 + vraddhn.u16 d19, q15, q11 + vmlal.u8 q1, d5, d29 + vshr.u16 q15, q12, #8 + vqadd.u8 q9, q8, q9 + vld1.32 {d22[0]}, [MASK]! + vuzp.8 d18, d19 + vadd.u16 q12, q12, q13 + vuzp.8 d18, d19 + vmovn.u16 d16, q0 + vst1.32 {d18, d19}, [OUT, :128]! .endm /* add_8888_8888 */ commit 4481920f405e47b3a92811a8cb06afbd37dee01b Author: Taekyun Kim <tkq....@samsung.com> Date: Wed Sep 21 15:52:13 2011 +0900 ARM: NEON: Instruction scheduling of bilinear over_8888_8888 Instructions are reordered to eliminate pipeline stalls and get better memory access. Performance of before/after on cortex-a8 @ 1GHz << 2000 x 2000 with scale factor close to 1.x >> before : 50.43 Mpix/s after : 61.09 Mpix/s diff --git a/pixman/pixman-arm-neon-asm-bilinear.S b/pixman/pixman-arm-neon-asm-bilinear.S index 25bcb24..82d248e 100644 --- a/pixman/pixman-arm-neon-asm-bilinear.S +++ b/pixman/pixman-arm-neon-asm-bilinear.S @@ -893,15 +893,158 @@ pixman_asm_function fname .endm .macro bilinear_over_8888_8888_process_pixblock_head - bilinear_over_8888_8888_process_four_pixels + mov TMP1, X, asr #16 + add X, X, UX + add TMP1, TOP, TMP1, asl #2 + mov TMP2, X, asr #16 + add X, X, UX + add TMP2, TOP, TMP2, asl #2 + + vld1.32 {d22}, [TMP1], STRIDE + vld1.32 {d23}, [TMP1] + mov TMP3, X, asr #16 + add X, X, UX + add TMP3, TOP, TMP3, asl #2 + vmull.u8 q8, d22, d28 + vmlal.u8 q8, d23, d29 + + vld1.32 {d22}, [TMP2], STRIDE + vld1.32 {d23}, [TMP2] + mov TMP4, X, asr #16 + add X, X, UX + add TMP4, TOP, TMP4, asl #2 + vmull.u8 q9, d22, d28 + vmlal.u8 q9, d23, d29 + + vld1.32 {d22}, [TMP3], STRIDE + vld1.32 {d23}, [TMP3] + vmull.u8 q10, d22, d28 + vmlal.u8 q10, d23, d29 + + vshll.u16 q0, d16, #8 + vmlsl.u16 q0, d16, d30 + vmlal.u16 q0, d17, d30 + + pld [TMP4, PF_OFFS] + vld1.32 {d16}, [TMP4], STRIDE + vld1.32 {d17}, [TMP4] + pld [TMP4, PF_OFFS] + vmull.u8 q11, d16, d28 + vmlal.u8 q11, d17, d29 + + vshll.u16 q1, d18, #8 + vmlsl.u16 q1, d18, d31 + vmlal.u16 q1, d19, d31 + vshr.u16 q15, q12, #8 + vadd.u16 q12, q12, q13 .endm .macro bilinear_over_8888_8888_process_pixblock_tail + vshll.u16 q2, d20, #8 + vmlsl.u16 q2, d20, d30 + vmlal.u16 q2, d21, d30 + vshll.u16 q3, d22, #8 + vmlsl.u16 q3, d22, d31 + vmlal.u16 q3, d23, d31 + vshrn.u32 d0, q0, #16 + vshrn.u32 d1, q1, #16 + vld1.32 {d2, d3}, [OUT, :128] + pld [OUT, PF_OFFS] + vshrn.u32 d4, q2, #16 + vshr.u16 q15, q12, #8 + vshrn.u32 d5, q3, #16 + vmovn.u16 d6, q0 + vmovn.u16 d7, q2 + vuzp.8 d6, d7 + vuzp.8 d2, d3 + vuzp.8 d6, d7 + vuzp.8 d2, d3 + vdup.32 d4, d7[1] + vmvn.8 d4, d4 + vmull.u8 q11, d2, d4 + vmull.u8 q2, d3, d4 + vrshr.u16 q1, q11, #8 + vrshr.u16 q10, q2, #8 + vraddhn.u16 d2, q1, q11 + vraddhn.u16 d3, q10, q2 + vqadd.u8 q3, q1, q3 + vuzp.8 d6, d7 + vuzp.8 d6, d7 + vadd.u16 q12, q12, q13 + vst1.32 {d6, d7}, [OUT, :128]! .endm .macro bilinear_over_8888_8888_process_pixblock_tail_head - bilinear_over_8888_8888_process_pixblock_tail - bilinear_over_8888_8888_process_pixblock_head + vshll.u16 q2, d20, #8 + mov TMP1, X, asr #16 + add X, X, UX + add TMP1, TOP, TMP1, asl #2 + vmlsl.u16 q2, d20, d30 + mov TMP2, X, asr #16 + add X, X, UX + add TMP2, TOP, TMP2, asl #2 + vmlal.u16 q2, d21, d30 + vshll.u16 q3, d22, #8 + vld1.32 {d20}, [TMP1], STRIDE + vmlsl.u16 q3, d22, d31 + vmlal.u16 q3, d23, d31 + vld1.32 {d21}, [TMP1] + vmull.u8 q8, d20, d28 + vmlal.u8 q8, d21, d29 + vshrn.u32 d0, q0, #16 + vshrn.u32 d1, q1, #16 + vld1.32 {d2, d3}, [OUT, :128] + pld [OUT, PF_OFFS] + vshrn.u32 d4, q2, #16 + vshr.u16 q15, q12, #8 + vld1.32 {d22}, [TMP2], STRIDE + vshrn.u32 d5, q3, #16 + vmovn.u16 d6, q0 + vld1.32 {d23}, [TMP2] + vmull.u8 q9, d22, d28 + mov TMP3, X, asr #16 + add X, X, UX + add TMP3, TOP, TMP3, asl #2 + mov TMP4, X, asr #16 + add X, X, UX + add TMP4, TOP, TMP4, asl #2 + vmlal.u8 q9, d23, d29 + vmovn.u16 d7, q2 + vld1.32 {d22}, [TMP3], STRIDE + vuzp.8 d6, d7 + vuzp.8 d2, d3 + vuzp.8 d6, d7 + vuzp.8 d2, d3 + vdup.32 d4, d7[1] + vld1.32 {d23}, [TMP3] + vmvn.8 d4, d4 + vmull.u8 q10, d22, d28 + vmlal.u8 q10, d23, d29 + vmull.u8 q11, d2, d4 + vmull.u8 q2, d3, d4 + vshll.u16 q0, d16, #8 + vmlsl.u16 q0, d16, d30 + vrshr.u16 q1, q11, #8 + vmlal.u16 q0, d17, d30 + vrshr.u16 q8, q2, #8 + vraddhn.u16 d2, q1, q11 + vraddhn.u16 d3, q8, q2 + pld [TMP4, PF_OFFS] + vld1.32 {d16}, [TMP4], STRIDE + vqadd.u8 q3, q1, q3 + vld1.32 {d17}, [TMP4] + pld [TMP4, PF_OFFS] + vmull.u8 q11, d16, d28 + vmlal.u8 q11, d17, d29 + vuzp.8 d6, d7 + vshll.u16 q1, d18, #8 + vuzp.8 d6, d7 + vmlsl.u16 q1, d18, d31 + vadd.u16 q12, q12, q13 + vmlal.u16 q1, d19, d31 + vshr.u16 q15, q12, #8 + vadd.u16 q12, q12, q13 + vst1.32 {d6, d7}, [OUT, :128]! .endm /* over_8888_8_8888 */ commit 1cd916f3a5ebeb943f66eecf0b8ce99af0b95d11 Author: Taekyun Kim <tkq....@samsung.com> Date: Fri Sep 23 00:03:22 2011 +0900 ARM: NEON: Replace old bilinear scanline generator with new template Bilinear scanline functions in pixman-arm-neon-asm-bilinear.S can be replaced with new template just by wrapping existing macros. diff --git a/pixman/pixman-arm-neon-asm-bilinear.S b/pixman/pixman-arm-neon-asm-bilinear.S index 784e5df..25bcb24 100644 --- a/pixman/pixman-arm-neon-asm-bilinear.S +++ b/pixman/pixman-arm-neon-asm-bilinear.S @@ -582,198 +582,6 @@ fname: bilinear_store_&dst_fmt 4, q2, q3 .endm -.macro generate_bilinear_scanline_func_src_dst \ - fname, src_fmt, dst_fmt, op, \ - bpp_shift, prefetch_distance - -- To UNSUBSCRIBE, email to debian-x-requ...@lists.debian.org with a subject of "unsubscribe". Trouble? Contact listmas...@lists.debian.org Archive: http://lists.debian.org/e1rldck-0002zi...@vasks.debian.org