Makefile.am | 4 configure.ac | 2 pixman/pixman-arm-neon-asm-bilinear.S | 922 ++++++++++++++++++++++++++++------ pixman/pixman-arm-neon-asm.S | 139 +++++ pixman/pixman-arm-neon.c | 10 pixman/pixman-gradient-walker.c | 175 +----- pixman/pixman-image.c | 73 ++ pixman/pixman-noop.c | 6 pixman/pixman-private.h | 16 pixman/pixman-sse2.c | 2 10 files changed, 1038 insertions(+), 311 deletions(-)
New commits: commit a0f1b565811388b0567c845b9b7063d5b93d325e Author: Søren Sandmann Pedersen <s...@redhat.com> Date: Sat Oct 29 05:33:44 2011 -0400 Pre-release version bump to 0.23.8 diff --git a/configure.ac b/configure.ac index 6c88c84..0552563 100644 --- a/configure.ac +++ b/configure.ac @@ -54,7 +54,7 @@ AC_PREREQ([2.57]) m4_define([pixman_major], 0) m4_define([pixman_minor], 23) -m4_define([pixman_micro], 7) +m4_define([pixman_micro], 8) m4_define([pixman_version],[pixman_major.pixman_minor.pixman_micro]) commit 498138c293a2abce44ce122114852f4e6c5b87fe Author: Søren Sandmann Pedersen <s...@redhat.com> Date: Tue Oct 25 08:45:34 2011 -0400 Fix use of uninitialized fields reported by valgrind In pixman-noop.c and pixman-sse2.c, we are accessing image->bits.width/height without first making sure the image is a bits image. The warning is harmless because we never act on this information without checking that the image is a8r8g8b8, but valgrind does warn about it. In pixman-noop.c, just reorder the clauses in the if statement; in pixman-sse2.c require images to have the FAST_PATH_BITS_IMAGE flag set. diff --git a/pixman/pixman-noop.c b/pixman/pixman-noop.c index 906a491..f4012d8 100644 --- a/pixman/pixman-noop.c +++ b/pixman/pixman-noop.c @@ -76,12 +76,12 @@ noop_src_iter_init (pixman_implementation_t *imp, pixman_iter_t *iter) { iter->get_scanline = _pixman_iter_get_scanline_noop; } - else if ((iter->flags & ITER_NARROW) && + else if (image->common.extended_format_code == PIXMAN_a8r8g8b8 && + (iter->flags & ITER_NARROW) && (image->common.flags & FLAGS) == FLAGS && iter->x >= 0 && iter->y >= 0 && iter->x + iter->width <= image->bits.width && - iter->y + iter->height <= image->bits.height && - image->common.extended_format_code == PIXMAN_a8r8g8b8) + iter->y + iter->height <= image->bits.height) { iter->buffer = image->bits.bits + iter->y * image->bits.rowstride + iter->x; diff --git a/pixman/pixman-sse2.c b/pixman/pixman-sse2.c index c419511..8adf541 100644 --- a/pixman/pixman-sse2.c +++ b/pixman/pixman-sse2.c @@ -5982,7 +5982,7 @@ sse2_src_iter_init (pixman_implementation_t *imp, pixman_iter_t *iter) int height = iter->height; #define FLAGS \ - (FAST_PATH_STANDARD_FLAGS | FAST_PATH_ID_TRANSFORM) + (FAST_PATH_STANDARD_FLAGS | FAST_PATH_ID_TRANSFORM | FAST_PATH_BITS_IMAGE) if ((iter->flags & ITER_NARROW) && (image->common.flags & FLAGS) == FLAGS && commit 3d4d705d2ffa4aeab3dc02a23c2aadbea1374a3f Author: Taekyun Kim <tkq....@samsung.com> Date: Tue Oct 18 21:50:18 2011 +0900 ARM: NEON: Fix assembly typo error in src_n_8_8888 Binutils 2.21 does not complain about missing comma between ARM register and alignement specifier in vld/vst instructions which causes build error on binutils 2.20. diff --git a/pixman/pixman-arm-neon-asm.S b/pixman/pixman-arm-neon-asm.S index da8f054..87aae1d 100644 --- a/pixman/pixman-arm-neon-asm.S +++ b/pixman/pixman-arm-neon-asm.S @@ -1260,7 +1260,7 @@ generate_composite_function \ PF subges PF_CTL, PF_CTL, #0x10 vmull.u8 q11, d24, d3 PF ldrgeb DUMMY, [PF_MASK, MASK_STRIDE, lsl #mask_bpp_shift]! - vst4.8 {d28, d29, d30, d31}, [DST_W :128]! + vst4.8 {d28, d29, d30, d31}, [DST_W, :128]! vrsra.u16 q8, q8, #8 vrsra.u16 q9, q9, #8 vrsra.u16 q10, q10, #8 commit 19f118f41f8725f22395d31eac5670cb350b55ec Author: Taekyun Kim <tkq....@samsung.com> Date: Mon Sep 26 18:33:27 2011 +0900 ARM: NEON: Standard fast path src_n_8_8 Performance numbers of before/after on cortex-a8 @ 1GHz - before L1: 28.05 L2: 28.26 M: 26.97 ( 4.48%) HT: 19.79 VT: 19.14 R: 17.61 RT: 9.88 ( 101Kops/s) - after L1:1430.28 L2:1252.10 M:421.93 ( 75.48%) HT:170.16 VT:138.03 R:145.86 RT: 35.51 ( 255Kops/s) diff --git a/pixman/pixman-arm-neon-asm.S b/pixman/pixman-arm-neon-asm.S index 1db02db..da8f054 100644 --- a/pixman/pixman-arm-neon-asm.S +++ b/pixman/pixman-arm-neon-asm.S @@ -1292,6 +1292,72 @@ generate_composite_function \ /******************************************************************************/ +.macro pixman_composite_src_n_8_8_process_pixblock_head + vmull.u8 q0, d24, d16 + vmull.u8 q1, d25, d16 + vmull.u8 q2, d26, d16 + vmull.u8 q3, d27, d16 + vrsra.u16 q0, q0, #8 + vrsra.u16 q1, q1, #8 + vrsra.u16 q2, q2, #8 + vrsra.u16 q3, q3, #8 +.endm + +.macro pixman_composite_src_n_8_8_process_pixblock_tail + vrshrn.u16 d28, q0, #8 + vrshrn.u16 d29, q1, #8 + vrshrn.u16 d30, q2, #8 + vrshrn.u16 d31, q3, #8 +.endm + +.macro pixman_composite_src_n_8_8_process_pixblock_tail_head + fetch_mask_pixblock + PF add PF_X, PF_X, #8 + vrshrn.u16 d28, q0, #8 + PF tst PF_CTL, #0x0F + vrshrn.u16 d29, q1, #8 + PF addne PF_X, PF_X, #8 + vrshrn.u16 d30, q2, #8 + PF subne PF_CTL, PF_CTL, #1 + vrshrn.u16 d31, q3, #8 + PF cmp PF_X, ORIG_W + vmull.u8 q0, d24, d16 + PF pld, [PF_MASK, PF_X, lsl #mask_bpp_shift] + vmull.u8 q1, d25, d16 + PF subge PF_X, PF_X, ORIG_W + vmull.u8 q2, d26, d16 + PF subges PF_CTL, PF_CTL, #0x10 + vmull.u8 q3, d27, d16 + PF ldrgeb DUMMY, [PF_MASK, MASK_STRIDE, lsl #mask_bpp_shift]! + vst1.8 {d28, d29, d30, d31}, [DST_W, :128]! + vrsra.u16 q0, q0, #8 + vrsra.u16 q1, q1, #8 + vrsra.u16 q2, q2, #8 + vrsra.u16 q3, q3, #8 +.endm + +.macro pixman_composite_src_n_8_8_init + add DUMMY, sp, #ARGS_STACK_OFFSET + vld1.32 {d16[0]}, [DUMMY] + vdup.8 d16, d16[3] +.endm + +.macro pixman_composite_src_n_8_8_cleanup +.endm + +generate_composite_function \ + pixman_composite_src_n_8_8_asm_neon, 0, 8, 8, \ + FLAG_DST_WRITEONLY, \ + 32, /* number of pixels, processed in a single block */ \ + 5, /* prefetch distance */ \ + pixman_composite_src_n_8_8_init, \ + pixman_composite_src_n_8_8_cleanup, \ + pixman_composite_src_n_8_8_process_pixblock_head, \ + pixman_composite_src_n_8_8_process_pixblock_tail, \ + pixman_composite_src_n_8_8_process_pixblock_tail_head + +/******************************************************************************/ + .macro pixman_composite_over_n_8_8888_process_pixblock_head /* expecting deinterleaved source data in {d8, d9, d10, d11} */ /* d8 - blue, d9 - green, d10 - red, d11 - alpha */ diff --git a/pixman/pixman-arm-neon.c b/pixman/pixman-arm-neon.c index 3db9adf..ca139de 100644 --- a/pixman/pixman-arm-neon.c +++ b/pixman/pixman-arm-neon.c @@ -92,6 +92,8 @@ PIXMAN_ARM_BIND_FAST_PATH_N_MASK_DST (SKIP_ZERO_SRC, neon, add_n_8_8888, uint8_t, 1, uint32_t, 1) PIXMAN_ARM_BIND_FAST_PATH_N_MASK_DST (0, neon, src_n_8_8888, uint8_t, 1, uint32_t, 1) +PIXMAN_ARM_BIND_FAST_PATH_N_MASK_DST (0, neon, src_n_8_8, + uint8_t, 1, uint8_t, 1) PIXMAN_ARM_BIND_FAST_PATH_SRC_N_DST (SKIP_ZERO_MASK, neon, over_8888_n_8888, uint32_t, 1, uint32_t, 1) @@ -295,6 +297,7 @@ static const pixman_fast_path_t arm_neon_fast_paths[] = PIXMAN_STD_FAST_PATH (SRC, solid, a8, x8r8g8b8, neon_composite_src_n_8_8888), PIXMAN_STD_FAST_PATH (SRC, solid, a8, a8b8g8r8, neon_composite_src_n_8_8888), PIXMAN_STD_FAST_PATH (SRC, solid, a8, x8b8g8r8, neon_composite_src_n_8_8888), + PIXMAN_STD_FAST_PATH (SRC, solid, a8, a8, neon_composite_src_n_8_8), PIXMAN_STD_FAST_PATH (OVER, solid, a8, a8, neon_composite_over_n_8_8), PIXMAN_STD_FAST_PATH (OVER, solid, a8, r5g6b5, neon_composite_over_n_8_0565), commit 4db9e2bc13d3ed26416f249e57acec4b41f58b7f Author: Taekyun Kim <tkq....@samsung.com> Date: Mon Sep 26 17:03:54 2011 +0900 ARM: NEON: Standard fast path src_n_8_8888 Performance numbers of before/after on cortex-a8 @ 1GHz - before L1: 32.39 L2: 31.79 M: 30.84 ( 13.77%) HT: 21.58 VT: 19.75 R: 18.83 RT: 10.46 ( 106Kops/s) - after L1: 516.25 L2: 372.00 M:193.49 ( 85.59%) HT:136.93 VT:109.10 R:104.48 RT: 34.77 ( 253Kops/s) diff --git a/pixman/pixman-arm-neon-asm.S b/pixman/pixman-arm-neon-asm.S index 3fcd07d..1db02db 100644 --- a/pixman/pixman-arm-neon-asm.S +++ b/pixman/pixman-arm-neon-asm.S @@ -1219,6 +1219,79 @@ generate_composite_function \ /******************************************************************************/ +.macro pixman_composite_src_n_8_8888_process_pixblock_head + /* expecting solid source in {d0, d1, d2, d3} */ + /* mask is in d24 (d25, d26, d27 are unused) */ + + /* in */ + vmull.u8 q8, d24, d0 + vmull.u8 q9, d24, d1 + vmull.u8 q10, d24, d2 + vmull.u8 q11, d24, d3 + vrsra.u16 q8, q8, #8 + vrsra.u16 q9, q9, #8 + vrsra.u16 q10, q10, #8 + vrsra.u16 q11, q11, #8 +.endm + +.macro pixman_composite_src_n_8_8888_process_pixblock_tail + vrshrn.u16 d28, q8, #8 + vrshrn.u16 d29, q9, #8 + vrshrn.u16 d30, q10, #8 + vrshrn.u16 d31, q11, #8 +.endm + +.macro pixman_composite_src_n_8_8888_process_pixblock_tail_head + fetch_mask_pixblock + PF add PF_X, PF_X, #8 + vrshrn.u16 d28, q8, #8 + PF tst PF_CTL, #0x0F + vrshrn.u16 d29, q9, #8 + PF addne PF_X, PF_X, #8 + vrshrn.u16 d30, q10, #8 + PF subne PF_CTL, PF_CTL, #1 + vrshrn.u16 d31, q11, #8 + PF cmp PF_X, ORIG_W + vmull.u8 q8, d24, d0 + PF pld, [PF_MASK, PF_X, lsl #mask_bpp_shift] + vmull.u8 q9, d24, d1 + PF subge PF_X, PF_X, ORIG_W + vmull.u8 q10, d24, d2 + PF subges PF_CTL, PF_CTL, #0x10 + vmull.u8 q11, d24, d3 + PF ldrgeb DUMMY, [PF_MASK, MASK_STRIDE, lsl #mask_bpp_shift]! + vst4.8 {d28, d29, d30, d31}, [DST_W :128]! + vrsra.u16 q8, q8, #8 + vrsra.u16 q9, q9, #8 + vrsra.u16 q10, q10, #8 + vrsra.u16 q11, q11, #8 +.endm + +.macro pixman_composite_src_n_8_8888_init + add DUMMY, sp, #ARGS_STACK_OFFSET + vld1.32 {d3[0]}, [DUMMY] + vdup.8 d0, d3[0] + vdup.8 d1, d3[1] + vdup.8 d2, d3[2] + vdup.8 d3, d3[3] +.endm + +.macro pixman_composite_src_n_8_8888_cleanup +.endm + +generate_composite_function \ + pixman_composite_src_n_8_8888_asm_neon, 0, 8, 32, \ + FLAG_DST_WRITEONLY | FLAG_DEINTERLEAVE_32BPP, \ + 8, /* number of pixels, processed in a single block */ \ + 5, /* prefetch distance */ \ + pixman_composite_src_n_8_8888_init, \ + pixman_composite_src_n_8_8888_cleanup, \ + pixman_composite_src_n_8_8888_process_pixblock_head, \ + pixman_composite_src_n_8_8888_process_pixblock_tail, \ + pixman_composite_src_n_8_8888_process_pixblock_tail_head, \ + +/******************************************************************************/ + .macro pixman_composite_over_n_8_8888_process_pixblock_head /* expecting deinterleaved source data in {d8, d9, d10, d11} */ /* d8 - blue, d9 - green, d10 - red, d11 - alpha */ diff --git a/pixman/pixman-arm-neon.c b/pixman/pixman-arm-neon.c index effb50b..3db9adf 100644 --- a/pixman/pixman-arm-neon.c +++ b/pixman/pixman-arm-neon.c @@ -90,6 +90,8 @@ PIXMAN_ARM_BIND_FAST_PATH_N_MASK_DST (SKIP_ZERO_SRC, neon, add_n_8_8, uint8_t, 1, uint8_t, 1) PIXMAN_ARM_BIND_FAST_PATH_N_MASK_DST (SKIP_ZERO_SRC, neon, add_n_8_8888, uint8_t, 1, uint32_t, 1) +PIXMAN_ARM_BIND_FAST_PATH_N_MASK_DST (0, neon, src_n_8_8888, + uint8_t, 1, uint32_t, 1) PIXMAN_ARM_BIND_FAST_PATH_SRC_N_DST (SKIP_ZERO_MASK, neon, over_8888_n_8888, uint32_t, 1, uint32_t, 1) @@ -289,6 +291,11 @@ static const pixman_fast_path_t arm_neon_fast_paths[] = PIXMAN_STD_FAST_PATH (SRC, pixbuf, pixbuf, a8b8g8r8, neon_composite_src_rpixbuf_8888), PIXMAN_STD_FAST_PATH (SRC, rpixbuf, rpixbuf, a8r8g8b8, neon_composite_src_rpixbuf_8888), PIXMAN_STD_FAST_PATH (SRC, rpixbuf, rpixbuf, a8b8g8r8, neon_composite_src_pixbuf_8888), + PIXMAN_STD_FAST_PATH (SRC, solid, a8, a8r8g8b8, neon_composite_src_n_8_8888), + PIXMAN_STD_FAST_PATH (SRC, solid, a8, x8r8g8b8, neon_composite_src_n_8_8888), + PIXMAN_STD_FAST_PATH (SRC, solid, a8, a8b8g8r8, neon_composite_src_n_8_8888), + PIXMAN_STD_FAST_PATH (SRC, solid, a8, x8b8g8r8, neon_composite_src_n_8_8888), + PIXMAN_STD_FAST_PATH (OVER, solid, a8, a8, neon_composite_over_n_8_8), PIXMAN_STD_FAST_PATH (OVER, solid, a8, r5g6b5, neon_composite_over_n_8_0565), PIXMAN_STD_FAST_PATH (OVER, solid, a8, b5g6r5, neon_composite_over_n_8_0565), commit 26659de6cd2775c83a9a6e6660324d5baacf61f9 Author: Taekyun Kim <tkq....@samsung.com> Date: Mon Sep 26 19:04:53 2011 +0900 ARM: NEON: Instruction scheduling of bilinear over_8888_8_8888 Instructions are reordered to eliminate pipeline stalls and get better memory access. Performance of before/after on cortex-a8 @ 1GHz << 2000 x 2000 with scale factor close to 1.x >> before : 40.53 Mpix/s after : 50.76 Mpix/s diff --git a/pixman/pixman-arm-neon-asm-bilinear.S b/pixman/pixman-arm-neon-asm-bilinear.S index 82d248e..f7913ad 100644 --- a/pixman/pixman-arm-neon-asm-bilinear.S +++ b/pixman/pixman-arm-neon-asm-bilinear.S @@ -949,7 +949,7 @@ pixman_asm_function fname vshrn.u32 d0, q0, #16 vshrn.u32 d1, q1, #16 vld1.32 {d2, d3}, [OUT, :128] - pld [OUT, PF_OFFS] + pld [OUT, #(prefetch_offset * 4)] vshrn.u32 d4, q2, #16 vshr.u16 q15, q12, #8 vshrn.u32 d5, q3, #16 @@ -1061,15 +1061,169 @@ pixman_asm_function fname .endm .macro bilinear_over_8888_8_8888_process_pixblock_head - bilinear_over_8888_8_8888_process_four_pixels + mov TMP1, X, asr #16 + add X, X, UX + add TMP1, TOP, TMP1, asl #2 + vld1.32 {d0}, [TMP1], STRIDE + mov TMP2, X, asr #16 + add X, X, UX + add TMP2, TOP, TMP2, asl #2 + vld1.32 {d1}, [TMP1] + mov TMP3, X, asr #16 + add X, X, UX + add TMP3, TOP, TMP3, asl #2 + vld1.32 {d2}, [TMP2], STRIDE + mov TMP4, X, asr #16 + add X, X, UX + add TMP4, TOP, TMP4, asl #2 + vld1.32 {d3}, [TMP2] + vmull.u8 q2, d0, d28 + vmull.u8 q3, d2, d28 + vmlal.u8 q2, d1, d29 + vmlal.u8 q3, d3, d29 + vshll.u16 q0, d4, #8 + vshll.u16 q1, d6, #8 + vmlsl.u16 q0, d4, d30 + vmlsl.u16 q1, d6, d31 + vmlal.u16 q0, d5, d30 + vmlal.u16 q1, d7, d31 + vshrn.u32 d0, q0, #16 + vshrn.u32 d1, q1, #16 + vld1.32 {d2}, [TMP3], STRIDE + vld1.32 {d3}, [TMP3] + pld [TMP4, PF_OFFS] + vld1.32 {d4}, [TMP4], STRIDE + vld1.32 {d5}, [TMP4] + pld [TMP4, PF_OFFS] + vmull.u8 q3, d2, d28 + vmlal.u8 q3, d3, d29 + vmull.u8 q1, d4, d28 + vmlal.u8 q1, d5, d29 + vshr.u16 q15, q12, #8 + vld1.32 {d22[0]}, [MASK]! + pld [MASK, #prefetch_offset] + vadd.u16 q12, q12, q13 + vmovn.u16 d16, q0 .endm .macro bilinear_over_8888_8_8888_process_pixblock_tail + vshll.u16 q9, d6, #8 + vshll.u16 q10, d2, #8 + vmlsl.u16 q9, d6, d30 + vmlsl.u16 q10, d2, d31 + vmlal.u16 q9, d7, d30 + vmlal.u16 q10, d3, d31 + vshr.u16 q15, q12, #8 + vadd.u16 q12, q12, q13 + vdup.32 d22, d22[0] + vshrn.u32 d18, q9, #16 + vshrn.u32 d19, q10, #16 + vmovn.u16 d17, q9 + vld1.32 {d18, d19}, [OUT, :128] + pld [OUT, PF_OFFS] + vuzp.8 d16, d17 + vuzp.8 d18, d19 + vuzp.8 d16, d17 + vuzp.8 d18, d19 + vmull.u8 q10, d16, d22 + vmull.u8 q11, d17, d22 + vrsra.u16 q10, q10, #8 + vrsra.u16 q11, q11, #8 + vrshrn.u16 d16, q10, #8 + vrshrn.u16 d17, q11, #8 + vdup.32 d22, d17[1] + vmvn.8 d22, d22 + vmull.u8 q10, d18, d22 + vmull.u8 q11, d19, d22 + vrshr.u16 q9, q10, #8 + vrshr.u16 q0, q11, #8 + vraddhn.u16 d18, q9, q10 + vraddhn.u16 d19, q0, q11 + vqadd.u8 q9, q8, q9 + vuzp.8 d18, d19 + vuzp.8 d18, d19 + vst1.32 {d18, d19}, [OUT, :128]! .endm .macro bilinear_over_8888_8_8888_process_pixblock_tail_head - bilinear_over_8888_8_8888_process_pixblock_tail - bilinear_over_8888_8_8888_process_pixblock_head + vshll.u16 q9, d6, #8 + mov TMP1, X, asr #16 + add X, X, UX + add TMP1, TOP, TMP1, asl #2 + vshll.u16 q10, d2, #8 + vld1.32 {d0}, [TMP1], STRIDE + mov TMP2, X, asr #16 + add X, X, UX + add TMP2, TOP, TMP2, asl #2 + vmlsl.u16 q9, d6, d30 + vmlsl.u16 q10, d2, d31 + vld1.32 {d1}, [TMP1] + mov TMP3, X, asr #16 + add X, X, UX + add TMP3, TOP, TMP3, asl #2 + vmlal.u16 q9, d7, d30 + vmlal.u16 q10, d3, d31 + vld1.32 {d2}, [TMP2], STRIDE + mov TMP4, X, asr #16 + add X, X, UX + add TMP4, TOP, TMP4, asl #2 + vshr.u16 q15, q12, #8 + vadd.u16 q12, q12, q13 + vld1.32 {d3}, [TMP2] + vdup.32 d22, d22[0] + vshrn.u32 d18, q9, #16 + vshrn.u32 d19, q10, #16 + vmull.u8 q2, d0, d28 + vmull.u8 q3, d2, d28 + vmovn.u16 d17, q9 + vld1.32 {d18, d19}, [OUT, :128] + pld [OUT, #(prefetch_offset * 4)] + vmlal.u8 q2, d1, d29 + vmlal.u8 q3, d3, d29 + vuzp.8 d16, d17 + vuzp.8 d18, d19 + vshll.u16 q0, d4, #8 + vshll.u16 q1, d6, #8 + vuzp.8 d16, d17 + vuzp.8 d18, d19 + vmlsl.u16 q0, d4, d30 + vmlsl.u16 q1, d6, d31 + vmull.u8 q10, d16, d22 + vmull.u8 q11, d17, d22 + vmlal.u16 q0, d5, d30 + vmlal.u16 q1, d7, d31 + vrsra.u16 q10, q10, #8 + vrsra.u16 q11, q11, #8 + vshrn.u32 d0, q0, #16 + vshrn.u32 d1, q1, #16 + vrshrn.u16 d16, q10, #8 + vrshrn.u16 d17, q11, #8 + vld1.32 {d2}, [TMP3], STRIDE + vdup.32 d22, d17[1] + vld1.32 {d3}, [TMP3] + vmvn.8 d22, d22 + pld [TMP4, PF_OFFS] + vld1.32 {d4}, [TMP4], STRIDE + vmull.u8 q10, d18, d22 + vmull.u8 q11, d19, d22 + vld1.32 {d5}, [TMP4] + pld [TMP4, PF_OFFS] + vmull.u8 q3, d2, d28 + vrshr.u16 q9, q10, #8 + vrshr.u16 q15, q11, #8 + vmlal.u8 q3, d3, d29 + vmull.u8 q1, d4, d28 + vraddhn.u16 d18, q9, q10 + vraddhn.u16 d19, q15, q11 + vmlal.u8 q1, d5, d29 + vshr.u16 q15, q12, #8 + vqadd.u8 q9, q8, q9 + vld1.32 {d22[0]}, [MASK]! + vuzp.8 d18, d19 + vadd.u16 q12, q12, q13 + vuzp.8 d18, d19 + vmovn.u16 d16, q0 + vst1.32 {d18, d19}, [OUT, :128]! .endm /* add_8888_8888 */ commit 4481920f405e47b3a92811a8cb06afbd37dee01b Author: Taekyun Kim <tkq....@samsung.com> Date: Wed Sep 21 15:52:13 2011 +0900 ARM: NEON: Instruction scheduling of bilinear over_8888_8888 Instructions are reordered to eliminate pipeline stalls and get better memory access. Performance of before/after on cortex-a8 @ 1GHz << 2000 x 2000 with scale factor close to 1.x >> before : 50.43 Mpix/s after : 61.09 Mpix/s diff --git a/pixman/pixman-arm-neon-asm-bilinear.S b/pixman/pixman-arm-neon-asm-bilinear.S index 25bcb24..82d248e 100644 --- a/pixman/pixman-arm-neon-asm-bilinear.S +++ b/pixman/pixman-arm-neon-asm-bilinear.S @@ -893,15 +893,158 @@ pixman_asm_function fname .endm .macro bilinear_over_8888_8888_process_pixblock_head - bilinear_over_8888_8888_process_four_pixels + mov TMP1, X, asr #16 + add X, X, UX + add TMP1, TOP, TMP1, asl #2 + mov TMP2, X, asr #16 + add X, X, UX + add TMP2, TOP, TMP2, asl #2 + + vld1.32 {d22}, [TMP1], STRIDE + vld1.32 {d23}, [TMP1] + mov TMP3, X, asr #16 + add X, X, UX + add TMP3, TOP, TMP3, asl #2 + vmull.u8 q8, d22, d28 + vmlal.u8 q8, d23, d29 + + vld1.32 {d22}, [TMP2], STRIDE + vld1.32 {d23}, [TMP2] + mov TMP4, X, asr #16 + add X, X, UX + add TMP4, TOP, TMP4, asl #2 + vmull.u8 q9, d22, d28 + vmlal.u8 q9, d23, d29 + + vld1.32 {d22}, [TMP3], STRIDE + vld1.32 {d23}, [TMP3] + vmull.u8 q10, d22, d28 + vmlal.u8 q10, d23, d29 + + vshll.u16 q0, d16, #8 + vmlsl.u16 q0, d16, d30 + vmlal.u16 q0, d17, d30 + + pld [TMP4, PF_OFFS] + vld1.32 {d16}, [TMP4], STRIDE + vld1.32 {d17}, [TMP4] + pld [TMP4, PF_OFFS] + vmull.u8 q11, d16, d28 + vmlal.u8 q11, d17, d29 + + vshll.u16 q1, d18, #8 + vmlsl.u16 q1, d18, d31 + vmlal.u16 q1, d19, d31 + vshr.u16 q15, q12, #8 + vadd.u16 q12, q12, q13 .endm .macro bilinear_over_8888_8888_process_pixblock_tail + vshll.u16 q2, d20, #8 + vmlsl.u16 q2, d20, d30 + vmlal.u16 q2, d21, d30 + vshll.u16 q3, d22, #8 + vmlsl.u16 q3, d22, d31 + vmlal.u16 q3, d23, d31 + vshrn.u32 d0, q0, #16 + vshrn.u32 d1, q1, #16 + vld1.32 {d2, d3}, [OUT, :128] + pld [OUT, PF_OFFS] + vshrn.u32 d4, q2, #16 + vshr.u16 q15, q12, #8 + vshrn.u32 d5, q3, #16 + vmovn.u16 d6, q0 + vmovn.u16 d7, q2 + vuzp.8 d6, d7 + vuzp.8 d2, d3 + vuzp.8 d6, d7 + vuzp.8 d2, d3 + vdup.32 d4, d7[1] + vmvn.8 d4, d4 + vmull.u8 q11, d2, d4 + vmull.u8 q2, d3, d4 + vrshr.u16 q1, q11, #8 + vrshr.u16 q10, q2, #8 + vraddhn.u16 d2, q1, q11 + vraddhn.u16 d3, q10, q2 + vqadd.u8 q3, q1, q3 + vuzp.8 d6, d7 + vuzp.8 d6, d7 + vadd.u16 q12, q12, q13 + vst1.32 {d6, d7}, [OUT, :128]! .endm .macro bilinear_over_8888_8888_process_pixblock_tail_head - bilinear_over_8888_8888_process_pixblock_tail - bilinear_over_8888_8888_process_pixblock_head + vshll.u16 q2, d20, #8 + mov TMP1, X, asr #16 + add X, X, UX + add TMP1, TOP, TMP1, asl #2 + vmlsl.u16 q2, d20, d30 + mov TMP2, X, asr #16 + add X, X, UX + add TMP2, TOP, TMP2, asl #2 + vmlal.u16 q2, d21, d30 + vshll.u16 q3, d22, #8 + vld1.32 {d20}, [TMP1], STRIDE + vmlsl.u16 q3, d22, d31 + vmlal.u16 q3, d23, d31 + vld1.32 {d21}, [TMP1] + vmull.u8 q8, d20, d28 + vmlal.u8 q8, d21, d29 + vshrn.u32 d0, q0, #16 + vshrn.u32 d1, q1, #16 + vld1.32 {d2, d3}, [OUT, :128] + pld [OUT, PF_OFFS] + vshrn.u32 d4, q2, #16 + vshr.u16 q15, q12, #8 + vld1.32 {d22}, [TMP2], STRIDE + vshrn.u32 d5, q3, #16 + vmovn.u16 d6, q0 + vld1.32 {d23}, [TMP2] + vmull.u8 q9, d22, d28 + mov TMP3, X, asr #16 + add X, X, UX + add TMP3, TOP, TMP3, asl #2 + mov TMP4, X, asr #16 + add X, X, UX + add TMP4, TOP, TMP4, asl #2 + vmlal.u8 q9, d23, d29 + vmovn.u16 d7, q2 + vld1.32 {d22}, [TMP3], STRIDE + vuzp.8 d6, d7 + vuzp.8 d2, d3 + vuzp.8 d6, d7 + vuzp.8 d2, d3 + vdup.32 d4, d7[1] + vld1.32 {d23}, [TMP3] + vmvn.8 d4, d4 + vmull.u8 q10, d22, d28 + vmlal.u8 q10, d23, d29 + vmull.u8 q11, d2, d4 + vmull.u8 q2, d3, d4 + vshll.u16 q0, d16, #8 + vmlsl.u16 q0, d16, d30 + vrshr.u16 q1, q11, #8 + vmlal.u16 q0, d17, d30 + vrshr.u16 q8, q2, #8 + vraddhn.u16 d2, q1, q11 + vraddhn.u16 d3, q8, q2 + pld [TMP4, PF_OFFS] + vld1.32 {d16}, [TMP4], STRIDE + vqadd.u8 q3, q1, q3 + vld1.32 {d17}, [TMP4] + pld [TMP4, PF_OFFS] + vmull.u8 q11, d16, d28 + vmlal.u8 q11, d17, d29 + vuzp.8 d6, d7 + vshll.u16 q1, d18, #8 + vuzp.8 d6, d7 + vmlsl.u16 q1, d18, d31 + vadd.u16 q12, q12, q13 + vmlal.u16 q1, d19, d31 + vshr.u16 q15, q12, #8 + vadd.u16 q12, q12, q13 + vst1.32 {d6, d7}, [OUT, :128]! .endm /* over_8888_8_8888 */ commit 1cd916f3a5ebeb943f66eecf0b8ce99af0b95d11 Author: Taekyun Kim <tkq....@samsung.com> Date: Fri Sep 23 00:03:22 2011 +0900 ARM: NEON: Replace old bilinear scanline generator with new template Bilinear scanline functions in pixman-arm-neon-asm-bilinear.S can be replaced with new template just by wrapping existing macros. diff --git a/pixman/pixman-arm-neon-asm-bilinear.S b/pixman/pixman-arm-neon-asm-bilinear.S index 784e5df..25bcb24 100644 --- a/pixman/pixman-arm-neon-asm-bilinear.S +++ b/pixman/pixman-arm-neon-asm-bilinear.S @@ -582,198 +582,6 @@ fname: bilinear_store_&dst_fmt 4, q2, q3 .endm -.macro generate_bilinear_scanline_func_src_dst \ - fname, src_fmt, dst_fmt, op, \ - bpp_shift, prefetch_distance - -pixman_asm_function fname - OUT .req r0 - TOP .req r1 - BOTTOM .req r2 - WT .req r3 - WB .req r4 - X .req r5 - UX .req r6 - WIDTH .req ip - TMP1 .req r3 - TMP2 .req r4 - PF_OFFS .req r7 - TMP3 .req r8 - TMP4 .req r9 - STRIDE .req r2 - - mov ip, sp - push {r4, r5, r6, r7, r8, r9} - mov PF_OFFS, #prefetch_distance - ldmia ip, {WB, X, UX, WIDTH} - mul PF_OFFS, PF_OFFS, UX - - .set prefetch_offset, prefetch_distance - - sub STRIDE, BOTTOM, TOP - .unreq BOTTOM - - cmp WIDTH, #0 - ble 3f - - vdup.u16 q12, X - vdup.u16 q13, UX - vdup.u8 d28, WT - vdup.u8 d29, WB - vadd.u16 d25, d25, d26 - vadd.u16 q13, q13, q13 - vshr.u16 q15, q12, #8 - vadd.u16 q12, q12, q13 - - subs WIDTH, WIDTH, #4 - blt 1f - mov PF_OFFS, PF_OFFS, asr #(16 - bpp_shift) -0: - bilinear_interpolate_four_pixels src_fmt, x, dst_fmt, op - subs WIDTH, WIDTH, #4 - bge 0b -1: - tst WIDTH, #2 - beq 2f - bilinear_interpolate_two_pixels src_fmt, x, dst_fmt, op -2: - tst WIDTH, #1 - beq 3f - bilinear_interpolate_last_pixel src_fmt, x, dst_fmt, op -3: - pop {r4, r5, r6, r7, r8, r9} - bx lr - - .unreq OUT - .unreq TOP - .unreq WT - .unreq WB - .unreq X - .unreq UX - .unreq WIDTH - .unreq TMP1 - .unreq TMP2 - .unreq PF_OFFS - .unreq TMP3 - .unreq TMP4 - .unreq STRIDE -.endfunc - -.endm - -.macro generate_bilinear_scanline_func_src_a8_dst \ - fname, src_fmt, dst_fmt, op, \ - bpp_shift, prefetch_distance - -pixman_asm_function fname - OUT .req r0 - MASK .req r1 - TOP .req r2 - BOTTOM .req r3 - WT .req r4 - WB .req r5 - X .req r6 - UX .req r7 - WIDTH .req ip - TMP1 .req r4 - TMP2 .req r5 - PF_OFFS .req r8 - TMP3 .req r9 - TMP4 .req r10 - STRIDE .req r3 - - mov ip, sp - push {r4, r5, r6, r7, r8, r9, r10, ip} - mov PF_OFFS, #prefetch_distance - ldmia ip, {WT, WB, X, UX, WIDTH} - mul PF_OFFS, PF_OFFS, UX - - .set prefetch_offset, prefetch_distance - - sub STRIDE, BOTTOM, TOP - .unreq BOTTOM - - cmp WIDTH, #0 - ble 3f - - vdup.u16 q12, X - vdup.u16 q13, UX - vdup.u8 d28, WT - vdup.u8 d29, WB - vadd.u16 d25, d25, d26 - vadd.u16 q13, q13, q13 - vshr.u16 q15, q12, #8 - vadd.u16 q12, q12, q13 - - subs WIDTH, WIDTH, #4 - blt 1f - mov PF_OFFS, PF_OFFS, asr #(16 - bpp_shift) -0: - bilinear_interpolate_four_pixels src_fmt, 8, dst_fmt, op - subs WIDTH, WIDTH, #4 - bge 0b -1: - tst WIDTH, #2 - beq 2f - bilinear_interpolate_two_pixels src_fmt, 8, dst_fmt, op -2: - tst WIDTH, #1 - beq 3f - bilinear_interpolate_last_pixel src_fmt, 8, dst_fmt, op -3: - pop {r4, r5, r6, r7, r8, r9, r10, ip} - bx lr - - .unreq OUT - .unreq TOP - .unreq WT - .unreq WB - .unreq X - .unreq UX - .unreq WIDTH - .unreq MASK - .unreq TMP1 - .unreq TMP2 - .unreq PF_OFFS - .unreq TMP3 - .unreq TMP4 - .unreq STRIDE -.endfunc - -.endm - -generate_bilinear_scanline_func_src_dst \ - pixman_scaled_bilinear_scanline_8888_8888_OVER_asm_neon, \ - 8888, 8888, over, 2, 28 - -generate_bilinear_scanline_func_src_dst \ - pixman_scaled_bilinear_scanline_8888_8888_ADD_asm_neon, \ - 8888, 8888, add, 2, 28 - -generate_bilinear_scanline_func_src_a8_dst \ - pixman_scaled_bilinear_scanline_8888_8_8888_SRC_asm_neon, \ - 8888, 8888, src, 2, 28 - -generate_bilinear_scanline_func_src_a8_dst \ - pixman_scaled_bilinear_scanline_8888_8_0565_SRC_asm_neon, \ - 8888, 0565, src, 2, 28 - -generate_bilinear_scanline_func_src_a8_dst \ - pixman_scaled_bilinear_scanline_0565_8_x888_SRC_asm_neon, \ - 0565, 8888, src, 1, 28 - -generate_bilinear_scanline_func_src_a8_dst \ - pixman_scaled_bilinear_scanline_0565_8_0565_SRC_asm_neon, \ - 0565, 0565, src, 1, 28 - -generate_bilinear_scanline_func_src_a8_dst \ - pixman_scaled_bilinear_scanline_8888_8_8888_OVER_asm_neon, \ - 8888, 8888, over, 2, 28 - -generate_bilinear_scanline_func_src_a8_dst \ - pixman_scaled_bilinear_scanline_8888_8_8888_ADD_asm_neon, \ - 8888, 8888, add, 2, 28 - .set BILINEAR_FLAG_USE_MASK, 1 .set BILINEAR_FLAG_USE_ALL_NEON_REGS, 2 @@ -855,6 +663,8 @@ pixman_asm_function fname TMP4 .req r10 STRIDE .req r3 + .set prefetch_offset, prefetch_distance + mov ip, sp push {r4, r5, r6, r7, r8, r9, r10, ip} mov PF_OFFS, #prefetch_distance @@ -968,3 +778,293 @@ pixman_asm_function fname .endfunc .endm + +/* src_8888_8_8888 */ +.macro bilinear_src_8888_8_8888_process_last_pixel + bilinear_interpolate_last_pixel 8888, 8, 8888, src +.endm + +.macro bilinear_src_8888_8_8888_process_two_pixels + bilinear_interpolate_two_pixels 8888, 8, 8888, src +.endm + +.macro bilinear_src_8888_8_8888_process_four_pixels + bilinear_interpolate_four_pixels 8888, 8, 8888, src +.endm + +.macro bilinear_src_8888_8_8888_process_pixblock_head + bilinear_src_8888_8_8888_process_four_pixels +.endm + +.macro bilinear_src_8888_8_8888_process_pixblock_tail +.endm + +.macro bilinear_src_8888_8_8888_process_pixblock_tail_head + bilinear_src_8888_8_8888_process_pixblock_tail + bilinear_src_8888_8_8888_process_pixblock_head +.endm + +/* src_8888_8_0565 */ +.macro bilinear_src_8888_8_0565_process_last_pixel + bilinear_interpolate_last_pixel 8888, 8, 0565, src +.endm + +.macro bilinear_src_8888_8_0565_process_two_pixels + bilinear_interpolate_two_pixels 8888, 8, 0565, src +.endm + +.macro bilinear_src_8888_8_0565_process_four_pixels + bilinear_interpolate_four_pixels 8888, 8, 0565, src +.endm + +.macro bilinear_src_8888_8_0565_process_pixblock_head + bilinear_src_8888_8_0565_process_four_pixels +.endm + +.macro bilinear_src_8888_8_0565_process_pixblock_tail +.endm + +.macro bilinear_src_8888_8_0565_process_pixblock_tail_head -- To UNSUBSCRIBE, email to debian-x-requ...@lists.debian.org with a subject of "unsubscribe". Trouble? Contact listmas...@lists.debian.org Archive: http://lists.debian.org/e1rldck-0002zx...@vasks.debian.org