Skip to site navigation (Press enter)

[webkit-changes] [134403] trunk/Source/WebCore

rgabor Tue, 13 Nov 2012 05:43:00 -0800

Title: [134403] trunk/Source/WebCore

Revision: 134403
Author: [email protected]
Date: 2012-11-13 05:44:31 -0800 (Tue, 13 Nov 2012)

Log Message

Optimize RGB565 and RGBA5551 packing/unpacking functions with NEON intrinsics
https://bugs.webkit.org/show_bug.cgi?id=102060


Reviewed by Zoltan Herczeg.

Adding more NEONized packing functions to GraphicsContext3DNEON.h.
These functions are three times faster with this intrinsics optimizations.

* platform/graphics/GraphicsContext3D.cpp:
(WebCore):
* platform/graphics/cpu/arm/GraphicsContext3DNEON.h:
(WebCore::ARM::unpackOneRowOfRGBA5551ToRGBA8NEON):
(ARM):
(WebCore::ARM::packOneRowOfRGBA8ToUnsignedShort5551NEON):
(WebCore::ARM::unpackOneRowOfRGB565ToRGBA8NEON):
(WebCore::ARM::packOneRowOfRGBA8ToUnsignedShort565NEON):

Modified Paths

trunk/Source/WebCore/ChangeLog
trunk/Source/WebCore/platform/graphics/GraphicsContext3D.cpp
trunk/Source/WebCore/platform/graphics/cpu/arm/GraphicsContext3DNEON.h

Diff

Modified: trunk/Source/WebCore/ChangeLog (134402 => 134403)


--- trunk/Source/WebCore/ChangeLog	2012-11-13 13:26:44 UTC (rev 134402)
+++ trunk/Source/WebCore/ChangeLog	2012-11-13 13:44:31 UTC (rev 134403)
@@ -1,3 +1,22 @@
+2012-11-13  Gabor Rapcsanyi  <[email protected]>
+
+        Optimize RGB565 and RGBA5551 packing/unpacking functions with NEON intrinsics
+        https://bugs.webkit.org/show_bug.cgi?id=102060
+
+        Reviewed by Zoltan Herczeg.
+
+        Adding more NEONized packing functions to GraphicsContext3DNEON.h.
+        These functions are three times faster with this intrinsics optimizations.
+
+        * platform/graphics/GraphicsContext3D.cpp:
+        (WebCore):
+        * platform/graphics/cpu/arm/GraphicsContext3DNEON.h:
+        (WebCore::ARM::unpackOneRowOfRGBA5551ToRGBA8NEON):
+        (ARM):
+        (WebCore::ARM::packOneRowOfRGBA8ToUnsignedShort5551NEON):
+        (WebCore::ARM::unpackOneRowOfRGB565ToRGBA8NEON):
+        (WebCore::ARM::packOneRowOfRGBA8ToUnsignedShort565NEON):
+
 2012-11-13  Csaba Osztrogonác  <[email protected]>
 
         [Qt] Enable Mutation observer

Modified: trunk/Source/WebCore/platform/graphics/GraphicsContext3D.cpp (134402 => 134403)


--- trunk/Source/WebCore/platform/graphics/GraphicsContext3D.cpp	2012-11-13 13:26:44 UTC (rev 134402)
+++ trunk/Source/WebCore/platform/graphics/GraphicsContext3D.cpp	2012-11-13 13:44:31 UTC (rev 134403)
@@ -526,6 +526,16 @@
 
 void unpackOneRowOfRGBA5551ToRGBA8(const uint16_t* source, uint8_t* destination, unsigned int pixelsPerRow)
 {
+#if HAVE(ARM_NEON_INTRINSICS)
+    unsigned tailPixels = pixelsPerRow % 8;
+    unsigned pixelSize = pixelsPerRow - tailPixels;
+
+    ARM::unpackOneRowOfRGBA5551ToRGBA8NEON(source, destination, pixelSize);
+
+    source += pixelSize;
+    destination += pixelSize * 4;
+    pixelsPerRow = tailPixels;
+#endif
     for (unsigned int i = 0; i < pixelsPerRow; ++i) {
         uint16_t packedValue = source[0];
         uint8_t r = packedValue >> 11;
@@ -569,6 +579,16 @@
 
 void unpackOneRowOfRGB565ToRGBA8(const uint16_t* source, uint8_t* destination, unsigned int pixelsPerRow)
 {
+#if HAVE(ARM_NEON_INTRINSICS)
+    unsigned tailPixels = pixelsPerRow % 8;
+    unsigned pixelSize = pixelsPerRow - tailPixels;
+
+    ARM::unpackOneRowOfRGB565ToRGBA8NEON(source, destination, pixelSize);
+
+    source += pixelSize;
+    destination += pixelSize * 4;
+    pixelsPerRow = tailPixels;
+#endif
     for (unsigned int i = 0; i < pixelsPerRow; ++i) {
         uint16_t packedValue = source[0];
         uint8_t r = packedValue >> 11;
@@ -1014,6 +1034,17 @@
 
 void packOneRowOfRGBA8ToUnsignedShort5551(const uint8_t* source, uint16_t* destination, unsigned int pixelsPerRow)
 {
+#if HAVE(ARM_NEON_INTRINSICS)
+    unsigned componentsPerRow = pixelsPerRow * 4;
+    unsigned tailComponents = componentsPerRow % 32;
+    unsigned componentsSize = componentsPerRow - tailComponents;
+
+    ARM::packOneRowOfRGBA8ToUnsignedShort5551NEON(source, destination, componentsSize);
+
+    source += componentsSize;
+    destination += componentsSize / 4;
+    pixelsPerRow = tailComponents / 4;
+#endif
     for (unsigned int i = 0; i < pixelsPerRow; ++i) {
         *destination = (((source[0] & 0xF8) << 8)
                         | ((source[1] & 0xF8) << 3)
@@ -1059,6 +1090,17 @@
 
 void packOneRowOfRGBA8ToUnsignedShort565(const uint8_t* source, uint16_t* destination, unsigned int pixelsPerRow)
 {
+#if HAVE(ARM_NEON_INTRINSICS)
+    unsigned componentsPerRow = pixelsPerRow * 4;
+    unsigned tailComponents = componentsPerRow % 32;
+    unsigned componentsSize = componentsPerRow - tailComponents;
+
+    ARM::packOneRowOfRGBA8ToUnsignedShort565NEON(source, destination, componentsSize);
+
+    source += componentsSize;
+    destination += componentsSize / 4;
+    pixelsPerRow = tailComponents / 4;
+#endif
     for (unsigned int i = 0; i < pixelsPerRow; ++i) {
         *destination = (((source[0] & 0xF8) << 8)
                         | ((source[1] & 0xFC) << 3)

Modified: trunk/Source/WebCore/platform/graphics/cpu/arm/GraphicsContext3DNEON.h (134402 => 134403)


--- trunk/Source/WebCore/platform/graphics/cpu/arm/GraphicsContext3DNEON.h	2012-11-13 13:26:44 UTC (rev 134402)
+++ trunk/Source/WebCore/platform/graphics/cpu/arm/GraphicsContext3DNEON.h	2012-11-13 13:44:31 UTC (rev 134403)
@@ -36,14 +36,14 @@
 
 ALWAYS_INLINE void unpackOneRowOfRGBA4444ToRGBA8NEON(const uint16_t* source, uint8_t* destination, unsigned pixelSize)
 {
-    uint16x8_t constant = vdupq_n_u16(0x0F);
+    uint16x8_t immediate0x0f = vdupq_n_u16(0x0F);
     for (unsigned i = 0; i < pixelSize; i += 8) {
         uint16x8_t eightPixels = vld1q_u16(source + i);
 
         uint8x8_t componentR = vqmovn_u16(vshrq_n_u16(eightPixels, 12));
-        uint8x8_t componentG = vqmovn_u16(vandq_u16(vshrq_n_u16(eightPixels, 8), constant));
-        uint8x8_t componentB = vqmovn_u16(vandq_u16(vshrq_n_u16(eightPixels, 4), constant));
-        uint8x8_t componentA = vqmovn_u16(vandq_u16(eightPixels, constant));
+        uint8x8_t componentG = vqmovn_u16(vandq_u16(vshrq_n_u16(eightPixels, 8), immediate0x0f));
+        uint8x8_t componentB = vqmovn_u16(vandq_u16(vshrq_n_u16(eightPixels, 4), immediate0x0f));
+        uint8x8_t componentA = vqmovn_u16(vandq_u16(eightPixels, immediate0x0f));
 
         componentR = vorr_u8(vshl_n_u8(componentR, 4), componentR);
         componentG = vorr_u8(vshl_n_u8(componentG, 4), componentG);
@@ -59,14 +59,14 @@
 ALWAYS_INLINE void packOneRowOfRGBA8ToUnsignedShort4444NEON(const uint8_t* source, uint16_t* destination, unsigned componentsSize)
 {
     uint8_t* dst = reinterpret_cast<uint8_t*>(destination);
-    uint8x8_t constant = vdup_n_u8(0xF0);
+    uint8x8_t immediate0xf0 = vdup_n_u8(0xF0);
     for (unsigned i = 0; i < componentsSize; i += 32) {
         uint8x8x4_t RGBA8 = vld4_u8(source + i);
 
-        uint8x8_t componentR = vand_u8(RGBA8.val[0], constant);
-        uint8x8_t componentG = vshr_n_u8(vand_u8(RGBA8.val[1], constant), 4);
-        uint8x8_t componentB = vand_u8(RGBA8.val[2], constant);
-        uint8x8_t componentA = vshr_n_u8(vand_u8(RGBA8.val[3], constant), 4);
+        uint8x8_t componentR = vand_u8(RGBA8.val[0], immediate0xf0);
+        uint8x8_t componentG = vshr_n_u8(vand_u8(RGBA8.val[1], immediate0xf0), 4);
+        uint8x8_t componentB = vand_u8(RGBA8.val[2], immediate0xf0);
+        uint8x8_t componentA = vshr_n_u8(vand_u8(RGBA8.val[3], immediate0xf0), 4);
 
         uint8x8x2_t RGBA4;
         RGBA4.val[0] = vorr_u8(componentB, componentA);
@@ -76,6 +76,104 @@
     }
 }
 
+ALWAYS_INLINE void unpackOneRowOfRGBA5551ToRGBA8NEON(const uint16_t* source, uint8_t* destination, unsigned pixelSize)
+{
+    uint8x8_t immediate0x7 = vdup_n_u8(0x7);
+    uint8x8_t immediate0xff = vdup_n_u8(0xFF);
+    uint16x8_t immediate0x1f = vdupq_n_u16(0x1F);
+    uint16x8_t immediate0x1 = vdupq_n_u16(0x1);
+
+    for (unsigned i = 0; i < pixelSize; i += 8) {
+        uint16x8_t eightPixels = vld1q_u16(source + i);
+
+        uint8x8_t componentR = vqmovn_u16(vshrq_n_u16(eightPixels, 11));
+        uint8x8_t componentG = vqmovn_u16(vandq_u16(vshrq_n_u16(eightPixels, 6), immediate0x1f));
+        uint8x8_t componentB = vqmovn_u16(vandq_u16(vshrq_n_u16(eightPixels, 1), immediate0x1f));
+        uint8x8_t componentA = vqmovn_u16(vandq_u16(eightPixels, immediate0x1));
+
+        componentR = vorr_u8(vshl_n_u8(componentR, 3), vand_u8(componentR, immediate0x7));
+        componentG = vorr_u8(vshl_n_u8(componentG, 3), vand_u8(componentG, immediate0x7));
+        componentB = vorr_u8(vshl_n_u8(componentB, 3), vand_u8(componentB, immediate0x7));
+        componentA = vmul_u8(componentA, immediate0xff);
+
+        uint8x8x4_t destComponents = {componentR, componentG, componentB, componentA};
+        vst4_u8(destination, destComponents);
+        destination += 32;
+    }
+}
+
+ALWAYS_INLINE void packOneRowOfRGBA8ToUnsignedShort5551NEON(const uint8_t* source, uint16_t* destination, unsigned componentsSize)
+{
+    uint8_t* dst = reinterpret_cast<uint8_t*>(destination);
+
+    uint8x8_t immediate0xf8 = vdup_n_u8(0xF8);
+    uint8x8_t immediate0x18 = vdup_n_u8(0x18);
+    for (unsigned i = 0; i < componentsSize; i += 32) {
+        uint8x8x4_t RGBA8 = vld4_u8(source + i);
+
+        uint8x8_t componentR = vand_u8(RGBA8.val[0], immediate0xf8);
+        uint8x8_t componentG3bit = vshr_n_u8(RGBA8.val[1], 5);
+
+        uint8x8_t componentG2bit = vshl_n_u8(vand_u8(RGBA8.val[1], immediate0x18), 3);
+        uint8x8_t componentB = vshr_n_u8(vand_u8(RGBA8.val[2], immediate0xf8), 2);
+        uint8x8_t componentA = vshr_n_u8(RGBA8.val[3], 7);
+
+        uint8x8x2_t RGBA5551;
+        RGBA5551.val[0] = vorr_u8(vorr_u8(componentG2bit, componentB), componentA);
+        RGBA5551.val[1] = vorr_u8(componentR, componentG3bit);
+        vst2_u8(dst, RGBA5551);
+        dst += 16;
+    }
+}
+
+ALWAYS_INLINE void unpackOneRowOfRGB565ToRGBA8NEON(const uint16_t* source, uint8_t* destination, unsigned pixelSize)
+{
+    uint16x8_t immediate0x3f = vdupq_n_u16(0x3F);
+    uint16x8_t immediate0x1f = vdupq_n_u16(0x1F);
+    uint8x8_t immediate0x3 = vdup_n_u8(0x3);
+    uint8x8_t immediate0x7 = vdup_n_u8(0x7);
+
+    uint8x8_t componentA = vdup_n_u8(0xFF);
+
+    for (unsigned i = 0; i < pixelSize; i += 8) {
+        uint16x8_t eightPixels = vld1q_u16(source + i);
+
+        uint8x8_t componentR = vqmovn_u16(vshrq_n_u16(eightPixels, 11));
+        uint8x8_t componentG = vqmovn_u16(vandq_u16(vshrq_n_u16(eightPixels, 5), immediate0x3f));
+        uint8x8_t componentB = vqmovn_u16(vandq_u16(eightPixels, immediate0x1f));
+
+        componentR = vorr_u8(vshl_n_u8(componentR, 3), vand_u8(componentR, immediate0x7));
+        componentG = vorr_u8(vshl_n_u8(componentG, 2), vand_u8(componentG, immediate0x3));
+        componentB = vorr_u8(vshl_n_u8(componentB, 3), vand_u8(componentB, immediate0x7));
+
+        uint8x8x4_t destComponents = {componentR, componentG, componentB, componentA};
+        vst4_u8(destination, destComponents);
+        destination += 32;
+    }
+}
+
+ALWAYS_INLINE void packOneRowOfRGBA8ToUnsignedShort565NEON(const uint8_t* source, uint16_t* destination, unsigned componentsSize)
+{
+    uint8_t* dst = reinterpret_cast<uint8_t*>(destination);
+
+    uint8x8_t immediate0xf8 = vdup_n_u8(0xF8);
+    uint8x8_t immediate0x1c = vdup_n_u8(0x1C);
+    for (unsigned i = 0; i < componentsSize; i += 32) {
+        uint8x8x4_t RGBA8 = vld4_u8(source + i);
+
+        uint8x8_t componentR = vand_u8(RGBA8.val[0], immediate0xf8);
+        uint8x8_t componentGLeft = vshr_n_u8(RGBA8.val[1], 5);
+        uint8x8_t componentGRight = vshl_n_u8(vand_u8(RGBA8.val[1], immediate0x1c), 3);
+        uint8x8_t componentB = vshr_n_u8(vand_u8(RGBA8.val[2], immediate0xf8), 3);
+
+        uint8x8x2_t RGB565;
+        RGB565.val[0] = vorr_u8(componentGRight, componentB);
+        RGB565.val[1] = vorr_u8(componentR, componentGLeft);
+        vst2_u8(dst, RGB565);
+        dst += 16;
+    }
+}
+
 } // namespace ARM
 
 } // namespace WebCore

_______________________________________________
webkit-changes mailing list
[email protected]
http://lists.webkit.org/mailman/listinfo/webkit-changes