Skip to site navigation (Press enter)

[webkit-changes] [102702] trunk/Source/WebCore

commit-queue Tue, 13 Dec 2011 13:42:47 -0800

Title: [102702] trunk/Source/WebCore

Revision: 102702
Author: [email protected]
Date: 2011-12-13 13:42:37 -0800 (Tue, 13 Dec 2011)

Log Message

Implement a function of vector multiply with SSE2 optimization in VectorMath.cpp.
https://bugs.webkit.org/show_bug.cgi?id=74048


Patch by Xingnan Wang <[email protected]> on 2011-12-13
Reviewed by Benjamin Poulain.

The vmul is a function for an element-by-element multiply of two float vectors and we
get about 3.4x performance improvement with SSE2 optimization compared with the common
multiply.

Use vmul in AudioBus::copyWithSampleAccurateGainValuesFrom().

* platform/audio/AudioBus.cpp:
(WebCore::AudioBus::copyWithSampleAccurateGainValuesFrom):
* platform/audio/VectorMath.cpp:
(WebCore::VectorMath::vmul):
* platform/audio/VectorMath.h:

Modified Paths

trunk/Source/WebCore/ChangeLog
trunk/Source/WebCore/platform/audio/AudioBus.cpp
trunk/Source/WebCore/platform/audio/VectorMath.cpp
trunk/Source/WebCore/platform/audio/VectorMath.h

Diff

Modified: trunk/Source/WebCore/ChangeLog (102701 => 102702)


--- trunk/Source/WebCore/ChangeLog	2011-12-13 21:27:15 UTC (rev 102701)
+++ trunk/Source/WebCore/ChangeLog	2011-12-13 21:42:37 UTC (rev 102702)
@@ -1,3 +1,22 @@
+2011-12-13  Xingnan Wang  <[email protected]>
+
+        Implement a function of vector multiply with SSE2 optimization in VectorMath.cpp.
+        https://bugs.webkit.org/show_bug.cgi?id=74048
+
+        Reviewed by Benjamin Poulain.
+
+        The vmul is a function for an element-by-element multiply of two float vectors and we 
+        get about 3.4x performance improvement with SSE2 optimization compared with the common 
+        multiply.
+
+        Use vmul in AudioBus::copyWithSampleAccurateGainValuesFrom().
+
+        * platform/audio/AudioBus.cpp:
+        (WebCore::AudioBus::copyWithSampleAccurateGainValuesFrom):
+        * platform/audio/VectorMath.cpp:
+        (WebCore::VectorMath::vmul):
+        * platform/audio/VectorMath.h:
+
 2011-12-13  Vsevolod Vlasov  <[email protected]>
 
         Web Inspector: [Regression] ResourceHeadersView sections should be expanded by default.

Modified: trunk/Source/WebCore/platform/audio/AudioBus.cpp (102701 => 102702)


--- trunk/Source/WebCore/platform/audio/AudioBus.cpp	2011-12-13 21:27:15 UTC (rev 102701)
+++ trunk/Source/WebCore/platform/audio/AudioBus.cpp	2011-12-13 21:42:37 UTC (rev 102702)
@@ -382,15 +382,13 @@
         return;
     }
 
-    // FIXME: this can potentially use SIMD optimizations with vector libraries.
     // We handle both the 1 -> N and N -> N case here.
     const float* source = sourceBus.channel(0)->data();
     for (unsigned channelIndex = 0; channelIndex < numberOfChannels(); ++channelIndex) {
         if (sourceBus.numberOfChannels() == numberOfChannels())
             source = sourceBus.channel(channelIndex)->data();
         float* destination = channel(channelIndex)->data();
-        for (unsigned i = 0; i < numberOfGainValues; ++i)
-            destination[i] = source[i] * gainValues[i];
+        vmul(source, 1, gainValues, 1, destination, 1, numberOfGainValues);
     }
 }

Modified: trunk/Source/WebCore/platform/audio/VectorMath.cpp (102701 => 102702)


--- trunk/Source/WebCore/platform/audio/VectorMath.cpp	2011-12-13 21:27:15 UTC (rev 102701)
+++ trunk/Source/WebCore/platform/audio/VectorMath.cpp	2011-12-13 21:42:37 UTC (rev 102702)
@@ -63,8 +63,17 @@
 #endif
 }
 
+void vmul(const float* source1P, int sourceStride1, const float* source2P, int sourceStride2, float* destP, int destStride, size_t framesToProcess)
+{
+#if defined(__ppc__) || defined(__i386__)
+    ::vmul(source1P, sourceStride1, source2P, sourceStride2, destP, destStride, framesToProcess);
 #else
+    vDSP_vmul(source1P, sourceStride1, source2P, sourceStride2, destP, destStride, framesToProcess);
+#endif
+}
 
+#else
+
 void vsmul(const float* sourceP, int sourceStride, const float* scale, float* destP, int destStride, size_t framesToProcess)
 {
 #ifdef __SSE2__
@@ -229,6 +238,66 @@
 #endif
 }
 
+void vmul(const float* source1P, int sourceStride1, const float* source2P, int sourceStride2, float* destP, int destStride, size_t framesToProcess)
+{
+
+    int n = framesToProcess;
+
+#ifdef __SSE2__
+    if ((sourceStride1 == 1) && (sourceStride2 == 1) && (destStride == 1)) {
+
+        // If the source1P address is not 16-byte aligned, the first several frames (at most three) should be processed seperately.
+        while ((reinterpret_cast<uintptr_t>(source1P) & 0x0F) && n) {
+            *destP = *source1P * *source2P;
+            source1P++;
+            source2P++;
+            destP++;
+            n--;
+        }
+
+        // Now the source1P address aligned and start to apply SSE.
+        int tailFrames = n % 4;
+        float* endP = destP + n - tailFrames;
+        __m128 pSource1;
+        __m128 pSource2;
+        __m128 dest;
+
+        bool source2Aligned = !(reinterpret_cast<uintptr_t>(source2P) & 0x0F);
+        bool destAligned = !(reinterpret_cast<uintptr_t>(destP) & 0x0F);
+
+#define SSE2_MULT(loadInstr, storeInstr)                   \
+            while (destP < endP)                           \
+            {                                              \
+                pSource1 = _mm_load_ps(source1P);          \
+                pSource2 = _mm_##loadInstr##_ps(source2P); \
+                dest = _mm_mul_ps(pSource1, pSource2);     \
+                _mm_##storeInstr##_ps(destP, dest);        \
+                source1P += 4;                             \
+                source2P += 4;                             \
+                destP += 4;                                \
+            }
+
+        if (source2Aligned && destAligned) // Both aligned.
+            SSE2_MULT(load, store)
+        else if (source2Aligned && !destAligned) // Source2 is aligned but dest not.
+            SSE2_MULT(load, storeu)
+        else if (!source2Aligned && destAligned) // Dest is aligned but source2 not.
+            SSE2_MULT(loadu, store)
+        else // Neither aligned.
+            SSE2_MULT(loadu, storeu)
+
+        n = tailFrames;
+    }
+#endif
+    while (n) {
+        *destP = *source1P * *source2P;
+        source1P += sourceStride1;
+        source2P += sourceStride2;
+        destP += destStride;
+        n--;
+    }
+}
+
 #endif // OS(DARWIN)
 
 } // namespace VectorMath

Modified: trunk/Source/WebCore/platform/audio/VectorMath.h (102701 => 102702)


--- trunk/Source/WebCore/platform/audio/VectorMath.h	2011-12-13 21:27:15 UTC (rev 102701)
+++ trunk/Source/WebCore/platform/audio/VectorMath.h	2011-12-13 21:42:37 UTC (rev 102702)
@@ -34,6 +34,9 @@
 void vsmul(const float* sourceP, int sourceStride, const float* scale, float* destP, int destStride, size_t framesToProcess);
 void vadd(const float* source1P, int sourceStride1, const float* source2P, int sourceStride2, float* destP, int destStride, size_t framesToProcess);
 
+// For an element-by-element multiply of two float vectors.
+void vmul(const float* source1P, int sourceStride1, const float* source2P, int sourceStride2, float* destP, int destStride, size_t framesToProcess);
+
 } // namespace VectorMath
 
 } // namespace WebCore

_______________________________________________
webkit-changes mailing list
[email protected]
http://lists.webkit.org/mailman/listinfo.cgi/webkit-changes