[REVIEW 3-5] fdo#49208 ridiculous performance on certain .doc

Caolán McNamara Tue, 01 May 2012 08:47:15 -0700

The .doc has some paragraphs in it that are around the 64k length mark.
It doesn't *look* like that's the case because they end in vast quantity
of whitespace and writer implements the word-compatibility thing to let
empty spaces disappear off the right side of the page rather than wrap.


Turns out to be a good test case for our break iterators, couple of
problems, squashed together as attachment

We create an icu string twice, once to compare it to a cached one, and
the second time to replace the cache if different
http://cgit.freedesktop.org/libreoffice/core/commit/?id=268ec2e64f89eb39fd5f02688787cd6f53e948b5

Better to index into a string instead of copying when "character"
counting
http://cgit.freedesktop.org/libreoffice/core/commit/?id=fd4fe85329654883a0bf3304ad0aa8ef0bfde844

icu's string compare is very very slow when you just want to check for
equality
http://cgit.freedesktop.org/libreoffice/core/commit/?id=ccc47b3db3eae25cc11bb709416c0b61747ca89e

and assume that a space doesn't combine with anything in any exotic way
to form a single grapheme
http://cgit.freedesktop.org/libreoffice/core/commit/?id=ae716b07f7218fadf0143de1946cc9e0e2c08744

C.

>From 2e9ee2040c52a4425f91be9b0083b78b25d662b3 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Caol=C3=A1n=20McNamara?= <caol...@redhat.com>
Date: Tue, 1 May 2012 16:30:52 +0100
Subject: [PATCH] Resolves: fdo#49208 performance problems with very long
 paragraphs

---
 .../source/breakiterator/breakiterator_unicode.cxx |   27 +++++++++++++++++--
 sw/inc/breakit.hxx                                 |    7 ++++-
 sw/source/core/bastyp/breakit.cxx                  |   19 +++++++++----
 sw/source/core/txtnode/txtedt.cxx                  |    2 +-
 4 files changed, 44 insertions(+), 11 deletions(-)

diff --git a/i18npool/source/breakiterator/breakiterator_unicode.cxx b/i18npool/source/breakiterator/breakiterator_unicode.cxx
index aa5e1d8..5e5f6d5 100644
--- a/i18npool/source/breakiterator/breakiterator_unicode.cxx
+++ b/i18npool/source/breakiterator/breakiterator_unicode.cxx
@@ -34,6 +34,7 @@
 #include <unicode/udata.h>
 #include <rtl/strbuf.hxx>
 #include <rtl/ustring.hxx>
+#include <string.h>
 
 U_CDECL_BEGIN
 extern const char OpenOffice_dat[];
@@ -94,6 +95,24 @@ class OOoRuleBasedBreakIterator : public RuleBasedBreakIterator {
 
 };
 
+namespace
+{
+    bool isEqual(const UnicodeString &rOne, const rtl::OUString &rOther)
+    {
+        sal_Int32 nLength = rOne.length();
+        if (nLength != rOther.getLength())
+            return false;
+
+        //fdo#49208 operator== is implemented by compareTo etc in icu which is
+        //horrifically slow when all you want to know is that they're the same
+        //or not
+        const UChar *pOne = rOne.getBuffer();
+        // UChar != sal_Unicode in MinGW
+        const UChar *pOther = reinterpret_cast<const UChar *>(rOther.getStr());
+        return memcmp(pOne, pOther, nLength * sizeof(UChar)) == 0;
+    }
+}
+
 // loading ICU breakiterator on demand.
 void SAL_CALL BreakIterator_Unicode::loadICUBreakIterator(const com::sun::star::lang::Locale& rLocale,
         sal_Int16 rBreakType, sal_Int16 rWordType, const sal_Char *rule, const OUString& rText) throw(uno::RuntimeException)
@@ -199,13 +218,15 @@ void SAL_CALL BreakIterator_Unicode::loadICUBreakIterator(const com::sun::star::
         }
     }
 
-    if (newBreak || icuBI->aICUText.compare(UnicodeString(reinterpret_cast<const UChar *>(rText.getStr()), rText.getLength()))) {   // UChar != sal_Unicode in MinGW
-        icuBI->aICUText=UnicodeString(reinterpret_cast<const UChar *>(rText.getStr()), rText.getLength());
+    if (newBreak || !isEqual(icuBI->aICUText, rText))
+    {
+        // UChar != sal_Unicode in MinGW
+        const UChar *pText = reinterpret_cast<const UChar *>(rText.getStr());
+        icuBI->aICUText=UnicodeString(pText, rText.getLength());
         icuBI->aBreakIterator->setText(icuBI->aICUText);
     }
 }
 
-
 sal_Int32 SAL_CALL BreakIterator_Unicode::nextCharacters( const OUString& Text,
         sal_Int32 nStartPos, const lang::Locale &rLocale,
         sal_Int16 nCharacterIteratorMode, sal_Int32 nCount, sal_Int32& nDone )
diff --git a/sw/inc/breakit.hxx b/sw/inc/breakit.hxx
index 3075fc9..0bdce8c 100644
--- a/sw/inc/breakit.hxx
+++ b/sw/inc/breakit.hxx
@@ -112,7 +112,12 @@ public:
     sal_uInt16 GetRealScriptOfText( const String& rTxt, xub_StrLen nPos ) const;
     sal_uInt16 GetAllScriptsOfText( const String& rTxt ) const;
 
-    sal_Int32 getGraphemeCount(const rtl::OUString& rStr) const;
+    sal_Int32 getGraphemeCount(const rtl::OUString& rStr,
+        sal_Int32 nStart, sal_Int32 nEnd) const;
+    sal_Int32 getGraphemeCount(const rtl::OUString& rStr) const
+    {
+        return getGraphemeCount(rStr, 0, rStr.getLength());
+    }
 };
 
 #define SW_BREAKITER()  SwBreakIt::Get()
diff --git a/sw/source/core/bastyp/breakit.cxx b/sw/source/core/bastyp/breakit.cxx
index bb93660..c35afa0 100644
--- a/sw/source/core/bastyp/breakit.cxx
+++ b/sw/source/core/bastyp/breakit.cxx
@@ -168,16 +168,23 @@ sal_uInt16 SwBreakIt::GetAllScriptsOfText( const String& rTxt ) const
     return nRet;
 }
 
-sal_Int32 SwBreakIt::getGraphemeCount(const rtl::OUString& rText) const
+sal_Int32 SwBreakIt::getGraphemeCount(const rtl::OUString& rText, sal_Int32 nStart, sal_Int32 nEnd) const
 {
     sal_Int32 nGraphemeCount = 0;
 
-    sal_Int32 nCurPos = 0;
-    while (nCurPos < rText.getLength())
+    sal_Int32 nCurPos = nStart;
+    while (nCurPos < nEnd)
     {
-        sal_Int32 nCount2 = 1;
-        nCurPos = xBreak->nextCharacters(rText, nCurPos, lang::Locale(),
-            i18n::CharacterIteratorMode::SKIPCELL, nCount2, nCount2);
+        //fdo#49208 cheat and assume that nothing can combine with a space
+        //to form a single grapheme
+        if (rText[nCurPos] == ' ')
+            ++nCurPos;
+        else
+        {
+            sal_Int32 nCount2 = 1;
+            nCurPos = xBreak->nextCharacters(rText, nCurPos, lang::Locale(),
+                i18n::CharacterIteratorMode::SKIPCELL, nCount2, nCount2);
+        }
         ++nGraphemeCount;
     }
 
diff --git a/sw/source/core/txtnode/txtedt.cxx b/sw/source/core/txtnode/txtedt.cxx
index 86e4b69..186d4ea 100644
--- a/sw/source/core/txtnode/txtedt.cxx
+++ b/sw/source/core/txtnode/txtedt.cxx
@@ -1863,7 +1863,7 @@ void SwTxtNode::CountWords( SwDocStat& rStat,
         }
     }
 
-    nTmpChars = pBreakIt->getGraphemeCount(aExpandText.copy(nExpandBegin, nExpandEnd - nExpandBegin));
+    nTmpChars = pBreakIt->getGraphemeCount(aExpandText, nExpandBegin, nExpandEnd);
     nTmpChars -= nNumOfMaskedChars;
 
     // no nTmpCharsExcludingSpaces adjust needed neither for blanked out MaskedChars
-- 
1.7.7.6

_______________________________________________
LibreOffice mailing list
LibreOffice@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/libreoffice

[REVIEW 3-5] fdo#49208 ridiculous performance on certain .doc

Reply via email to