include/svtools/svparser.hxx      |    1 
 svtools/Library_svt.mk            |    1 
 svtools/source/svrtf/svparser.cxx |   99 ++++++++++++++------------------------
 3 files changed, 40 insertions(+), 61 deletions(-)

New commits:
commit ecfb83d7463bed7c89baeccc03286c1ac9956d70
Author:     Mike Kaganski <mike.kagan...@collabora.com>
AuthorDate: Fri Dec 10 12:06:11 2021 +0200
Commit:     Xisco Fauli <xiscofa...@libreoffice.org>
CommitDate: Mon Dec 13 11:48:34 2021 +0100

    tdf#146048: detect UTF-16 without BOM
    
    Change-Id: I3c1742cdf88dfa08cf35d9f95875d4d3d6af09db
    Reviewed-on: https://gerrit.libreoffice.org/c/core/+/126596
    Tested-by: Jenkins
    Reviewed-by: Mike Kaganski <mike.kagan...@collabora.com>
    (cherry picked from commit 3392f567be8d52804b187b0bced47204ef38fa3c)
    Reviewed-on: https://gerrit.libreoffice.org/c/core/+/126600
    Reviewed-by: Xisco Fauli <xiscofa...@libreoffice.org>

diff --git a/include/svtools/svparser.hxx b/include/svtools/svparser.hxx
index b0ba450a7285..d54b4a972f15 100644
--- a/include/svtools/svparser.hxx
+++ b/include/svtools/svparser.hxx
@@ -62,7 +62,6 @@ protected:
     sal_uInt64          nNextChPos;
     sal_uInt32          nNextCh;            // current character codepoint in 
UTF32 for the "lex"
 
-    bool                bUCS2BSrcEnc : 1;   // or as big-endian UCS2
     bool                bSwitchToUCS2 : 1;  // switching is allowed
     bool                bRTF_InTextRead : 1;  // only for RTF-Parser!!!
 
diff --git a/svtools/Library_svt.mk b/svtools/Library_svt.mk
index bbcce42ac65b..625b8e38f9cb 100644
--- a/svtools/Library_svt.mk
+++ b/svtools/Library_svt.mk
@@ -61,6 +61,7 @@ $(eval $(call gb_Library_use_libraries,svt,\
 
 $(eval $(call gb_Library_use_externals,svt,\
        boost_headers \
+    icui18n \
     icuuc \
     icu_headers \
 ))
diff --git a/svtools/source/svrtf/svparser.cxx 
b/svtools/source/svrtf/svparser.cxx
index 3da34404f517..0fec7a97097e 100644
--- a/svtools/source/svrtf/svparser.cxx
+++ b/svtools/source/svrtf/svparser.cxx
@@ -25,6 +25,7 @@
 #include <rtl/tencinfo.h>
 #include <rtl/character.hxx>
 #include <sal/log.hxx>
+#include <unicode/ucsdet.h>
 
 #include <vector>
 #include <climits>
@@ -85,7 +86,6 @@ SvParser<T>::SvParser( SvStream& rIn, sal_uInt8 nStackSize )
     , eSrcEnc( RTL_TEXTENCODING_DONTKNOW )
     , nNextChPos(0)
     , nNextCh(0)
-    , bUCS2BSrcEnc(false)
     , bSwitchToUCS2(false)
     , bRTF_InTextRead(false)
     , nTokenStackSize( nStackSize )
@@ -188,87 +188,66 @@ sal_uInt32 SvParser<T>::GetNextChar()
     // When reading multiple bytes, we don't have to care about the file
     // position when we run into the pending state. The file position is
     // maintained by SaveState/RestoreState.
-    bool bErr;
     if( bSwitchToUCS2 && 0 == rInput.Tell() )
     {
-        unsigned char c1;
-        bool bSeekBack = true;
-
-        rInput.ReadUChar( c1 );
-        bErr = !rInput.good();
-        if( !bErr )
+        rInput.StartReadingUnicodeText(RTL_TEXTENCODING_DONTKNOW);
+        if (rInput.good())
         {
-            if( 0xff == c1 || 0xfe == c1 )
-            {
-                unsigned char c2;
-                rInput.ReadUChar( c2 );
-                bErr = !rInput.good();
-                if( !bErr )
-                {
-                    if( 0xfe == c1 && 0xff == c2 )
-                    {
-                        eSrcEnc = RTL_TEXTENCODING_UCS2;
-                        bUCS2BSrcEnc = true;
-                        bSeekBack = false;
-                    }
-                    else if( 0xff == c1 && 0xfe == c2 )
-                    {
-                        eSrcEnc = RTL_TEXTENCODING_UCS2;
-                        bUCS2BSrcEnc = false;
-                        bSeekBack = false;
-                    }
-                }
-            }
-            else if( 0xef == c1 || 0xbb == c1 ) // check for UTF-8 BOM
+            sal_uInt64 nPos = rInput.Tell();
+            if (nPos == 2)
+                eSrcEnc = RTL_TEXTENCODING_UCS2;
+            else if (nPos == 3)
+                SetSrcEncoding(RTL_TEXTENCODING_UTF8);
+            else // Try to detect encoding without BOM
             {
-                unsigned char c2;
-                rInput.ReadUChar( c2 );
-                bErr = !rInput.good();
-                if( !bErr )
+                std::vector<char> buf(65535); // Arbitrarily chosen 64KiB 
buffer
+                const size_t nSize = rInput.ReadBytes(buf.data(), buf.size());
+                rInput.Seek(0);
+                if (nSize > 0)
                 {
-                    if( ( 0xef == c1 && 0xbb == c2 ) || ( 0xbb == c1 && 0xef 
== c2 ) )
+                    UErrorCode uerr = U_ZERO_ERROR;
+                    UCharsetDetector* ucd = ucsdet_open(&uerr);
+                    ucsdet_setText(ucd, buf.data(), nSize, &uerr);
+                    if (const UCharsetMatch* match = ucsdet_detect(ucd, &uerr))
                     {
-                        unsigned char c3(0);
-                        rInput.ReadUChar( c3 );
-                        bErr = !rInput.good();
-                        if( !bErr && ( 0xbf == c3 ) )
+                        const char* pEncodingName = ucsdet_getName(match, 
&uerr);
+
+                        if (U_SUCCESS(uerr))
                         {
-                            SetSrcEncoding(RTL_TEXTENCODING_UTF8);
-                            bSeekBack = false;
+                            if (strcmp("UTF-8", pEncodingName) == 0)
+                            {
+                                SetSrcEncoding(RTL_TEXTENCODING_UTF8);
+                            }
+                            else if (strcmp("UTF-16LE", pEncodingName) == 0)
+                            {
+                                eSrcEnc = RTL_TEXTENCODING_UCS2;
+                                rInput.SetEndian(SvStreamEndian::LITTLE);
+                            }
+                            else if (strcmp("UTF-16BE", pEncodingName) == 0)
+                            {
+                                eSrcEnc = RTL_TEXTENCODING_UCS2;
+                                rInput.SetEndian(SvStreamEndian::BIG);
+                            }
                         }
                     }
+
+                    ucsdet_close(ucd);
                 }
             }
         }
-        if( bSeekBack )
-            rInput.Seek( 0 );
-
         bSwitchToUCS2 = false;
     }
 
+    bool bErr;
     nNextChPos = rInput.Tell();
 
     if( RTL_TEXTENCODING_UCS2 == eSrcEnc )
     {
-        unsigned char c1, c2;
-
-        rInput.ReadUChar( c1 ).ReadUChar( c2 );
-        if( 2 == rInput.Tell() && rInput.good() &&
-            ( (bUCS2BSrcEnc && 0xfe == c1 && 0xff == c2) ||
-              (!bUCS2BSrcEnc && 0xff == c1 && 0xfe == c2) ) )
-            rInput.ReadUChar( c1 ).ReadUChar( c2 );
-
+        sal_Unicode cUC;
+        rInput.ReadUtf16(cUC);
         bErr = !rInput.good();
         if( !bErr )
-        {
-            sal_Unicode cUC = USHRT_MAX;
-            if( bUCS2BSrcEnc )
-                cUC = (sal_Unicode(c1) << 8) | c2;
-            else
-                cUC = (sal_Unicode(c2) << 8) | c1;
-
             c = cUC;
-        }
     }
     else
     {

Reply via email to