sw/qa/extras/rtfexport/data/tdf95706.rtf       |   36 ++++++++++++++
 sw/qa/extras/rtfexport/rtfexport4.cxx          |   61 +++++++++++++++++++++++++
 writerfilter/source/rtftok/rtfcharsets.cxx     |    9 +++
 writerfilter/source/rtftok/rtfcharsets.hxx     |   13 +++++
 writerfilter/source/rtftok/rtfdocumentimpl.cxx |   39 +++++++++++++++
 5 files changed, 156 insertions(+), 2 deletions(-)

New commits:
commit d72dece2bc61e3bab8db5968d53dc0e98a3bea4d
Author:     Vasily Melenchuk <vasily.melenc...@cib.de>
AuthorDate: Tue Apr 5 19:13:05 2022 +0300
Commit:     Miklos Vajna <vmik...@collabora.com>
CommitDate: Fri Apr 8 11:20:10 2022 +0200

    tdf#95706: RTF import: Use fontname suffixes to detect encoding
    
    Font names like "Arial CE", "Times New Roman Cyr" are not special
    fonts. They are classical Arial, Times New Roman... And these
    suffixes can be used to detect encoding used for RTF text.
    
    Most interesting: for MS Word these suffixes have priority:
    {\f34\cpg1253\fcharset161 Arial Baltic;} will have cp1257
    and not cp1253.
    
    Looks like compatibility issue came from dark ages.
    
    Change-Id: Ife8e781d5d04c3f6a8c11fcf604357c74bf33055
    Reviewed-on: https://gerrit.libreoffice.org/c/core/+/132584
    Tested-by: Jenkins
    Reviewed-by: Miklos Vajna <vmik...@collabora.com>
    Signed-off-by: Xisco Fauli <xiscofa...@libreoffice.org>
    Reviewed-on: https://gerrit.libreoffice.org/c/core/+/132681
    Reviewed-by: Michael Stahl <michael.st...@allotropia.de>

diff --git a/sw/qa/extras/rtfexport/data/tdf95706.rtf 
b/sw/qa/extras/rtfexport/data/tdf95706.rtf
new file mode 100644
index 000000000000..64c97930441d
--- /dev/null
+++ b/sw/qa/extras/rtfexport/data/tdf95706.rtf
@@ -0,0 +1,36 @@
+{\rtf\ansi
+{\fonttbl
+{\f1 Arial Baltic;}
+{\f2 Arial CE;}
+{\f3 Arial Cyr;}
+{\f4 Arial Greek;}
+{\f5 Arial Tur;}
+{\f6 Arial (Hebrew);}
+{\f7 Arial (Arabic);}
+{\f8 Arial (Vietnamese);}
+{\f9 Arial BlaBlaBla;}
+
+{\f10\cpg1253\fcharset161 Arial;}
+{\f11\fcharset161 Arial;}
+{\f12\cpg1253 Arial;}
+{\f13\cpg1253\fcharset161 Arial Baltic;}
+{\f14 Arial Baltic;\cpg1253\fcharset161}
+
+}
+\pard Font name suffixes:\par
+\pard\f1\fs26 \'c0\'c1\'c2\'c3\'c4\par
+\pard\f2\fs26 \'c0\'c1\'c2\'c3\'c4\par
+\pard\f3\fs26 \'c0\'c1\'c2\'c3\'c4\par
+\pard\f4\fs26 \'c0\'c1\'c2\'c3\'c4\par
+\pard\f5\fs26 \'c0\'c1\'c2\'c3\'c4\par
+\pard\f6\fs26 \'c0\'c1\'c2\'c3\'c4\par
+\pard\f7\fs26 \'c0\'c1\'c2\'c3\'c4\par
+\pard\f8\fs26 \'c0\'c1\'c2\'c3\'c4\par
+\pard\f9\fs26 \'c0\'c1\'c2\'c3\'c4\par
+\pard Font entry charset values:\par
+\pard\f10\fs26 \'c0\'c1\'c2\'c3\'c4\par
+\pard\f11\fs26 \'c0\'c1\'c2\'c3\'c4\par
+\pard\f12\fs26 \'c0\'c1\'c2\'c3\'c4\par
+\pard\f13\fs26 \'c0\'c1\'c2\'c3\'c4\par
+\pard\f14\fs26 \'c0\'c1\'c2\'c3\'c4\par
+}
diff --git a/sw/qa/extras/rtfexport/rtfexport4.cxx 
b/sw/qa/extras/rtfexport/rtfexport4.cxx
index 32f1a5f88cad..1f1434054085 100644
--- a/sw/qa/extras/rtfexport/rtfexport4.cxx
+++ b/sw/qa/extras/rtfexport/rtfexport4.cxx
@@ -461,6 +461,67 @@ CPPUNIT_TEST_FIXTURE(Test, testGutterTop)
     CPPUNIT_ASSERT(bGutterAtTop);
 }
 
+DECLARE_RTFEXPORT_TEST(testTdf95706, "tdf95706.rtf")
+{
+    uno::Reference<text::XTextRange> xRun2
+        = getRun(getParagraph(2), 1, u"\u0104\u012e\u0100\u0106\u00c4");
+    CPPUNIT_ASSERT_EQUAL(OUString("Arial"), getProperty<OUString>(xRun2, 
"CharFontName"));
+
+    uno::Reference<text::XTextRange> xRun3
+        = getRun(getParagraph(3), 1, u"\u0154\u00c1\u00c2\u0102\u00c4");
+    CPPUNIT_ASSERT_EQUAL(OUString("Arial"), getProperty<OUString>(xRun3, 
"CharFontName"));
+
+    uno::Reference<text::XTextRange> xRun4
+        = getRun(getParagraph(4), 1, u"\u0410\u0411\u0412\u0413\u0414");
+    CPPUNIT_ASSERT_EQUAL(OUString("Arial"), getProperty<OUString>(xRun4, 
"CharFontName"));
+
+    uno::Reference<text::XTextRange> xRun5
+        = getRun(getParagraph(5), 1, u"\u0390\u0391\u0392\u0393\u0394");
+    CPPUNIT_ASSERT_EQUAL(OUString("Arial"), getProperty<OUString>(xRun5, 
"CharFontName"));
+
+    uno::Reference<text::XTextRange> xRun6
+        = getRun(getParagraph(6), 1, u"\u00c0\u00c1\u00c2\u00c3\u00c4");
+    CPPUNIT_ASSERT_EQUAL(OUString("Arial"), getProperty<OUString>(xRun6, 
"CharFontName"));
+
+    uno::Reference<text::XTextRange> xRun7
+        = getRun(getParagraph(7), 1, u"\u05b0\u05b1\u05b2\u05b3\u05b4");
+    // Do not check font for Hebrew: it can be substituted by smth able to 
handle these chars
+    //CPPUNIT_ASSERT_EQUAL(OUString("Arial"), getProperty<OUString>(xRun7, 
"CharFontName"));
+
+    uno::Reference<text::XTextRange> xRun8
+        = getRun(getParagraph(8), 1, u"\u06c1\u0621\u0622\u0623\u0624");
+    // Do not check font for Arabic: it can be substituted by smth able to 
handle these chars
+    //CPPUNIT_ASSERT_EQUAL(OUString("Arial"), getProperty<OUString>(xRun8, 
"CharFontName"));
+
+    uno::Reference<text::XTextRange> xRun9
+        = getRun(getParagraph(9), 1, u"\u00c0\u00c1\u00c2\u0102\u00c4");
+    CPPUNIT_ASSERT_EQUAL(OUString("Arial"), getProperty<OUString>(xRun9, 
"CharFontName"));
+
+    // Ensure strange font remains strange. No reason to check content: in 
this case it can vary on locale
+    uno::Reference<text::XTextRange> xRun10 = getRun(getParagraph(10), 1);
+    CPPUNIT_ASSERT_EQUAL(OUString("Arial BlaBlaBla"),
+                         getProperty<OUString>(xRun10, "CharFontName"));
+
+    uno::Reference<text::XTextRange> xRun12
+        = getRun(getParagraph(12), 1, u"\u0390\u0391\u0392\u0393\u0394");
+    CPPUNIT_ASSERT_EQUAL(OUString("Arial"), getProperty<OUString>(xRun12, 
"CharFontName"));
+
+    uno::Reference<text::XTextRange> xRun13
+        = getRun(getParagraph(13), 1, u"\u0390\u0391\u0392\u0393\u0394");
+    CPPUNIT_ASSERT_EQUAL(OUString("Arial"), getProperty<OUString>(xRun13, 
"CharFontName"));
+
+    uno::Reference<text::XTextRange> xRun14 = getRun(getParagraph(14), 1);
+    CPPUNIT_ASSERT_EQUAL(OUString("Arial"), getProperty<OUString>(xRun14, 
"CharFontName"));
+
+    uno::Reference<text::XTextRange> xRun15
+        = getRun(getParagraph(15), 1, u"\u0104\u012e\u0100\u0106\u00c4");
+    CPPUNIT_ASSERT_EQUAL(OUString("Arial"), getProperty<OUString>(xRun15, 
"CharFontName"));
+
+    uno::Reference<text::XTextRange> xRun16
+        = getRun(getParagraph(16), 1, u"\u0104\u012e\u0100\u0106\u00c4");
+    CPPUNIT_ASSERT_EQUAL(OUString("Arial"), getProperty<OUString>(xRun16, 
"CharFontName"));
+}
+
 DECLARE_RTFEXPORT_TEST(testTdf111851, "tdf111851.rtf")
 {
     uno::Reference<text::XTextTable> xTable(getParagraphOrTable(1), 
uno::UNO_QUERY);
diff --git a/writerfilter/source/rtftok/rtfcharsets.cxx 
b/writerfilter/source/rtftok/rtfcharsets.cxx
index 69e416ce8bc5..886e161e3c89 100644
--- a/writerfilter/source/rtftok/rtfcharsets.cxx
+++ b/writerfilter/source/rtftok/rtfcharsets.cxx
@@ -9,6 +9,7 @@
 
 #include "rtfcharsets.hxx"
 #include <sal/macros.h>
+#include <rtl/textenc.h>
 
 namespace writerfilter::rtftok
 {
@@ -50,6 +51,14 @@ RTFEncoding const aRTFEncodings[] = {
 
 int nRTFEncodings = SAL_N_ELEMENTS(aRTFEncodings);
 
+RTFFontNameSuffix const aRTFFontNameSuffixes[] = {
+    { "Baltic", RTL_TEXTENCODING_MS_1257 },   { "CE", RTL_TEXTENCODING_MS_1250 
},
+    { "Cyr", RTL_TEXTENCODING_MS_1251 },      { "Greek", 
RTL_TEXTENCODING_MS_1253 },
+    { "Tur", RTL_TEXTENCODING_MS_1254 },      { "(Hebrew)", 
RTL_TEXTENCODING_MS_1255 },
+    { "(Arabic)", RTL_TEXTENCODING_MS_1256 }, { "(Vietnamese)", 
RTL_TEXTENCODING_MS_1258 },
+    { "", RTL_TEXTENCODING_DONTKNOW } // End of array
+};
+
 } // namespace writerfilter::rtftok
 
 /* vim:set shiftwidth=4 softtabstop=4 expandtab: */
diff --git a/writerfilter/source/rtftok/rtfcharsets.hxx 
b/writerfilter/source/rtftok/rtfcharsets.hxx
index 865a9310289e..826dea271f6b 100644
--- a/writerfilter/source/rtftok/rtfcharsets.hxx
+++ b/writerfilter/source/rtftok/rtfcharsets.hxx
@@ -19,6 +19,19 @@ struct RTFEncoding
 };
 extern RTFEncoding const aRTFEncodings[];
 extern int nRTFEncodings;
+
+/// Font name can contain special suffixes used
+/// to determine encoding for given font table entry
+/// For example "Arial CE" is "Arial" with CP1250 encoding
+/// List of these suffixes is not official and detected in a empirical
+/// way thus may be inexact and incomplete.
+struct RTFFontNameSuffix
+{
+    const char* suffix;
+    int codepage;
+};
+extern RTFFontNameSuffix const aRTFFontNameSuffixes[];
+
 } // namespace writerfilter::rtftok
 
 /* vim:set shiftwidth=4 softtabstop=4 expandtab: */
diff --git a/writerfilter/source/rtftok/rtfdocumentimpl.cxx 
b/writerfilter/source/rtftok/rtfdocumentimpl.cxx
index af4ba47158c3..8ea858abada5 100644
--- a/writerfilter/source/rtftok/rtfdocumentimpl.cxx
+++ b/writerfilter/source/rtftok/rtfdocumentimpl.cxx
@@ -50,6 +50,7 @@
 #include "rtfskipdestination.hxx"
 #include "rtftokenizer.hxx"
 #include "rtflookahead.hxx"
+#include "rtfcharsets.hxx"
 
 using namespace com::sun::star;
 
@@ -1370,14 +1371,48 @@ void RTFDocumentImpl::text(OUString& rString)
                     case Destination::FONTTABLE:
                     case Destination::FONTENTRY:
                     {
-                        m_aFontNames[m_nCurrentFontIndex] = aName;
+                        // Old documents can contain no encoding information 
in fontinfo,
+                        // but there can be font name suffixes: Arial CE is 
not a special
+                        // font, it is ordinal Arial, but with used cp 1250 
encoding.
+                        // Moreover these suffixes have priority over \cpgN 
and \fcharsetN
+                        // in MS Word.
+                        OUString aFontSuffix;
+                        OUString aNameNoSuffix(aName);
+                        sal_Int32 nLastSpace = aName.lastIndexOf(' ');
+                        if (nLastSpace >= 0)
+                        {
+                            aFontSuffix = aName.copy(nLastSpace + 1);
+                            aNameNoSuffix = aName.copy(0, nLastSpace);
+                            sal_Int32 nEncoding = RTL_TEXTENCODING_DONTKNOW;
+                            for (int i = 0;
+                                 aRTFFontNameSuffixes[i].codepage != 
RTL_TEXTENCODING_DONTKNOW; i++)
+                            {
+                                if 
(aFontSuffix.equalsAscii(aRTFFontNameSuffixes[i].suffix))
+                                {
+                                    nEncoding = 
aRTFFontNameSuffixes[i].codepage;
+                                    break;
+                                }
+                            }
+                            if (nEncoding > RTL_TEXTENCODING_DONTKNOW)
+                            {
+                                m_nCurrentEncoding = nEncoding;
+                                
m_aStates.top().setCurrentEncoding(m_nCurrentEncoding);
+                            }
+                            else
+                            {
+                                // Unknown suffix: looks like it is just a 
part of font name, restore it
+                                aNameNoSuffix = aName;
+                            }
+                        }
+
+                        m_aFontNames[m_nCurrentFontIndex] = aNameNoSuffix;
                         if (m_nCurrentEncoding >= 0)
                         {
                             m_aFontEncodings[m_nCurrentFontIndex] = 
m_nCurrentEncoding;
                             m_nCurrentEncoding = -1;
                         }
                         
m_aStates.top().getTableAttributes().set(NS_ooxml::LN_CT_Font_name,
-                                                                 new 
RTFValue(aName));
+                                                                 new 
RTFValue(aNameNoSuffix));
 
                         writerfilter::Reference<Properties>::Pointer_t const 
pProp(
                             new 
RTFReferenceProperties(m_aStates.top().getTableAttributes(),

Reply via email to