Hi,

i came across gendict.cxx while fixing a possible memleak. It took me some
time to figure out what the code did.
I notice a lot of very very long function bodies in LO-code, gendict was no
exception. So i refactored the code, did some google searches on gendict and
was able to fix the memleak.
I ended up submitting only the fix, not the refactoring, because i didn't
want to break any de facto coding style guideliness and i am still a fairly
new contributer to LO.

I submit these patches now, so you guys can decide if you push them or not.
In case it could save some other new contributor some time in understanding
gendict ;)
BTW, i tested the code on ja.dic and output is still the same.

-- Kenneth
From 7155a3675cf3410ab74dc7032bceaef719548d3c Mon Sep 17 00:00:00 2001
From: Kenneth Venken <kenneth.ven...@gmail.com>
Date: Thu, 27 Jan 2011 22:27:24 +0100
Subject: [PATCH 1/8] added some documentation to gendict

---
 i18npool/source/breakiterator/gendict.cxx |   17 ++++++++++++++++-
 1 files changed, 16 insertions(+), 1 deletions(-)

diff --git a/i18npool/source/breakiterator/gendict.cxx b/i18npool/source/breakiterator/gendict.cxx
index 9f49f67..8a6354b 100644
--- a/i18npool/source/breakiterator/gendict.cxx
+++ b/i18npool/source/breakiterator/gendict.cxx
@@ -41,7 +41,22 @@ using std::vector;
 
 using namespace ::rtl;
 
-/* Main Procedure */
+/* Utility gendict:
+
+   "BreakIterator_CJK provides input string caching and dictionary searching for
+   longest matching. You can provide a sorted dictionary (the encoding must be
+   UTF-8) by creating the following file:
+            i18npool/source/breakiterator/data/<language>.dict.
+
+   The utility gendict will convert the file to C code, which will be compiled
+   into a shared library for dynamic loading.
+
+   All dictionary searching and loading is performed in the xdictionary class.
+   The only thing you need to do is to derive your class from BreakIterator_CJK
+   and create an instance of the xdictionary with the language name and
+   pass it to the parent class." (from http://wiki.services.openoffice.org/wiki/
+   /Documentation/DevGuide/OfficeDev/Implementing_a_New_Locale - 27/01/2011)
+*/
 
 SAL_IMPLEMENT_MAIN_WITH_ARGS(argc, argv)
 {
-- 
1.7.1

From d44b67f81303f56246f93a6eb6419f371d5b215f Mon Sep 17 00:00:00 2001
From: Kenneth Venken <kenneth.ven...@gmail.com>
Date: Thu, 27 Jan 2011 22:43:49 +0100
Subject: [PATCH 2/8] refactored out some simple print functions

---
 i18npool/source/breakiterator/gendict.cxx |   35 +++++++++++++++++++----------
 1 files changed, 23 insertions(+), 12 deletions(-)

diff --git a/i18npool/source/breakiterator/gendict.cxx b/i18npool/source/breakiterator/gendict.cxx
index 8a6354b..df7e144 100644
--- a/i18npool/source/breakiterator/gendict.cxx
+++ b/i18npool/source/breakiterator/gendict.cxx
@@ -58,6 +58,9 @@ using namespace ::rtl;
    /Documentation/DevGuide/OfficeDev/Implementing_a_New_Locale - 27/01/2011)
 */
 
+void printIncludes(FILE *source_fp);
+void printFunctions(FILE *source_fp);
+
 SAL_IMPLEMENT_MAIN_WITH_ARGS(argc, argv)
 {
     FILE *sfp, *cfp;
@@ -79,12 +82,7 @@ SAL_IMPLEMENT_MAIN_WITH_ARGS(argc, argv)
         return -1;
     }
 
-    fprintf(cfp, "/*\n");
-    fprintf(cfp, " * Copyright(c) 1999 - 2000, Sun Microsystems, Inc.\n");
-    fprintf(cfp, " * All Rights Reserved.\n");
-    fprintf(cfp, " */\n\n");
-    fprintf(cfp, "/* !!!The file is generated automatically. DONOT edit the file manually!!! */\n\n");
-    fprintf(cfp, "#include <sal/types.h>\n\n");
+    printIncludes(cfp);
     fprintf(cfp, "extern \"C\" {\n");
 
     sal_Int32 count, i, j;
@@ -209,12 +207,6 @@ SAL_IMPLEMENT_MAIN_WITH_ARGS(argc, argv)
     }
     fprintf (cfp, "\n};\n");
 
-    // create function to return arrays
-    fprintf (cfp, "\tconst sal_uInt8* getExistMark() { return existMark; }\n");
-    fprintf (cfp, "\tconst sal_Int16* getIndex1() { return index1; }\n");
-    fprintf (cfp, "\tconst sal_Int32* getIndex2() { return index2; }\n");
-    fprintf (cfp, "\tconst sal_Int32* getLenArray() { return lenArray; }\n");
-    fprintf (cfp, "\tconst sal_Unicode* getDataArea() { return dataArea; }\n");
     fprintf (cfp, "}\n");
 
     fclose(sfp);
@@ -223,4 +215,23 @@ SAL_IMPLEMENT_MAIN_WITH_ARGS(argc, argv)
     return 0;
 }	// End of main
 
+void printIncludes(FILE* source_fp)
+{
+    fprintf(source_fp, "/*\n");
+    fprintf(source_fp, " * Copyright(c) 1999 - 2000, Sun Microsystems, Inc.\n");
+    fprintf(source_fp, " * All Rights Reserved.\n");
+    fprintf(source_fp, " */\n\n");
+    fprintf(source_fp, "/* !!!The file is generated automatically. DO NOT edit the file manually!!! */\n\n");
+    fprintf(source_fp, "#include <sal/types.h>\n\n");
+}
+
+void printFunctions(FILE* source_fp)
+{
+    fprintf (source_fp, "\tconst sal_uInt8* getExistMark() { return existMark; }\n");
+    fprintf (source_fp, "\tconst sal_Int16* getIndex1() { return index1; }\n");
+    fprintf (source_fp, "\tconst sal_Int32* getIndex2() { return index2; }\n");
+    fprintf (source_fp, "\tconst sal_Int32* getLenArray() { return lenArray; }\n");
+    fprintf (source_fp, "\tconst sal_Unicode* getDataArea() { return dataArea; }\n");
+}
+
 /* vim:set shiftwidth=4 softtabstop=4 expandtab: */
-- 
1.7.1

From f7c0500f2f74b785b27d185eb533e49de6753160 Mon Sep 17 00:00:00 2001
From: Kenneth Venken <kenneth.ven...@gmail.com>
Date: Thu, 27 Jan 2011 23:02:11 +0100
Subject: [PATCH 3/8] refactored out dataArea

---
 i18npool/source/breakiterator/gendict.cxx |  104 +++++++++++++++++------------
 1 files changed, 60 insertions(+), 44 deletions(-)

diff --git a/i18npool/source/breakiterator/gendict.cxx b/i18npool/source/breakiterator/gendict.cxx
index df7e144..eb654f5 100644
--- a/i18npool/source/breakiterator/gendict.cxx
+++ b/i18npool/source/breakiterator/gendict.cxx
@@ -59,6 +59,10 @@ using namespace ::rtl;
 */
 
 void printIncludes(FILE *source_fp);
+void initArrays(sal_Bool *exists, sal_Int32 *charArray);
+void printDataArea(FILE *sfp, FILE *cfp, sal_Int32 count, sal_Int32 i,
+                   sal_Int32 lenArrayCurr, sal_Int32 *charArray,
+                   vector<sal_Int32>& lenArray, sal_Bool *exists);
 void printFunctions(FILE *source_fp);
 
 SAL_IMPLEMENT_MAIN_WITH_ARGS(argc, argv)
@@ -89,51 +93,9 @@ SAL_IMPLEMENT_MAIN_WITH_ARGS(argc, argv)
     sal_Int32 lenArrayCurr = 0, charArray[0x10000];
     vector<sal_Int32> lenArray;
     sal_Bool exist[0x10000];
-    for (i = 0; i < 0x10000; i++) {
-        exist[i] = sal_False;
-        charArray[i] = 0;
-    }
-
-    // generate main dict. data array
-    fprintf(cfp, "static const sal_Unicode dataArea[] = {");
-    sal_Char str[1024];
-    sal_Unicode current = 0;
-    count = 0;
-    while (fgets(str, 1024, sfp)) {
-        // input file is in UTF-8 encoding
-        // don't convert last new line character to Ostr.
-        OUString Ostr((const sal_Char *)str, strlen(str) - 1, RTL_TEXTENCODING_UTF8);
-        const sal_Unicode *u = Ostr.getStr();
-
-        sal_Int32 len = Ostr.getLength();
-
-        i=0;
-        Ostr.iterateCodePoints(&i, 1);
-        if (len == i) continue;	// skip one character word
-
-        if (*u != current) {
-        if (*u < current)
-        printf("u %x, current %x, count %d, lenArray.size() %d\n", *u, current,
-                    sal::static_int_cast<int>(count), sal::static_int_cast<int>(lenArray.size()));
-        current = *u;
-        charArray[current] = lenArray.size();
-        }
+    initArrays( exist, charArray );
 
-        lenArray.push_back(lenArrayCurr);
-
-        exist[u[0]] = sal_True;
-        for (i = 1; i < len; i++) {		// start from second character, 
-        exist[u[i]] = sal_True; 	// since the first character is captured in charArray.
-        lenArrayCurr++;
-        if ((count++) % 0x10 == 0)
-            fprintf(cfp, "\n\t");
-        fprintf(cfp, "0x%04x, ", u[i]);
-        }
-    }
-    lenArray.push_back( lenArrayCurr ); // store last ending pointer
-
-    charArray[current+1] = lenArray.size();
-    fprintf(cfp, "\n};\n");
+    printDataArea(sfp, cfp, count, i, lenArrayCurr, charArray, lenArray, exist);
 
     // generate lenArray 
     fprintf(cfp, "static const sal_Int32 lenArray[] = {\n\t");
@@ -215,6 +177,14 @@ SAL_IMPLEMENT_MAIN_WITH_ARGS(argc, argv)
     return 0;
 }	// End of main
 
+void initArrays(sal_Bool* exists, sal_Int32* charArray)
+{
+    for (sal_Int32 i = 0; i < 0x10000; i++) {
+        exists[i] = sal_False;
+        charArray[i] = 0;
+    }
+}
+
 void printIncludes(FILE* source_fp)
 {
     fprintf(source_fp, "/*\n");
@@ -234,4 +204,50 @@ void printFunctions(FILE* source_fp)
     fprintf (source_fp, "\tconst sal_Unicode* getDataArea() { return dataArea; }\n");
 }
 
+void printDataArea(FILE *sfp, FILE *cfp, sal_Int32 count, sal_Int32 i,
+                   sal_Int32 lenArrayCurr, sal_Int32 *charArray,
+                   vector<sal_Int32>& lenArray, sal_Bool *exists)
+{
+    // generate main dict. data array
+    fprintf(cfp, "static const sal_Unicode dataArea[] = {");
+    sal_Char str[1024];
+    sal_Unicode current = 0;
+    count = 0;
+    while (fgets(str, 1024, sfp)) {
+        // input file is in UTF-8 encoding
+        // don't convert last new line character to Ostr.
+        OUString Ostr((const sal_Char *)str, strlen(str) - 1, RTL_TEXTENCODING_UTF8);
+        const sal_Unicode *u = Ostr.getStr();
+
+        sal_Int32 len = Ostr.getLength();
+
+        i=0;
+        Ostr.iterateCodePoints(&i, 1);
+        if (len == i) continue;	// skip one character word
+
+        if (*u != current) {
+        if (*u < current)
+        printf("u %x, current %x, count %d, lenArray.size() %d\n", *u, current,
+                    sal::static_int_cast<int>(count), sal::static_int_cast<int>(lenArray.size()));
+        current = *u;
+        charArray[current] = lenArray.size();
+        }
+
+        lenArray.push_back(lenArrayCurr);
+
+        exists[u[0]] = sal_True;
+        for (i = 1; i < len; i++) {		// start from second character,
+        exists[u[i]] = sal_True; 	// since the first character is captured in charArray.
+        lenArrayCurr++;
+        if ((count++) % 0x10 == 0)
+            fprintf(cfp, "\n\t");
+        fprintf(cfp, "0x%04x, ", u[i]);
+        }
+    }
+    lenArray.push_back( lenArrayCurr ); // store last ending pointer
+
+    charArray[current+1] = lenArray.size();
+    fprintf(cfp, "\n};\n");
+}
+
 /* vim:set shiftwidth=4 softtabstop=4 expandtab: */
-- 
1.7.1

From 1e5a7d4cd5e13895ce4db4f0fea85f8e77ffe876 Mon Sep 17 00:00:00 2001
From: Kenneth Venken <kenneth.ven...@gmail.com>
Date: Thu, 27 Jan 2011 23:43:38 +0100
Subject: [PATCH 4/8] refactored out all array functions

---
 i18npool/source/breakiterator/gendict.cxx |  188 ++++++++++++++++-------------
 1 files changed, 104 insertions(+), 84 deletions(-)

diff --git a/i18npool/source/breakiterator/gendict.cxx b/i18npool/source/breakiterator/gendict.cxx
index eb654f5..315acfa 100644
--- a/i18npool/source/breakiterator/gendict.cxx
+++ b/i18npool/source/breakiterator/gendict.cxx
@@ -60,9 +60,15 @@ using namespace ::rtl;
 
 void printIncludes(FILE *source_fp);
 void initArrays(sal_Bool *exists, sal_Int32 *charArray);
-void printDataArea(FILE *sfp, FILE *cfp, sal_Int32 count, sal_Int32 i,
+void printDataArea(FILE *sfp, FILE *source_fp, sal_Int32 count, sal_Int32 i,
                    sal_Int32 lenArrayCurr, sal_Int32 *charArray,
                    vector<sal_Int32>& lenArray, sal_Bool *exists);
+void printLenArray(FILE *source_fp, const vector<sal_Int32>& lenArray,
+                   sal_Int32 count);
+void printIndex1(FILE *source_fp, sal_Int32 *charArray, sal_Int32 count,
+                 sal_Int16 *set);
+void printIndex2(FILE *source_fp, sal_Int32 *charArray, sal_Int16 *set);
+void printExistMark(FILE *source_fp, sal_Bool *exists, sal_Int32 count);
 void printFunctions(FILE *source_fp);
 
 SAL_IMPLEMENT_MAIN_WITH_ARGS(argc, argv)
@@ -86,89 +92,21 @@ SAL_IMPLEMENT_MAIN_WITH_ARGS(argc, argv)
         return -1;
     }
 
-    printIncludes(cfp);
-    fprintf(cfp, "extern \"C\" {\n");
-
-    sal_Int32 count, i, j;
+    sal_Int32 count, i;
     sal_Int32 lenArrayCurr = 0, charArray[0x10000];
     vector<sal_Int32> lenArray;
     sal_Bool exist[0x10000];
-    initArrays( exist, charArray );
-
-    printDataArea(sfp, cfp, count, i, lenArrayCurr, charArray, lenArray, exist);
-
-    // generate lenArray 
-    fprintf(cfp, "static const sal_Int32 lenArray[] = {\n\t");
-    count = 1;
-    fprintf(cfp, "0x%x, ", 0); // insert one slat for skipping 0 in index2 array.
-    for (size_t k = 0; k < lenArray.size(); k++)
-    {
-        fprintf(cfp, "0x%lx, ", static_cast<long unsigned int>(lenArray[k]));
-        if (count == 0xf)
-        {
-            count = 0;
-            fprintf(cfp, "\n\t");
-        }
-            else count++;
-    }
-    fprintf(cfp, "\n};\n");
-
-    // generate index1 array
-    fprintf (cfp, "static const sal_Int16 index1[] = {\n\t");
     sal_Int16 set[0x100];
-    count = 0;
-    for (i = 0; i < 0x100; i++) {
-        for (j = 0; j < 0x100; j++)
-        if (charArray[(i*0x100) + j] != 0)
-            break;
-
-        fprintf(cfp, "0x%02x, ", set[i] = (j < 0x100 ? sal::static_int_cast<sal_Int16>(count++) : 0xff));
-        if ((i+1) % 0x10 == 0)
-        fprintf (cfp, "\n\t");
-    }
-    fprintf (cfp, "};\n");
-
-    // generate index2 array
-    fprintf (cfp, "static const sal_Int32 index2[] = {\n\t");
-    sal_Int32 prev = 0;
-    for (i = 0; i < 0x100; i++) {
-        if (set[i] != 0xff) {
-        for (j = 0; j < 0x100; j++) {
-            sal_Int32 k = (i*0x100) + j;
-            if (prev != 0 && charArray[k] == 0) {
-            for (k++; k < 0x10000; k++)
-                if (charArray[k] != 0)
-                break;
-            }
-            prev = charArray[(i*0x100) + j];
-            fprintf(
-                cfp, "0x%lx, ",
-                sal::static_int_cast< unsigned long >(
-                    k < 0x10000 ? charArray[k] + 1 : 0));
-            if ((j+1) % 0x10 == 0)
-            fprintf (cfp, "\n\t");
-        }
-        fprintf (cfp, "\n\t");
-        }
-    }
-    fprintf (cfp, "\n};\n");
-
-    // generate existMark array
-    count = 0;
-    fprintf (cfp, "static const sal_uInt8 existMark[] = {\n\t");
-    for (i = 0; i < 0x1FFF; i++) {
-        sal_uInt8 bit = 0;
-        for (j = 0; j < 8; j++)
-        if (exist[i * 8 + j])
-            bit |= 1 << j;
-        fprintf(cfp, "0x%02x, ", bit);
-        if (count == 0xf) {
-        count = 0;
-        fprintf(cfp, "\n\t");
-        } else count++;
-    }
-    fprintf (cfp, "\n};\n");
+    initArrays( exist, charArray );
 
+    printIncludes(cfp);
+    fprintf(cfp, "extern \"C\" {\n");
+        printDataArea(sfp, cfp, count, i, lenArrayCurr, charArray, lenArray, exist);
+        printLenArray(cfp, lenArray, count);
+        printIndex1(cfp, charArray, count, set);
+        printIndex2(cfp, charArray, set);
+        printExistMark(cfp, exist, count);
+        printFunctions(cfp);
     fprintf (cfp, "}\n");
 
     fclose(sfp);
@@ -204,12 +142,12 @@ void printFunctions(FILE* source_fp)
     fprintf (source_fp, "\tconst sal_Unicode* getDataArea() { return dataArea; }\n");
 }
 
-void printDataArea(FILE *sfp, FILE *cfp, sal_Int32 count, sal_Int32 i,
+void printDataArea(FILE *sfp, FILE *source_fp, sal_Int32 count, sal_Int32 i,
                    sal_Int32 lenArrayCurr, sal_Int32 *charArray,
                    vector<sal_Int32>& lenArray, sal_Bool *exists)
 {
     // generate main dict. data array
-    fprintf(cfp, "static const sal_Unicode dataArea[] = {");
+    fprintf(source_fp, "static const sal_Unicode dataArea[] = {");
     sal_Char str[1024];
     sal_Unicode current = 0;
     count = 0;
@@ -240,14 +178,96 @@ void printDataArea(FILE *sfp, FILE *cfp, sal_Int32 count, sal_Int32 i,
         exists[u[i]] = sal_True; 	// since the first character is captured in charArray.
         lenArrayCurr++;
         if ((count++) % 0x10 == 0)
-            fprintf(cfp, "\n\t");
-        fprintf(cfp, "0x%04x, ", u[i]);
+            fprintf(source_fp, "\n\t");
+        fprintf(source_fp, "0x%04x, ", u[i]);
         }
     }
     lenArray.push_back( lenArrayCurr ); // store last ending pointer
 
     charArray[current+1] = lenArray.size();
-    fprintf(cfp, "\n};\n");
+    fprintf(source_fp, "\n};\n");
+}
+
+void printLenArray(FILE* source_fp, const vector<sal_Int32>& lenArray,
+                   sal_Int32 count)
+{
+    fprintf(source_fp, "static const sal_Int32 lenArray[] = {\n\t");
+    count = 1;
+    fprintf(source_fp, "0x%x, ", 0); // insert one slat for skipping 0 in index2 array.
+    for (size_t k = 0; k < lenArray.size(); k++)
+    {
+        fprintf(source_fp, "0x%lx, ", static_cast<long unsigned int>(lenArray[k]));
+        if (count == 0xf)
+        {
+            count = 0;
+            fprintf(source_fp, "\n\t");
+        }
+            else count++;
+    }
+    fprintf(source_fp, "\n};\n");
+}
+
+void printIndex1(FILE *source_fp, sal_Int32 *charArray, sal_Int32 count,
+                 sal_Int16 *set)
+{
+    fprintf (source_fp, "static const sal_Int16 index1[] = {\n\t");
+    count = 0;
+    sal_Int32 j;
+    for (sal_Int32 i = 0; i < 0x100; i++) {
+        for (j = 0; j < 0x100; j++)
+        if (charArray[(i*0x100) + j] != 0)
+            break;
+
+        fprintf(source_fp, "0x%02x, ", set[i] = (j < 0x100 ? sal::static_int_cast<sal_Int16>(count++) : 0xff));
+        if ((i+1) % 0x10 == 0)
+        fprintf (source_fp, "\n\t");
+    }
+    fprintf (source_fp, "};\n");
+}
+
+void printIndex2(FILE *source_fp, sal_Int32 *charArray, sal_Int16 *set)
+{
+    fprintf (source_fp, "static const sal_Int32 index2[] = {\n\t");
+    sal_Int32 prev = 0;
+    for (sal_Int32 i = 0; i < 0x100; i++) {
+        if (set[i] != 0xff) {
+        for (sal_Int32 j = 0; j < 0x100; j++) {
+            sal_Int32 k = (i*0x100) + j;
+            if (prev != 0 && charArray[k] == 0) {
+            for (k++; k < 0x10000; k++)
+                if (charArray[k] != 0)
+                break;
+            }
+            prev = charArray[(i*0x100) + j];
+            fprintf(
+                source_fp, "0x%lx, ",
+                sal::static_int_cast< unsigned long >(
+                    k < 0x10000 ? charArray[k] + 1 : 0));
+            if ((j+1) % 0x10 == 0)
+            fprintf (source_fp, "\n\t");
+        }
+        fprintf (source_fp, "\n\t");
+        }
+    }
+    fprintf (source_fp, "\n};\n");
+}
+
+void printExistMark(FILE *source_fp, sal_Bool *exists, sal_Int32 count)
+{
+    count = 0;
+    fprintf (source_fp, "static const sal_uInt8 existMark[] = {\n\t");
+    for (sal_Int32 i = 0; i < 0x1FFF; i++) {
+        sal_uInt8 bit = 0;
+        for (sal_Int32 j = 0; j < 8; j++)
+        if (exists[i * 8 + j])
+            bit |= 1 << j;
+        fprintf(source_fp, "0x%02x, ", bit);
+        if (count == 0xf) {
+        count = 0;
+        fprintf(source_fp, "\n\t");
+        } else count++;
+    }
+    fprintf (source_fp, "\n};\n");
 }
 
 /* vim:set shiftwidth=4 softtabstop=4 expandtab: */
-- 
1.7.1

From 8df201b3e266826392e6c59fb885d71cecbeeeb7 Mon Sep 17 00:00:00 2001
From: Kenneth Venken <kenneth.ven...@gmail.com>
Date: Thu, 27 Jan 2011 23:52:52 +0100
Subject: [PATCH 5/8] reduced scope of some variables

---
 i18npool/source/breakiterator/gendict.cxx |   43 +++++++++++++----------------
 1 files changed, 19 insertions(+), 24 deletions(-)

diff --git a/i18npool/source/breakiterator/gendict.cxx b/i18npool/source/breakiterator/gendict.cxx
index 315acfa..93a359b 100644
--- a/i18npool/source/breakiterator/gendict.cxx
+++ b/i18npool/source/breakiterator/gendict.cxx
@@ -60,15 +60,12 @@ using namespace ::rtl;
 
 void printIncludes(FILE *source_fp);
 void initArrays(sal_Bool *exists, sal_Int32 *charArray);
-void printDataArea(FILE *sfp, FILE *source_fp, sal_Int32 count, sal_Int32 i,
-                   sal_Int32 lenArrayCurr, sal_Int32 *charArray,
+void printDataArea(FILE *sfp, FILE *source_fp, sal_Int32 *charArray,
                    vector<sal_Int32>& lenArray, sal_Bool *exists);
-void printLenArray(FILE *source_fp, const vector<sal_Int32>& lenArray,
-                   sal_Int32 count);
-void printIndex1(FILE *source_fp, sal_Int32 *charArray, sal_Int32 count,
-                 sal_Int16 *set);
+void printLenArray(FILE *source_fp, const vector<sal_Int32>& lenArray);
+void printIndex1(FILE *source_fp, sal_Int32 *charArray, sal_Int16 *set);
 void printIndex2(FILE *source_fp, sal_Int32 *charArray, sal_Int16 *set);
-void printExistMark(FILE *source_fp, sal_Bool *exists, sal_Int32 count);
+void printExistMark(FILE *source_fp, sal_Bool *exists);
 void printFunctions(FILE *source_fp);
 
 SAL_IMPLEMENT_MAIN_WITH_ARGS(argc, argv)
@@ -92,7 +89,6 @@ SAL_IMPLEMENT_MAIN_WITH_ARGS(argc, argv)
         return -1;
     }
 
-    sal_Int32 count, i;
     sal_Int32 lenArrayCurr = 0, charArray[0x10000];
     vector<sal_Int32> lenArray;
     sal_Bool exist[0x10000];
@@ -101,11 +97,11 @@ SAL_IMPLEMENT_MAIN_WITH_ARGS(argc, argv)
 
     printIncludes(cfp);
     fprintf(cfp, "extern \"C\" {\n");
-        printDataArea(sfp, cfp, count, i, lenArrayCurr, charArray, lenArray, exist);
-        printLenArray(cfp, lenArray, count);
-        printIndex1(cfp, charArray, count, set);
+        printDataArea(sfp, cfp, charArray, lenArray, exist);
+        printLenArray(cfp, lenArray);
+        printIndex1(cfp, charArray, set);
         printIndex2(cfp, charArray, set);
-        printExistMark(cfp, exist, count);
+        printExistMark(cfp, exist);
         printFunctions(cfp);
     fprintf (cfp, "}\n");
 
@@ -142,15 +138,16 @@ void printFunctions(FILE* source_fp)
     fprintf (source_fp, "\tconst sal_Unicode* getDataArea() { return dataArea; }\n");
 }
 
-void printDataArea(FILE *sfp, FILE *source_fp, sal_Int32 count, sal_Int32 i,
-                   sal_Int32 lenArrayCurr, sal_Int32 *charArray,
+void printDataArea(FILE *sfp, FILE *source_fp,
+                    sal_Int32 *charArray,
                    vector<sal_Int32>& lenArray, sal_Bool *exists)
 {
     // generate main dict. data array
     fprintf(source_fp, "static const sal_Unicode dataArea[] = {");
     sal_Char str[1024];
+    sal_Int32 lenArrayCurr = 0;
     sal_Unicode current = 0;
-    count = 0;
+    sal_Int32 count = 0;
     while (fgets(str, 1024, sfp)) {
         // input file is in UTF-8 encoding
         // don't convert last new line character to Ostr.
@@ -159,7 +156,7 @@ void printDataArea(FILE *sfp, FILE *source_fp, sal_Int32 count, sal_Int32 i,
 
         sal_Int32 len = Ostr.getLength();
 
-        i=0;
+        sal_Int32 i=0;
         Ostr.iterateCodePoints(&i, 1);
         if (len == i) continue;	// skip one character word
 
@@ -188,11 +185,10 @@ void printDataArea(FILE *sfp, FILE *source_fp, sal_Int32 count, sal_Int32 i,
     fprintf(source_fp, "\n};\n");
 }
 
-void printLenArray(FILE* source_fp, const vector<sal_Int32>& lenArray,
-                   sal_Int32 count)
+void printLenArray(FILE* source_fp, const vector<sal_Int32>& lenArray)
 {
     fprintf(source_fp, "static const sal_Int32 lenArray[] = {\n\t");
-    count = 1;
+    sal_Int32 count = 1;
     fprintf(source_fp, "0x%x, ", 0); // insert one slat for skipping 0 in index2 array.
     for (size_t k = 0; k < lenArray.size(); k++)
     {
@@ -207,11 +203,10 @@ void printLenArray(FILE* source_fp, const vector<sal_Int32>& lenArray,
     fprintf(source_fp, "\n};\n");
 }
 
-void printIndex1(FILE *source_fp, sal_Int32 *charArray, sal_Int32 count,
-                 sal_Int16 *set)
+void printIndex1(FILE *source_fp, sal_Int32 *charArray, sal_Int16 *set)
 {
     fprintf (source_fp, "static const sal_Int16 index1[] = {\n\t");
-    count = 0;
+    sal_Int32 count = 0;
     sal_Int32 j;
     for (sal_Int32 i = 0; i < 0x100; i++) {
         for (j = 0; j < 0x100; j++)
@@ -252,9 +247,9 @@ void printIndex2(FILE *source_fp, sal_Int32 *charArray, sal_Int16 *set)
     fprintf (source_fp, "\n};\n");
 }
 
-void printExistMark(FILE *source_fp, sal_Bool *exists, sal_Int32 count)
+void printExistMark(FILE *source_fp, sal_Bool *exists)
 {
-    count = 0;
+    sal_Int32 count = 0;
     fprintf (source_fp, "static const sal_uInt8 existMark[] = {\n\t");
     for (sal_Int32 i = 0; i < 0x1FFF; i++) {
         sal_uInt8 bit = 0;
-- 
1.7.1

From 905f8fecf95f73d5d20f27ac72ff07bc4c7ebb5c Mon Sep 17 00:00:00 2001
From: Kenneth Venken <kenneth.ven...@gmail.com>
Date: Fri, 28 Jan 2011 00:14:53 +0100
Subject: [PATCH 6/8] readability changes

---
 i18npool/source/breakiterator/gendict.cxx |  106 +++++++++++++++--------------
 1 files changed, 55 insertions(+), 51 deletions(-)

diff --git a/i18npool/source/breakiterator/gendict.cxx b/i18npool/source/breakiterator/gendict.cxx
index 93a359b..3d0b627 100644
--- a/i18npool/source/breakiterator/gendict.cxx
+++ b/i18npool/source/breakiterator/gendict.cxx
@@ -41,6 +41,16 @@ using std::vector;
 
 using namespace ::rtl;
 
+void printIncludes(FILE *source_fp);
+void initArrays(sal_Bool *exists, sal_Int32 *charArray);
+void printDataArea(FILE *dictionary_fp, FILE *source_fp, sal_Int32 *charArray,
+                   vector<sal_Int32>& lenArray, sal_Bool *exists);
+void printLenArray(FILE *source_fp, const vector<sal_Int32>& lenArray);
+void printIndex1(FILE *source_fp, sal_Int32 *charArray, sal_Int16 *set);
+void printIndex2(FILE *source_fp, sal_Int32 *charArray, sal_Int16 *set);
+void printExistMark(FILE *source_fp, sal_Bool *exists);
+void printFunctions(FILE *source_fp);
+
 /* Utility gendict:
 
    "BreakIterator_CJK provides input string caching and dictionary searching for
@@ -58,58 +68,52 @@ using namespace ::rtl;
    /Documentation/DevGuide/OfficeDev/Implementing_a_New_Locale - 27/01/2011)
 */
 
-void printIncludes(FILE *source_fp);
-void initArrays(sal_Bool *exists, sal_Int32 *charArray);
-void printDataArea(FILE *sfp, FILE *source_fp, sal_Int32 *charArray,
-                   vector<sal_Int32>& lenArray, sal_Bool *exists);
-void printLenArray(FILE *source_fp, const vector<sal_Int32>& lenArray);
-void printIndex1(FILE *source_fp, sal_Int32 *charArray, sal_Int16 *set);
-void printIndex2(FILE *source_fp, sal_Int32 *charArray, sal_Int16 *set);
-void printExistMark(FILE *source_fp, sal_Bool *exists);
-void printFunctions(FILE *source_fp);
-
 SAL_IMPLEMENT_MAIN_WITH_ARGS(argc, argv)
 {
-    FILE *sfp, *cfp;
+    FILE *dictionary_fp, *source_fp;
 
-    if (argc < 3) exit(-1);
+    if (argc < 3)
+    {
+        printf("2 arguments required: dictionary_file_name source_file_name");
+        exit(-1);
+    }
 
-    sfp = fopen(argv[1], "rb");	// open the source file for read;
-    if (sfp == NULL) 
+    dictionary_fp = fopen(argv[1], "rb");	// open the source file for read;
+    if (dictionary_fp == NULL)
     {
         printf("Open the dictionary source file failed.");
         return -1;
     }
 
     // create the C source file to write
-    cfp = fopen(argv[2], "wb");
-    if (cfp == NULL) {
-        fclose(sfp);
+    source_fp = fopen(argv[2], "wb");
+    if (source_fp == NULL) {
+        fclose(dictionary_fp);
         printf("Can't create the C source file.");
         return -1;
     }
 
-    sal_Int32 lenArrayCurr = 0, charArray[0x10000];
+    sal_Int32 charArray[0x10000];
     vector<sal_Int32> lenArray;
     sal_Bool exist[0x10000];
     sal_Int16 set[0x100];
     initArrays( exist, charArray );
 
-    printIncludes(cfp);
-    fprintf(cfp, "extern \"C\" {\n");
-        printDataArea(sfp, cfp, charArray, lenArray, exist);
-        printLenArray(cfp, lenArray);
-        printIndex1(cfp, charArray, set);
-        printIndex2(cfp, charArray, set);
-        printExistMark(cfp, exist);
-        printFunctions(cfp);
-    fprintf (cfp, "}\n");
+    printIncludes(source_fp);
+    fprintf(source_fp, "extern \"C\" {\n");
+        printDataArea(dictionary_fp, source_fp, charArray, lenArray, exist);
+        printLenArray(source_fp, lenArray);
+        printIndex1(source_fp, charArray, set);
+        printIndex2(source_fp, charArray, set);
+        printExistMark(source_fp, exist);
+        printFunctions(source_fp);
+    fprintf (source_fp, "}\n");
 
-    fclose(sfp);
-    fclose(cfp);
+    fclose(dictionary_fp);
+    fclose(source_fp);
 
     return 0;
-}	// End of main
+}
 
 void initArrays(sal_Bool* exists, sal_Int32* charArray)
 {
@@ -138,7 +142,7 @@ void printFunctions(FILE* source_fp)
     fprintf (source_fp, "\tconst sal_Unicode* getDataArea() { return dataArea; }\n");
 }
 
-void printDataArea(FILE *sfp, FILE *source_fp,
+void printDataArea(FILE *dictionary_fp, FILE *source_fp,
                     sal_Int32 *charArray,
                    vector<sal_Int32>& lenArray, sal_Bool *exists)
 {
@@ -148,7 +152,7 @@ void printDataArea(FILE *sfp, FILE *source_fp,
     sal_Int32 lenArrayCurr = 0;
     sal_Unicode current = 0;
     sal_Int32 count = 0;
-    while (fgets(str, 1024, sfp)) {
+    while (fgets(str, 1024, dictionary_fp)) {
         // input file is in UTF-8 encoding
         // don't convert last new line character to Ostr.
         OUString Ostr((const sal_Char *)str, strlen(str) - 1, RTL_TEXTENCODING_UTF8);
@@ -160,23 +164,23 @@ void printDataArea(FILE *sfp, FILE *source_fp,
         Ostr.iterateCodePoints(&i, 1);
         if (len == i) continue;	// skip one character word
 
-        if (*u != current) {
-        if (*u < current)
-        printf("u %x, current %x, count %d, lenArray.size() %d\n", *u, current,
-                    sal::static_int_cast<int>(count), sal::static_int_cast<int>(lenArray.size()));
-        current = *u;
-        charArray[current] = lenArray.size();
+        if (u[0] != current) {
+            if (u[0] < current)
+            printf("u %x, current %x, count %d, lenArray.size() %d\n", u[0], current,
+                        sal::static_int_cast<int>(count), sal::static_int_cast<int>(lenArray.size()));
+            current = u[0];
+            charArray[current] = lenArray.size();
         }
 
         lenArray.push_back(lenArrayCurr);
 
         exists[u[0]] = sal_True;
         for (i = 1; i < len; i++) {		// start from second character,
-        exists[u[i]] = sal_True; 	// since the first character is captured in charArray.
-        lenArrayCurr++;
-        if ((count++) % 0x10 == 0)
-            fprintf(source_fp, "\n\t");
-        fprintf(source_fp, "0x%04x, ", u[i]);
+            exists[u[i]] = sal_True; 	// since the first character is captured in charArray.
+            lenArrayCurr++;
+            if ((count++) % 0x10 == 0)
+                fprintf(source_fp, "\n\t");
+            fprintf(source_fp, "0x%04x, ", u[i]);
         }
     }
     lenArray.push_back( lenArrayCurr ); // store last ending pointer
@@ -210,12 +214,12 @@ void printIndex1(FILE *source_fp, sal_Int32 *charArray, sal_Int16 *set)
     sal_Int32 j;
     for (sal_Int32 i = 0; i < 0x100; i++) {
         for (j = 0; j < 0x100; j++)
-        if (charArray[(i*0x100) + j] != 0)
-            break;
+            if (charArray[(i*0x100) + j] != 0)
+                break;
 
         fprintf(source_fp, "0x%02x, ", set[i] = (j < 0x100 ? sal::static_int_cast<sal_Int16>(count++) : 0xff));
         if ((i+1) % 0x10 == 0)
-        fprintf (source_fp, "\n\t");
+            fprintf (source_fp, "\n\t");
     }
     fprintf (source_fp, "};\n");
 }
@@ -231,7 +235,7 @@ void printIndex2(FILE *source_fp, sal_Int32 *charArray, sal_Int16 *set)
             if (prev != 0 && charArray[k] == 0) {
             for (k++; k < 0x10000; k++)
                 if (charArray[k] != 0)
-                break;
+                    break;
             }
             prev = charArray[(i*0x100) + j];
             fprintf(
@@ -254,12 +258,12 @@ void printExistMark(FILE *source_fp, sal_Bool *exists)
     for (sal_Int32 i = 0; i < 0x1FFF; i++) {
         sal_uInt8 bit = 0;
         for (sal_Int32 j = 0; j < 8; j++)
-        if (exists[i * 8 + j])
-            bit |= 1 << j;
+            if (exists[i * 8 + j])
+                bit |= 1 << j;
         fprintf(source_fp, "0x%02x, ", bit);
         if (count == 0xf) {
-        count = 0;
-        fprintf(source_fp, "\n\t");
+            count = 0;
+            fprintf(source_fp, "\n\t");
         } else count++;
     }
     fprintf (source_fp, "\n};\n");
-- 
1.7.1

From 4d317d550bde15112ac2e312636146d02b2d3e03 Mon Sep 17 00:00:00 2001
From: Kenneth Venken <kenneth.ven...@gmail.com>
Date: Fri, 28 Jan 2011 00:29:14 +0100
Subject: [PATCH 7/8] changed some loop constructs

---
 i18npool/source/breakiterator/gendict.cxx |   16 +++++++---------
 1 files changed, 7 insertions(+), 9 deletions(-)

diff --git a/i18npool/source/breakiterator/gendict.cxx b/i18npool/source/breakiterator/gendict.cxx
index 3d0b627..90e6f75 100644
--- a/i18npool/source/breakiterator/gendict.cxx
+++ b/i18npool/source/breakiterator/gendict.cxx
@@ -211,11 +211,10 @@ void printIndex1(FILE *source_fp, sal_Int32 *charArray, sal_Int16 *set)
 {
     fprintf (source_fp, "static const sal_Int16 index1[] = {\n\t");
     sal_Int32 count = 0;
-    sal_Int32 j;
     for (sal_Int32 i = 0; i < 0x100; i++) {
-        for (j = 0; j < 0x100; j++)
-            if (charArray[(i*0x100) + j] != 0)
-                break;
+        sal_Int32 j = 0;
+        while( j < 0x100 && charArray[(i*0x100) + j] == 0)
+            j++;
 
         fprintf(source_fp, "0x%02x, ", set[i] = (j < 0x100 ? sal::static_int_cast<sal_Int16>(count++) : 0xff));
         if ((i+1) % 0x10 == 0)
@@ -232,11 +231,10 @@ void printIndex2(FILE *source_fp, sal_Int32 *charArray, sal_Int16 *set)
         if (set[i] != 0xff) {
         for (sal_Int32 j = 0; j < 0x100; j++) {
             sal_Int32 k = (i*0x100) + j;
-            if (prev != 0 && charArray[k] == 0) {
-            for (k++; k < 0x10000; k++)
-                if (charArray[k] != 0)
-                    break;
-            }
+            if (prev != 0 )
+                while( charArray[k] == 0 && k < 0x10000 )
+                    k++;
+
             prev = charArray[(i*0x100) + j];
             fprintf(
                 source_fp, "0x%lx, ",
-- 
1.7.1

From 8e9bff87e00f1324588323e3ff0a4e3779f6250f Mon Sep 17 00:00:00 2001
From: Kenneth Venken <kenneth.ven...@gmail.com>
Date: Sun, 30 Jan 2011 00:00:38 +0100
Subject: [PATCH 8/8] more comments

---
 i18npool/source/breakiterator/gendict.cxx |   53 ++++++++++++++++-------------
 1 files changed, 29 insertions(+), 24 deletions(-)

diff --git a/i18npool/source/breakiterator/gendict.cxx b/i18npool/source/breakiterator/gendict.cxx
index 90e6f75..1b70f23 100644
--- a/i18npool/source/breakiterator/gendict.cxx
+++ b/i18npool/source/breakiterator/gendict.cxx
@@ -93,10 +93,10 @@ SAL_IMPLEMENT_MAIN_WITH_ARGS(argc, argv)
         return -1;
     }
 
-    sal_Int32 charArray[0x10000];
-    vector<sal_Int32> lenArray;
-    sal_Bool exist[0x10000];
+    vector<sal_Int32> lenArray;   // stores the word boundaries in DataArea
     sal_Int16 set[0x100];
+    sal_Bool exist[0x10000];      // true if unicode character exists
+    sal_Int32 charArray[0x10000]; // keeps track where words beginning with a certain char are stored in DataArea
     initArrays( exist, charArray );
 
     printIncludes(source_fp);
@@ -142,8 +142,7 @@ void printFunctions(FILE* source_fp)
     fprintf (source_fp, "\tconst sal_Unicode* getDataArea() { return dataArea; }\n");
 }
 
-void printDataArea(FILE *dictionary_fp, FILE *source_fp,
-                    sal_Int32 *charArray,
+void printDataArea(FILE *dictionary_fp, FILE *source_fp, sal_Int32 *charArray,
                    vector<sal_Int32>& lenArray, sal_Bool *exists)
 {
     // generate main dict. data array
@@ -162,7 +161,8 @@ void printDataArea(FILE *dictionary_fp, FILE *source_fp,
 
         sal_Int32 i=0;
         Ostr.iterateCodePoints(&i, 1);
-        if (len == i) continue;	// skip one character word
+        if (len == i)
+            continue;	// skip one character word
 
         if (u[0] != current) {
             if (u[0] < current)
@@ -184,7 +184,6 @@ void printDataArea(FILE *dictionary_fp, FILE *source_fp,
         }
     }
     lenArray.push_back( lenArrayCurr ); // store last ending pointer
-
     charArray[current+1] = lenArray.size();
     fprintf(source_fp, "\n};\n");
 }
@@ -207,6 +206,9 @@ void printLenArray(FILE* source_fp, const vector<sal_Int32>& lenArray)
     fprintf(source_fp, "\n};\n");
 }
 
+/* FIXME?: what happens if in every range i there is at least one charArray != 0
+       => this will make index1[] = {0x00, 0x01, 0x02,... 0xfe, 0xff }
+       => then in index2, the last range will be ignored incorrectly */
 void printIndex1(FILE *source_fp, sal_Int32 *charArray, sal_Int16 *set)
 {
     fprintf (source_fp, "static const sal_Int16 index1[] = {\n\t");
@@ -229,26 +231,28 @@ void printIndex2(FILE *source_fp, sal_Int32 *charArray, sal_Int16 *set)
     sal_Int32 prev = 0;
     for (sal_Int32 i = 0; i < 0x100; i++) {
         if (set[i] != 0xff) {
-        for (sal_Int32 j = 0; j < 0x100; j++) {
-            sal_Int32 k = (i*0x100) + j;
-            if (prev != 0 )
-                while( charArray[k] == 0 && k < 0x10000 )
-                    k++;
-
-            prev = charArray[(i*0x100) + j];
-            fprintf(
-                source_fp, "0x%lx, ",
-                sal::static_int_cast< unsigned long >(
-                    k < 0x10000 ? charArray[k] + 1 : 0));
-            if ((j+1) % 0x10 == 0)
+            for (sal_Int32 j = 0; j < 0x100; j++) {
+                sal_Int32 k = (i*0x100) + j;
+                if (prev != 0 )
+                    while( charArray[k] == 0 && k < 0x10000 )
+                        k++;
+
+                prev = charArray[(i*0x100) + j];
+                fprintf(
+                    source_fp, "0x%lx, ",
+                    sal::static_int_cast< unsigned long >(
+                        k < 0x10000 ? charArray[k] + 1 : 0));
+                if ((j+1) % 0x10 == 0)
+                    fprintf (source_fp, "\n\t");
+            }
             fprintf (source_fp, "\n\t");
         }
-        fprintf (source_fp, "\n\t");
-        }
     }
     fprintf (source_fp, "\n};\n");
 }
 
+/* Generates a bitmask for the existance of sal_Unicode values in dictionary;
+   it packs 8 sal_Bool values in 1 sal_uInt8 */
 void printExistMark(FILE *source_fp, sal_Bool *exists)
 {
     sal_Int32 count = 0;
@@ -256,13 +260,14 @@ void printExistMark(FILE *source_fp, sal_Bool *exists)
     for (sal_Int32 i = 0; i < 0x1FFF; i++) {
         sal_uInt8 bit = 0;
         for (sal_Int32 j = 0; j < 8; j++)
-            if (exists[i * 8 + j])
-                bit |= 1 << j;
+            bit |= (exists[i * 8 + j]) << j;
+
         fprintf(source_fp, "0x%02x, ", bit);
         if (count == 0xf) {
             count = 0;
             fprintf(source_fp, "\n\t");
-        } else count++;
+        } else
+            count++;
     }
     fprintf (source_fp, "\n};\n");
 }
-- 
1.7.1

_______________________________________________
LibreOffice mailing list
LibreOffice@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/libreoffice

Reply via email to