Hi, i came across gendict.cxx while fixing a possible memleak. It took me some time to figure out what the code did. I notice a lot of very very long function bodies in LO-code, gendict was no exception. So i refactored the code, did some google searches on gendict and was able to fix the memleak. I ended up submitting only the fix, not the refactoring, because i didn't want to break any de facto coding style guideliness and i am still a fairly new contributer to LO.
I submit these patches now, so you guys can decide if you push them or not. In case it could save some other new contributor some time in understanding gendict ;) BTW, i tested the code on ja.dic and output is still the same. -- Kenneth
From 7155a3675cf3410ab74dc7032bceaef719548d3c Mon Sep 17 00:00:00 2001 From: Kenneth Venken <kenneth.ven...@gmail.com> Date: Thu, 27 Jan 2011 22:27:24 +0100 Subject: [PATCH 1/8] added some documentation to gendict --- i18npool/source/breakiterator/gendict.cxx | 17 ++++++++++++++++- 1 files changed, 16 insertions(+), 1 deletions(-) diff --git a/i18npool/source/breakiterator/gendict.cxx b/i18npool/source/breakiterator/gendict.cxx index 9f49f67..8a6354b 100644 --- a/i18npool/source/breakiterator/gendict.cxx +++ b/i18npool/source/breakiterator/gendict.cxx @@ -41,7 +41,22 @@ using std::vector; using namespace ::rtl; -/* Main Procedure */ +/* Utility gendict: + + "BreakIterator_CJK provides input string caching and dictionary searching for + longest matching. You can provide a sorted dictionary (the encoding must be + UTF-8) by creating the following file: + i18npool/source/breakiterator/data/<language>.dict. + + The utility gendict will convert the file to C code, which will be compiled + into a shared library for dynamic loading. + + All dictionary searching and loading is performed in the xdictionary class. + The only thing you need to do is to derive your class from BreakIterator_CJK + and create an instance of the xdictionary with the language name and + pass it to the parent class." (from http://wiki.services.openoffice.org/wiki/ + /Documentation/DevGuide/OfficeDev/Implementing_a_New_Locale - 27/01/2011) +*/ SAL_IMPLEMENT_MAIN_WITH_ARGS(argc, argv) { -- 1.7.1
From d44b67f81303f56246f93a6eb6419f371d5b215f Mon Sep 17 00:00:00 2001 From: Kenneth Venken <kenneth.ven...@gmail.com> Date: Thu, 27 Jan 2011 22:43:49 +0100 Subject: [PATCH 2/8] refactored out some simple print functions --- i18npool/source/breakiterator/gendict.cxx | 35 +++++++++++++++++++---------- 1 files changed, 23 insertions(+), 12 deletions(-) diff --git a/i18npool/source/breakiterator/gendict.cxx b/i18npool/source/breakiterator/gendict.cxx index 8a6354b..df7e144 100644 --- a/i18npool/source/breakiterator/gendict.cxx +++ b/i18npool/source/breakiterator/gendict.cxx @@ -58,6 +58,9 @@ using namespace ::rtl; /Documentation/DevGuide/OfficeDev/Implementing_a_New_Locale - 27/01/2011) */ +void printIncludes(FILE *source_fp); +void printFunctions(FILE *source_fp); + SAL_IMPLEMENT_MAIN_WITH_ARGS(argc, argv) { FILE *sfp, *cfp; @@ -79,12 +82,7 @@ SAL_IMPLEMENT_MAIN_WITH_ARGS(argc, argv) return -1; } - fprintf(cfp, "/*\n"); - fprintf(cfp, " * Copyright(c) 1999 - 2000, Sun Microsystems, Inc.\n"); - fprintf(cfp, " * All Rights Reserved.\n"); - fprintf(cfp, " */\n\n"); - fprintf(cfp, "/* !!!The file is generated automatically. DONOT edit the file manually!!! */\n\n"); - fprintf(cfp, "#include <sal/types.h>\n\n"); + printIncludes(cfp); fprintf(cfp, "extern \"C\" {\n"); sal_Int32 count, i, j; @@ -209,12 +207,6 @@ SAL_IMPLEMENT_MAIN_WITH_ARGS(argc, argv) } fprintf (cfp, "\n};\n"); - // create function to return arrays - fprintf (cfp, "\tconst sal_uInt8* getExistMark() { return existMark; }\n"); - fprintf (cfp, "\tconst sal_Int16* getIndex1() { return index1; }\n"); - fprintf (cfp, "\tconst sal_Int32* getIndex2() { return index2; }\n"); - fprintf (cfp, "\tconst sal_Int32* getLenArray() { return lenArray; }\n"); - fprintf (cfp, "\tconst sal_Unicode* getDataArea() { return dataArea; }\n"); fprintf (cfp, "}\n"); fclose(sfp); @@ -223,4 +215,23 @@ SAL_IMPLEMENT_MAIN_WITH_ARGS(argc, argv) return 0; } // End of main +void printIncludes(FILE* source_fp) +{ + fprintf(source_fp, "/*\n"); + fprintf(source_fp, " * Copyright(c) 1999 - 2000, Sun Microsystems, Inc.\n"); + fprintf(source_fp, " * All Rights Reserved.\n"); + fprintf(source_fp, " */\n\n"); + fprintf(source_fp, "/* !!!The file is generated automatically. DO NOT edit the file manually!!! */\n\n"); + fprintf(source_fp, "#include <sal/types.h>\n\n"); +} + +void printFunctions(FILE* source_fp) +{ + fprintf (source_fp, "\tconst sal_uInt8* getExistMark() { return existMark; }\n"); + fprintf (source_fp, "\tconst sal_Int16* getIndex1() { return index1; }\n"); + fprintf (source_fp, "\tconst sal_Int32* getIndex2() { return index2; }\n"); + fprintf (source_fp, "\tconst sal_Int32* getLenArray() { return lenArray; }\n"); + fprintf (source_fp, "\tconst sal_Unicode* getDataArea() { return dataArea; }\n"); +} + /* vim:set shiftwidth=4 softtabstop=4 expandtab: */ -- 1.7.1
From f7c0500f2f74b785b27d185eb533e49de6753160 Mon Sep 17 00:00:00 2001 From: Kenneth Venken <kenneth.ven...@gmail.com> Date: Thu, 27 Jan 2011 23:02:11 +0100 Subject: [PATCH 3/8] refactored out dataArea --- i18npool/source/breakiterator/gendict.cxx | 104 +++++++++++++++++------------ 1 files changed, 60 insertions(+), 44 deletions(-) diff --git a/i18npool/source/breakiterator/gendict.cxx b/i18npool/source/breakiterator/gendict.cxx index df7e144..eb654f5 100644 --- a/i18npool/source/breakiterator/gendict.cxx +++ b/i18npool/source/breakiterator/gendict.cxx @@ -59,6 +59,10 @@ using namespace ::rtl; */ void printIncludes(FILE *source_fp); +void initArrays(sal_Bool *exists, sal_Int32 *charArray); +void printDataArea(FILE *sfp, FILE *cfp, sal_Int32 count, sal_Int32 i, + sal_Int32 lenArrayCurr, sal_Int32 *charArray, + vector<sal_Int32>& lenArray, sal_Bool *exists); void printFunctions(FILE *source_fp); SAL_IMPLEMENT_MAIN_WITH_ARGS(argc, argv) @@ -89,51 +93,9 @@ SAL_IMPLEMENT_MAIN_WITH_ARGS(argc, argv) sal_Int32 lenArrayCurr = 0, charArray[0x10000]; vector<sal_Int32> lenArray; sal_Bool exist[0x10000]; - for (i = 0; i < 0x10000; i++) { - exist[i] = sal_False; - charArray[i] = 0; - } - - // generate main dict. data array - fprintf(cfp, "static const sal_Unicode dataArea[] = {"); - sal_Char str[1024]; - sal_Unicode current = 0; - count = 0; - while (fgets(str, 1024, sfp)) { - // input file is in UTF-8 encoding - // don't convert last new line character to Ostr. - OUString Ostr((const sal_Char *)str, strlen(str) - 1, RTL_TEXTENCODING_UTF8); - const sal_Unicode *u = Ostr.getStr(); - - sal_Int32 len = Ostr.getLength(); - - i=0; - Ostr.iterateCodePoints(&i, 1); - if (len == i) continue; // skip one character word - - if (*u != current) { - if (*u < current) - printf("u %x, current %x, count %d, lenArray.size() %d\n", *u, current, - sal::static_int_cast<int>(count), sal::static_int_cast<int>(lenArray.size())); - current = *u; - charArray[current] = lenArray.size(); - } + initArrays( exist, charArray ); - lenArray.push_back(lenArrayCurr); - - exist[u[0]] = sal_True; - for (i = 1; i < len; i++) { // start from second character, - exist[u[i]] = sal_True; // since the first character is captured in charArray. - lenArrayCurr++; - if ((count++) % 0x10 == 0) - fprintf(cfp, "\n\t"); - fprintf(cfp, "0x%04x, ", u[i]); - } - } - lenArray.push_back( lenArrayCurr ); // store last ending pointer - - charArray[current+1] = lenArray.size(); - fprintf(cfp, "\n};\n"); + printDataArea(sfp, cfp, count, i, lenArrayCurr, charArray, lenArray, exist); // generate lenArray fprintf(cfp, "static const sal_Int32 lenArray[] = {\n\t"); @@ -215,6 +177,14 @@ SAL_IMPLEMENT_MAIN_WITH_ARGS(argc, argv) return 0; } // End of main +void initArrays(sal_Bool* exists, sal_Int32* charArray) +{ + for (sal_Int32 i = 0; i < 0x10000; i++) { + exists[i] = sal_False; + charArray[i] = 0; + } +} + void printIncludes(FILE* source_fp) { fprintf(source_fp, "/*\n"); @@ -234,4 +204,50 @@ void printFunctions(FILE* source_fp) fprintf (source_fp, "\tconst sal_Unicode* getDataArea() { return dataArea; }\n"); } +void printDataArea(FILE *sfp, FILE *cfp, sal_Int32 count, sal_Int32 i, + sal_Int32 lenArrayCurr, sal_Int32 *charArray, + vector<sal_Int32>& lenArray, sal_Bool *exists) +{ + // generate main dict. data array + fprintf(cfp, "static const sal_Unicode dataArea[] = {"); + sal_Char str[1024]; + sal_Unicode current = 0; + count = 0; + while (fgets(str, 1024, sfp)) { + // input file is in UTF-8 encoding + // don't convert last new line character to Ostr. + OUString Ostr((const sal_Char *)str, strlen(str) - 1, RTL_TEXTENCODING_UTF8); + const sal_Unicode *u = Ostr.getStr(); + + sal_Int32 len = Ostr.getLength(); + + i=0; + Ostr.iterateCodePoints(&i, 1); + if (len == i) continue; // skip one character word + + if (*u != current) { + if (*u < current) + printf("u %x, current %x, count %d, lenArray.size() %d\n", *u, current, + sal::static_int_cast<int>(count), sal::static_int_cast<int>(lenArray.size())); + current = *u; + charArray[current] = lenArray.size(); + } + + lenArray.push_back(lenArrayCurr); + + exists[u[0]] = sal_True; + for (i = 1; i < len; i++) { // start from second character, + exists[u[i]] = sal_True; // since the first character is captured in charArray. + lenArrayCurr++; + if ((count++) % 0x10 == 0) + fprintf(cfp, "\n\t"); + fprintf(cfp, "0x%04x, ", u[i]); + } + } + lenArray.push_back( lenArrayCurr ); // store last ending pointer + + charArray[current+1] = lenArray.size(); + fprintf(cfp, "\n};\n"); +} + /* vim:set shiftwidth=4 softtabstop=4 expandtab: */ -- 1.7.1
From 1e5a7d4cd5e13895ce4db4f0fea85f8e77ffe876 Mon Sep 17 00:00:00 2001 From: Kenneth Venken <kenneth.ven...@gmail.com> Date: Thu, 27 Jan 2011 23:43:38 +0100 Subject: [PATCH 4/8] refactored out all array functions --- i18npool/source/breakiterator/gendict.cxx | 188 ++++++++++++++++------------- 1 files changed, 104 insertions(+), 84 deletions(-) diff --git a/i18npool/source/breakiterator/gendict.cxx b/i18npool/source/breakiterator/gendict.cxx index eb654f5..315acfa 100644 --- a/i18npool/source/breakiterator/gendict.cxx +++ b/i18npool/source/breakiterator/gendict.cxx @@ -60,9 +60,15 @@ using namespace ::rtl; void printIncludes(FILE *source_fp); void initArrays(sal_Bool *exists, sal_Int32 *charArray); -void printDataArea(FILE *sfp, FILE *cfp, sal_Int32 count, sal_Int32 i, +void printDataArea(FILE *sfp, FILE *source_fp, sal_Int32 count, sal_Int32 i, sal_Int32 lenArrayCurr, sal_Int32 *charArray, vector<sal_Int32>& lenArray, sal_Bool *exists); +void printLenArray(FILE *source_fp, const vector<sal_Int32>& lenArray, + sal_Int32 count); +void printIndex1(FILE *source_fp, sal_Int32 *charArray, sal_Int32 count, + sal_Int16 *set); +void printIndex2(FILE *source_fp, sal_Int32 *charArray, sal_Int16 *set); +void printExistMark(FILE *source_fp, sal_Bool *exists, sal_Int32 count); void printFunctions(FILE *source_fp); SAL_IMPLEMENT_MAIN_WITH_ARGS(argc, argv) @@ -86,89 +92,21 @@ SAL_IMPLEMENT_MAIN_WITH_ARGS(argc, argv) return -1; } - printIncludes(cfp); - fprintf(cfp, "extern \"C\" {\n"); - - sal_Int32 count, i, j; + sal_Int32 count, i; sal_Int32 lenArrayCurr = 0, charArray[0x10000]; vector<sal_Int32> lenArray; sal_Bool exist[0x10000]; - initArrays( exist, charArray ); - - printDataArea(sfp, cfp, count, i, lenArrayCurr, charArray, lenArray, exist); - - // generate lenArray - fprintf(cfp, "static const sal_Int32 lenArray[] = {\n\t"); - count = 1; - fprintf(cfp, "0x%x, ", 0); // insert one slat for skipping 0 in index2 array. - for (size_t k = 0; k < lenArray.size(); k++) - { - fprintf(cfp, "0x%lx, ", static_cast<long unsigned int>(lenArray[k])); - if (count == 0xf) - { - count = 0; - fprintf(cfp, "\n\t"); - } - else count++; - } - fprintf(cfp, "\n};\n"); - - // generate index1 array - fprintf (cfp, "static const sal_Int16 index1[] = {\n\t"); sal_Int16 set[0x100]; - count = 0; - for (i = 0; i < 0x100; i++) { - for (j = 0; j < 0x100; j++) - if (charArray[(i*0x100) + j] != 0) - break; - - fprintf(cfp, "0x%02x, ", set[i] = (j < 0x100 ? sal::static_int_cast<sal_Int16>(count++) : 0xff)); - if ((i+1) % 0x10 == 0) - fprintf (cfp, "\n\t"); - } - fprintf (cfp, "};\n"); - - // generate index2 array - fprintf (cfp, "static const sal_Int32 index2[] = {\n\t"); - sal_Int32 prev = 0; - for (i = 0; i < 0x100; i++) { - if (set[i] != 0xff) { - for (j = 0; j < 0x100; j++) { - sal_Int32 k = (i*0x100) + j; - if (prev != 0 && charArray[k] == 0) { - for (k++; k < 0x10000; k++) - if (charArray[k] != 0) - break; - } - prev = charArray[(i*0x100) + j]; - fprintf( - cfp, "0x%lx, ", - sal::static_int_cast< unsigned long >( - k < 0x10000 ? charArray[k] + 1 : 0)); - if ((j+1) % 0x10 == 0) - fprintf (cfp, "\n\t"); - } - fprintf (cfp, "\n\t"); - } - } - fprintf (cfp, "\n};\n"); - - // generate existMark array - count = 0; - fprintf (cfp, "static const sal_uInt8 existMark[] = {\n\t"); - for (i = 0; i < 0x1FFF; i++) { - sal_uInt8 bit = 0; - for (j = 0; j < 8; j++) - if (exist[i * 8 + j]) - bit |= 1 << j; - fprintf(cfp, "0x%02x, ", bit); - if (count == 0xf) { - count = 0; - fprintf(cfp, "\n\t"); - } else count++; - } - fprintf (cfp, "\n};\n"); + initArrays( exist, charArray ); + printIncludes(cfp); + fprintf(cfp, "extern \"C\" {\n"); + printDataArea(sfp, cfp, count, i, lenArrayCurr, charArray, lenArray, exist); + printLenArray(cfp, lenArray, count); + printIndex1(cfp, charArray, count, set); + printIndex2(cfp, charArray, set); + printExistMark(cfp, exist, count); + printFunctions(cfp); fprintf (cfp, "}\n"); fclose(sfp); @@ -204,12 +142,12 @@ void printFunctions(FILE* source_fp) fprintf (source_fp, "\tconst sal_Unicode* getDataArea() { return dataArea; }\n"); } -void printDataArea(FILE *sfp, FILE *cfp, sal_Int32 count, sal_Int32 i, +void printDataArea(FILE *sfp, FILE *source_fp, sal_Int32 count, sal_Int32 i, sal_Int32 lenArrayCurr, sal_Int32 *charArray, vector<sal_Int32>& lenArray, sal_Bool *exists) { // generate main dict. data array - fprintf(cfp, "static const sal_Unicode dataArea[] = {"); + fprintf(source_fp, "static const sal_Unicode dataArea[] = {"); sal_Char str[1024]; sal_Unicode current = 0; count = 0; @@ -240,14 +178,96 @@ void printDataArea(FILE *sfp, FILE *cfp, sal_Int32 count, sal_Int32 i, exists[u[i]] = sal_True; // since the first character is captured in charArray. lenArrayCurr++; if ((count++) % 0x10 == 0) - fprintf(cfp, "\n\t"); - fprintf(cfp, "0x%04x, ", u[i]); + fprintf(source_fp, "\n\t"); + fprintf(source_fp, "0x%04x, ", u[i]); } } lenArray.push_back( lenArrayCurr ); // store last ending pointer charArray[current+1] = lenArray.size(); - fprintf(cfp, "\n};\n"); + fprintf(source_fp, "\n};\n"); +} + +void printLenArray(FILE* source_fp, const vector<sal_Int32>& lenArray, + sal_Int32 count) +{ + fprintf(source_fp, "static const sal_Int32 lenArray[] = {\n\t"); + count = 1; + fprintf(source_fp, "0x%x, ", 0); // insert one slat for skipping 0 in index2 array. + for (size_t k = 0; k < lenArray.size(); k++) + { + fprintf(source_fp, "0x%lx, ", static_cast<long unsigned int>(lenArray[k])); + if (count == 0xf) + { + count = 0; + fprintf(source_fp, "\n\t"); + } + else count++; + } + fprintf(source_fp, "\n};\n"); +} + +void printIndex1(FILE *source_fp, sal_Int32 *charArray, sal_Int32 count, + sal_Int16 *set) +{ + fprintf (source_fp, "static const sal_Int16 index1[] = {\n\t"); + count = 0; + sal_Int32 j; + for (sal_Int32 i = 0; i < 0x100; i++) { + for (j = 0; j < 0x100; j++) + if (charArray[(i*0x100) + j] != 0) + break; + + fprintf(source_fp, "0x%02x, ", set[i] = (j < 0x100 ? sal::static_int_cast<sal_Int16>(count++) : 0xff)); + if ((i+1) % 0x10 == 0) + fprintf (source_fp, "\n\t"); + } + fprintf (source_fp, "};\n"); +} + +void printIndex2(FILE *source_fp, sal_Int32 *charArray, sal_Int16 *set) +{ + fprintf (source_fp, "static const sal_Int32 index2[] = {\n\t"); + sal_Int32 prev = 0; + for (sal_Int32 i = 0; i < 0x100; i++) { + if (set[i] != 0xff) { + for (sal_Int32 j = 0; j < 0x100; j++) { + sal_Int32 k = (i*0x100) + j; + if (prev != 0 && charArray[k] == 0) { + for (k++; k < 0x10000; k++) + if (charArray[k] != 0) + break; + } + prev = charArray[(i*0x100) + j]; + fprintf( + source_fp, "0x%lx, ", + sal::static_int_cast< unsigned long >( + k < 0x10000 ? charArray[k] + 1 : 0)); + if ((j+1) % 0x10 == 0) + fprintf (source_fp, "\n\t"); + } + fprintf (source_fp, "\n\t"); + } + } + fprintf (source_fp, "\n};\n"); +} + +void printExistMark(FILE *source_fp, sal_Bool *exists, sal_Int32 count) +{ + count = 0; + fprintf (source_fp, "static const sal_uInt8 existMark[] = {\n\t"); + for (sal_Int32 i = 0; i < 0x1FFF; i++) { + sal_uInt8 bit = 0; + for (sal_Int32 j = 0; j < 8; j++) + if (exists[i * 8 + j]) + bit |= 1 << j; + fprintf(source_fp, "0x%02x, ", bit); + if (count == 0xf) { + count = 0; + fprintf(source_fp, "\n\t"); + } else count++; + } + fprintf (source_fp, "\n};\n"); } /* vim:set shiftwidth=4 softtabstop=4 expandtab: */ -- 1.7.1
From 8df201b3e266826392e6c59fb885d71cecbeeeb7 Mon Sep 17 00:00:00 2001 From: Kenneth Venken <kenneth.ven...@gmail.com> Date: Thu, 27 Jan 2011 23:52:52 +0100 Subject: [PATCH 5/8] reduced scope of some variables --- i18npool/source/breakiterator/gendict.cxx | 43 +++++++++++++---------------- 1 files changed, 19 insertions(+), 24 deletions(-) diff --git a/i18npool/source/breakiterator/gendict.cxx b/i18npool/source/breakiterator/gendict.cxx index 315acfa..93a359b 100644 --- a/i18npool/source/breakiterator/gendict.cxx +++ b/i18npool/source/breakiterator/gendict.cxx @@ -60,15 +60,12 @@ using namespace ::rtl; void printIncludes(FILE *source_fp); void initArrays(sal_Bool *exists, sal_Int32 *charArray); -void printDataArea(FILE *sfp, FILE *source_fp, sal_Int32 count, sal_Int32 i, - sal_Int32 lenArrayCurr, sal_Int32 *charArray, +void printDataArea(FILE *sfp, FILE *source_fp, sal_Int32 *charArray, vector<sal_Int32>& lenArray, sal_Bool *exists); -void printLenArray(FILE *source_fp, const vector<sal_Int32>& lenArray, - sal_Int32 count); -void printIndex1(FILE *source_fp, sal_Int32 *charArray, sal_Int32 count, - sal_Int16 *set); +void printLenArray(FILE *source_fp, const vector<sal_Int32>& lenArray); +void printIndex1(FILE *source_fp, sal_Int32 *charArray, sal_Int16 *set); void printIndex2(FILE *source_fp, sal_Int32 *charArray, sal_Int16 *set); -void printExistMark(FILE *source_fp, sal_Bool *exists, sal_Int32 count); +void printExistMark(FILE *source_fp, sal_Bool *exists); void printFunctions(FILE *source_fp); SAL_IMPLEMENT_MAIN_WITH_ARGS(argc, argv) @@ -92,7 +89,6 @@ SAL_IMPLEMENT_MAIN_WITH_ARGS(argc, argv) return -1; } - sal_Int32 count, i; sal_Int32 lenArrayCurr = 0, charArray[0x10000]; vector<sal_Int32> lenArray; sal_Bool exist[0x10000]; @@ -101,11 +97,11 @@ SAL_IMPLEMENT_MAIN_WITH_ARGS(argc, argv) printIncludes(cfp); fprintf(cfp, "extern \"C\" {\n"); - printDataArea(sfp, cfp, count, i, lenArrayCurr, charArray, lenArray, exist); - printLenArray(cfp, lenArray, count); - printIndex1(cfp, charArray, count, set); + printDataArea(sfp, cfp, charArray, lenArray, exist); + printLenArray(cfp, lenArray); + printIndex1(cfp, charArray, set); printIndex2(cfp, charArray, set); - printExistMark(cfp, exist, count); + printExistMark(cfp, exist); printFunctions(cfp); fprintf (cfp, "}\n"); @@ -142,15 +138,16 @@ void printFunctions(FILE* source_fp) fprintf (source_fp, "\tconst sal_Unicode* getDataArea() { return dataArea; }\n"); } -void printDataArea(FILE *sfp, FILE *source_fp, sal_Int32 count, sal_Int32 i, - sal_Int32 lenArrayCurr, sal_Int32 *charArray, +void printDataArea(FILE *sfp, FILE *source_fp, + sal_Int32 *charArray, vector<sal_Int32>& lenArray, sal_Bool *exists) { // generate main dict. data array fprintf(source_fp, "static const sal_Unicode dataArea[] = {"); sal_Char str[1024]; + sal_Int32 lenArrayCurr = 0; sal_Unicode current = 0; - count = 0; + sal_Int32 count = 0; while (fgets(str, 1024, sfp)) { // input file is in UTF-8 encoding // don't convert last new line character to Ostr. @@ -159,7 +156,7 @@ void printDataArea(FILE *sfp, FILE *source_fp, sal_Int32 count, sal_Int32 i, sal_Int32 len = Ostr.getLength(); - i=0; + sal_Int32 i=0; Ostr.iterateCodePoints(&i, 1); if (len == i) continue; // skip one character word @@ -188,11 +185,10 @@ void printDataArea(FILE *sfp, FILE *source_fp, sal_Int32 count, sal_Int32 i, fprintf(source_fp, "\n};\n"); } -void printLenArray(FILE* source_fp, const vector<sal_Int32>& lenArray, - sal_Int32 count) +void printLenArray(FILE* source_fp, const vector<sal_Int32>& lenArray) { fprintf(source_fp, "static const sal_Int32 lenArray[] = {\n\t"); - count = 1; + sal_Int32 count = 1; fprintf(source_fp, "0x%x, ", 0); // insert one slat for skipping 0 in index2 array. for (size_t k = 0; k < lenArray.size(); k++) { @@ -207,11 +203,10 @@ void printLenArray(FILE* source_fp, const vector<sal_Int32>& lenArray, fprintf(source_fp, "\n};\n"); } -void printIndex1(FILE *source_fp, sal_Int32 *charArray, sal_Int32 count, - sal_Int16 *set) +void printIndex1(FILE *source_fp, sal_Int32 *charArray, sal_Int16 *set) { fprintf (source_fp, "static const sal_Int16 index1[] = {\n\t"); - count = 0; + sal_Int32 count = 0; sal_Int32 j; for (sal_Int32 i = 0; i < 0x100; i++) { for (j = 0; j < 0x100; j++) @@ -252,9 +247,9 @@ void printIndex2(FILE *source_fp, sal_Int32 *charArray, sal_Int16 *set) fprintf (source_fp, "\n};\n"); } -void printExistMark(FILE *source_fp, sal_Bool *exists, sal_Int32 count) +void printExistMark(FILE *source_fp, sal_Bool *exists) { - count = 0; + sal_Int32 count = 0; fprintf (source_fp, "static const sal_uInt8 existMark[] = {\n\t"); for (sal_Int32 i = 0; i < 0x1FFF; i++) { sal_uInt8 bit = 0; -- 1.7.1
From 905f8fecf95f73d5d20f27ac72ff07bc4c7ebb5c Mon Sep 17 00:00:00 2001 From: Kenneth Venken <kenneth.ven...@gmail.com> Date: Fri, 28 Jan 2011 00:14:53 +0100 Subject: [PATCH 6/8] readability changes --- i18npool/source/breakiterator/gendict.cxx | 106 +++++++++++++++-------------- 1 files changed, 55 insertions(+), 51 deletions(-) diff --git a/i18npool/source/breakiterator/gendict.cxx b/i18npool/source/breakiterator/gendict.cxx index 93a359b..3d0b627 100644 --- a/i18npool/source/breakiterator/gendict.cxx +++ b/i18npool/source/breakiterator/gendict.cxx @@ -41,6 +41,16 @@ using std::vector; using namespace ::rtl; +void printIncludes(FILE *source_fp); +void initArrays(sal_Bool *exists, sal_Int32 *charArray); +void printDataArea(FILE *dictionary_fp, FILE *source_fp, sal_Int32 *charArray, + vector<sal_Int32>& lenArray, sal_Bool *exists); +void printLenArray(FILE *source_fp, const vector<sal_Int32>& lenArray); +void printIndex1(FILE *source_fp, sal_Int32 *charArray, sal_Int16 *set); +void printIndex2(FILE *source_fp, sal_Int32 *charArray, sal_Int16 *set); +void printExistMark(FILE *source_fp, sal_Bool *exists); +void printFunctions(FILE *source_fp); + /* Utility gendict: "BreakIterator_CJK provides input string caching and dictionary searching for @@ -58,58 +68,52 @@ using namespace ::rtl; /Documentation/DevGuide/OfficeDev/Implementing_a_New_Locale - 27/01/2011) */ -void printIncludes(FILE *source_fp); -void initArrays(sal_Bool *exists, sal_Int32 *charArray); -void printDataArea(FILE *sfp, FILE *source_fp, sal_Int32 *charArray, - vector<sal_Int32>& lenArray, sal_Bool *exists); -void printLenArray(FILE *source_fp, const vector<sal_Int32>& lenArray); -void printIndex1(FILE *source_fp, sal_Int32 *charArray, sal_Int16 *set); -void printIndex2(FILE *source_fp, sal_Int32 *charArray, sal_Int16 *set); -void printExistMark(FILE *source_fp, sal_Bool *exists); -void printFunctions(FILE *source_fp); - SAL_IMPLEMENT_MAIN_WITH_ARGS(argc, argv) { - FILE *sfp, *cfp; + FILE *dictionary_fp, *source_fp; - if (argc < 3) exit(-1); + if (argc < 3) + { + printf("2 arguments required: dictionary_file_name source_file_name"); + exit(-1); + } - sfp = fopen(argv[1], "rb"); // open the source file for read; - if (sfp == NULL) + dictionary_fp = fopen(argv[1], "rb"); // open the source file for read; + if (dictionary_fp == NULL) { printf("Open the dictionary source file failed."); return -1; } // create the C source file to write - cfp = fopen(argv[2], "wb"); - if (cfp == NULL) { - fclose(sfp); + source_fp = fopen(argv[2], "wb"); + if (source_fp == NULL) { + fclose(dictionary_fp); printf("Can't create the C source file."); return -1; } - sal_Int32 lenArrayCurr = 0, charArray[0x10000]; + sal_Int32 charArray[0x10000]; vector<sal_Int32> lenArray; sal_Bool exist[0x10000]; sal_Int16 set[0x100]; initArrays( exist, charArray ); - printIncludes(cfp); - fprintf(cfp, "extern \"C\" {\n"); - printDataArea(sfp, cfp, charArray, lenArray, exist); - printLenArray(cfp, lenArray); - printIndex1(cfp, charArray, set); - printIndex2(cfp, charArray, set); - printExistMark(cfp, exist); - printFunctions(cfp); - fprintf (cfp, "}\n"); + printIncludes(source_fp); + fprintf(source_fp, "extern \"C\" {\n"); + printDataArea(dictionary_fp, source_fp, charArray, lenArray, exist); + printLenArray(source_fp, lenArray); + printIndex1(source_fp, charArray, set); + printIndex2(source_fp, charArray, set); + printExistMark(source_fp, exist); + printFunctions(source_fp); + fprintf (source_fp, "}\n"); - fclose(sfp); - fclose(cfp); + fclose(dictionary_fp); + fclose(source_fp); return 0; -} // End of main +} void initArrays(sal_Bool* exists, sal_Int32* charArray) { @@ -138,7 +142,7 @@ void printFunctions(FILE* source_fp) fprintf (source_fp, "\tconst sal_Unicode* getDataArea() { return dataArea; }\n"); } -void printDataArea(FILE *sfp, FILE *source_fp, +void printDataArea(FILE *dictionary_fp, FILE *source_fp, sal_Int32 *charArray, vector<sal_Int32>& lenArray, sal_Bool *exists) { @@ -148,7 +152,7 @@ void printDataArea(FILE *sfp, FILE *source_fp, sal_Int32 lenArrayCurr = 0; sal_Unicode current = 0; sal_Int32 count = 0; - while (fgets(str, 1024, sfp)) { + while (fgets(str, 1024, dictionary_fp)) { // input file is in UTF-8 encoding // don't convert last new line character to Ostr. OUString Ostr((const sal_Char *)str, strlen(str) - 1, RTL_TEXTENCODING_UTF8); @@ -160,23 +164,23 @@ void printDataArea(FILE *sfp, FILE *source_fp, Ostr.iterateCodePoints(&i, 1); if (len == i) continue; // skip one character word - if (*u != current) { - if (*u < current) - printf("u %x, current %x, count %d, lenArray.size() %d\n", *u, current, - sal::static_int_cast<int>(count), sal::static_int_cast<int>(lenArray.size())); - current = *u; - charArray[current] = lenArray.size(); + if (u[0] != current) { + if (u[0] < current) + printf("u %x, current %x, count %d, lenArray.size() %d\n", u[0], current, + sal::static_int_cast<int>(count), sal::static_int_cast<int>(lenArray.size())); + current = u[0]; + charArray[current] = lenArray.size(); } lenArray.push_back(lenArrayCurr); exists[u[0]] = sal_True; for (i = 1; i < len; i++) { // start from second character, - exists[u[i]] = sal_True; // since the first character is captured in charArray. - lenArrayCurr++; - if ((count++) % 0x10 == 0) - fprintf(source_fp, "\n\t"); - fprintf(source_fp, "0x%04x, ", u[i]); + exists[u[i]] = sal_True; // since the first character is captured in charArray. + lenArrayCurr++; + if ((count++) % 0x10 == 0) + fprintf(source_fp, "\n\t"); + fprintf(source_fp, "0x%04x, ", u[i]); } } lenArray.push_back( lenArrayCurr ); // store last ending pointer @@ -210,12 +214,12 @@ void printIndex1(FILE *source_fp, sal_Int32 *charArray, sal_Int16 *set) sal_Int32 j; for (sal_Int32 i = 0; i < 0x100; i++) { for (j = 0; j < 0x100; j++) - if (charArray[(i*0x100) + j] != 0) - break; + if (charArray[(i*0x100) + j] != 0) + break; fprintf(source_fp, "0x%02x, ", set[i] = (j < 0x100 ? sal::static_int_cast<sal_Int16>(count++) : 0xff)); if ((i+1) % 0x10 == 0) - fprintf (source_fp, "\n\t"); + fprintf (source_fp, "\n\t"); } fprintf (source_fp, "};\n"); } @@ -231,7 +235,7 @@ void printIndex2(FILE *source_fp, sal_Int32 *charArray, sal_Int16 *set) if (prev != 0 && charArray[k] == 0) { for (k++; k < 0x10000; k++) if (charArray[k] != 0) - break; + break; } prev = charArray[(i*0x100) + j]; fprintf( @@ -254,12 +258,12 @@ void printExistMark(FILE *source_fp, sal_Bool *exists) for (sal_Int32 i = 0; i < 0x1FFF; i++) { sal_uInt8 bit = 0; for (sal_Int32 j = 0; j < 8; j++) - if (exists[i * 8 + j]) - bit |= 1 << j; + if (exists[i * 8 + j]) + bit |= 1 << j; fprintf(source_fp, "0x%02x, ", bit); if (count == 0xf) { - count = 0; - fprintf(source_fp, "\n\t"); + count = 0; + fprintf(source_fp, "\n\t"); } else count++; } fprintf (source_fp, "\n};\n"); -- 1.7.1
From 4d317d550bde15112ac2e312636146d02b2d3e03 Mon Sep 17 00:00:00 2001 From: Kenneth Venken <kenneth.ven...@gmail.com> Date: Fri, 28 Jan 2011 00:29:14 +0100 Subject: [PATCH 7/8] changed some loop constructs --- i18npool/source/breakiterator/gendict.cxx | 16 +++++++--------- 1 files changed, 7 insertions(+), 9 deletions(-) diff --git a/i18npool/source/breakiterator/gendict.cxx b/i18npool/source/breakiterator/gendict.cxx index 3d0b627..90e6f75 100644 --- a/i18npool/source/breakiterator/gendict.cxx +++ b/i18npool/source/breakiterator/gendict.cxx @@ -211,11 +211,10 @@ void printIndex1(FILE *source_fp, sal_Int32 *charArray, sal_Int16 *set) { fprintf (source_fp, "static const sal_Int16 index1[] = {\n\t"); sal_Int32 count = 0; - sal_Int32 j; for (sal_Int32 i = 0; i < 0x100; i++) { - for (j = 0; j < 0x100; j++) - if (charArray[(i*0x100) + j] != 0) - break; + sal_Int32 j = 0; + while( j < 0x100 && charArray[(i*0x100) + j] == 0) + j++; fprintf(source_fp, "0x%02x, ", set[i] = (j < 0x100 ? sal::static_int_cast<sal_Int16>(count++) : 0xff)); if ((i+1) % 0x10 == 0) @@ -232,11 +231,10 @@ void printIndex2(FILE *source_fp, sal_Int32 *charArray, sal_Int16 *set) if (set[i] != 0xff) { for (sal_Int32 j = 0; j < 0x100; j++) { sal_Int32 k = (i*0x100) + j; - if (prev != 0 && charArray[k] == 0) { - for (k++; k < 0x10000; k++) - if (charArray[k] != 0) - break; - } + if (prev != 0 ) + while( charArray[k] == 0 && k < 0x10000 ) + k++; + prev = charArray[(i*0x100) + j]; fprintf( source_fp, "0x%lx, ", -- 1.7.1
From 8e9bff87e00f1324588323e3ff0a4e3779f6250f Mon Sep 17 00:00:00 2001 From: Kenneth Venken <kenneth.ven...@gmail.com> Date: Sun, 30 Jan 2011 00:00:38 +0100 Subject: [PATCH 8/8] more comments --- i18npool/source/breakiterator/gendict.cxx | 53 ++++++++++++++++------------- 1 files changed, 29 insertions(+), 24 deletions(-) diff --git a/i18npool/source/breakiterator/gendict.cxx b/i18npool/source/breakiterator/gendict.cxx index 90e6f75..1b70f23 100644 --- a/i18npool/source/breakiterator/gendict.cxx +++ b/i18npool/source/breakiterator/gendict.cxx @@ -93,10 +93,10 @@ SAL_IMPLEMENT_MAIN_WITH_ARGS(argc, argv) return -1; } - sal_Int32 charArray[0x10000]; - vector<sal_Int32> lenArray; - sal_Bool exist[0x10000]; + vector<sal_Int32> lenArray; // stores the word boundaries in DataArea sal_Int16 set[0x100]; + sal_Bool exist[0x10000]; // true if unicode character exists + sal_Int32 charArray[0x10000]; // keeps track where words beginning with a certain char are stored in DataArea initArrays( exist, charArray ); printIncludes(source_fp); @@ -142,8 +142,7 @@ void printFunctions(FILE* source_fp) fprintf (source_fp, "\tconst sal_Unicode* getDataArea() { return dataArea; }\n"); } -void printDataArea(FILE *dictionary_fp, FILE *source_fp, - sal_Int32 *charArray, +void printDataArea(FILE *dictionary_fp, FILE *source_fp, sal_Int32 *charArray, vector<sal_Int32>& lenArray, sal_Bool *exists) { // generate main dict. data array @@ -162,7 +161,8 @@ void printDataArea(FILE *dictionary_fp, FILE *source_fp, sal_Int32 i=0; Ostr.iterateCodePoints(&i, 1); - if (len == i) continue; // skip one character word + if (len == i) + continue; // skip one character word if (u[0] != current) { if (u[0] < current) @@ -184,7 +184,6 @@ void printDataArea(FILE *dictionary_fp, FILE *source_fp, } } lenArray.push_back( lenArrayCurr ); // store last ending pointer - charArray[current+1] = lenArray.size(); fprintf(source_fp, "\n};\n"); } @@ -207,6 +206,9 @@ void printLenArray(FILE* source_fp, const vector<sal_Int32>& lenArray) fprintf(source_fp, "\n};\n"); } +/* FIXME?: what happens if in every range i there is at least one charArray != 0 + => this will make index1[] = {0x00, 0x01, 0x02,... 0xfe, 0xff } + => then in index2, the last range will be ignored incorrectly */ void printIndex1(FILE *source_fp, sal_Int32 *charArray, sal_Int16 *set) { fprintf (source_fp, "static const sal_Int16 index1[] = {\n\t"); @@ -229,26 +231,28 @@ void printIndex2(FILE *source_fp, sal_Int32 *charArray, sal_Int16 *set) sal_Int32 prev = 0; for (sal_Int32 i = 0; i < 0x100; i++) { if (set[i] != 0xff) { - for (sal_Int32 j = 0; j < 0x100; j++) { - sal_Int32 k = (i*0x100) + j; - if (prev != 0 ) - while( charArray[k] == 0 && k < 0x10000 ) - k++; - - prev = charArray[(i*0x100) + j]; - fprintf( - source_fp, "0x%lx, ", - sal::static_int_cast< unsigned long >( - k < 0x10000 ? charArray[k] + 1 : 0)); - if ((j+1) % 0x10 == 0) + for (sal_Int32 j = 0; j < 0x100; j++) { + sal_Int32 k = (i*0x100) + j; + if (prev != 0 ) + while( charArray[k] == 0 && k < 0x10000 ) + k++; + + prev = charArray[(i*0x100) + j]; + fprintf( + source_fp, "0x%lx, ", + sal::static_int_cast< unsigned long >( + k < 0x10000 ? charArray[k] + 1 : 0)); + if ((j+1) % 0x10 == 0) + fprintf (source_fp, "\n\t"); + } fprintf (source_fp, "\n\t"); } - fprintf (source_fp, "\n\t"); - } } fprintf (source_fp, "\n};\n"); } +/* Generates a bitmask for the existance of sal_Unicode values in dictionary; + it packs 8 sal_Bool values in 1 sal_uInt8 */ void printExistMark(FILE *source_fp, sal_Bool *exists) { sal_Int32 count = 0; @@ -256,13 +260,14 @@ void printExistMark(FILE *source_fp, sal_Bool *exists) for (sal_Int32 i = 0; i < 0x1FFF; i++) { sal_uInt8 bit = 0; for (sal_Int32 j = 0; j < 8; j++) - if (exists[i * 8 + j]) - bit |= 1 << j; + bit |= (exists[i * 8 + j]) << j; + fprintf(source_fp, "0x%02x, ", bit); if (count == 0xf) { count = 0; fprintf(source_fp, "\n\t"); - } else count++; + } else + count++; } fprintf (source_fp, "\n};\n"); } -- 1.7.1
_______________________________________________ LibreOffice mailing list LibreOffice@lists.freedesktop.org http://lists.freedesktop.org/mailman/listinfo/libreoffice