Hello, I have come around a strange situation when using a unicode string that has non normalized characters. The attached script 'initcap.sql' can reproduce the problem.
The attached patch can fix the issue. Regards, Juan José Santamaría Flecha
initcap.sql
Description: application/sql
diff --git a/src/backend/utils/adt/formatting.c b/src/backend/utils/adt/formatting.c index 755ca6e..9f8becf 100644 --- a/src/backend/utils/adt/formatting.c +++ b/src/backend/utils/adt/formatting.c @@ -96,6 +96,7 @@ #include "utils/memutils.h" #include "utils/numeric.h" #include "utils/pg_locale.h" +#include "common/unicode_norm.h" /* ---------- * Routines type @@ -1864,7 +1865,8 @@ str_initcap(const char *buff, size_t nbytes, Oid collid) workspace[curr_char] = towlower_l(workspace[curr_char], mylocale->info.lt); else workspace[curr_char] = towupper_l(workspace[curr_char], mylocale->info.lt); - wasalnum = iswalnum_l(workspace[curr_char], mylocale->info.lt); + if (!is_pg_wchar_combining(workspace[curr_char])) + wasalnum = iswalnum_l(workspace[curr_char], mylocale->info.lt); } else #endif @@ -1873,7 +1875,8 @@ str_initcap(const char *buff, size_t nbytes, Oid collid) workspace[curr_char] = towlower(workspace[curr_char]); else workspace[curr_char] = towupper(workspace[curr_char]); - wasalnum = iswalnum(workspace[curr_char]); + if (!is_pg_wchar_combining(workspace[curr_char])) + wasalnum = iswalnum(workspace[curr_char]); } } diff --git a/src/common/unicode_norm.c b/src/common/unicode_norm.c index 89c5533..25b149b 100644 --- a/src/common/unicode_norm.c +++ b/src/common/unicode_norm.c @@ -435,3 +435,14 @@ unicode_normalize_kc(const pg_wchar *input) return recomp_chars; } + +bool +is_pg_wchar_combining(const pg_wchar current) +{ + pg_unicode_decomposition *currEntry = get_code_entry(current); + if (currEntry == NULL) + return false; + if (currEntry->comb_class == 0x0) + return false; + return true; +} diff --git a/src/include/common/unicode_norm.h b/src/include/common/unicode_norm.h index 99167d2..bdcf02e 100644 --- a/src/include/common/unicode_norm.h +++ b/src/include/common/unicode_norm.h @@ -17,5 +17,6 @@ #include "mb/pg_wchar.h" extern pg_wchar *unicode_normalize_kc(const pg_wchar *input); +extern bool is_pg_wchar_combining(const pg_wchar current); #endif /* UNICODE_NORM_H */