Hello,

I have come around a strange situation when using a unicode string
that has non normalized characters. The attached script 'initcap.sql'
can reproduce the problem.

The attached patch can fix the issue.

Regards,

Juan José Santamaría Flecha

Attachment: initcap.sql
Description: application/sql

diff --git a/src/backend/utils/adt/formatting.c b/src/backend/utils/adt/formatting.c
index 755ca6e..9f8becf 100644
--- a/src/backend/utils/adt/formatting.c
+++ b/src/backend/utils/adt/formatting.c
@@ -96,6 +96,7 @@
 #include "utils/memutils.h"
 #include "utils/numeric.h"
 #include "utils/pg_locale.h"
+#include "common/unicode_norm.h"
 
 /* ----------
  * Routines type
@@ -1864,7 +1865,8 @@ str_initcap(const char *buff, size_t nbytes, Oid collid)
 							workspace[curr_char] = towlower_l(workspace[curr_char], mylocale->info.lt);
 						else
 							workspace[curr_char] = towupper_l(workspace[curr_char], mylocale->info.lt);
-						wasalnum = iswalnum_l(workspace[curr_char], mylocale->info.lt);
+						if (!is_pg_wchar_combining(workspace[curr_char]))
+							wasalnum = iswalnum_l(workspace[curr_char], mylocale->info.lt);
 					}
 					else
 #endif
@@ -1873,7 +1875,8 @@ str_initcap(const char *buff, size_t nbytes, Oid collid)
 							workspace[curr_char] = towlower(workspace[curr_char]);
 						else
 							workspace[curr_char] = towupper(workspace[curr_char]);
-						wasalnum = iswalnum(workspace[curr_char]);
+						if (!is_pg_wchar_combining(workspace[curr_char]))
+							wasalnum = iswalnum(workspace[curr_char]);
 					}
 				}
 
diff --git a/src/common/unicode_norm.c b/src/common/unicode_norm.c
index 89c5533..25b149b 100644
--- a/src/common/unicode_norm.c
+++ b/src/common/unicode_norm.c
@@ -435,3 +435,14 @@ unicode_normalize_kc(const pg_wchar *input)
 
 	return recomp_chars;
 }
+
+bool
+is_pg_wchar_combining(const pg_wchar current)
+{
+	pg_unicode_decomposition *currEntry = get_code_entry(current);
+	if (currEntry == NULL)
+		return false;
+	if (currEntry->comb_class == 0x0)
+		return false;
+	return true;
+}
diff --git a/src/include/common/unicode_norm.h b/src/include/common/unicode_norm.h
index 99167d2..bdcf02e 100644
--- a/src/include/common/unicode_norm.h
+++ b/src/include/common/unicode_norm.h
@@ -17,5 +17,6 @@
 #include "mb/pg_wchar.h"
 
 extern pg_wchar *unicode_normalize_kc(const pg_wchar *input);
+extern bool is_pg_wchar_combining(const pg_wchar current);
 
 #endif							/* UNICODE_NORM_H */

Reply via email to