utf-test.c

Evgeny Kotkov Mon, 07 Mar 2016 09:00:07 -0800

Branko Čibej <br...@apache.org> writes:

> The big question here is what we'll use the API for. Currently we have a
> 'normalize' function that's used by svn_fs_verify (IIRC). Since we're
> talking about a funciton that transforms a UTF-8 string to a shape
> suitable for stuff-insensitive comparison, we could follow the example
> of the standard strxfrm() -> svn_utf__xfrm(); but if that's too ugly, my
> preference is for svn_utf__fold().
>
> However, I'd not add arguments for normalization/case folding/etc; I'd
> just make this function DTRT without any additional flags, because
> otherwise we'll always be second-guessing the correct invocation.


One use case that I keep in mind is doing server-side search or filtering,
where a client tells the server what kind of comparison and matching she
expects to get.

The strxfrm() function doesn't define the transformation in terms of
preserving case or diacritical marks.  Hence, we can't have svn_utf__xfrm()
doing the right thing for svn log --search, as that would mean that a
libsvn_subr function controls the behavior of the command-line client.
And while a private function somewhere around svn.c could be doing that,
hardcoding this kind of behavior in libsvn_subr doesn't sound proper to me.

We can drop the `normalize' argument, since keeping denormalized strings
around is dangerous and unnecessary, but I'd leave the other two and let the
caller specify the wanted behavior:

    svn_error_t *
    svn_utf__xfrm(const char **result,
                  const char *str,
                  apr_size_t len,
                  svn_boolean_t case_insensitive,
                  svn_boolean_t accent_insensitive,
                  svn_membuf_t *buf);

I attached the patch that does that.  What do you think?


Regards,
Evgeny Kotkov

Index: subversion/include/private/svn_utf_private.h
===================================================================
--- subversion/include/private/svn_utf_private.h        (revision 1733434)
+++ subversion/include/private/svn_utf_private.h        (working copy)
@@ -150,22 +150,27 @@ svn_utf__normalize(const char **result,
                    const char *str, apr_size_t len,
                    svn_membuf_t *buf);
 
-/* Normalize the UTF-8 string STR to form C and remove case distinctions
- * with Unicode's Default Caseless Matching algorithm. Use BUF as a
- * temporary storage. If LEN is SVN_UTF__UNKNOWN_LENGTH, assume STR
- * is null-terminated; otherwise, consider the string only up to the
- * given length.
+/* Transform the UTF-8 string to a shape suitable for comparison with
+ * strcmp(). The tranformation is defined by CASE_INSENSITIVE and
+ * ACCENT_INSENSITIVE arguments. If CASE_INSENSITIVE is non-zero,
+ * remove case distinctions from the string. If ACCENT_INSENSITIVE
+ * is non-zero, remove diacritical marks from the string.
  *
- * Return the resulting string in *RESULT, which shares storage with
- * BUF and is valid only until the next time BUF is modified.
+ * Use BUF as a temporary storage. If LEN is SVN_UTF__UNKNOWN_LENGTH,
+ * assume STR is null-terminated; otherwise, consider the string only
+ * up to the given length. Place the tranformed string in *RESULT, which
+ * shares storage with BUF and is valid only until the next time BUF is
+ * modified.
  *
  * A returned error may indicate that STRING contains invalid UTF-8 or
  * invalid Unicode codepoints.
  */
 svn_error_t *
-svn_utf__casefold(const char **result,
-                  const char *str, apr_size_t len,
-                  svn_membuf_t *buf);
+svn_utf__xfrm(const char **result,
+              const char *str, apr_size_t len,
+              svn_boolean_t case_insensitive,
+              svn_boolean_t accent_insensitive,
+              svn_membuf_t *buf);
 
 /* Check if STRING is a valid, NFC-normalized UTF-8 string.  Note that
  * a FALSE return value may indicate that STRING is not valid UTF-8 at
Index: subversion/libsvn_subr/utf8proc.c
===================================================================
--- subversion/libsvn_subr/utf8proc.c   (revision 1733434)
+++ subversion/libsvn_subr/utf8proc.c   (working copy)
@@ -127,7 +127,8 @@ decompose_normalized(apr_size_t *result_length,
  * of UTF-8 characters.
  *
  * If CASEFOLD is non-zero, perform Unicode case folding, e.g., for
- * case-insensitive string comparison.
+ * case-insensitive string comparison. If STRIPMARK is non-zero, strip
+ * all diacritical marks (e.g., accents) from the string.
  *
  * A returned error may indicate that STRING contains invalid UTF-8 or
  * invalid Unicode codepoints. Any error message comes from utf8proc.
@@ -136,10 +137,19 @@ static svn_error_t *
 normalize_cstring(apr_size_t *result_length,
                   const char *string, apr_size_t length,
                   svn_boolean_t casefold,
+                  svn_boolean_t stripmark,
                   svn_membuf_t *buffer)
 {
-  ssize_t result = unicode_decomposition(casefold ? UTF8PROC_CASEFOLD : 0,
-                                         string, length, buffer);
+  int flags = 0;
+  ssize_t result;
+
+  if (casefold)
+    flags |= UTF8PROC_CASEFOLD;
+
+  if (stripmark)
+    flags |= UTF8PROC_STRIPMARK;
+
+  result = unicode_decomposition(flags, string, length, buffer);
   if (result >= 0)
     {
       svn_membuf__resize(buffer, result * sizeof(apr_int32_t) + 1);
@@ -207,18 +217,21 @@ svn_utf__normalize(const char **result,
                    svn_membuf_t *buf)
 {
   apr_size_t result_length;
-  SVN_ERR(normalize_cstring(&result_length, str, len, FALSE, buf));
+  SVN_ERR(normalize_cstring(&result_length, str, len, FALSE, FALSE, buf));
   *result = (const char*)(buf->data);
   return SVN_NO_ERROR;
 }
 
 svn_error_t *
-svn_utf__casefold(const char **result,
-                  const char *str, apr_size_t len,
-                  svn_membuf_t *buf)
+svn_utf__xfrm(const char **result,
+              const char *str, apr_size_t len,
+              svn_boolean_t case_insensitive,
+              svn_boolean_t accent_insensitive,
+              svn_membuf_t *buf)
 {
   apr_size_t result_length;
-  SVN_ERR(normalize_cstring(&result_length, str, len, TRUE, buf));
+  SVN_ERR(normalize_cstring(&result_length, str, len,
+                            case_insensitive, accent_insensitive, buf));
   *result = (const char*)(buf->data);
   return SVN_NO_ERROR;
 }
@@ -375,7 +388,8 @@ svn_utf__is_normalized(const char *string, apr_poo
   apr_size_t result_length;
   const apr_size_t length = strlen(string);
   svn_membuf__create(&buffer, length * sizeof(apr_int32_t), scratch_pool);
-  err = normalize_cstring(&result_length, string, length, FALSE, &buffer);
+  err = normalize_cstring(&result_length, string, length,
+                          FALSE, FALSE, &buffer);
   if (err)
     {
       svn_error_clear(err);
Index: subversion/svn/log-cmd.c
===================================================================
--- subversion/svn/log-cmd.c    (revision 1733434)
+++ subversion/svn/log-cmd.c    (working copy)
@@ -112,14 +112,14 @@ display_diff(const svn_log_entry_t *log_entry,
 }
 
 /* Return TRUE if STR matches PATTERN. Else, return FALSE. Assumes that
- * PATTERN is a UTF-8 string normalized to form C with case folding
- * applied. Use BUF for temporary allocations. */
+ * PATTERN is a UTF-8 string prepared for case- and accent-insensitive
+ * comparison via svn_utf__xfrm(). */
 static svn_boolean_t
 match(const char *pattern, const char *str, svn_membuf_t *buf)
 {
   svn_error_t *err;
 
-  err = svn_utf__casefold(&str, str, strlen(str), buf);
+  err = svn_utf__xfrm(&str, str, strlen(str), TRUE, TRUE, buf);
   if (err)
     {
       /* Can't match invalid data. */
Index: subversion/svn/svn.c
===================================================================
--- subversion/svn/svn.c        (revision 1733434)
+++ subversion/svn/svn.c        (working copy)
@@ -2397,8 +2397,8 @@ sub_main(int *exit_code, int argc, const char *arg
         break;
       case opt_search:
         SVN_ERR(svn_utf_cstring_to_utf8(&utf8_opt_arg, opt_arg, pool));
-        SVN_ERR(svn_utf__casefold(&utf8_opt_arg, utf8_opt_arg,
-                                  strlen(utf8_opt_arg), &buf));
+        SVN_ERR(svn_utf__xfrm(&utf8_opt_arg, utf8_opt_arg,
+                              strlen(utf8_opt_arg), TRUE, TRUE, &buf));
         add_search_pattern_group(&opt_state,
                                  apr_pstrdup(pool, utf8_opt_arg),
                                  pool);
@@ -2405,8 +2405,8 @@ sub_main(int *exit_code, int argc, const char *arg
         break;
       case opt_search_and:
         SVN_ERR(svn_utf_cstring_to_utf8(&utf8_opt_arg, opt_arg, pool));
-        SVN_ERR(svn_utf__casefold(&utf8_opt_arg, utf8_opt_arg,
-                                  strlen(utf8_opt_arg), &buf));
+        SVN_ERR(svn_utf__xfrm(&utf8_opt_arg, utf8_opt_arg,
+                              strlen(utf8_opt_arg), TRUE, TRUE, &buf));
         add_search_pattern_to_latest_group(&opt_state,
                                            apr_pstrdup(pool, utf8_opt_arg),
                                            pool);
Index: subversion/tests/libsvn_subr/utf-test.c
===================================================================
--- subversion/tests/libsvn_subr/utf-test.c     (revision 1733434)
+++ subversion/tests/libsvn_subr/utf-test.c     (working copy)
@@ -898,87 +898,76 @@ test_utf_normalize(apr_pool_t *pool)
 
 
 static svn_error_t *
-test_utf_casefold(apr_pool_t *pool)
+test_utf_xfrm(apr_pool_t *pool)
 {
-  /* Normalized: NFC */
-  static const char nfc[] =
-    "\xe1\xb9\xa8"              /* S with dot above and below */
-    "\xc5\xaf"                  /* u with ring */
-    "\xe1\xb8\x87"              /* b with macron below */
-    "\xe1\xb9\xbd"              /* v with tilde */
-    "\xe1\xb8\x9d"              /* e with breve and cedilla */
-    "\xc8\x91"                  /* r with double grave */
-    "\xc5\xa1"                  /* s with caron */
-    "\xe1\xb8\xaf"              /* i with diaeresis and acute */
-    "\xe1\xbb\x9d"              /* o with grave and hook */
-    "\xe1\xb9\x8b";             /* n with circumflex below */
+  const char *str;
+  const char *result;
+  svn_membuf_t buf;
 
-  /* Normalized: NFC, case folded */
-  static const char nfc_casefold[] =
-    "\xe1\xb9\xa9"              /* s with dot above and below */
-    "\xc5\xaf"                  /* u with ring */
-    "\xe1\xb8\x87"              /* b with macron below */
-    "\xe1\xb9\xbd"              /* v with tilde */
-    "\xe1\xb8\x9d"              /* e with breve and cedilla */
-    "\xc8\x91"                  /* r with double grave */
-    "\xc5\xa1"                  /* s with caron */
-    "\xe1\xb8\xaf"              /* i with diaeresis and acute */
-    "\xe1\xbb\x9d"              /* o with grave and hook */
-    "\xe1\xb9\x8b";             /* n with circumflex below */
+  svn_membuf__create(&buf, 0, pool);
 
-  /* Normalized: NFD */
-  static const char nfd[] =
-    "S\xcc\xa3\xcc\x87"         /* S with dot above and below */
-    "u\xcc\x8a"                 /* u with ring */
-    "b\xcc\xb1"                 /* b with macron below */
-    "v\xcc\x83"                 /* v with tilde */
-    "e\xcc\xa7\xcc\x86"         /* e with breve and cedilla */
-    "r\xcc\x8f"                 /* r with double grave */
-    "s\xcc\x8c"                 /* s with caron */
-    "i\xcc\x88\xcc\x81"         /* i with diaeresis and acute */
-    "o\xcc\x9b\xcc\x80"         /* o with grave and hook */
-    "n\xcc\xad";                /* n with circumflex below */
+  /* ASCII string */
+  str = "Subversion";
+  SVN_ERR(svn_utf__xfrm(&result, str, strlen(str), FALSE, FALSE, &buf));
+  SVN_TEST_STRING_ASSERT(result, "Subversion");
+  SVN_ERR(svn_utf__xfrm(&result, str, strlen(str), TRUE, FALSE, &buf));
+  SVN_TEST_STRING_ASSERT(result, "subversion");
+  SVN_ERR(svn_utf__xfrm(&result, str, strlen(str), FALSE, TRUE, &buf));
+  SVN_TEST_STRING_ASSERT(result, "Subversion");
+  SVN_ERR(svn_utf__xfrm(&result, str, strlen(str), TRUE, TRUE, &buf));
+  SVN_TEST_STRING_ASSERT(result, "subversion");
 
-  /* Mixed, denormalized */
-  static const char mixup[] =
-    "S\xcc\x87\xcc\xa3"         /* S with dot above and below */
-    "\xc5\xaf"                  /* u with ring */
-    "b\xcc\xb1"                 /* b with macron below */
-    "\xe1\xb9\xbd"              /* v with tilde */
-    "e\xcc\xa7\xcc\x86"         /* e with breve and cedilla */
-    "\xc8\x91"                  /* r with double grave */
-    "s\xcc\x8c"                 /* s with caron */
-    "\xe1\xb8\xaf"              /* i with diaeresis and acute */
-    "o\xcc\x80\xcc\x9b"         /* o with grave and hook */
-    "\xe1\xb9\x8b";             /* n with circumflex below */
+  /* M (u with diaeresis) (sharp s) en */
+  str = "M" "\xc3\xbc" "\xc3\x9f" "en";
+  SVN_ERR(svn_utf__xfrm(&result, str, strlen(str), FALSE, FALSE, &buf));
+  SVN_TEST_STRING_ASSERT(result, "M" "\xc3\xbc" "\xc3\x9f" "en");
+  SVN_ERR(svn_utf__xfrm(&result, str, strlen(str), TRUE, FALSE, &buf));
+  SVN_TEST_STRING_ASSERT(result, "m" "\xc3\xbc" "ssen");
+  SVN_ERR(svn_utf__xfrm(&result, str, strlen(str), FALSE, TRUE, &buf));
+  SVN_TEST_STRING_ASSERT(result, "Mu" "\xc3\x9f" "en");
+  SVN_ERR(svn_utf__xfrm(&result, str, strlen(str), TRUE, TRUE, &buf));
+  SVN_TEST_STRING_ASSERT(result, "mussen");
 
-  /* Invalid UTF-8 */
-  static const char invalid[] =
-    "\xe1\xb9\xa8"              /* S with dot above and below */
-    "\xc5\xaf"                  /* u with ring */
-    "\xe1\xb8\x87"              /* b with macron below */
-    "\xe1\xb9\xbd"              /* v with tilde */
-    "\xe1\xb8\x9d"              /* e with breve and cedilla */
-    "\xc8\x91"                  /* r with double grave */
-    "\xc5\xa1"                  /* s with caron */
-    "\xe1\xb8\xaf"              /* i with diaeresis and acute */
-    "\xe6"                      /* Invalid byte */
-    "\xe1\xb9\x8b";             /* n with circumflex below */
+  /* Na (i with diaeresis) vet (e with acute), decomposed */
+  str = "Nai" "\xcc\x88" "vete" "\xcc\x81";
+  SVN_ERR(svn_utf__xfrm(&result, str, strlen(str), FALSE, FALSE, &buf));
+  SVN_TEST_STRING_ASSERT(result, "Na" "\xc3\xaf" "vet" "\xc3\xa9");
+  SVN_ERR(svn_utf__xfrm(&result, str, strlen(str), TRUE, FALSE, &buf));
+  SVN_TEST_STRING_ASSERT(result, "na" "\xc3\xaf" "vet" "\xc3\xa9");
+  SVN_ERR(svn_utf__xfrm(&result, str, strlen(str), FALSE, TRUE, &buf));
+  SVN_TEST_STRING_ASSERT(result, "Naivete");
+  SVN_ERR(svn_utf__xfrm(&result, str, strlen(str), TRUE, TRUE, &buf));
+  SVN_TEST_STRING_ASSERT(result, "naivete");
 
-  const char *result;
-  svn_membuf_t buf;
+  /* (I with dot above) stanbul */
+  str = "\xc4\xb0" "stanbul";
+  SVN_ERR(svn_utf__xfrm(&result, str, strlen(str), FALSE, FALSE, &buf));
+  SVN_TEST_STRING_ASSERT(result, "\xc4\xb0" "stanbul");
 
-  svn_membuf__create(&buf, 0, pool);
-  SVN_ERR(svn_utf__casefold(&result, nfc, strlen(nfc), &buf));
-  SVN_TEST_STRING_ASSERT(result, nfc_casefold);
-  SVN_ERR(svn_utf__casefold(&result, nfd, strlen(nfd), &buf));
-  SVN_TEST_STRING_ASSERT(result, nfc_casefold);
-  SVN_ERR(svn_utf__casefold(&result, mixup, strlen(mixup), &buf));
-  SVN_TEST_STRING_ASSERT(result, nfc_casefold);
+  /* The Latin Capital Letter I with Dot Above (0130) should fold into
+     Latin Small Letter I (0069) with Combining Dot Above (0307) per full
+     mapping in http://www.unicode.org/Public/UNIDATA/CaseFolding.txt */
+  SVN_ERR(svn_utf__xfrm(&result, str, strlen(str), TRUE, FALSE, &buf));
+  SVN_TEST_STRING_ASSERT(result, "i" "\xcc\x87" "stanbul");
+  SVN_ERR(svn_utf__xfrm(&result, str, strlen(str), FALSE, TRUE, &buf));
+  SVN_TEST_STRING_ASSERT(result, "Istanbul");
+  SVN_ERR(svn_utf__xfrm(&result, str, strlen(str), TRUE, TRUE, &buf));
+  SVN_TEST_STRING_ASSERT(result, "istanbul");
 
-  SVN_TEST_ASSERT_ERROR(svn_utf__casefold(&result, invalid, strlen(invalid),
-                                          &buf),
+  /* Invalid UTF-8 */
+  str = "a" "\xe6" "bc";
+  SVN_TEST_ASSERT_ERROR(svn_utf__xfrm(&result, str, strlen(str),
+                                      FALSE, FALSE, &buf),
                         SVN_ERR_UTF8PROC_ERROR);
+  SVN_TEST_ASSERT_ERROR(svn_utf__xfrm(&result, str, strlen(str),
+                                      TRUE, FALSE, &buf),
+                        SVN_ERR_UTF8PROC_ERROR);
+  SVN_TEST_ASSERT_ERROR(svn_utf__xfrm(&result, str, strlen(str),
+                                      FALSE, TRUE, &buf),
+                        SVN_ERR_UTF8PROC_ERROR);
+  SVN_TEST_ASSERT_ERROR(svn_utf__xfrm(&result, str, strlen(str),
+                                      TRUE, TRUE, &buf),
+                        SVN_ERR_UTF8PROC_ERROR);
 
   return SVN_NO_ERROR;
 }
@@ -1011,8 +1000,8 @@ static struct svn_test_descriptor_t test_funcs[] =
                    "test svn_utf__utf{16,32}_to_utf8"),
     SVN_TEST_PASS2(test_utf_normalize,
                    "test svn_utf__normalize"),
-    SVN_TEST_PASS2(test_utf_casefold,
-                   "test svn_utf__casefold"),
+    SVN_TEST_PASS2(test_utf_xfrm,
+                   "test svn_utf__xfrm"),
     SVN_TEST_NULL
   };

Re: svn commit: r1731300 - in /subversion/trunk/subversion: include/private/svn_utf_private.h libsvn_repos/dump.c libsvn_subr/utf8proc.c svn/cl-log.h svn/log-cmd.c svn/svn.c tests/cmdline/log_tests.py tests/libsvn_subr/utf-test.c

Reply via email to