Hi, Alexander! On May 25, Alexander Barkov wrote: > Hi Sergei, > > For simplicity I decided to split: > > MDEV-8036 Fix all collations to compare broken bytes as "greater than > any non-broken character" > > into sub-tasks. > > Please review a patch for the first sub-task MDEV-8214.
I don't know why you wanted me to review it :) Looks pretty ok. Of course there were few comments and questions, but nothing serious. > diff --git a/strings/ctype-big5.c b/strings/ctype-big5.c > index eda81c0..aa06a7a 100644 > --- a/strings/ctype-big5.c > +++ b/strings/ctype-big5.c > @@ -6853,11 +6771,29 @@ my_mb_wc_big5(CHARSET_INFO *cs > __attribute__((unused)), > } > > > -static MY_COLLATION_HANDLER my_collation_big5_chinese_ci_handler = > +#undef MY_FUNCTION_NAME > +#undef WEIGHT_MB1 > +#undef WEIGHT_MB2 why do you start from undef-ing macros that couldn't be defined here? > +#define MY_FUNCTION_NAME(x) my_ ## x ## _big5_chinese_ci > +#define WEIGHT_MB1(x) (sort_order_big5[(uchar) (x)]) > +#define WEIGHT_MB2(x,y) (big5code(x, y)) > +#include "ctype-strcoll.ic" > + > + > +#undef MY_FUNCTION_NAME > +#undef WEIGHT_MB1 > +#undef WEIGHT_MB2 alternatively you could undef them at the end of ctype-strcoll.ic it'll also be a protection against including this file twice (although it cannot be done now anyway) > +#define MY_FUNCTION_NAME(x) my_ ## x ## _big5_bin > +#define WEIGHT_MB1(x) ((uchar) (x)) > +#define WEIGHT_MB2(x,y) (big5code(x, y)) > +#include "ctype-strcoll.ic" > + > + > +static MY_COLLATION_HANDLER my_collation_handler_big5_chinese_ci= > { > NULL, /* init */ > - my_strnncoll_big5, > - my_strnncollsp_big5, > + my_strnncoll_big5_chinese_ci, > + my_strnncollsp_big5_chinese_ci, > my_strnxfrm_big5, > my_strnxfrmlen_simple, > my_like_range_mb, > diff --git a/strings/ctype-euc_kr.c b/strings/ctype-euc_kr.c > index a2c95bf..db47b3d 100644 > --- a/strings/ctype-euc_kr.c > +++ b/strings/ctype-euc_kr.c > @@ -9938,21 +9940,56 @@ my_mb_wc_euc_kr(CHARSET_INFO *cs > __attribute__((unused)), > } > > > -static MY_COLLATION_HANDLER my_collation_ci_handler = > +#undef MY_FUNCTION_NAME > +#undef WEIGHT_MB1 > +#undef WEIGHT_MB2 > +#define MY_FUNCTION_NAME(x) my_ ## x ## _euckr_korean_ci > +#define WEIGHT_MB1(x) (sort_order_euc_kr[(uchar) (x)]) > +#define WEIGHT_MB2(x,y) (euckrcode(x, y)) > +#include "ctype-strcoll.ic" > + > + > +#undef MY_FUNCTION_NAME > +#undef WEIGHT_MB1 > +#undef WEIGHT_MB2 > +#define MY_FUNCTION_NAME(x) my_ ## x ## _euckr_bin > +#define WEIGHT_MB1(x) ((uchar) (x)) > +#define WEIGHT_MB2(x,y) (euckrcode(x, y)) > +#include "ctype-strcoll.ic" > + > + > +static MY_COLLATION_HANDLER my_collation_handler_euckr_korean_ci= > { > - NULL, /* init */ > - my_strnncoll_simple, /* strnncoll */ > - my_strnncollsp_simple, > - my_strnxfrm_mb, /* strnxfrm */ > + NULL, /* init */ > + my_strnncoll_euckr_korean_ci, > + my_strnncollsp_euckr_korean_ci, hmm, so is euckr multi-byte or not? wikipedia says it is, my_strnxfrm_mb. but why my_strnncoll_simple? > + my_strnxfrm_mb, > my_strnxfrmlen_simple, > - my_like_range_mb, /* like_range */ > - my_wildcmp_mb, /* wildcmp */ > + my_like_range_mb, > + my_wildcmp_mb, > my_strcasecmp_mb, > my_instr_mb, > my_hash_sort_simple, > my_propagate_simple > }; > > + > +static MY_COLLATION_HANDLER my_collation_handler_euckr_bin= > +{ > + NULL, /* init */ > + my_strnncoll_euckr_bin, > + my_strnncollsp_euckr_bin, > + my_strnxfrm_mb, > + my_strnxfrmlen_simple, > + my_like_range_mb, > + my_wildcmp_mb_bin, > + my_strcasecmp_mb_bin, > + my_instr_mb, > + my_hash_sort_mb_bin, > + my_propagate_simple > +}; > + > + > static MY_CHARSET_HANDLER my_charset_handler= > { > NULL, /* init */ > diff --git a/strings/ctype-strcoll.ic b/strings/ctype-strcoll.ic > new file mode 100644 > index 0000000..7217f99 > --- /dev/null > +++ b/strings/ctype-strcoll.ic > @@ -0,0 +1,208 @@ > +/* > + Copyright (c) 2015, MariaDB Foundation > + > + This program is free software; you can redistribute it and/or modify > + it under the terms of the GNU General Public License as published by > + the Free Software Foundation; version 2 of the License. > + > + This program is distributed in the hope that it will be useful, > + but WITHOUT ANY WARRANTY; without even the implied warranty of > + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the > + GNU General Public License for more details. > + > + You should have received a copy of the GNU General Public License > + along with this program; if not, write to the Free Software > + Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA > +*/ > + > + > +#ifndef MY_FUNCTION_NAME > +#error MY_FUNCTION_NAME is not defined > +#endif > + > + > +/* > + The weight for automatically padded spaces when comparing strings with > + the PAD SPACE property. > + Should normally be equal to the weight of a regular space. it is not a space for sjis and cp932? what does that mean? padding space is not the same as a regular space? why? > +*/ > +#ifndef WEIGHT_PAD_SPACE > +#define WEIGHT_PAD_SPACE (' ') > +#endif > + > + > +/* > + Weight of an illegal byte. > + Must be greater than weight of any normal character. > + Two bad bytes are compared binary. > +*/ > +#ifndef WEIGHT_ILSEQ why #ifndef? it creates an impression that the caller (includer :) can redefine it. better define WEIGHT_ILSEQ here unconditionally and undefine it at the end > +#define WEIGHT_ILSEQ(x) (0xFF00 + (x)) > +#endif > + > + > +/** > + Scan a valid character, or a bad byte, or an auto-padded space > + from a string and calculate the weight of the scanned sequence. > + > + @param [OUT] weight - the weight is returned here > + @param str - the string > + @param end - the end of the string > + @return - the number of bytes scanned > + > + The including source file must define the following macros: > + IS_MB1_CHAR(x) > + IS_MB2_CHAR(x,y) > + WEIGHT_PAD_SPACE > + WEIGHT_MB1(x) > + WEIGHT_MB2(x,y) > + WEIGHT_ILSEQ(x) > +*/ > +static inline uint > +MY_FUNCTION_NAME(scan_weight)(int *weight, const uchar *str, const uchar > *end) > +{ > + if (str >= end) > + { > + *weight= WEIGHT_PAD_SPACE; > + return 0; > + } > + > + if (IS_MB1_CHAR(*str)) > + { > + *weight= WEIGHT_MB1(*str); /* A valid single byte character*/ > + return 1; > + } > + > + if (str + 2 > end) /* The string ended unexpectedly */ > + goto bad; /* Treat as a bad byte */ > + > + if (IS_MB2_CHAR(str[0], str[1])) > + { > + *weight= WEIGHT_MB2(str[0], str[1]); > + return 2; /* A valid two-byte character */ > + } > + > +bad: > + *weight= WEIGHT_ILSEQ(str[0]); /* Bad byte */ > + return 1; > +} > + > + > +/** > + Compare two strings according to the collation, > + without handling the PAD SPACE property. > + > + Note, cs->coll->strnncoll() is usually used to compare identifiers. > + Perhaps we should eventually (in 10.2?) create a new collation > + my_charset_utf8_general_ci_no_pad and have only one comparison function > + in MY_COLLATION_HANDLER. > + > + @param cs - the character set and collation > + @param a - the left string > + @param a_length - the length of the left string > + @param b - the right string > + @param b_length - the length of the right string > + @param b_is_prefix - if the caller wants to check if "b" is a prefix of "a" > + @return - the comparison result > +*/ > +static int > +MY_FUNCTION_NAME(strnncoll)(CHARSET_INFO *cs __attribute__((unused)), > + const uchar *a, size_t a_length, > + const uchar *b, size_t b_length, > + my_bool b_is_prefix) > +{ > + const uchar *a_end= a + a_length; > + const uchar *b_end= b + b_length; > + for ( ; ; ) > + { > + int a_weight, b_weight, res; > + uint a_wlen= MY_FUNCTION_NAME(scan_weight)(&a_weight, a, a_end); > + uint b_wlen= MY_FUNCTION_NAME(scan_weight)(&b_weight, b, b_end); > + /* > + a_wlen b_wlen Comment > + ------ ------ ------- > + 0 0 Strings ended simultaneously, "a" and "b" are equal. > + 0 >0 "a" is a prefix of "b", so "a" is smaller. > + >0 0 "b" is a prefix of "a", check b_is_prefix. > + >0 >0 Two weights were scanned, check weight difference. > + */ > + if (!a_wlen) > + return b_wlen ? -b_weight : 0; can weight be negative? if not, the code will be clearer if you declare it unsigned > + > + if (!b_wlen) > + return b_is_prefix ? 0 : a_weight; > + > + if ((res= (a_weight - b_weight))) > + return res; > + /* > + None of the strings has ended yet. > + */ > + DBUG_ASSERT(a < a_end); > + DBUG_ASSERT(b < b_end); > + a+= a_wlen; > + b+= b_wlen; > + } > + DBUG_ASSERT(0); > + return 0; > +} > + > + > +/** > + Compare two strings according to the collation, with PAD SPACE handling. > + > + @param cs - the character set and collation > + @param a - the left string > + @param a_length - the length of the left string > + @param b - the right string > + @param b_length - the length of the right string > + @param diff_if_only_endspace_difference - not used in the code. > + TODO: this should be eventually removed (in 10.2?) why not now (in a separate changeset, but in 10.1 still)? > + @return - the comparison result > +*/ > + > +static int > +MY_FUNCTION_NAME(strnncollsp)(CHARSET_INFO *cs __attribute__((unused)), > + const uchar *a, size_t a_length, > + const uchar *b, size_t b_length, > + my_bool diff_if_only_endspace_difference > + __attribute__((unused))) > +{ > + const uchar *a_end= a + a_length; > + const uchar *b_end= b + b_length; > + for ( ; ; ) > + { > + int a_weight, b_weight, res; > + uint a_wlen= MY_FUNCTION_NAME(scan_weight)(&a_weight, a, a_end); > + uint b_wlen= MY_FUNCTION_NAME(scan_weight)(&b_weight, b, b_end); > + if ((res= (a_weight - b_weight))) > + { > + /* > + Got two different weights. Each weight can be generated by either of: > + - a real character > + - a bad byte sequence or an incomplete byte sequence > + - an auto-generated trailing space (PAD SPACE) > + It does not matter how exactly each weight was generated. > + Just return the weight difference. > + */ > + return res; > + } > + if (!a_wlen && !b_wlen) > + { > + /* > + Got two auto-generated trailing spaces, i.e. > + both strings have now ended, so they are equal. > + */ > + DBUG_ASSERT(a == a_end); > + DBUG_ASSERT(b == b_end); > + return 0; > + } > + /* > + At least one of the strings has not ended yet, continue comparison. > + */ > + DBUG_ASSERT(a < a_end || b < b_end); > + a+= a_wlen; > + b+= b_wlen; > + } > + DBUG_ASSERT(0); > + return 0; > +} Regards, Sergei _______________________________________________ Mailing list: https://launchpad.net/~maria-developers Post to : maria-developers@lists.launchpad.net Unsubscribe : https://launchpad.net/~maria-developers More help : https://help.launchpad.net/ListHelp