The implementation of the Unicode rules (LB15a), (LB15b) on 2024-01-30 was
not correct: It causes several test failures in Unicode.org's LineBreak.txt.
This patch fixes it.


2024-09-15  Bruno Haible  <br...@clisp.org>

        unilbrk: Fix bugs in implementation of Unicode rules (LB15a), (LB15b).
        * lib/gen-uni-tables.c (output_lbrk_rules_as_tables): Fix typo in
        comment.
        * lib/unilbrk/u8-possible-linebreaks.c (u8_possible_linebreaks_loop):
        Do the LBP_QU2 to LBP_QU1 mapping for (LB15a) after the table lookup but
        before the assignment to last_prop. Do the LBP_QU3 to LBP_QU1 mapping
        for (LB15b) in a way that does not influence prev_prop or last_prop.
        * lib/unilbrk/u16-possible-linebreaks.c (u16_possible_linebreaks_loop):
        Likewise.
        * lib/unilbrk/u32-possible-linebreaks.c (u32_possible_linebreaks_loop):
        Likewise.

diff --git a/lib/gen-uni-tables.c b/lib/gen-uni-tables.c
index 0ea9de9f28..6ac81d0821 100644
--- a/lib/gen-uni-tables.c
+++ b/lib/gen-uni-tables.c
@@ -9424,7 +9424,7 @@ output_lbrk_rules_as_tables (const char *filename, const 
char *version)
                        set_table_cell (prohibited_with_sp, true);
     }
 
-  /* (LB15a) Do not break before an ambiguous quotation that is an initial
+  /* (LB15a) Do not break after an ambiguous quotation that is an initial
      punctuation, even after spaces.  */
   for (after = 0; after < NLBP; after++)
     {
diff --git a/lib/unilbrk/u16-possible-linebreaks.c 
b/lib/unilbrk/u16-possible-linebreaks.c
index e327575a34..d2d6343234 100644
--- a/lib/unilbrk/u16-possible-linebreaks.c
+++ b/lib/unilbrk/u16-possible-linebreaks.c
@@ -158,51 +158,6 @@ u16_possible_linebreaks_loop (const uint16_t *s, size_t n, 
const char *encoding,
                   /* This is arbitrary.  */
                   prop = LBP_AL1;
                   break;
-                case LBP_QU2:
-                  /* For (LB15a): Replace LBP_QU2 with LBP_QU1 if the previous
-                     character's line break property was not one of
-                     BK, CR, LF, OP, QU, GL, SP, ZW.  */
-                  switch (prev_prop)
-                    {
-                    case LBP_BK:
-                    case LBP_CR:
-                    case LBP_LF:
-                    case LBP_OP1: case LBP_OP2:
-                    case LBP_QU1: case LBP_QU2: case LBP_QU3:
-                    case LBP_GL:
-                    case LBP_SP:
-                    case LBP_ZW:
-                      break;
-                    default:
-                      prop = LBP_QU1;
-                      break;
-                    }
-                  break;
-                case LBP_QU3:
-                  /* For (LB15b): Replace LBP_QU3 with LBP_QU1 if the next
-                     character's line break property is not one of
-                     BK, CR, LF, SP, GL, WJ, CL, QU, CP, EX, IS, SY, ZW.  */
-                  switch (PROP (lookahead1_prop_ea))
-                    {
-                    case LBP_BK:
-                    case LBP_CR:
-                    case LBP_LF:
-                    case LBP_SP:
-                    case LBP_GL:
-                    case LBP_WJ:
-                    case LBP_CL:
-                    case LBP_QU1: case LBP_QU2: case LBP_QU3:
-                    case LBP_CP1: case LBP_CP2:
-                    case LBP_EX:
-                    case LBP_IS:
-                    case LBP_SY:
-                    case LBP_ZW:
-                      break;
-                    default:
-                      prop = LBP_QU1;
-                      break;
-                    }
-                  break;
                 }
 
               /* Deal with spaces and combining characters.  */
@@ -343,7 +298,35 @@ u16_possible_linebreaks_loop (const uint16_t *s, size_t n, 
const char *encoding,
                     }
                   else
                     {
-                      switch (unilbrk_table [last_prop] [prop])
+                      int this_prop = prop;
+                      if (prop == LBP_QU3)
+                        {
+                          /* For (LB15b): Replace LBP_QU3 with LBP_QU1 if the
+                             next character's line break property is not one of
+                             BK, CR, LF, SP, GL, WJ, CL, QU, CP, EX, IS, SY, 
ZW.  */
+                          switch (PROP (lookahead1_prop_ea))
+                            {
+                            case LBP_BK:
+                            case LBP_CR:
+                            case LBP_LF:
+                            case LBP_SP:
+                            case LBP_GL:
+                            case LBP_WJ:
+                            case LBP_CL:
+                            case LBP_QU1: case LBP_QU2: case LBP_QU3:
+                            case LBP_CP1: case LBP_CP2:
+                            case LBP_EX:
+                            case LBP_IS:
+                            case LBP_SY:
+                            case LBP_ZW:
+                              break;
+                            default:
+                              this_prop = LBP_QU1;
+                              break;
+                            }
+                        }
+
+                      switch (unilbrk_table [last_prop] [this_prop])
                         {
                         case D:
                           *p = UC_BREAK_POSSIBLE;
@@ -358,6 +341,29 @@ u16_possible_linebreaks_loop (const uint16_t *s, size_t n, 
const char *encoding,
                           abort ();
                         }
                     }
+
+                  if (prop == LBP_QU2)
+                    {
+                      /* For (LB15a): Replace LBP_QU2 with LBP_QU1 if the
+                         previous character's line break property was not one 
of
+                         BK, CR, LF, OP, QU, GL, SP, ZW.  */
+                      switch (prev_prop)
+                        {
+                        case LBP_BK:
+                        case LBP_CR:
+                        case LBP_LF:
+                        case LBP_OP1: case LBP_OP2:
+                        case LBP_QU1: case LBP_QU2: case LBP_QU3:
+                        case LBP_GL:
+                        case LBP_SP:
+                        case LBP_ZW:
+                          break;
+                        default:
+                          prop = LBP_QU1;
+                          break;
+                        }
+                    }
+
                   last_prop = prop;
                   seen_space = NULL;
                 }
diff --git a/lib/unilbrk/u32-possible-linebreaks.c 
b/lib/unilbrk/u32-possible-linebreaks.c
index b351ece0d4..10f39a8d05 100644
--- a/lib/unilbrk/u32-possible-linebreaks.c
+++ b/lib/unilbrk/u32-possible-linebreaks.c
@@ -151,51 +151,6 @@ u32_possible_linebreaks_loop (const uint32_t *s, size_t n, 
const char *encoding,
                   /* This is arbitrary.  */
                   prop = LBP_AL1;
                   break;
-                case LBP_QU2:
-                  /* For (LB15a): Replace LBP_QU2 with LBP_QU1 if the previous
-                     character's line break property was not one of
-                     BK, CR, LF, OP, QU, GL, SP, ZW.  */
-                  switch (prev_prop)
-                    {
-                    case LBP_BK:
-                    case LBP_CR:
-                    case LBP_LF:
-                    case LBP_OP1: case LBP_OP2:
-                    case LBP_QU1: case LBP_QU2: case LBP_QU3:
-                    case LBP_GL:
-                    case LBP_SP:
-                    case LBP_ZW:
-                      break;
-                    default:
-                      prop = LBP_QU1;
-                      break;
-                    }
-                  break;
-                case LBP_QU3:
-                  /* For (LB15b): Replace LBP_QU3 with LBP_QU1 if the next
-                     character's line break property is not one of
-                     BK, CR, LF, SP, GL, WJ, CL, QU, CP, EX, IS, SY, ZW.  */
-                  switch (PROP (lookahead1_prop_ea))
-                    {
-                    case LBP_BK:
-                    case LBP_CR:
-                    case LBP_LF:
-                    case LBP_SP:
-                    case LBP_GL:
-                    case LBP_WJ:
-                    case LBP_CL:
-                    case LBP_QU1: case LBP_QU2: case LBP_QU3:
-                    case LBP_CP1: case LBP_CP2:
-                    case LBP_EX:
-                    case LBP_IS:
-                    case LBP_SY:
-                    case LBP_ZW:
-                      break;
-                    default:
-                      prop = LBP_QU1;
-                      break;
-                    }
-                  break;
                 }
 
               /* Deal with spaces and combining characters.  */
@@ -336,7 +291,35 @@ u32_possible_linebreaks_loop (const uint32_t *s, size_t n, 
const char *encoding,
                     }
                   else
                     {
-                      switch (unilbrk_table [last_prop] [prop])
+                      int this_prop = prop;
+                      if (prop == LBP_QU3)
+                        {
+                          /* For (LB15b): Replace LBP_QU3 with LBP_QU1 if the
+                             next character's line break property is not one of
+                             BK, CR, LF, SP, GL, WJ, CL, QU, CP, EX, IS, SY, 
ZW.  */
+                          switch (PROP (lookahead1_prop_ea))
+                            {
+                            case LBP_BK:
+                            case LBP_CR:
+                            case LBP_LF:
+                            case LBP_SP:
+                            case LBP_GL:
+                            case LBP_WJ:
+                            case LBP_CL:
+                            case LBP_QU1: case LBP_QU2: case LBP_QU3:
+                            case LBP_CP1: case LBP_CP2:
+                            case LBP_EX:
+                            case LBP_IS:
+                            case LBP_SY:
+                            case LBP_ZW:
+                              break;
+                            default:
+                              this_prop = LBP_QU1;
+                              break;
+                            }
+                        }
+
+                      switch (unilbrk_table [last_prop] [this_prop])
                         {
                         case D:
                           *p = UC_BREAK_POSSIBLE;
@@ -351,6 +334,29 @@ u32_possible_linebreaks_loop (const uint32_t *s, size_t n, 
const char *encoding,
                           abort ();
                         }
                     }
+
+                  if (prop == LBP_QU2)
+                    {
+                      /* For (LB15a): Replace LBP_QU2 with LBP_QU1 if the
+                         previous character's line break property was not one 
of
+                         BK, CR, LF, OP, QU, GL, SP, ZW.  */
+                      switch (prev_prop)
+                        {
+                        case LBP_BK:
+                        case LBP_CR:
+                        case LBP_LF:
+                        case LBP_OP1: case LBP_OP2:
+                        case LBP_QU1: case LBP_QU2: case LBP_QU3:
+                        case LBP_GL:
+                        case LBP_SP:
+                        case LBP_ZW:
+                          break;
+                        default:
+                          prop = LBP_QU1;
+                          break;
+                        }
+                    }
+
                   last_prop = prop;
                   seen_space = NULL;
                 }
diff --git a/lib/unilbrk/u8-possible-linebreaks.c 
b/lib/unilbrk/u8-possible-linebreaks.c
index dd7d7c6fbd..65248e5c40 100644
--- a/lib/unilbrk/u8-possible-linebreaks.c
+++ b/lib/unilbrk/u8-possible-linebreaks.c
@@ -158,51 +158,6 @@ u8_possible_linebreaks_loop (const uint8_t *s, size_t n, 
const char *encoding,
                   /* This is arbitrary.  */
                   prop = LBP_AL1;
                   break;
-                case LBP_QU2:
-                  /* For (LB15a): Replace LBP_QU2 with LBP_QU1 if the previous
-                     character's line break property was not one of
-                     BK, CR, LF, OP, QU, GL, SP, ZW.  */
-                  switch (prev_prop)
-                    {
-                    case LBP_BK:
-                    case LBP_CR:
-                    case LBP_LF:
-                    case LBP_OP1: case LBP_OP2:
-                    case LBP_QU1: case LBP_QU2: case LBP_QU3:
-                    case LBP_GL:
-                    case LBP_SP:
-                    case LBP_ZW:
-                      break;
-                    default:
-                      prop = LBP_QU1;
-                      break;
-                    }
-                  break;
-                case LBP_QU3:
-                  /* For (LB15b): Replace LBP_QU3 with LBP_QU1 if the next
-                     character's line break property is not one of
-                     BK, CR, LF, SP, GL, WJ, CL, QU, CP, EX, IS, SY, ZW.  */
-                  switch (PROP (lookahead1_prop_ea))
-                    {
-                    case LBP_BK:
-                    case LBP_CR:
-                    case LBP_LF:
-                    case LBP_SP:
-                    case LBP_GL:
-                    case LBP_WJ:
-                    case LBP_CL:
-                    case LBP_QU1: case LBP_QU2: case LBP_QU3:
-                    case LBP_CP1: case LBP_CP2:
-                    case LBP_EX:
-                    case LBP_IS:
-                    case LBP_SY:
-                    case LBP_ZW:
-                      break;
-                    default:
-                      prop = LBP_QU1;
-                      break;
-                    }
-                  break;
                 }
 
               /* Deal with spaces and combining characters.  */
@@ -343,7 +298,35 @@ u8_possible_linebreaks_loop (const uint8_t *s, size_t n, 
const char *encoding,
                     }
                   else
                     {
-                      switch (unilbrk_table [last_prop] [prop])
+                      int this_prop = prop;
+                      if (prop == LBP_QU3)
+                        {
+                          /* For (LB15b): Replace LBP_QU3 with LBP_QU1 if the
+                             next character's line break property is not one of
+                             BK, CR, LF, SP, GL, WJ, CL, QU, CP, EX, IS, SY, 
ZW.  */
+                          switch (PROP (lookahead1_prop_ea))
+                            {
+                            case LBP_BK:
+                            case LBP_CR:
+                            case LBP_LF:
+                            case LBP_SP:
+                            case LBP_GL:
+                            case LBP_WJ:
+                            case LBP_CL:
+                            case LBP_QU1: case LBP_QU2: case LBP_QU3:
+                            case LBP_CP1: case LBP_CP2:
+                            case LBP_EX:
+                            case LBP_IS:
+                            case LBP_SY:
+                            case LBP_ZW:
+                              break;
+                            default:
+                              this_prop = LBP_QU1;
+                              break;
+                            }
+                        }
+
+                      switch (unilbrk_table [last_prop] [this_prop])
                         {
                         case D:
                           *p = UC_BREAK_POSSIBLE;
@@ -358,6 +341,29 @@ u8_possible_linebreaks_loop (const uint8_t *s, size_t n, 
const char *encoding,
                           abort ();
                         }
                     }
+
+                  if (prop == LBP_QU2)
+                    {
+                      /* For (LB15a): Replace LBP_QU2 with LBP_QU1 if the
+                         previous character's line break property was not one 
of
+                         BK, CR, LF, OP, QU, GL, SP, ZW.  */
+                      switch (prev_prop)
+                        {
+                        case LBP_BK:
+                        case LBP_CR:
+                        case LBP_LF:
+                        case LBP_OP1: case LBP_OP2:
+                        case LBP_QU1: case LBP_QU2: case LBP_QU3:
+                        case LBP_GL:
+                        case LBP_SP:
+                        case LBP_ZW:
+                          break;
+                        default:
+                          prop = LBP_QU1;
+                          break;
+                        }
+                    }
+
                   last_prop = prop;
                   seen_space = NULL;
                 }




Reply via email to