Re: uninorm/nfc - Unicode version?

Simon Josefsson Wed, 05 Jan 2011 14:42:46 -0800

Ben Pfaff <b...@cs.stanford.edu> writes:

> Simon Josefsson <si...@josefsson.org> writes:
>
>> The best would be if the process to re-generate the files were
>> documented, then I could generate them on the fly to test my code with a
>> 5.1, 5.2 and 6.0 Unicode library, which would be useful for
>> compatibility and regression testing.
>
> It's documented in the comments at the top and bottom of
> lib/gen-uni-tables.c.  Just compile that one file and then run it
> on the specified files from the unicode database.
>
> (When I ran it on the unicode 6.0 database a few weeks ago, it
> reported an error and failed.  Presumably at least some minor
> updates are needed.)


I have made some changes to gen-uni-tables.c (see below) to get it
passed the parsing stage.  I'm not certain mapping the CP line breaking
property into nothing is correct though.  I'm using 6.0.0 files,
although the CP line breaking property was added for 5.2.0.

The code now crashes when generating output because of three distinct
issues:

1) It crashes in output_properties for the PROPERTY(alphabetic) call,
triggering this abort:

static bool
is_property_alphabetic (unsigned int ch)
{
  bool result1 =
    is_category_L (ch)
    || ((unicode_properties[ch] & (1ULL << PROP_OTHER_ALPHABETIC)) != 0)
    /* For some reason, the following are listed as having property
       Alphabetic but not as having property Other_Alphabetic.  */
    || (ch >= 0x16EE && ch <= 0x16F0) /* RUNIC SYMBOLS */
    || (ch >= 0x2160 && ch <= 0x2182) /* ROMAN NUMERALS */
    || (ch >= 0x2185 && ch <= 0x2188) /* ROMAN NUMERALS */
    || (ch >= 0x24D0 && ch <= 0x24E9) /* CIRCLED LATIN SMALL LETTER */
    || (ch == 0x3007) /* IDEOGRAPHIC NUMBER ZERO */
    || (ch >= 0x3021 && ch <= 0x3029) /* HANGZHOU NUMERAL */
    || (ch >= 0x3038 && ch <= 0x303A) /* HANGZHOU NUMERAL */
    || (ch >= 0x10140 && ch <= 0x10174) /* GREEK ACROPHONICS */
    || (ch == 0x10341) /* GOTHIC LETTER NINETY */
    || (ch == 0x1034A) /* GOTHIC LETTER NINE HUNDRED */
    || (ch >= 0x103D1 && ch <= 0x103D5) /* OLD PERSIAN NUMBERS */
    || (ch >= 0x12400 && ch <= 0x12462); /* CUNEIFORM NUMERIC SIGNS */
  bool result2 =
    ((unicode_properties[ch] & (1ULL << PROP_ALPHABETIC)) != 0);

  if (result1 != result2)
    abort ();
  return result1;
}

2) Similarily it crashes for the PROPERTY(default_ignorable_code_point)
call, triggering this abort:

static bool
is_property_default_ignorable_code_point (unsigned int ch)
{
  bool result1 =
    (is_category_Cf (ch)
     && !(ch >= 0xFFF9 && ch <= 0xFFFB) /* Annotations */
     && !((ch >= 0x0600 && ch <= 0x0603) || ch == 0x06DD || ch == 0x070F))
    || ((unicode_properties[ch] & (1ULL << 
PROP_OTHER_DEFAULT_IGNORABLE_CODE_POINT)) != 0)
    || ((unicode_properties[ch] & (1ULL << PROP_VARIATION_SELECTOR)) != 0);
  bool result2 =
    ((unicode_properties[ch] & (1ULL << PROP_DEFAULT_IGNORABLE_CODE_POINT)) != 
0);

  if (result1 != result2)
    abort ();
  return result1;
}

3) It crashes in output_composition_tables triggering this abort:

          /* Exclude decompositions where the first part is not a starter,
             i.e. is not of canonical combining class 0.  */
          if (strcmp (unicode_attributes[code1].combining, "0") == 0
              /* Exclude characters listed in CompositionExclusions.txt.  */
              && !unicode_composition_exclusions[combined])
            {
              /* The combined character must now also be a starter.
                 Verify this.  */
              if (strcmp (unicode_attributes[combined].combining, "0") != 0)
                abort ();

              if (!(code1 < 0x10000))
                abort ();

Any ideas?

Commenting out these three output functions calls results in a exit code
of 0, so hopefully this isn't too difficult to fix.  The difficult part
is likely to review code to see if anything in the Unicode standard
affects any of the algorithms.  But that is what good self tests and
real world usage is for...

Thanks,
/Simon

diff --git a/lib/gen-uni-tables.c b/lib/gen-uni-tables.c
index 170e807..64a4f80 100644
--- a/lib/gen-uni-tables.c
+++ b/lib/gen-uni-tables.c
@@ -2534,6 +2534,13 @@ enum
   PROP_ALPHABETIC,
   PROP_LOWERCASE,
   PROP_UPPERCASE,
+  PROP_CASED,
+  PROP_CASE_IGNORABLE,
+  PROP_CHANGES_WHEN_LOWERCASED,
+  PROP_CHANGES_WHEN_UPPERCASED,
+  PROP_CHANGES_WHEN_TITLECASED,
+  PROP_CHANGES_WHEN_CASEFOLDED,
+  PROP_CHANGES_WHEN_CASEMAPPED,
   PROP_ID_START,
   PROP_ID_CONTINUE,
   PROP_XID_START,
@@ -2632,6 +2639,13 @@ fill_properties (const char *proplist_filename)
       PROP ("Alphabetic", PROP_ALPHABETIC)
       PROP ("Lowercase", PROP_LOWERCASE)
       PROP ("Uppercase", PROP_UPPERCASE)
+      PROP ("Cased", PROP_CASED)
+      PROP ("Case_Ignorable", PROP_CASE_IGNORABLE)
+      PROP ("Changes_When_Lowercased", PROP_CHANGES_WHEN_LOWERCASED)
+      PROP ("Changes_When_Uppercased", PROP_CHANGES_WHEN_UPPERCASED)
+      PROP ("Changes_When_Titlecased", PROP_CHANGES_WHEN_TITLECASED)
+      PROP ("Changes_When_Casefolded", PROP_CHANGES_WHEN_CASEFOLDED)
+      PROP ("Changes_When_Casemapped", PROP_CHANGES_WHEN_CASEMAPPED)
       PROP ("ID_Start", PROP_ID_START)
       PROP ("ID_Continue", PROP_ID_CONTINUE)
       PROP ("XID_Start", PROP_XID_START)
@@ -5944,6 +5958,7 @@ fill_org_lbp (const char *linebreak_filename)
       else if (strcmp (field1, "CR") == 0) value = LBP_BK;
       else if (strcmp (field1, "NL") == 0) value = LBP_BK;
       else if (strcmp (field1, "SG") == 0) value = LBP_XX;
+      else if (strcmp (field1, "CP") == 0) value = LBP_XX;
       else
         {
           fprintf (stderr, "unknown property value \"%s\" in '%s':%d\n",

Re: uninorm/nfc - Unicode version?

Reply via email to