Hello, I looked on this closer. The attached is the revised version of this patch.
At Mon, 05 Dec 2016 19:29:54 +0900 (Tokyo Standard Time), Kyotaro HORIGUCHI <horiguchi.kyot...@lab.ntt.co.jp> wrote in <20161205.192954.121855559.horiguchi.kyot...@lab.ntt.co.jp> > Apart from the aboves, I have some trivial comments on the new > version. > > > 1. If we decide not to use old-style maps, UtfToLocal no longer > need to take void * as map data. (Patch 0001) > 2. "use Data::Dumper" doesn't seem necessary. (Patch 0002) > 3. A comment contains a superfluous comma. (Patch 0002) (The last > byte of the first line below) > 4. The following code doesn't seem so perl'ish. > 4. download_srctxts.sh is no longer needed. (No patch) 6. Fixed some inconsistent indentation/folding. 7. Fix handling of $verbose. 8. Sort segments using leading bytes. regards, -- Kyotaro Horiguchi NTT Open Source Software Center
diff --git a/src/backend/utils/mb/Unicode/Makefile b/src/backend/utils/mb/Unicode/Makefile index 0345a36..f184f65 100644 --- a/src/backend/utils/mb/Unicode/Makefile +++ b/src/backend/utils/mb/Unicode/Makefile @@ -158,9 +158,6 @@ gb-18030-2000.xml windows-949-2000.xml: euc-jis-2004-std.txt sjis-0213-2004-std.txt: $(DOWNLOAD) http://x0213.org/codetable/$(@F) -gb-18030-2000.xml windows-949-2000.xml: - $(DOWNLOAD) https://ssl.icu-project.org/repos/icu/data/trunk/charset/data/xml/$(@F) - GB2312.TXT: $(DOWNLOAD) 'http://trac.greenstone.org/browser/trunk/gsdl/unicode/MAPPINGS/EASTASIA/GB/GB2312.TXT?rev=1842&format=txt' @@ -176,7 +173,7 @@ KOI8-R.TXT KOI8-U.TXT: $(ISO8859TEXTS): $(DOWNLOAD) http://ftp.unicode.org/Public/MAPPINGS/ISO8859/$(@F) -$(filter-out CP8%,$(WINTEXTS)) CP932.TXT CP950.TXT: +$(filter-out CP8%,$(WINTEXTS)) $(filter CP9%, $(SPECIALTEXTS)): $(DOWNLOAD) http://ftp.unicode.org/Public/MAPPINGS/VENDORS/MICSFT/WINDOWS/$(@F) $(filter CP8%,$(WINTEXTS)): diff --git a/src/backend/utils/mb/Unicode/UCS_to_BIG5.pl b/src/backend/utils/mb/Unicode/UCS_to_BIG5.pl index 822ab28..62e5145 100755 --- a/src/backend/utils/mb/Unicode/UCS_to_BIG5.pl +++ b/src/backend/utils/mb/Unicode/UCS_to_BIG5.pl @@ -51,7 +51,9 @@ foreach my $i (@$cp950txt) { code => $code, ucs => $ucs, comment => $i->{comment}, - direction => "both" }; + direction => "both", + f => $i->{f}, + l => $i->{l} }; } } @@ -70,6 +72,6 @@ foreach my $i (@$all) } # Output -print_tables($this_script, "BIG5", $all); +print_tables($this_script, "BIG5", $all, 1); print_radix_trees($this_script, "BIG5", $all); diff --git a/src/backend/utils/mb/Unicode/UCS_to_EUC_CN.pl b/src/backend/utils/mb/Unicode/UCS_to_EUC_CN.pl index a933c12..299beec 100755 --- a/src/backend/utils/mb/Unicode/UCS_to_EUC_CN.pl +++ b/src/backend/utils/mb/Unicode/UCS_to_EUC_CN.pl @@ -72,9 +72,11 @@ while (<$in>) push @mapping, { ucs => $ucs, code => $code, - direction => 'both' }; + direction => 'both', + f => $in_file, + l => $. }; } close($in); -print_tables($this_script, "EUC_CN", \@mapping); +print_tables($this_script, "EUC_CN", \@mapping, 1); print_radix_trees($this_script, "EUC_CN", \@mapping); diff --git a/src/backend/utils/mb/Unicode/UCS_to_EUC_JIS_2004.pl b/src/backend/utils/mb/Unicode/UCS_to_EUC_JIS_2004.pl index 1bf7f2e..fea03df 100755 --- a/src/backend/utils/mb/Unicode/UCS_to_EUC_JIS_2004.pl +++ b/src/backend/utils/mb/Unicode/UCS_to_EUC_JIS_2004.pl @@ -31,12 +31,14 @@ while (my $line = <$in>) my $ucs1 = hex($u1); my $ucs2 = hex($u2); - push @all, { direction => 'both', - ucs => $ucs1, - ucs_second => $ucs2, - code => $code, - comment => $rest }; - next; + push @all, + { direction => 'both', + ucs => $ucs1, + ucs_second => $ucs2, + code => $code, + comment => $rest, + f => $in_file, + l => $. }; } elsif ($line =~ /^0x(.*)[ \t]*U\+(.*)[ \t]*#(.*)$/) { @@ -47,7 +49,13 @@ while (my $line = <$in>) next if ($code < 0x80 && $ucs < 0x80); - push @all, { direction => 'both', ucs => $ucs, code => $code, comment => $rest }; + push @all, + { direction => 'both', + ucs => $ucs, + code => $code, + comment => $rest, + f => $in_file, + l => $. }; } } close($in); diff --git a/src/backend/utils/mb/Unicode/UCS_to_EUC_JP.pl b/src/backend/utils/mb/Unicode/UCS_to_EUC_JP.pl index 5ac3542..9dcb9e2 100755 --- a/src/backend/utils/mb/Unicode/UCS_to_EUC_JP.pl +++ b/src/backend/utils/mb/Unicode/UCS_to_EUC_JP.pl @@ -108,98 +108,98 @@ foreach my $i (@mapping) } push @mapping, ( - {direction => 'both', ucs => 0x4efc, code => 0x8ff4af, comment => '# CJK(4EFC)'}, - {direction => 'both', ucs => 0x50f4, code => 0x8ff4b0, comment => '# CJK(50F4)'}, - {direction => 'both', ucs => 0x51EC, code => 0x8ff4b1, comment => '# CJK(51EC)'}, - {direction => 'both', ucs => 0x5307, code => 0x8ff4b2, comment => '# CJK(5307)'}, - {direction => 'both', ucs => 0x5324, code => 0x8ff4b3, comment => '# CJK(5324)'}, - {direction => 'both', ucs => 0x548A, code => 0x8ff4b5, comment => '# CJK(548A)'}, - {direction => 'both', ucs => 0x5759, code => 0x8ff4b6, comment => '# CJK(5759)'}, - {direction => 'both', ucs => 0x589E, code => 0x8ff4b9, comment => '# CJK(589E)'}, - {direction => 'both', ucs => 0x5BEC, code => 0x8ff4ba, comment => '# CJK(5BEC)'}, - {direction => 'both', ucs => 0x5CF5, code => 0x8ff4bb, comment => '# CJK(5CF5)'}, - {direction => 'both', ucs => 0x5D53, code => 0x8ff4bc, comment => '# CJK(5D53)'}, - {direction => 'both', ucs => 0x5FB7, code => 0x8ff4be, comment => '# CJK(5FB7)'}, - {direction => 'both', ucs => 0x6085, code => 0x8ff4bf, comment => '# CJK(6085)'}, - {direction => 'both', ucs => 0x6120, code => 0x8ff4c0, comment => '# CJK(6120)'}, - {direction => 'both', ucs => 0x654E, code => 0x8ff4c1, comment => '# CJK(654E)'}, - {direction => 'both', ucs => 0x663B, code => 0x8ff4c2, comment => '# CJK(663B)'}, - {direction => 'both', ucs => 0x6665, code => 0x8ff4c3, comment => '# CJK(6665)'}, - {direction => 'both', ucs => 0x6801, code => 0x8ff4c6, comment => '# CJK(6801)'}, - {direction => 'both', ucs => 0x6A6B, code => 0x8ff4c9, comment => '# CJK(6A6B)'}, - {direction => 'both', ucs => 0x6AE2, code => 0x8ff4ca, comment => '# CJK(6AE2)'}, - {direction => 'both', ucs => 0x6DF2, code => 0x8ff4cc, comment => '# CJK(6DF2)'}, - {direction => 'both', ucs => 0x6DF8, code => 0x8ff4cb, comment => '# CJK(6DF8)'}, - {direction => 'both', ucs => 0x7028, code => 0x8ff4cd, comment => '# CJK(7028)'}, - {direction => 'both', ucs => 0x70BB, code => 0x8ff4ae, comment => '# CJK(70BB)'}, - {direction => 'both', ucs => 0x7501, code => 0x8ff4d0, comment => '# CJK(7501)'}, - {direction => 'both', ucs => 0x7682, code => 0x8ff4d1, comment => '# CJK(7682)'}, - {direction => 'both', ucs => 0x769E, code => 0x8ff4d2, comment => '# CJK(769E)'}, - {direction => 'both', ucs => 0x7930, code => 0x8ff4d4, comment => '# CJK(7930)'}, - {direction => 'both', ucs => 0x7AE7, code => 0x8ff4d9, comment => '# CJK(7AE7)'}, - {direction => 'both', ucs => 0x7DA0, code => 0x8ff4dc, comment => '# CJK(7DA0)'}, - {direction => 'both', ucs => 0x7DD6, code => 0x8ff4dd, comment => '# CJK(7DD6)'}, - {direction => 'both', ucs => 0x8362, code => 0x8ff4df, comment => '# CJK(8362)'}, - {direction => 'both', ucs => 0x85B0, code => 0x8ff4e1, comment => '# CJK(85B0)'}, - {direction => 'both', ucs => 0x8807, code => 0x8ff4e4, comment => '# CJK(8807)'}, - {direction => 'both', ucs => 0x8B7F, code => 0x8ff4e6, comment => '# CJK(8B7F)'}, - {direction => 'both', ucs => 0x8CF4, code => 0x8ff4e7, comment => '# CJK(8CF4)'}, - {direction => 'both', ucs => 0x8D76, code => 0x8ff4e8, comment => '# CJK(8D76)'}, - {direction => 'both', ucs => 0x90DE, code => 0x8ff4ec, comment => '# CJK(90DE)'}, - {direction => 'both', ucs => 0x9115, code => 0x8ff4ee, comment => '# CJK(9115)'}, - {direction => 'both', ucs => 0x9592, code => 0x8ff4f1, comment => '# CJK(9592)'}, - {direction => 'both', ucs => 0x973B, code => 0x8ff4f4, comment => '# CJK(973B)'}, - {direction => 'both', ucs => 0x974D, code => 0x8ff4f5, comment => '# CJK(974D)'}, - {direction => 'both', ucs => 0x9751, code => 0x8ff4f6, comment => '# CJK(9751)'}, - {direction => 'both', ucs => 0x999E, code => 0x8ff4fa, comment => '# CJK(999E)'}, - {direction => 'both', ucs => 0x9AD9, code => 0x8ff4fb, comment => '# CJK(9AD9)'}, - {direction => 'both', ucs => 0x9B72, code => 0x8ff4fc, comment => '# CJK(9B72)'}, - {direction => 'both', ucs => 0x9ED1, code => 0x8ff4fe, comment => '# CJK(9ED1)'}, - {direction => 'both', ucs => 0xF929, code => 0x8ff4c5, comment => '# CJK COMPATIBILITY IDEOGRAPH-F929'}, - {direction => 'both', ucs => 0xF9DC, code => 0x8ff4f2, comment => '# CJK COMPATIBILITY IDEOGRAPH-F9DC'}, - {direction => 'both', ucs => 0xFA0E, code => 0x8ff4b4, comment => '# CJK COMPATIBILITY IDEOGRAPH-FA0E'}, - {direction => 'both', ucs => 0xFA0F, code => 0x8ff4b7, comment => '# CJK COMPATIBILITY IDEOGRAPH-FA0F'}, - {direction => 'both', ucs => 0xFA10, code => 0x8ff4b8, comment => '# CJK COMPATIBILITY IDEOGRAPH-FA10'}, - {direction => 'both', ucs => 0xFA11, code => 0x8ff4bd, comment => '# CJK COMPATIBILITY IDEOGRAPH-FA11'}, - {direction => 'both', ucs => 0xFA12, code => 0x8ff4c4, comment => '# CJK COMPATIBILITY IDEOGRAPH-FA12'}, - {direction => 'both', ucs => 0xFA13, code => 0x8ff4c7, comment => '# CJK COMPATIBILITY IDEOGRAPH-FA13'}, - {direction => 'both', ucs => 0xFA14, code => 0x8ff4c8, comment => '# CJK COMPATIBILITY IDEOGRAPH-FA14'}, - {direction => 'both', ucs => 0xFA15, code => 0x8ff4ce, comment => '# CJK COMPATIBILITY IDEOGRAPH-FA15'}, - {direction => 'both', ucs => 0xFA16, code => 0x8ff4cf, comment => '# CJK COMPATIBILITY IDEOGRAPH-FA16'}, - {direction => 'both', ucs => 0xFA17, code => 0x8ff4d3, comment => '# CJK COMPATIBILITY IDEOGRAPH-FA17'}, - {direction => 'both', ucs => 0xFA18, code => 0x8ff4d5, comment => '# CJK COMPATIBILITY IDEOGRAPH-FA18'}, - {direction => 'both', ucs => 0xFA19, code => 0x8ff4d6, comment => '# CJK COMPATIBILITY IDEOGRAPH-FA19'}, - {direction => 'both', ucs => 0xFA1A, code => 0x8ff4d7, comment => '# CJK COMPATIBILITY IDEOGRAPH-FA1A'}, - {direction => 'both', ucs => 0xFA1B, code => 0x8ff4d8, comment => '# CJK COMPATIBILITY IDEOGRAPH-FA1B'}, - {direction => 'both', ucs => 0xFA1C, code => 0x8ff4da, comment => '# CJK COMPATIBILITY IDEOGRAPH-FA1C'}, - {direction => 'both', ucs => 0xFA1D, code => 0x8ff4db, comment => '# CJK COMPATIBILITY IDEOGRAPH-FA1D'}, - {direction => 'both', ucs => 0xFA1E, code => 0x8ff4de, comment => '# CJK COMPATIBILITY IDEOGRAPH-FA1E'}, - {direction => 'both', ucs => 0xFA1F, code => 0x8ff4e0, comment => '# CJK COMPATIBILITY IDEOGRAPH-FA1F'}, - {direction => 'both', ucs => 0xFA20, code => 0x8ff4e2, comment => '# CJK COMPATIBILITY IDEOGRAPH-FA20'}, - {direction => 'both', ucs => 0xFA21, code => 0x8ff4e3, comment => '# CJK COMPATIBILITY IDEOGRAPH-FA21'}, - {direction => 'both', ucs => 0xFA22, code => 0x8ff4e5, comment => '# CJK COMPATIBILITY IDEOGRAPH-FA22'}, - {direction => 'both', ucs => 0xFA23, code => 0x8ff4e9, comment => '# CJK COMPATIBILITY IDEOGRAPH-FA23'}, - {direction => 'both', ucs => 0xFA24, code => 0x8ff4ea, comment => '# CJK COMPATIBILITY IDEOGRAPH-FA24'}, - {direction => 'both', ucs => 0xFA25, code => 0x8ff4eb, comment => '# CJK COMPATIBILITY IDEOGRAPH-FA25'}, - {direction => 'both', ucs => 0xFA26, code => 0x8ff4ed, comment => '# CJK COMPATIBILITY IDEOGRAPH-FA26'}, - {direction => 'both', ucs => 0xFA27, code => 0x8ff4ef, comment => '# CJK COMPATIBILITY IDEOGRAPH-FA27'}, - {direction => 'both', ucs => 0xFA28, code => 0x8ff4f0, comment => '# CJK COMPATIBILITY IDEOGRAPH-FA28'}, - {direction => 'both', ucs => 0xFA29, code => 0x8ff4f3, comment => '# CJK COMPATIBILITY IDEOGRAPH-FA29'}, - {direction => 'both', ucs => 0xFA2A, code => 0x8ff4f7, comment => '# CJK COMPATIBILITY IDEOGRAPH-FA2A'}, - {direction => 'both', ucs => 0xFA2B, code => 0x8ff4f8, comment => '# CJK COMPATIBILITY IDEOGRAPH-FA2B'}, - {direction => 'both', ucs => 0xFA2C, code => 0x8ff4f9, comment => '# CJK COMPATIBILITY IDEOGRAPH-FA2C'}, - {direction => 'both', ucs => 0xFA2D, code => 0x8ff4fd, comment => '# CJK COMPATIBILITY IDEOGRAPH-FA2D'}, - {direction => 'both', ucs => 0xFF07, code => 0x8ff4a9, comment => '# FULLWIDTH APOSTROPHE'}, - {direction => 'both', ucs => 0xFFE4, code => 0x8fa2c3, comment => '# FULLWIDTH BROKEN BAR'}, + {direction => 'both', ucs => 0x4efc, code => 0x8ff4af, comment => '# CJK(4EFC)', f => $this_script, l=> __LINE__}, + {direction => 'both', ucs => 0x50f4, code => 0x8ff4b0, comment => '# CJK(50F4)', f => $this_script, l=> __LINE__}, + {direction => 'both', ucs => 0x51EC, code => 0x8ff4b1, comment => '# CJK(51EC)', f => $this_script, l=> __LINE__}, + {direction => 'both', ucs => 0x5307, code => 0x8ff4b2, comment => '# CJK(5307)', f => $this_script, l=> __LINE__}, + {direction => 'both', ucs => 0x5324, code => 0x8ff4b3, comment => '# CJK(5324)', f => $this_script, l=> __LINE__}, + {direction => 'both', ucs => 0x548A, code => 0x8ff4b5, comment => '# CJK(548A)', f => $this_script, l=> __LINE__}, + {direction => 'both', ucs => 0x5759, code => 0x8ff4b6, comment => '# CJK(5759)', f => $this_script, l=> __LINE__}, + {direction => 'both', ucs => 0x589E, code => 0x8ff4b9, comment => '# CJK(589E)', f => $this_script, l=> __LINE__}, + {direction => 'both', ucs => 0x5BEC, code => 0x8ff4ba, comment => '# CJK(5BEC)', f => $this_script, l=> __LINE__}, + {direction => 'both', ucs => 0x5CF5, code => 0x8ff4bb, comment => '# CJK(5CF5)', f => $this_script, l=> __LINE__}, + {direction => 'both', ucs => 0x5D53, code => 0x8ff4bc, comment => '# CJK(5D53)', f => $this_script, l=> __LINE__}, + {direction => 'both', ucs => 0x5FB7, code => 0x8ff4be, comment => '# CJK(5FB7)', f => $this_script, l=> __LINE__}, + {direction => 'both', ucs => 0x6085, code => 0x8ff4bf, comment => '# CJK(6085)', f => $this_script, l=> __LINE__}, + {direction => 'both', ucs => 0x6120, code => 0x8ff4c0, comment => '# CJK(6120)', f => $this_script, l=> __LINE__}, + {direction => 'both', ucs => 0x654E, code => 0x8ff4c1, comment => '# CJK(654E)', f => $this_script, l=> __LINE__}, + {direction => 'both', ucs => 0x663B, code => 0x8ff4c2, comment => '# CJK(663B)', f => $this_script, l=> __LINE__}, + {direction => 'both', ucs => 0x6665, code => 0x8ff4c3, comment => '# CJK(6665)', f => $this_script, l=> __LINE__}, + {direction => 'both', ucs => 0x6801, code => 0x8ff4c6, comment => '# CJK(6801)', f => $this_script, l=> __LINE__}, + {direction => 'both', ucs => 0x6A6B, code => 0x8ff4c9, comment => '# CJK(6A6B)', f => $this_script, l=> __LINE__}, + {direction => 'both', ucs => 0x6AE2, code => 0x8ff4ca, comment => '# CJK(6AE2)', f => $this_script, l=> __LINE__}, + {direction => 'both', ucs => 0x6DF2, code => 0x8ff4cc, comment => '# CJK(6DF2)', f => $this_script, l=> __LINE__}, + {direction => 'both', ucs => 0x6DF8, code => 0x8ff4cb, comment => '# CJK(6DF8)', f => $this_script, l=> __LINE__}, + {direction => 'both', ucs => 0x7028, code => 0x8ff4cd, comment => '# CJK(7028)', f => $this_script, l=> __LINE__}, + {direction => 'both', ucs => 0x70BB, code => 0x8ff4ae, comment => '# CJK(70BB)', f => $this_script, l=> __LINE__}, + {direction => 'both', ucs => 0x7501, code => 0x8ff4d0, comment => '# CJK(7501)', f => $this_script, l=> __LINE__}, + {direction => 'both', ucs => 0x7682, code => 0x8ff4d1, comment => '# CJK(7682)', f => $this_script, l=> __LINE__}, + {direction => 'both', ucs => 0x769E, code => 0x8ff4d2, comment => '# CJK(769E)', f => $this_script, l=> __LINE__}, + {direction => 'both', ucs => 0x7930, code => 0x8ff4d4, comment => '# CJK(7930)', f => $this_script, l=> __LINE__}, + {direction => 'both', ucs => 0x7AE7, code => 0x8ff4d9, comment => '# CJK(7AE7)', f => $this_script, l=> __LINE__}, + {direction => 'both', ucs => 0x7DA0, code => 0x8ff4dc, comment => '# CJK(7DA0)', f => $this_script, l=> __LINE__}, + {direction => 'both', ucs => 0x7DD6, code => 0x8ff4dd, comment => '# CJK(7DD6)', f => $this_script, l=> __LINE__}, + {direction => 'both', ucs => 0x8362, code => 0x8ff4df, comment => '# CJK(8362)', f => $this_script, l=> __LINE__}, + {direction => 'both', ucs => 0x85B0, code => 0x8ff4e1, comment => '# CJK(85B0)', f => $this_script, l=> __LINE__}, + {direction => 'both', ucs => 0x8807, code => 0x8ff4e4, comment => '# CJK(8807)', f => $this_script, l=> __LINE__}, + {direction => 'both', ucs => 0x8B7F, code => 0x8ff4e6, comment => '# CJK(8B7F)', f => $this_script, l=> __LINE__}, + {direction => 'both', ucs => 0x8CF4, code => 0x8ff4e7, comment => '# CJK(8CF4)', f => $this_script, l=> __LINE__}, + {direction => 'both', ucs => 0x8D76, code => 0x8ff4e8, comment => '# CJK(8D76)', f => $this_script, l=> __LINE__}, + {direction => 'both', ucs => 0x90DE, code => 0x8ff4ec, comment => '# CJK(90DE)', f => $this_script, l=> __LINE__}, + {direction => 'both', ucs => 0x9115, code => 0x8ff4ee, comment => '# CJK(9115)', f => $this_script, l=> __LINE__}, + {direction => 'both', ucs => 0x9592, code => 0x8ff4f1, comment => '# CJK(9592)', f => $this_script, l=> __LINE__}, + {direction => 'both', ucs => 0x973B, code => 0x8ff4f4, comment => '# CJK(973B)', f => $this_script, l=> __LINE__}, + {direction => 'both', ucs => 0x974D, code => 0x8ff4f5, comment => '# CJK(974D)', f => $this_script, l=> __LINE__}, + {direction => 'both', ucs => 0x9751, code => 0x8ff4f6, comment => '# CJK(9751)', f => $this_script, l=> __LINE__}, + {direction => 'both', ucs => 0x999E, code => 0x8ff4fa, comment => '# CJK(999E)', f => $this_script, l=> __LINE__}, + {direction => 'both', ucs => 0x9AD9, code => 0x8ff4fb, comment => '# CJK(9AD9)', f => $this_script, l=> __LINE__}, + {direction => 'both', ucs => 0x9B72, code => 0x8ff4fc, comment => '# CJK(9B72)', f => $this_script, l=> __LINE__}, + {direction => 'both', ucs => 0x9ED1, code => 0x8ff4fe, comment => '# CJK(9ED1)', f => $this_script, l=> __LINE__}, + {direction => 'both', ucs => 0xF929, code => 0x8ff4c5, comment => '# CJK COMPATIBILITY IDEOGRAPH-F929', f => $this_script, l=> __LINE__}, + {direction => 'both', ucs => 0xF9DC, code => 0x8ff4f2, comment => '# CJK COMPATIBILITY IDEOGRAPH-F9DC', f => $this_script, l=> __LINE__}, + {direction => 'both', ucs => 0xFA0E, code => 0x8ff4b4, comment => '# CJK COMPATIBILITY IDEOGRAPH-FA0E', f => $this_script, l=> __LINE__}, + {direction => 'both', ucs => 0xFA0F, code => 0x8ff4b7, comment => '# CJK COMPATIBILITY IDEOGRAPH-FA0F', f => $this_script, l=> __LINE__}, + {direction => 'both', ucs => 0xFA10, code => 0x8ff4b8, comment => '# CJK COMPATIBILITY IDEOGRAPH-FA10', f => $this_script, l=> __LINE__}, + {direction => 'both', ucs => 0xFA11, code => 0x8ff4bd, comment => '# CJK COMPATIBILITY IDEOGRAPH-FA11', f => $this_script, l=> __LINE__}, + {direction => 'both', ucs => 0xFA12, code => 0x8ff4c4, comment => '# CJK COMPATIBILITY IDEOGRAPH-FA12', f => $this_script, l=> __LINE__}, + {direction => 'both', ucs => 0xFA13, code => 0x8ff4c7, comment => '# CJK COMPATIBILITY IDEOGRAPH-FA13', f => $this_script, l=> __LINE__}, + {direction => 'both', ucs => 0xFA14, code => 0x8ff4c8, comment => '# CJK COMPATIBILITY IDEOGRAPH-FA14', f => $this_script, l=> __LINE__}, + {direction => 'both', ucs => 0xFA15, code => 0x8ff4ce, comment => '# CJK COMPATIBILITY IDEOGRAPH-FA15', f => $this_script, l=> __LINE__}, + {direction => 'both', ucs => 0xFA16, code => 0x8ff4cf, comment => '# CJK COMPATIBILITY IDEOGRAPH-FA16', f => $this_script, l=> __LINE__}, + {direction => 'both', ucs => 0xFA17, code => 0x8ff4d3, comment => '# CJK COMPATIBILITY IDEOGRAPH-FA17', f => $this_script, l=> __LINE__}, + {direction => 'both', ucs => 0xFA18, code => 0x8ff4d5, comment => '# CJK COMPATIBILITY IDEOGRAPH-FA18', f => $this_script, l=> __LINE__}, + {direction => 'both', ucs => 0xFA19, code => 0x8ff4d6, comment => '# CJK COMPATIBILITY IDEOGRAPH-FA19', f => $this_script, l=> __LINE__}, + {direction => 'both', ucs => 0xFA1A, code => 0x8ff4d7, comment => '# CJK COMPATIBILITY IDEOGRAPH-FA1A', f => $this_script, l=> __LINE__}, + {direction => 'both', ucs => 0xFA1B, code => 0x8ff4d8, comment => '# CJK COMPATIBILITY IDEOGRAPH-FA1B', f => $this_script, l=> __LINE__}, + {direction => 'both', ucs => 0xFA1C, code => 0x8ff4da, comment => '# CJK COMPATIBILITY IDEOGRAPH-FA1C', f => $this_script, l=> __LINE__}, + {direction => 'both', ucs => 0xFA1D, code => 0x8ff4db, comment => '# CJK COMPATIBILITY IDEOGRAPH-FA1D', f => $this_script, l=> __LINE__}, + {direction => 'both', ucs => 0xFA1E, code => 0x8ff4de, comment => '# CJK COMPATIBILITY IDEOGRAPH-FA1E', f => $this_script, l=> __LINE__}, + {direction => 'both', ucs => 0xFA1F, code => 0x8ff4e0, comment => '# CJK COMPATIBILITY IDEOGRAPH-FA1F', f => $this_script, l=> __LINE__}, + {direction => 'both', ucs => 0xFA20, code => 0x8ff4e2, comment => '# CJK COMPATIBILITY IDEOGRAPH-FA20', f => $this_script, l=> __LINE__}, + {direction => 'both', ucs => 0xFA21, code => 0x8ff4e3, comment => '# CJK COMPATIBILITY IDEOGRAPH-FA21', f => $this_script, l=> __LINE__}, + {direction => 'both', ucs => 0xFA22, code => 0x8ff4e5, comment => '# CJK COMPATIBILITY IDEOGRAPH-FA22', f => $this_script, l=> __LINE__}, + {direction => 'both', ucs => 0xFA23, code => 0x8ff4e9, comment => '# CJK COMPATIBILITY IDEOGRAPH-FA23', f => $this_script, l=> __LINE__}, + {direction => 'both', ucs => 0xFA24, code => 0x8ff4ea, comment => '# CJK COMPATIBILITY IDEOGRAPH-FA24', f => $this_script, l=> __LINE__}, + {direction => 'both', ucs => 0xFA25, code => 0x8ff4eb, comment => '# CJK COMPATIBILITY IDEOGRAPH-FA25', f => $this_script, l=> __LINE__}, + {direction => 'both', ucs => 0xFA26, code => 0x8ff4ed, comment => '# CJK COMPATIBILITY IDEOGRAPH-FA26', f => $this_script, l=> __LINE__}, + {direction => 'both', ucs => 0xFA27, code => 0x8ff4ef, comment => '# CJK COMPATIBILITY IDEOGRAPH-FA27', f => $this_script, l=> __LINE__}, + {direction => 'both', ucs => 0xFA28, code => 0x8ff4f0, comment => '# CJK COMPATIBILITY IDEOGRAPH-FA28', f => $this_script, l=> __LINE__}, + {direction => 'both', ucs => 0xFA29, code => 0x8ff4f3, comment => '# CJK COMPATIBILITY IDEOGRAPH-FA29', f => $this_script, l=> __LINE__}, + {direction => 'both', ucs => 0xFA2A, code => 0x8ff4f7, comment => '# CJK COMPATIBILITY IDEOGRAPH-FA2A', f => $this_script, l=> __LINE__}, + {direction => 'both', ucs => 0xFA2B, code => 0x8ff4f8, comment => '# CJK COMPATIBILITY IDEOGRAPH-FA2B', f => $this_script, l=> __LINE__}, + {direction => 'both', ucs => 0xFA2C, code => 0x8ff4f9, comment => '# CJK COMPATIBILITY IDEOGRAPH-FA2C', f => $this_script, l=> __LINE__}, + {direction => 'both', ucs => 0xFA2D, code => 0x8ff4fd, comment => '# CJK COMPATIBILITY IDEOGRAPH-FA2D', f => $this_script, l=> __LINE__}, + {direction => 'both', ucs => 0xFF07, code => 0x8ff4a9, comment => '# FULLWIDTH APOSTROPHE', f => $this_script, l=> __LINE__}, + {direction => 'both', ucs => 0xFFE4, code => 0x8fa2c3, comment => '# FULLWIDTH BROKEN BAR', f => $this_script, l=> __LINE__}, # additional conversions for EUC_JP -> UTF-8 conversion - {direction => 'to_unicode', ucs => 0x2116, code => 0x8ff4ac, comment => '# NUMERO SIGN'}, - {direction => 'to_unicode', ucs => 0x2121, code => 0x8ff4ad, comment => '# TELEPHONE SIGN'}, - {direction => 'to_unicode', ucs => 0x3231, code => 0x8ff4ab, comment => '# PARENTHESIZED IDEOGRAPH STOCK'} + {direction => 'to_unicode', ucs => 0x2116, code => 0x8ff4ac, comment => '# NUMERO SIGN', f => $this_script, l=> __LINE__}, + {direction => 'to_unicode', ucs => 0x2121, code => 0x8ff4ad, comment => '# TELEPHONE SIGN', f => $this_script, l=> __LINE__}, + {direction => 'to_unicode', ucs => 0x3231, code => 0x8ff4ab, comment => '# PARENTHESIZED IDEOGRAPH STOCK', f => $this_script, l=> __LINE__} ); #>>> -print_tables($this_script, "EUC_JP", \@mapping); +print_tables($this_script, "EUC_JP", \@mapping, 1); print_radix_trees($this_script, "EUC_JP", \@mapping); diff --git a/src/backend/utils/mb/Unicode/UCS_to_EUC_KR.pl b/src/backend/utils/mb/Unicode/UCS_to_EUC_KR.pl index d17d777..baa3f9c 100755 --- a/src/backend/utils/mb/Unicode/UCS_to_EUC_KR.pl +++ b/src/backend/utils/mb/Unicode/UCS_to_EUC_KR.pl @@ -33,11 +33,11 @@ foreach my $i (@$mapping) # Some extra characters that are not in KSX1001.TXT #<<< do not let perltidy touch this push @$mapping,( - {direction => 'both', ucs => 0x20AC, code => 0xa2e6, comment => '# EURO SIGN'}, - {direction => 'both', ucs => 0x00AE, code => 0xa2e7, comment => '# REGISTERED SIGN'}, - {direction => 'both', ucs => 0x327E, code => 0xa2e8, comment => '# CIRCLED HANGUL IEUNG U'} + {direction => 'both', ucs => 0x20AC, code => 0xa2e6, comment => '# EURO SIGN', f => $this_script, l => __LINE__}, + {direction => 'both', ucs => 0x00AE, code => 0xa2e7, comment => '# REGISTERED SIGN', f => $this_script, l => __LINE__ }, + {direction => 'both', ucs => 0x327E, code => 0xa2e8, comment => '# CIRCLED HANGUL IEUNG U', f => $this_script, l => __LINE__ } ); #>>> -print_tables($this_script, "EUC_KR", $mapping); +print_tables($this_script, "EUC_KR", $mapping, 1); print_radix_trees($this_script, "EUC_KR", $mapping); diff --git a/src/backend/utils/mb/Unicode/UCS_to_EUC_TW.pl b/src/backend/utils/mb/Unicode/UCS_to_EUC_TW.pl index 603edc4..0407575 100755 --- a/src/backend/utils/mb/Unicode/UCS_to_EUC_TW.pl +++ b/src/backend/utils/mb/Unicode/UCS_to_EUC_TW.pl @@ -56,11 +56,13 @@ foreach my $i (@$mapping) { ucs => $i->{ucs}, code => ($i->{code} + 0x8ea10000), rest => $i->{rest}, - direction => 'to_unicode' }; + direction => 'to_unicode', + f => $i->{f}, + l => $i->{l} }; } } push @$mapping, @extras; -print_tables($this_script, "EUC_TW", $mapping); +print_tables($this_script, "EUC_TW", $mapping, 1); print_radix_trees($this_script, "EUC_TW", $mapping); diff --git a/src/backend/utils/mb/Unicode/UCS_to_GB18030.pl b/src/backend/utils/mb/Unicode/UCS_to_GB18030.pl index e20b4a8..922f206 100755 --- a/src/backend/utils/mb/Unicode/UCS_to_GB18030.pl +++ b/src/backend/utils/mb/Unicode/UCS_to_GB18030.pl @@ -38,10 +38,12 @@ while (<$in>) push @mapping, { ucs => $ucs, code => $code, - direction => 'both' }; + direction => 'both', + f => $in_file, + l => $. }; } } close($in); -print_tables($this_script, "GB18030", \@mapping); +print_tables($this_script, "GB18030", \@mapping, 1); print_radix_trees($this_script, "GB18030", \@mapping); diff --git a/src/backend/utils/mb/Unicode/UCS_to_JOHAB.pl b/src/backend/utils/mb/Unicode/UCS_to_JOHAB.pl old mode 100755 new mode 100644 index 2dc9fb3..ab6bebf --- a/src/backend/utils/mb/Unicode/UCS_to_JOHAB.pl +++ b/src/backend/utils/mb/Unicode/UCS_to_JOHAB.pl @@ -27,11 +27,11 @@ my $mapping = &read_source("JOHAB.TXT"); # Some extra characters that are not in JOHAB.TXT #<<< do not let perltidy touch this push @$mapping, ( - {direction => 'both', ucs => 0x20AC, code => 0xd9e6, comment => '# EURO SIGN'}, - {direction => 'both', ucs => 0x00AE, code => 0xd9e7, comment => '# REGISTERED SIGN'}, - {direction => 'both', ucs => 0x327E, code => 0xd9e8, comment => '# CIRCLED HANGUL IEUNG U'} + {direction => 'both', ucs => 0x20AC, code => 0xd9e6, comment => '# EURO SIGN', f => $this_script, l => __LINE__ }, + {direction => 'both', ucs => 0x00AE, code => 0xd9e7, comment => '# REGISTERED SIGN', f => $this_script, l => __LINE__ }, + {direction => 'both', ucs => 0x327E, code => 0xd9e8, comment => '# CIRCLED HANGUL IEUNG U', f => $this_script, l => __LINE__ } ); #>>> -print_tables($this_script, "JOHAB", $mapping); +print_tables($this_script, "JOHAB", $mapping, 1); print_radix_trees($this_script, "JOHAB", $mapping); diff --git a/src/backend/utils/mb/Unicode/UCS_to_SHIFT_JIS_2004.pl b/src/backend/utils/mb/Unicode/UCS_to_SHIFT_JIS_2004.pl index b1ab307..557fc62 100755 --- a/src/backend/utils/mb/Unicode/UCS_to_SHIFT_JIS_2004.pl +++ b/src/backend/utils/mb/Unicode/UCS_to_SHIFT_JIS_2004.pl @@ -24,7 +24,6 @@ while (my $line = <$in>) { if ($line =~ /^0x(.*)[ \t]*U\+(.*)\+(.*)[ \t]*#(.*)$/) { - # combined characters my ($c, $u1, $u2) = ($1, $2, $3); my $rest = "U+" . $u1 . "+" . $u2 . $4; my $code = hex($c); @@ -36,15 +35,15 @@ while (my $line = <$in>) ucs => $ucs1, ucs_second => $ucs2, comment => $rest, - direction => 'both' }; + direction => 'both', + f => $in_file, + l => $. }; next; } elsif ($line =~ /^0x(.*)[ \t]*U\+(.*)[ \t]*#(.*)$/) { - # non-combined characters my ($c, $u, $rest) = ($1, $2, "U+" . $2 . $3); - my $ucs = hex($u); - my $code = hex($c); + my ($ucs, $code) = (hex($u), hex($c)); my $direction; if ($code < 0x80 && $ucs < 0x80) @@ -64,12 +63,13 @@ while (my $line = <$in>) $direction = 'both'; } - push @mapping, { - code => $code, - ucs => $ucs, - comment => $rest, - direction => $direction - }; + push @mapping, + { code => $code, + ucs => $ucs, + comment => $rest, + direction => $direction, + f => $in_file, + l => $. }; } } close($in); diff --git a/src/backend/utils/mb/Unicode/UCS_to_SJIS.pl b/src/backend/utils/mb/Unicode/UCS_to_SJIS.pl index ffeb65f..e1978f7 100755 --- a/src/backend/utils/mb/Unicode/UCS_to_SJIS.pl +++ b/src/backend/utils/mb/Unicode/UCS_to_SJIS.pl @@ -40,16 +40,16 @@ foreach my $i (@$mapping) # Add these UTF8->SJIS pairs to the table. #<<< do not let perltidy touch this push @$mapping, ( - {direction => "from_unicode", ucs => 0x00a2, code => 0x8191, comment => '# CENT SIGN'}, - {direction => "from_unicode", ucs => 0x00a3, code => 0x8192, comment => '# POUND SIGN'}, - {direction => "from_unicode", ucs => 0x00a5, code => 0x5c, comment => '# YEN SIGN'}, - {direction => "from_unicode", ucs => 0x00ac, code => 0x81ca, comment => '# NOT SIGN'}, - {direction => "from_unicode", ucs => 0x2016, code => 0x8161, comment => '# DOUBLE VERTICAL LINE'}, - {direction => "from_unicode", ucs => 0x203e, code => 0x7e, comment => '# OVERLINE'}, - {direction => "from_unicode", ucs => 0x2212, code => 0x817c, comment => '# MINUS SIGN'}, - {direction => "from_unicode", ucs => 0x301c, code => 0x8160, comment => '# WAVE DASH'} + {direction => "from_unicode", ucs => 0x00a2, code => 0x8191, comment => '# CENT SIGN', f => $this_script, l => __LINE__ }, + {direction => "from_unicode", ucs => 0x00a3, code => 0x8192, comment => '# POUND SIGN', f => $this_script, l => __LINE__ }, + {direction => "from_unicode", ucs => 0x00a5, code => 0x5c, comment => '# YEN SIGN', f => $this_script, l => __LINE__ }, + {direction => "from_unicode", ucs => 0x00ac, code => 0x81ca, comment => '# NOT SIGN', f => $this_script, l => __LINE__ }, + {direction => "from_unicode", ucs => 0x2016, code => 0x8161, comment => '# DOUBLE VERTICAL LINE', f => $this_script, l => __LINE__ }, + {direction => "from_unicode", ucs => 0x203e, code => 0x7e, comment => '# OVERLINE', f => $this_script, l => __LINE__ }, + {direction => "from_unicode", ucs => 0x2212, code => 0x817c, comment => '# MINUS SIGN', f => $this_script, l => __LINE__ }, + {direction => "from_unicode", ucs => 0x301c, code => 0x8160, comment => '# WAVE DASH', f => $this_script, l => __LINE__ } ); #>>> -print_tables($this_script, "SJIS", $mapping); +print_tables($this_script, "SJIS", $mapping, 1); print_radix_trees($this_script, "SJIS", $mapping); diff --git a/src/backend/utils/mb/Unicode/UCS_to_UHC.pl b/src/backend/utils/mb/Unicode/UCS_to_UHC.pl old mode 100755 new mode 100644 index 2905b95..26cf5a2 --- a/src/backend/utils/mb/Unicode/UCS_to_UHC.pl +++ b/src/backend/utils/mb/Unicode/UCS_to_UHC.pl @@ -41,7 +41,9 @@ while (<$in>) push @mapping, { ucs => $ucs, code => $code, - direction => 'both' }; + direction => 'both', + f => $in_file, + l => $. }; } } close($in); @@ -51,7 +53,9 @@ push @mapping, { direction => 'both', code => 0xa2e8, ucs => 0x327e, - comment => 'CIRCLED HANGUL IEUNG U' }; + comment => 'CIRCLED HANGUL IEUNG U', + f => $this_script, + l => __LINE__ }; -print_tables($this_script, "UHC", \@mapping); +print_tables($this_script, "UHC", \@mapping, 1); print_radix_trees($this_script, "UHC", \@mapping); diff --git a/src/backend/utils/mb/Unicode/UCS_to_most.pl b/src/backend/utils/mb/Unicode/UCS_to_most.pl index 55ef873..8cc3eb7 100755 --- a/src/backend/utils/mb/Unicode/UCS_to_most.pl +++ b/src/backend/utils/mb/Unicode/UCS_to_most.pl @@ -56,6 +56,6 @@ foreach my $charset (@charsets) { my $mapping = &read_source($filename{$charset}); - print_tables($this_script, $charset, $mapping); + print_tables($this_script, $charset, $mapping, 1); print_radix_trees($this_script, $charset, $mapping); } diff --git a/src/backend/utils/mb/Unicode/convutils.pm b/src/backend/utils/mb/Unicode/convutils.pm index 3ab461b..f4e6917 100644 --- a/src/backend/utils/mb/Unicode/convutils.pm +++ b/src/backend/utils/mb/Unicode/convutils.pm @@ -19,20 +19,24 @@ sub ucs2utf } elsif ($ucs > 0x007f && $ucs <= 0x07ff) { - $utf = (($ucs & 0x003f) | 0x80) | ((($ucs >> 6) | 0xc0) << 8); + $utf = + (($ucs & 0x003f) | 0x80) | + ((($ucs >> 6) | 0xc0) << 8); } elsif ($ucs > 0x07ff && $ucs <= 0xffff) { $utf = ((($ucs >> 12) | 0xe0) << 16) | - (((($ucs & 0x0fc0) >> 6) | 0x80) << 8) | (($ucs & 0x003f) | 0x80); + (((($ucs & 0x0fc0) >> 6) | 0x80) << 8) | + (($ucs & 0x003f) | 0x80); } else { $utf = ((($ucs >> 18) | 0xf0) << 24) | (((($ucs & 0x3ffff) >> 12) | 0x80) << 16) | - (((($ucs & 0x0fc0) >> 6) | 0x80) << 8) | (($ucs & 0x003f) | 0x80); + (((($ucs & 0x0fc0) >> 6) | 0x80) << 8) | + (($ucs & 0x003f) | 0x80); } return ($utf); } @@ -72,7 +76,9 @@ sub read_source code => hex($1), ucs => hex($2), comment => $4, - direction => "both" }; + direction => "both", + f => $fname, + l => $. }; # Ignore pure ASCII mappings. PostgreSQL character conversion code # never even passes these to the conversion code. @@ -134,8 +140,7 @@ sub print_tables { push @to_unicode_combined, $entry; } - if ( $i->{direction} eq "both" - || $i->{direction} eq "from_unicode") + if ($i->{direction} eq "both" || $i->{direction} eq "from_unicode") { push @from_unicode_combined, $entry; } @@ -152,8 +157,7 @@ sub print_tables { push @to_unicode, $entry; } - if ( $i->{direction} eq "both" - || $i->{direction} eq "from_unicode") + if ($i->{direction} eq "both" || $i->{direction} eq "from_unicode") { push @from_unicode, $entry; } @@ -183,15 +187,16 @@ sub print_from_utf8_map my $fname = lc("utf8_to_${charset}.map"); print "- Writing UTF8=>${charset} conversion table: $fname\n"; open(my $out, '>', $fname) || die "cannot open output file : $fname\n"; - printf($out "/* src/backend/utils/mb/Unicode/$fname */\n\n". - "static const pg_utf_to_local ULmap${charset}[ %d ] = {", - scalar(@$table)); + printf $out "/* src/backend/utils/mb/Unicode/$fname */\n" + . "/* This file is generated by $this_script */\n\n" + . "static const pg_utf_to_local ULmap${charset}[ %d ] = {", + scalar(@$table); my $first = 1; foreach my $i (sort { $a->{utf8} <=> $b->{utf8} } @$table) { print $out "," if (!$first); $first = 0; - print $out "\t/* $last_comment */" if ($verbose); + print $out "\t/* $last_comment */" if ($verbose && $last_comment ne ""); printf $out "\n {0x%04x, 0x%04x}", $i->{utf8}, $i->{code}; if ($verbose >= 2) @@ -199,12 +204,12 @@ sub print_from_utf8_map $last_comment = sprintf("%s:%d %s", $i->{f}, $i->{l}, $i->{comment}); } - else + elsif ($verbose >= 1) { $last_comment = $i->{comment}; } } - print $out "\t/* $last_comment */" if ($verbose); + print $out "\t/* $last_comment */" if ($verbose && $last_comment ne ""); print $out "\n};\n"; close($out); } @@ -218,21 +223,30 @@ sub print_from_utf8_combined_map my $fname = lc("utf8_to_${charset}_combined.map"); print "- Writing UTF8=>${charset} conversion table: $fname\n"; open(my $out, '>', $fname) || die "cannot open output file : $fname\n"; - printf($out "/* src/backend/utils/mb/Unicode/$fname */\n\n". - "static const pg_utf_to_local_combined ULmap${charset}_combined[ %d ] = {", - scalar(@$table)); + printf $out "/* src/backend/utils/mb/Unicode/$fname */\n" + . "/* This file is generated by $this_script */\n\n" + . "static const pg_utf_to_local_combined ULmap${charset}_combined[ %d ] = {", + scalar(@$table); my $first = 1; foreach my $i (sort { $a->{utf8} <=> $b->{utf8} } @$table) { print $out "," if (!$first); $first = 0; - print $out "\t/* $last_comment */" if ($verbose); + print $out "\t/* $last_comment */" if ($verbose && $last_comment ne ""); printf $out "\n {0x%08x, 0x%08x, 0x%04x}", $i->{utf8}, $i->{utf8_second}, $i->{code}; - $last_comment = $i->{comment}; + if ($verbose >= 2) + { + $last_comment = + sprintf("%s:%d %s", $i->{f}, $i->{l}, $i->{comment}); + } + elsif ($verbose >= 1) + { + $last_comment = $i->{comment}; + } } - print $out "\t/* $last_comment */" if ($verbose); + print $out "\t/* $last_comment */" if ($verbose && $last_comment ne ""); print $out "\n};\n"; close($out); } @@ -247,15 +261,16 @@ sub print_to_utf8_map print "- Writing ${charset}=>UTF8 conversion table: $fname\n"; open(my $out, '>', $fname) || die "cannot open output file : $fname\n"; - printf($out "/* src/backend/utils/mb/Unicode/${fname} */\n\n". - "static const pg_local_to_utf LUmap${charset}[ %d ] = {", - scalar(@$table)); + printf $out "/* src/backend/utils/mb/Unicode/$fname */\n" + . "/* This file is generated by $this_script */\n\n" + . "static const pg_local_to_utf LUmap${charset}[ %d ] = {", + scalar(@$table); my $first = 1; foreach my $i (sort { $a->{code} <=> $b->{code} } @$table) { print $out "," if (!$first); $first = 0; - print $out "\t/* $last_comment */" if ($verbose); + print $out "\t/* $last_comment */" if ($verbose && $last_comment ne ""); printf $out "\n {0x%04x, 0x%x}", $i->{code}, $i->{utf8}; if ($verbose >= 2) @@ -263,12 +278,12 @@ sub print_to_utf8_map $last_comment = sprintf("%s:%d %s", $i->{f}, $i->{l}, $i->{comment}); } - else + elsif ($verbose >= 1) { $last_comment = $i->{comment}; } } - print $out "\t/* $last_comment */" if ($verbose); + print $out "\t/* $last_comment */" if ($verbose && $last_comment ne ""); print $out "\n};\n"; close($out); } @@ -283,21 +298,31 @@ sub print_to_utf8_combined_map print "- Writing ${charset}=>UTF8 conversion table: $fname\n"; open(my $out, '>', $fname) || die "cannot open output file : $fname\n"; - printf($out "/* src/backend/utils/mb/Unicode/${fname} */\n\n". - "static const pg_local_to_utf_combined LUmap${charset}_combined[ %d ] = {", - scalar(@$table)); + printf $out "/* src/backend/utils/mb/Unicode/$fname */\n" + . "/* This file is generated by $this_script */\n\n" + . "static const pg_local_to_utf_combined LUmap${charset}_combined[ %d ] = {", + scalar(@$table); my $first = 1; foreach my $i (sort { $a->{code} <=> $b->{code} } @$table) { print $out "," if (!$first); $first = 0; - print $out "\t/* $last_comment */" if ($verbose); + print $out "\t/* $last_comment */" if ($verbose && $last_comment ne ""); printf $out "\n {0x%04x, 0x%08x, 0x%08x}", $i->{code}, $i->{utf8}, $i->{utf8_second}; - $last_comment = $i->{comment}; + + if ($verbose >= 2) + { + $last_comment = + sprintf("%s:%d %s", $i->{f}, $i->{l}, $i->{comment}); + } + elsif ($verbose >= 1) + { + $last_comment = $i->{comment}; + } } - print $out "\t/* $last_comment */" if ($verbose); + print $out "\t/* $last_comment */" if ($verbose && $last_comment ne ""); print $out "\n};\n"; close($out); } @@ -305,711 +330,442 @@ sub print_to_utf8_combined_map ############################################################################# # RADIX TREE STUFF -# C struct type names : see wchar.h -my $radix_type = "pg_mb_radix_tree"; -my $radix_node_type = "pg_mb_radix_index"; - -######################################### -# read_maptable(<map file name>) -# -# extract data from map files and returns a character map table. -# returns a reference to a hash <in code> => <out code> -sub read_maptable -{ - my ($fname) = @_; - my %c; - - open(my $in, '<', $fname) || die("cannot open $fname"); - - while (<$in>) - { - if (/^[ \t]*{0x([0-9a-f]+), *0x([0-9a-f]+)},?/) - { - $c{ hex($1) } = hex($2); - } - } - - close($in); - return \%c; -} - ######################################### -# generate_index(<charmap hash ref>) +# print_radix_table(<charmap hash ref>) # -# generate a radix tree data from a character table -# returns a hashref to an index data. -# { -# csegs => <character segment index> -# b2idx => [<tree index of 1st byte of 2-byte code>] -# b3idx => [<idx for 1st byte for 3-byte code>, <2nd>] -# b4idx => [<idx for 1st byte for 4-byte code>, <2nd>, <3rd>] -# } +# Input: A hash, mapping an input character to an output character. # -# Tables are in two forms, flat and segmented. a segmented table is -# logically a two-dimentional table but physically a sequence of -# segments, fixed length block of items. This structure allows us to -# shrink table size by overlapping a shared sequence of zeros between -# successive two segments. overlap_segments does that step. +# Constructs a radix tree from the hash, and prints it out as a C-struct. # -# A flat table is simple set of key and value pairs. The value is a -# segment id of the next segmented table. The next table is referenced -# using the segment id and the next byte of a code. -# -# flat table (b2idx, b3idx1, b4idx1) -# { -# attr => { -# segmented => true(1) if this index is segmented> -# min => <minimum value of index key> -# max => <maximum value of index key> -# nextidx => <hash reference to the next level table> -# } -# i => { # index data -# <byte> => <pointer value> # pointer to the next index -# ... -# } -# -# Each segments in segmented table is equivalent to a flat table -# above. -# -# segmented table (csegs, b3idx2, b4idx2, b4idx3) -# { -# attr => { -# segmented => true(1) if this index is segmented> -# min => <minimum value of index key> -# max => <maximum value of index key> -# width => <required hex width only for cseg table> -# is32bit => true if values are 32bit width, false means 16bit. -# has0page => only for cseg. true if 0 page is for single byte chars -# next => <hash reference to the next level table, if any> -# } -# i => { # segment data -# <segid> => { # key for this segment -# lower => <minimum value> -# upper => <maximum value> -# offset => <position of this segment in the whole table> -# label => <label string of this segment> -# d => { # segment data -# <byte> => { # pointer to the next index -# label => <label string for this item> -# segid => <target segid of next level> -# segoffset => <offset of the target segid> -# } -# ... -# } -# } -# } -# } - -sub generate_index -{ - my ($c) = @_; - my (%csegs, %b2idx, %b3idx1, %b3idx2, %b4idx1, %b4idx2, %b4idx3); - my @all_tables = - (\%csegs, \%b2idx, \%b3idx1, \%b3idx2, \%b4idx1, \%b4idx2, \%b4idx3); - my $si; - - # initialize attributes of index tables - #<<< do not let perltidy touch this - $csegs{attr} = {name => "csegs", chartbl => 1, segmented => 1, - is32bit => 0, has0page => 0}; - #>>> - $csegs{attr} = { - name => "csegs", - chartbl => 1, - segmented => 1, - is32bit => 0, - has0page => 0 }; - $b2idx{attr} = { name => "b2idx", segmented => 0, nextidx => \%csegs }; - $b3idx1{attr} = { name => "b3idx1", segmented => 0, nextidx => \%b3idx2 }; - $b3idx2{attr} = { name => "b3idx2", segmented => 1, nextidx => \%csegs }; - $b4idx1{attr} = { name => "b4idx1", segmented => 0, nextidx => \%b4idx2 }; - $b4idx2{attr} = { name => "b4idx2", segmented => 1, nextidx => \%b4idx3 }; - $b4idx3{attr} = { name => "b4idx3", segmented => 1, nextidx => \%csegs }; +sub print_radix_table +{ + my ($out, $tblname, $c) = @_; + + ### + ### Build radix trees in memory, for 1-, 2-, 3- and 4-byte inputs. Each + ### radix tree is represented as a nested hash, each hash indexed by + ### input byte + ### + my %b1map; + my %b2map; + my %b3map; + my %b4map; foreach my $in (keys %$c) { + my $out = $c->{$in}; + if ($in < 0x100) { - my $b1 = $in; - - # 1 byte code doesn't have index. the first segment #0 of - # character table stores them - $csegs{attr}{has0page} = 1; - $si = { - segid => 0, - off => $in, - label => "1byte-", - char => $c->{$in} }; + $b1map{$in} = $out; } elsif ($in < 0x10000) { - # 2-byte code index consists of just one flat table my $b1 = $in >> 8; my $b2 = $in & 0xff; - my $csegid = $in >> 8; - if (!defined $b2idx{i}{$b1}) - { - &set_min_max($b2idx{attr}, $b1); - $b2idx{i}{$b1}{segid} = $csegid; - } - $si = { - segid => $csegid, - off => $b2, - label => sprintf("%02x", $b1), - char => $c->{$in} }; + $b2map{$b1}{$b2} = $out; } elsif ($in < 0x1000000) { - # 3-byte code index consists of one flat table and one - # segmented table my $b1 = $in >> 16; my $b2 = ($in >> 8) & 0xff; my $b3 = $in & 0xff; - my $l1id = $in >> 16; - my $csegid = $in >> 8; - - if (!defined $b3idx1{i}{$b1}) - { - &set_min_max($b3idx1{attr}, $b1); - $b3idx1{i}{$b1}{segid} = $l1id; - } - if (!defined $b3idx2{i}{$l1id}{d}{$b2}) - { - &set_min_max($b3idx2{attr}, $b2); - $b3idx2{i}{$l1id}{label} = sprintf("%02x", $b1); - $b3idx2{i}{$l1id}{d}{$b2} = { - segid => $csegid, - label => sprintf("%02x%02x", $b1, $b2) }; - } - $si = { - segid => $csegid, - off => $b3, - label => sprintf("%02x%02x", $b1, $b2), - char => $c->{$in} }; + $b3map{$b1}{$b2}{$b3} = $out; } elsif ($in < 0x100000000) { - # 4-byte code index consists of one flat table, and two - # segmented tables my $b1 = $in >> 24; my $b2 = ($in >> 16) & 0xff; my $b3 = ($in >> 8) & 0xff; my $b4 = $in & 0xff; - my $l1id = $in >> 24; - my $l2id = $in >> 16; - my $csegid = $in >> 8; - - if (!defined $b4idx1{i}{$b1}) - { - &set_min_max($b4idx1{attr}, $b1); - $b4idx1{i}{$b1}{segid} = $l1id; - } - - if (!defined $b4idx2{i}{$l1id}{d}{$b2}) - { - &set_min_max($b4idx2{attr}, $b2); - $b4idx2{i}{$l1id}{d}{$b2} = { - segid => $l2id, - label => sprintf("%02x", $b1) }; - } - if (!defined $b4idx3{i}{$l2id}{d}{$b3}) - { - &set_min_max($b4idx3{attr}, $b3); - $b4idx3{i}{$l2id}{d}{$b3} = { - segid => $csegid, - label => sprintf("%02x%02x", $b1, $b2) }; - } - $si = { - segid => $csegid, - off => $b4, - label => sprintf("%02x%02x%02x", $b1, $b2, $b3), - char => $c->{$in} }; + $b4map{$b1}{$b2}{$b3}{$b4} = $out; } else { die sprintf("up to 4 byte code is supported: %x", $in); } - - &set_min_max($csegs{attr}, $si->{off}); - $csegs{i}{ $si->{segid} }{d}{ $si->{off} } = $si->{char}; - $csegs{i}{ $si->{segid} }{label} = $si->{label}; - $csegs{attr}{is32bit} = 1 if ($si->{char} >= 0x10000); - &update_width($csegs{attr}, $si->{char}); - if ($si->{char} >= 0x100000000) - { - die "character width is over 32bit. abort."; - } } - # calcualte segment attributes - foreach my $t (@all_tables) - { - next if (!defined $t->{i} || !$t->{attr}{segmented}); - - # segments are to be aligned in the numerical order of segment id - my @keylist = sort { $a <=> $b } keys $t->{i}; - next if ($#keylist < 0); - my $offset = 1; - my $segsize = $t->{attr}{max} - $t->{attr}{min} + 1; - - for my $k (@keylist) + my @segments; + + ### + ### Build a linear list of "segments", from the nested hashes. + ### + ### Each segment is a lookup table, keyed by the next byte in the input. + ### The segments are written out physically to one big array in the final + ### step, but logically, they form a radix tree. Or rather, four radix + ### trees: one for 1-byte inputs, another for 2-byte inputs, 3-byte + ### inputs, and 4-byte inputs. + ### + ### Each segment is represented by a hash with following fields: + ### + ### comment => <string to output as a comment> + ### label => <label that can be used to refer to this segment from elsewhere> + ### values => <a hash, keyed by byte, 0-0xff> + ### + ### Entries in 'values' can be integers (for leaf-level segments), or + ### string labels, pointing to a segment with that label. Any missing + ### values are treated as zeros. If 'values' hash is missing altogether, + ### it's treated as all-zeros. + ### + ### Subsequent steps will enrich the segments with more fields. + ### + + # Add the segments for the radix trees themselves. + push @segments, build_segments_from_tree("Single byte table", "1-byte", 1, \%b1map); + push @segments, build_segments_from_tree("Two byte table", "2-byte", 2, \%b2map); + push @segments, build_segments_from_tree("Three byte table", "3-byte", 3, \%b3map); + push @segments, build_segments_from_tree("Four byte table", "4-byte", 4, \%b4map); + + ### + ### Find min and max index used in each level of each tree. + ### + ### These are stored separately, and we can then leave out the unused + ### parts of every segment. (When using the resulting tree, you must + ### check each input byte against the min and max.) + ### + my %min_idx; + my %max_idx; + foreach my $seg (@segments) + { + my $this_min = $min_idx{$seg->{depth}}->{$seg->{level}}; + my $this_max = $max_idx{$seg->{depth}}->{$seg->{level}}; + + foreach my $i (keys $seg->{values}) { - my $seg = $t->{i}{$k}; - $seg->{lower} = $t->{attr}{min}; - $seg->{upper} = $t->{attr}{max}; - $seg->{offset} = $offset; - $offset += $segsize; + $this_min = $i if (!defined $this_min || $i < $this_min); + $this_max = $i if (!defined $this_max || $i > $this_max); } - # overlapping successive zeros between segments - &overlap_segments($t); + $min_idx{$seg->{depth}}{$seg->{level}} = $this_min; + $max_idx{$seg->{depth}}{$seg->{level}} = $this_max; } - - # make link among tables - foreach my $t (@all_tables) + # Copy the mins and max's back to every segment, for convenience + foreach my $seg (@segments) { - &make_index_link($t, $t->{attr}{nextidx}); + $seg->{min_idx} = $min_idx{$seg->{depth}}{$seg->{level}}; + $seg->{max_idx} = $max_idx{$seg->{depth}}{$seg->{level}}; } - return { - name_prefix => "", - csegs => \%csegs, - b2idx => [ \%b2idx ], - b3idx => [ \%b3idx1, \%b3idx2 ], - b4idx => [ \%b4idx1, \%b4idx2, \%b4idx3 ], - all => \@all_tables }; -} - - -######################################### -# set_min_max - internal routine to maintain min and max value of a table -sub set_min_max -{ - my ($a, $v) = @_; - - $a->{min} = $v if (!defined $a->{min} || $v < $a->{min}); - $a->{max} = $v if (!defined $a->{max} || $v > $a->{max}); -} - -######################################### -# set_maxval - internal routine to maintain mixval -sub update_width -{ - my ($a, $v) = @_; + ### + ### Prepend a dummy all-zeros map to the beginning. + ### + ### A 0 is an invalid value anywhere in the table, and this allows us to + ### point to 0 offset anywhere else in the tables, to get a 0 result. - my $nnibbles = int((int(log($v) / log(16)) + 1) / 2) * 2; - if (!defined $a->{width} || $nnibbles > $a->{width}) + # Find the max range between min and max indexes in any of the segments. + my $widest_range = 0; + foreach my $seg (@segments) { - $a->{width} = $nnibbles; + my $this_range = $seg->{max_idx} - $seg->{min_idx}; + $widest_range = $this_range if ($this_range > $widest_range); } -} - -######################################### -# overlap_segments -# -# removes duplicate regeion between two successive segments. - -sub overlap_segments -{ - my ($h) = @_; - # don't touch if undefined - return if (!defined $h->{i} || !$h->{attr}{segmented}); - my $index = $h->{i}; - my ($min, $max) = ($h->{attr}{min}, $h->{attr}{max}); - my ($prev, $first); - my @segids = sort { $a <=> $b } keys $index; - return if ($#segids < 1); + unshift @segments, { + header => "Dummy map, for invalid values", + min_idx => 0, + max_idx => $widest_range + }; - $first = 1; - undef $prev; - - for my $segid (@segids) + ### + ### Eliminate overlapping zeros + ### + ### For each segment, if there are zero values at the end of, and there + ### are also zero values at the beginning of the next segment, we can + ### overlay the tail of this segment with the head of next segment, to + ### save space. + ### + ### To achieve that, we subtract the 'max_idx' of each segment with the + ### amount of zeros that can be ovarlaid. + ### + for (my $j = 0; $j < $#segments - 1; $j++) { - my $seg = $index->{$segid}; - - # smin and smax is range excluded preceeding and trailing zeros - my @keys = sort { $a <=> $b } keys $seg->{d}; - my $smin = $keys[0]; - my $smax = $keys[-1]; + my $seg = $segments[$j]; + my $nextseg = $segments[$j + 1]; - if ($first) + # Count the number of zero values at the end of this segment. + my $this_trail_zeros = 0; + for (my $i = $seg->{max_idx}; $i >= $seg->{min_idx} && !$seg->{values}->{$i}; $i--) { - # first segment doesn't have a preceding segment - $seg->{offset} = 1; - $seg->{lower} = $min; - $seg->{upper} = $smax; + $this_trail_zeros++; } - else + + # Count the number of zeros at the beginning of next segment. + my $next_lead_zeros = 0; + for (my $i = $nextseg->{min_idx}; $i <= $nextseg->{max_idx} && !$nextseg->{values}->{$i}; $i++) { - # calculate overlap and shift segment location - my $prefix = $smin - $min; - my $postfix = $max - $smax; - my $prevpostfix = $max - $prev->{upper}; - my $overlap = $prevpostfix < $prefix ? $prevpostfix : $prefix; - - $seg->{lower} = $min + $overlap; - $seg->{upper} = $smax; - $seg->{offset} = $prev->{offset} + ($max - $min + 1) - $overlap; - $prev->{upper} = $max; + $next_lead_zeros++; } - $prev = $seg; - $first = 0; - } - return $h; -} + # How many zeros in common? + my $overlaid_trail_zeros = + ($this_trail_zeros > $next_lead_zeros) ? $next_lead_zeros : $this_trail_zeros; -###################################################### -# make_index_link(from_table, to_table) -# -# Fills out target pointers in non-leaf index tables. -# -# from_table - table to set links -# to_table - target table of from_table + $seg->{overlaid_trail_zeros} = $overlaid_trail_zeros; + $seg->{max_idx} = $seg->{max_idx} - $overlaid_trail_zeros; + } -sub make_index_link -{ - my ($s, $t) = @_; - return if (!defined $s->{i} || !defined $t->{i}); + ### + ### Replace label references with real offsets. + ### + ### So far, the non-leaf segments have referred to other segments by + ### their labels. Replace them with numerical offsets from the beginning + ### of the final array. You cannot move, add, or remove segments after + ### this step, as that would invalidate the offsets calculated here! + ### + my $flatoff = 0; + my %segmap; - my @tkeys = sort { $a <=> $b } keys $t->{i}; + # First pass: assign offsets to each segment, and build hash + # of label => offset. + foreach my $seg (@segments) + { + $seg->{offset} = $flatoff; + $segmap{$seg->{label}} = $flatoff; + $flatoff += $seg->{max_idx} - $seg->{min_idx} + 1; + } + my $tblsize = $flatoff; - if ($s->{attr}{segmented}) + # Second pass: look up the offset of each label reference in the hash. + foreach my $seg (@segments) { - foreach my $k1 (keys $s->{i}) + while (my ($i, $val) = each %{$seg->{values}}) { - foreach my $k2 (keys $s->{i}{$k1}{d}) + if (!($val =~ /^[0-9,.E]+$/ )) { - my $tsegid = $s->{i}{$k1}{d}{$k2}{segid}; - if (!defined $tsegid) + my $segoff = $segmap{$val}; + if ($segoff) + { + $seg->{values}->{$i} = $segoff; + } + else { - die sprintf( - "segid is not set in %s{i}{%x}{d}{%x}{segid}", - $s->{attr}{name}, - $k1, $k2); + die "no segment with label $val"; } - $s->{i}{$k1}{d}{$k2}{segoffset} = $t->{i}{$tsegid}{offset}; } } } - else - { - foreach my $k (keys $s->{i}) + + # Also look up the positions of the roots in the table. + my $b1root = $segmap{"1-byte"}; + my $b2root = $segmap{"2-byte"}; + my $b3root = $segmap{"3-byte"}; + my $b4root = $segmap{"4-byte"}; + + # And the lower-upper values of each level in each radix tree. + my $b1_lower = $min_idx{1}{1}; + my $b1_upper = $max_idx{1}{1}; + + my $b2_1_lower = $min_idx{2}{1}; + my $b2_1_upper = $max_idx{2}{1}; + my $b2_2_lower = $min_idx{2}{2}; + my $b2_2_upper = $max_idx{2}{2}; + + my $b3_1_lower = $min_idx{3}{1}; + my $b3_1_upper = $max_idx{3}{1}; + my $b3_2_lower = $min_idx{3}{2}; + my $b3_2_upper = $max_idx{3}{2}; + my $b3_3_lower = $min_idx{3}{3}; + my $b3_3_upper = $max_idx{3}{3}; + + my $b4_1_lower = $min_idx{4}{1}; + my $b4_1_upper = $max_idx{4}{1}; + my $b4_2_lower = $min_idx{4}{2}; + my $b4_2_upper = $max_idx{4}{2}; + my $b4_3_lower = $min_idx{4}{3}; + my $b4_3_upper = $max_idx{4}{3}; + my $b4_4_lower = $min_idx{4}{4}; + my $b4_4_upper = $max_idx{4}{4}; + + ### + ### Find the maximum value in the whole table, to determine if we can + ### use uint16 or if we need to use uint32. + ### + my $max_val = 0; + foreach my $seg (@segments) + { + foreach my $val (values $seg->{values}) { - my $tsegid = $s->{i}{$k}{segid}; - if (!defined $tsegid) - { - die sprintf("segid is not set in %s{i}{%x}{segid}", - $s->{attr}{name}, $k); - } - $s->{i}{$k}{segoffset} = $t->{i}{$tsegid}{offset}; + $max_val = $val if ($val > $max_val); } } -} - -############################################### -# print_radix_table - output index table as C-struct -# -# print_radix_table(hd, table, tblname, width) -# returns 1 if the table is written -# -# hd - file handle to write -# table - ref to an index table -# tblname - C symbol name for the table -# width - width in characters of this table -sub print_radix_table -{ - my ($hd, $table, $tblname, $width) = @_; + my $datatype = ($max_val <= 0xffff) ? "uint16" : "uint32"; - return 0 if (!defined $table->{i}); + # For formatting, determine how many values we can fit on a single + # line, and how wide each value needs to be to align nicely. + my $vals_per_line; + my $colwidth; - if ($table->{attr}{chartbl}) + if ($max_val <= 0xffff) { - &print_chars_table($hd, $table, $tblname, $width); + $vals_per_line = 8; + $colwidth = 4; } - elsif ($table->{attr}{segmented}) + elsif ($max_val <= 0xffffff) { - &print_segmented_table($hd, $table, $tblname, $width); + $vals_per_line = 4; + $colwidth = 6; } else { - &print_flat_table($hd, $table, $tblname, $width); - } - return 1; -} - -######################################### -# print_chars_table -# -# print_chars_table(hd, table, tblname, width) -# this is usually called via writ_table -# -# hd - file handle to write -# table - ref to an index table -# tblname - C symbol name for the table -# tblwidth- width in characters of this table - -sub print_chars_table -{ - my ($hd, $table, $tblname, $width) = @_; - my ($st, $ed) = ($table->{attr}{min}, $table->{attr}{max}); - my ($type) = $table->{attr}{is32bit} ? "uint32" : "uint16"; - - printf $hd "static const %s %s[] =\n{", $type, $tblname; - printf $hd " /* chars content - index range = [%02x, %02x] */", $st, $ed; - - # values in character table are written in fixedwidth - # hexadecimals. calculate the number of columns in a line. 13 is - # the length of line header. - - my $colwidth = $table->{attr}{width}; - my $colseplen = 4; # the length of ", 0x" - my $headerlength = 13; - my $colnum = int(($width - $headerlength) / ($colwidth + $colseplen)); - - # round down to multiples of 4. don't bother by too small table width - my $colnum = int($colnum / 4) * 4; - my $line = ""; - my $first0 = 1; - - # output all segments in segment id order - foreach my $k (sort { $a <=> $b } keys $table->{i}) - { - my $s = $table->{i}{$k}; - if (!$first0) - { - $line =~ s/\s+$//; # remove trailing space - print $hd $line, ",\n"; - $line = ""; - } - $first0 = 0; - - # write segment header - printf $hd "\n /*** %4sxx - offset 0x%05x ***/", - $s->{label}, $s->{offset}; - - # write segment content - my $first1 = 1; - my ($segstart, $segend) = ($s->{lower}, $s->{upper}); - my ($xpos, $nocomma) = (0, 0); - - foreach my $j (($segstart - ($segstart % $colnum)) .. $segend) + $vals_per_line = 4; + $colwidth = 8; + } + + ### + ### Print the struct and array. + ### + printf $out "static const $datatype ${tblname}_table[];\n"; + printf $out "\n"; + printf $out "static const pg_mb_radix_tree $tblname =\n"; + printf $out "{\n"; + if ($datatype eq "uint16") + { + print $out " ${tblname}_table,\n"; + print $out " NULL, /* 32-bit table not used */\n"; + } + if ($datatype eq "uint32") + { + print $out " NULL, /* 16-bit table not used */\n"; + print $out " ${tblname}_table,\n"; + } + printf $out "\n"; + printf $out " 0x%04x, /* offset of table for 1-byte inputs */\n", $b1root; + printf $out " 0x%02x, /* b1_lower */\n", $b1_lower; + printf $out " 0x%02x, /* b1_upper */\n", $b1_upper; + printf $out "\n"; + printf $out " 0x%04x, /* offset of table for 2-byte inputs */\n", $b2root; + printf $out " 0x%02x, /* b2_1_lower */\n", $b2_1_lower; + printf $out " 0x%02x, /* b2_1_upper */\n", $b2_1_upper; + printf $out " 0x%02x, /* b2_2_lower */\n", $b2_2_lower; + printf $out " 0x%02x, /* b2_2_upper */\n", $b2_2_upper; + printf $out "\n"; + printf $out " 0x%04x, /* offset of table for 3-byte inputs */\n", $b3root; + printf $out " 0x%02x, /* b3_1_lower */\n", $b3_1_lower; + printf $out " 0x%02x, /* b3_1_upper */\n", $b3_1_upper; + printf $out " 0x%02x, /* b3_2_lower */\n", $b3_2_lower; + printf $out " 0x%02x, /* b3_2_upper */\n", $b3_2_upper; + printf $out " 0x%02x, /* b3_3_lower */\n", $b3_3_lower; + printf $out " 0x%02x, /* b3_3_upper */\n", $b3_3_upper; + printf $out "\n"; + printf $out " 0x%04x, /* offset of table for 3-byte inputs */\n", $b4root; + printf $out " 0x%02x, /* b4_1_lower */\n", $b4_1_lower; + printf $out " 0x%02x, /* b4_1_upper */\n", $b4_1_upper; + printf $out " 0x%02x, /* b4_2_lower */\n", $b4_2_lower; + printf $out " 0x%02x, /* b4_2_upper */\n", $b4_2_upper; + printf $out " 0x%02x, /* b4_3_lower */\n", $b4_3_lower; + printf $out " 0x%02x, /* b4_3_upper */\n", $b4_3_upper; + printf $out " 0x%02x, /* b4_4_lower */\n", $b4_4_lower; + printf $out " 0x%02x /* b4_4_upper */\n", $b4_4_upper; + print $out "};\n"; + print $out "\n"; + print $out "static const $datatype ${tblname}_table[$tblsize] =\n"; + print $out "{"; + my $off = 0; + foreach my $seg (@segments) + { + printf $out "\n"; + printf $out " /*** %s - offset 0x%05x ***/\n", $seg->{header}, $off; + printf $out "\n"; + + for (my $i=$seg->{min_idx}; $i <= $seg->{max_idx};) { - $line .= "," if (!$first1 && !$nocomma); - - # write the previous line and put a line header for the - # new line if this is the first time or this line is full - if ($xpos >= $colnum || $first1) - { - $line =~ s/\s+$//; # remove trailing space - print $hd $line, "\n"; - $line = sprintf(" /* %02x */ ", $j); - $xpos = 0; - } - else + # Print the next line's worth of values. + # XXX pad to begin at a nice boundary + printf $out " /* %02x */ ", $i; + for (my $j = 0; $j < $vals_per_line && $i <= $seg->{max_idx}; $j++) { - $line .= " "; - } - $first1 = 0; + my $val = $seg->{values}->{$i}; - # write each column - if ($j >= $segstart) - { - $line .= sprintf("0x%0*x", $colwidth, $s->{d}{$j}); - $nocomma = 0; - } - else - { - # adjust column position - $line .= " " x ($colwidth + 3); - $nocomma = 1; + printf $out " 0x%0*x", $colwidth, $val; + $off++; + if ($off != $tblsize) + { + print $out ","; + } + $i++; } - $xpos++; + print $out "\n"; + } + if ($seg->{overlaid_trail_zeros}) + { + printf $out " /* $seg->{overlaid_trail_zeros} trailing zero values shared with next segment */\n"; } - } - $line =~ s/\s+$//; - print $hd $line, "\n};\n"; -} + # Sanity check. + if ($off != $tblsize) { die "table size didn't match!"; } -###################################################### -# print_flat_table - output nonsegmented index table -# -# print_flat_table(hd, table, tblname, width) -# this is usually called via writ_table -# -# hd - file handle to write -# table - ref to an index table -# tblname - C symbol name for the table -# width - width in characters of this table + print $out "};\n"; +} -sub print_flat_table +### +sub build_segments_from_tree { - my ($hd, $table, $tblname, $width) = @_; - my ($st, $ed) = ($table->{attr}{min}, $table->{attr}{max}); - - print $hd "static const $radix_node_type $tblname =\n{"; - printf $hd "\n 0x%x, 0x%x, /* table range */\n", $st, $ed; - print $hd " {"; + my ($header, $rootlabel, $depth, $map) = @_; - my $first = 1; - my $line = ""; + my @segments; - foreach my $i ($st .. $ed) + if (%{$map}) { - $line .= "," if (!$first); - my $newitem = sprintf("%d", - defined $table->{i}{$i} ? $table->{i}{$i}{segoffset} : 0); + @segments = build_segments_recurse($header, $rootlabel, "", 1, $depth, $map); - # flush current line and feed a line if the current line - # exceeds a limit - if ($first || length($line . $newitem) > $width) - { - $line =~ s/\s+$//; # remove trailing space - print $hd "$line\n"; - $line = " "; - } - else - { - $line .= " "; - } - $line .= $newitem; - $first = 0; + # Sort the segments into "breadth-first" order. Not strictly required, + # but makes the maps nicer to read. + @segments = sort { $a->{level} cmp $b->{level} or + $a->{path} cmp $b->{path}} + @segments; } - print $hd $line; - print $hd "\n }\n};\n"; -} -###################################################### -# print_segmented_table - output segmented index table -# -# print_segmented_table(hd, table, tblname, width) -# this is usually called via writ_table -# -# hd - file handle to write -# table - ref to an index table -# tblname - C symbol name for the table -# width - width in characters of this table + return @segments; +} -sub print_segmented_table +### +sub build_segments_recurse { - my ($hd, $table, $tblname, $width) = @_; - my ($st, $ed) = ($table->{attr}{min}, $table->{attr}{max}); + my ($header, $label, $path, $level, $depth, $map) = @_; - # write the variable definition - print $hd "static const $radix_node_type $tblname =\n{"; - printf $hd "\n 0x%02x, 0x%02x, /*index range */\n {", $st, $ed; + my @segments; - my $first0 = 1; - foreach my $k (sort { $a <=> $b } keys $table->{i}) + if ($level == $depth) { - print $hd ",\n" if (!$first0); - $first0 = 0; - printf $hd "\n /*** %sxxxx - offset 0x%05x ****/", - $table->{i}{$k}{label}, $table->{i}{$k}{offset}; - - my $segstart = $table->{i}{$k}{lower}; - my $segend = $table->{i}{$k}{upper}; - - my $line = ""; - my $first1 = 1; - my $newitem = ""; + push @segments, { + header => $header . ", leaf: ${path}xx", + label => $label, + level => $level, + depth => $depth, + path => $path, + values => $map + }; + } + else + { + my %children; - foreach my $j ($segstart .. $segend) + while (my ($i, $val) = each $map) { - $line .= "," if (!$first1); - $newitem = sprintf("%d", - $table->{i}{$k}{d}{$j} - ? $table->{i}{$k}{d}{$j}{segoffset} - : 0); + my $childpath = $path . sprintf("%02x", $i); + my $childlabel = "$depth-level-$level-$childpath"; - if ($first1 || length($line . $newitem) > $width) - { - $line =~ s/\s+$//; - print $hd "$line\n"; - $line = - sprintf(" /* %2s%02x */ ", $table->{i}{$k}{label}, $j); - } - else - { - $line .= " "; - } - $line .= $newitem; - $first1 = 0; + push @segments, build_segments_recurse($header, $childlabel, $childpath, + $level + 1, $depth, $val); + $children{$i} = $childlabel; } - print $hd $line; - } - print $hd "\n }\n};\n"; -} - -######################################### -# make_table_refname(table, prefix) -# -# internal routine to make C reference notation for tables - -sub make_table_refname -{ - my ($table, $prefix) = @_; - - return "NULL" if (!defined $table->{i}); - return "&" . $prefix . $table->{attr}{name}; -} - -######################################### -# print_radix_main(hd, tblname, trie, name_prefix) -# -# write main radix tree table -# -# hd - file handle to write this table -# tblname - variable name of this struct -# trie - ref to a radix tree -# name_prefix- prefix for subtables. -sub print_radix_main -{ - my ($hd, $tblname, $trie, $name_prefix) = @_; - my $ctblname = $name_prefix . $trie->{csegs}{attr}{name}; - my ($ctbl16name, $ctbl32name); - if ($trie->{csegs}{attr}{is32bit}) - { - $ctbl16name = "NULL"; - $ctbl32name = $ctblname; - } - else - { - $ctbl16name = $ctblname; - $ctbl32name = "NULL"; + push @segments, { + header => $header . ", byte #$level: ${path}xx", + label => $label, + level => $level, + depth => $depth, + path => $path, + values => \%children + }; } - - my $b2iname = make_table_refname($trie->{b2idx}[0], $name_prefix); - my $b3i1name = make_table_refname($trie->{b3idx}[0], $name_prefix); - my $b3i2name = make_table_refname($trie->{b3idx}[1], $name_prefix); - my $b4i1name = make_table_refname($trie->{b4idx}[0], $name_prefix); - my $b4i2name = make_table_refname($trie->{b4idx}[1], $name_prefix); - my $b4i3name = make_table_refname($trie->{b4idx}[2], $name_prefix); - - #<<< do not let perltidy touch this - print $hd "static const $radix_type $tblname =\n{\n"; - print $hd " /* final character table offset and body */\n"; - printf $hd " 0x%x, 0x%x, %s, %s, %s,\n", - $trie->{csegs}{attr}{min}, $trie->{csegs}{attr}{max}, - $trie->{csegs}{attr}{has0page} ? 'true' : 'false', - $ctbl16name, $ctbl32name; - - print $hd " /* 2-byte code table */\n"; - print $hd " $b2iname,\n"; - print $hd " /* 3-byte code tables */\n"; - print $hd " {$b3i1name, $b3i2name},\n"; - print $hd " /* 4-byte code table */\n"; - print $hd " {$b4i1name, $b4i2name, $b4i3name},\n"; - print $hd "};\n"; - #>>> + return @segments; } ###################################################### @@ -1078,7 +834,6 @@ sub print_radix_map my ($this_script, $csname, $direction, $charset, $tblwidth) = @_; my $charmap = &make_charmap($charset, $direction); - my $trie = &generate_index($charmap); my $fname = $direction eq "to_unicode" ? lc("${csname}_to_utf8_radix.map") @@ -1101,17 +856,8 @@ sub print_radix_map print $out "/* src/backend/utils/mb/Unicode/$fname */\n" . "/* This file is generated by $this_script */\n\n"; - foreach my $t (@{ $trie->{all} }) - { - my $table_name = $name_prefix . $t->{attr}{name}; - - if (&print_radix_table($out, $t, $table_name, $tblwidth)) - { - print $out "\n"; - } - } + print_radix_table($out, $tblname, $charmap); - &print_radix_main($out, $tblname, $trie, $name_prefix); close($out); } diff --git a/src/backend/utils/mb/Unicode/download_srctxts.sh b/src/backend/utils/mb/Unicode/download_srctxts.sh deleted file mode 100755 index 572d57e..0000000 --- a/src/backend/utils/mb/Unicode/download_srctxts.sh +++ /dev/null @@ -1,127 +0,0 @@ -#! /bin/bash - -# This script downloads conversion source files from URLs as of 2016/10/27 -# These source files may removed or changed without notice -if [ ! -e CP932.TXT ]; then - wget ftp://ftp.unicode.org/Public/MAPPINGS/VENDORS/MICSFT/WINDOWS/CP932.TXT -fi -if [ ! -e JIS0201.TXT ]; then - wget ftp://ftp.unicode.org/Public/MAPPINGS/OBSOLETE/EASTASIA/JIS/JIS0201.TXT -fi -if [ ! -e JIS0208.TXT ]; then - wget ftp://ftp.unicode.org/Public/MAPPINGS/OBSOLETE/EASTASIA/JIS/JIS0208.TXT -fi -if [ ! -e JIS0212.TXT ]; then - wget ftp://ftp.unicode.org/Public/MAPPINGS/OBSOLETE/EASTASIA/JIS/JIS0212.TXT -fi -if [ ! -e SHIFTJIS.TXT ]; then - wget ftp://ftp.unicode.org/Public/MAPPINGS/OBSOLETE/EASTASIA/JIS/SHIFTJIS.TXT -fi -if [ ! -e CP866.TXT ]; then - wget ftp://ftp.unicode.org/Public/MAPPINGS/VENDORS/MICSFT/PC/CP866.TXT -fi -if [ ! -e CP874.TXT ]; then - wget ftp://ftp.unicode.org/Public/MAPPINGS/VENDORS/MICSFT/WINDOWS/CP874.TXT -fi -if [ ! -e CP936.TXT ]; then - wget ftp://ftp.unicode.org/Public/MAPPINGS/VENDORS/MICSFT/WINDOWS/CP936.TXT -fi -if [ ! -e CP950.TXT ]; then - wget ftp://ftp.unicode.org/Public/MAPPINGS/VENDORS/MICSFT/WINDOWS/CP950.TXT -fi -if [ ! -e CP1250.TXT ]; then - wget ftp://ftp.unicode.org/Public/MAPPINGS/VENDORS/MICSFT/WINDOWS/CP1250.TXT -fi -if [ ! -e CP1251.TXT ]; then - wget ftp://ftp.unicode.org/Public/MAPPINGS/VENDORS/MICSFT/WINDOWS/CP1251.TXT -fi -if [ ! -e CP1252.TXT ]; then - wget ftp://ftp.unicode.org/Public/MAPPINGS/VENDORS/MICSFT/WINDOWS/CP1252.TXT -fi -if [ ! -e CP1253.TXT ]; then - wget ftp://ftp.unicode.org/Public/MAPPINGS/VENDORS/MICSFT/WINDOWS/CP1253.TXT -fi -if [ ! -e CP1254.TXT ]; then - wget ftp://ftp.unicode.org/Public/MAPPINGS/VENDORS/MICSFT/WINDOWS/CP1254.TXT -fi -if [ ! -e CP1255.TXT ]; then - wget ftp://ftp.unicode.org/Public/MAPPINGS/VENDORS/MICSFT/WINDOWS/CP1255.TXT -fi -if [ ! -e CP1256.TXT ]; then - wget ftp://ftp.unicode.org/Public/MAPPINGS/VENDORS/MICSFT/WINDOWS/CP1256.TXT -fi -if [ ! -e CP1257.TXT ]; then - wget ftp://ftp.unicode.org/Public/MAPPINGS/VENDORS/MICSFT/WINDOWS/CP1257.TXT -fi -if [ ! -e CP1258.TXT ]; then - wget ftp://ftp.unicode.org/Public/MAPPINGS/VENDORS/MICSFT/WINDOWS/CP1258.TXT -fi -if [ ! -e 8859-2.TXT ]; then - wget ftp://ftp.unicode.org/Public/MAPPINGS/ISO8859/8859-2.TXT -fi -if [ ! -e 8859-3.TXT ]; then - wget ftp://ftp.unicode.org/Public/MAPPINGS/ISO8859/8859-3.TXT -fi -if [ ! -e 8859-4.TXT ]; then - wget ftp://ftp.unicode.org/Public/MAPPINGS/ISO8859/8859-4.TXT -fi -if [ ! -e 8859-5.TXT ]; then - wget ftp://ftp.unicode.org/Public/MAPPINGS/ISO8859/8859-5.TXT -fi -if [ ! -e 8859-6.TXT ]; then - wget ftp://ftp.unicode.org/Public/MAPPINGS/ISO8859/8859-6.TXT -fi -if [ ! -e 8859-7.TXT ]; then - wget ftp://ftp.unicode.org/Public/MAPPINGS/ISO8859/8859-7.TXT -fi -if [ ! -e 8859-8.TXT ]; then - wget ftp://ftp.unicode.org/Public/MAPPINGS/ISO8859/8859-8.TXT -fi -if [ ! -e 8859-9.TXT ]; then - wget ftp://ftp.unicode.org/Public/MAPPINGS/ISO8859/8859-9.TXT -fi -if [ ! -e 8859-10.TXT ]; then - wget ftp://ftp.unicode.org/Public/MAPPINGS/ISO8859/8859-10.TXT -fi -if [ ! -e 8859-13.TXT ]; then - wget ftp://ftp.unicode.org/Public/MAPPINGS/ISO8859/8859-13.TXT -fi -if [ ! -e 8859-14.TXT ]; then - wget ftp://ftp.unicode.org/Public/MAPPINGS/ISO8859/8859-14.TXT -fi -if [ ! -e 8859-15.TXT ]; then - wget ftp://ftp.unicode.org/Public/MAPPINGS/ISO8859/8859-15.TXT -fi -if [ ! -e 8859-16.TXT ]; then - wget ftp://ftp.unicode.org/Public/MAPPINGS/ISO8859/8859-16.TXT -fi -if [ ! -e KOI8-R.TXT ]; then - wget ftp://ftp.unicode.org/Public/MAPPINGS/VENDORS/MISC/KOI8-R.TXT -fi -if [ ! -e KOI8-U.TXT ]; then - wget ftp://ftp.unicode.org/Public/MAPPINGS/VENDORS/MISC/KOI8-U.TXT -fi -if [ ! -e CNS11643.TXT ]; then - wget ftp://ftp.unicode.org/Public/MAPPINGS/OBSOLETE/EASTASIA/OTHER/CNS11643.TXT -fi -if [ ! -e KSX1001.TXT ]; then - wget ftp://ftp.unicode.org/Public/MAPPINGS/OBSOLETE/EASTASIA/KSC/KSX1001.TXT -fi -if [ ! -e JOHAB.TXT ]; then - wget ftp://ftp.unicode.org/Public/MAPPINGS/OBSOLETE/EASTASIA/KSC/JOHAB.TXT -fi -if [ ! -e BIG5.TXT ]; then - wget ftp://ftp.unicode.org/Public/MAPPINGS/OBSOLETE/EASTASIA/OTHER/BIG5.TXT -fi -if [ ! -e windows-949-2000.xml ]; then - wget http://source.icu-project.org/repos/icu/data/trunk/charset/data/xml/windows-949-2000.xml -fi -if [ ! -e gb-18030-2000.xml ]; then - wget http://source.icu-project.org/repos/icu/data/trunk/charset/data/xml/gb-18030-2000.xml -fi -if [ ! -e sjis-0213-2004-std.txt ]; then - wget http://x0213.org/codetable/sjis-0213-2004-std.txt -fi -if [ ! -e euc-jis-2004-std.txt ]; then - wget http://x0213.org/codetable/euc-jis-2004-std.txt -fi diff --git a/src/backend/utils/mb/Unicode/make_mapchecker.pl b/src/backend/utils/mb/Unicode/make_mapchecker.pl old mode 100755 new mode 100644 diff --git a/src/backend/utils/mb/Unicode/map_checker.c b/src/backend/utils/mb/Unicode/map_checker.c index 643ac10..65e33ea 100644 --- a/src/backend/utils/mb/Unicode/map_checker.c +++ b/src/backend/utils/mb/Unicode/map_checker.c @@ -9,98 +9,109 @@ * radix tree conversion function - this should be identical to the function in * ../conv.c with the same name */ -const uint32 pg_mb_radix_conv(const pg_mb_radix_tree *rt, const uint32 c) +static inline uint32 +pg_mb_radix_conv(const pg_mb_radix_tree *rt, + int l, + unsigned char b1, + unsigned char b2, + unsigned char b3, + unsigned char b4) { - uint32 off = 0; - uint32 b1 = c >> 24; - uint32 b2 = (c >> 16) & 0xff; - uint32 b3 = (c >> 8) & 0xff; - uint32 b4 = c & 0xff; - - if (b1 > 0) + if (l == 4) { /* 4-byte code */ - uint32 idx; - - /* check code validity - fist byte */ - if (rt->b4idx[0] == NULL || - b1 < rt->b4idx[0]->lower || b1 > rt->b4idx[0]->upper) - return 0; - - idx = b1 - rt->b4idx[0]->lower; - off = rt->b4idx[0]->idx[idx]; - if (off == 0) - return 0; - /* check code validity - second byte */ - if (b2 < rt->b4idx[1]->lower || b2 > rt->b4idx[1]->upper) + /* check code validity */ + if (b1 < rt->b4_1_lower || b1 > rt->b4_1_upper || + b2 < rt->b4_2_lower || b2 > rt->b4_2_upper || + b3 < rt->b4_3_lower || b3 > rt->b4_3_upper || + b4 < rt->b4_4_lower || b4 > rt->b4_4_upper) return 0; - idx = b2 - rt->b4idx[1]->lower; - off = (rt->b4idx[1]->idx + off - 1)[idx]; - if (off == 0) - return 0; + if (rt->chars32) + { + uint32 idx = rt->b4root; - /* check code validity - third byte */ - if (b3 < rt->b4idx[2]->lower || b3 > rt->b4idx[2]->upper) - return 0; + idx = rt->chars32[b1 + idx - rt->b4_1_lower]; + idx = rt->chars32[b2 + idx - rt->b4_2_lower]; + idx = rt->chars32[b3 + idx - rt->b4_3_lower]; + return rt->chars32[b4 + idx - rt->b4_4_lower]; + } + else + { + uint16 idx = rt->b4root; - idx = b3 - rt->b4idx[2]->lower; - off = (rt->b4idx[2]->idx + off - 1)[idx]; + idx = rt->chars16[b1 + idx - rt->b4_1_lower]; + idx = rt->chars16[b2 + idx - rt->b4_2_lower]; + idx = rt->chars16[b3 + idx - rt->b4_3_lower]; + return rt->chars16[b4 + idx - rt->b4_4_lower]; + } } - else if (b2 > 0) + else if (l == 3) { /* 3-byte code */ - uint32 idx; - - /* check code validity - first byte */ - if (rt->b3idx[0] == NULL || - b2 < rt->b3idx[0]->lower || b2 > rt->b3idx[0]->upper) + /* check code validity */ + if (b2 < rt->b3_1_lower || b2 > rt->b3_1_upper || + b3 < rt->b3_2_lower || b3 > rt->b3_2_upper || + b4 < rt->b3_3_lower || b4 > rt->b3_3_upper) return 0; - idx = b2 - rt->b3idx[0]->lower; - off = rt->b3idx[0]->idx[idx]; - if (off == 0) - return 0; + if (rt->chars32) + { + uint32 idx = rt->b3root; - /* check code validity - second byte */ - if (b3 < rt->b3idx[1]->lower || b3 > rt->b3idx[1]->upper) - return 0; + idx = rt->chars32[b2 + idx - rt->b3_1_lower]; + idx = rt->chars32[b3 + idx - rt->b3_2_lower]; + return rt->chars32[b4 + idx - rt->b3_3_lower]; + } + else + { + uint16 idx = rt->b3root; - idx = b3 - rt->b3idx[1]->lower; - off = (rt->b3idx[1]->idx + off - 1)[idx]; + idx = rt->chars16[b2 + idx - rt->b3_1_lower]; + idx = rt->chars16[b3 + idx - rt->b3_2_lower]; + return rt->chars16[b4 + idx - rt->b3_3_lower]; + } } - else if (b3 > 0) + else if (l == 2) { /* 2-byte code */ - uint32 idx; /* check code validity - first byte */ - if (rt->b2idx == NULL || - b3 < rt->b2idx->lower || b3 > rt->b2idx->upper) + if (b3 < rt->b2_1_lower || b3 > rt->b2_1_upper || + b4 < rt->b2_2_lower || b4 > rt->b2_2_upper) return 0; - idx = b3 - rt->b2idx->lower; - off = rt->b2idx->idx[idx]; + if (rt->chars32) + { + uint32 idx = rt->b2root; + + idx = rt->chars32[b3 + idx - rt->b2_1_lower]; + return rt->chars32[b4 + idx - rt->b2_2_lower]; + } + else + { + uint16 idx = rt->b2root; + + idx = rt->chars16[b3 + idx - rt->b2_1_lower]; + return rt->chars16[b4 + idx - rt->b2_2_lower]; + } } - else + else if (l == 1) { - if (rt->single_byte) - off = 1; - } - - if (off == 0) - return 0; + /* 1-byte code */ - /* check code validity - last byte */ - if (b4 < rt->chars_lower || b4 > rt->chars_upper) - return 0; + /* check code validity - first byte */ + if (b4 < rt->b1_lower || b4 > rt->b1_upper) + return 0; - if (rt->chars32) - return (rt->chars32 + off - 1)[b4 - rt->chars_lower]; - else - return (rt->chars16 + off - 1)[b4 - rt->chars_lower]; + if (rt->chars32) + return rt->chars32[b4 + rt->b1root - rt->b1_lower]; + else + return rt->chars16[b4 + rt->b1root - rt->b1_lower]; + } + return 0; /* shouldn't happen */ } int main(void) @@ -116,6 +127,12 @@ int main(void) { uint32 s, c, d; + unsigned char b1; + unsigned char b2; + unsigned char b3; + unsigned char b4; + int l; + if (mp->ul) { s = mp->ul[i].utf; @@ -132,7 +149,20 @@ int main(void) exit(1); } - c = pg_mb_radix_conv(mp->rt, s); + b1 = s >> 24; + b2 = s >> 16; + b3 = s >> 8; + b4 = s; + if (b1 != 0) + l = 4; + else if (b2 != 0) + l = 3; + else if (b3 != 0) + l = 2; + else + l = 1; + + c = pg_mb_radix_conv(mp->rt, l, b1, b2, b3, b4); if (c != d) { diff --git a/src/backend/utils/mb/conv.c b/src/backend/utils/mb/conv.c index d4fab1f..f850487 100644 --- a/src/backend/utils/mb/conv.c +++ b/src/backend/utils/mb/conv.c @@ -284,36 +284,6 @@ mic2latin_with_table(const unsigned char *mic, /* * comparison routine for bsearch() - * this routine is intended for UTF8 -> local code - */ -static int -compare1(const void *p1, const void *p2) -{ - uint32 v1, - v2; - - v1 = *(const uint32 *) p1; - v2 = ((const pg_utf_to_local *) p2)->utf; - return (v1 > v2) ? 1 : ((v1 == v2) ? 0 : -1); -} - -/* - * comparison routine for bsearch() - * this routine is intended for local code -> UTF8 - */ -static int -compare2(const void *p1, const void *p2) -{ - uint32 v1, - v2; - - v1 = *(const uint32 *) p1; - v2 = ((const pg_local_to_utf *) p2)->code; - return (v1 > v2) ? 1 : ((v1 == v2) ? 0 : -1); -} - -/* - * comparison routine for bsearch() * this routine is intended for combined UTF8 -> local code */ static int @@ -366,98 +336,109 @@ store_coded_char(unsigned char *dest, uint32 code) /* * radix tree conversion function */ -const uint32 pg_mb_radix_conv(const pg_mb_radix_tree *rt, const uint32 c) +static inline uint32 +pg_mb_radix_conv(const pg_mb_radix_tree *rt, + int l, + unsigned char b1, + unsigned char b2, + unsigned char b3, + unsigned char b4) { - uint32 off = 0; - uint32 b1 = c >> 24; - uint32 b2 = (c >> 16) & 0xff; - uint32 b3 = (c >> 8) & 0xff; - uint32 b4 = c & 0xff; - - if (b1 > 0) + if (l == 4) { /* 4-byte code */ - uint32 idx; - - /* check code validity - fist byte */ - if (rt->b4idx[0] == NULL || - b1 < rt->b4idx[0]->lower || b1 > rt->b4idx[0]->upper) - return 0; - - idx = b1 - rt->b4idx[0]->lower; - off = rt->b4idx[0]->idx[idx]; - if (off == 0) - return 0; - /* check code validity - second byte */ - if (b2 < rt->b4idx[1]->lower || b2 > rt->b4idx[1]->upper) + /* check code validity */ + if (b1 < rt->b4_1_lower || b1 > rt->b4_1_upper || + b2 < rt->b4_2_lower || b2 > rt->b4_2_upper || + b3 < rt->b4_3_lower || b3 > rt->b4_3_upper || + b4 < rt->b4_4_lower || b4 > rt->b4_4_upper) return 0; - idx = b2 - rt->b4idx[1]->lower; - off = (rt->b4idx[1]->idx + off - 1)[idx]; - if (off == 0) - return 0; + if (rt->chars32) + { + uint32 idx = rt->b4root; - /* check code validity - third byte */ - if (b3 < rt->b4idx[2]->lower || b3 > rt->b4idx[2]->upper) - return 0; + idx = rt->chars32[b1 + idx - rt->b4_1_lower]; + idx = rt->chars32[b2 + idx - rt->b4_2_lower]; + idx = rt->chars32[b3 + idx - rt->b4_3_lower]; + return rt->chars32[b4 + idx - rt->b4_4_lower]; + } + else + { + uint16 idx = rt->b4root; - idx = b3 - rt->b4idx[2]->lower; - off = (rt->b4idx[2]->idx + off - 1)[idx]; + idx = rt->chars16[b1 + idx - rt->b4_1_lower]; + idx = rt->chars16[b2 + idx - rt->b4_2_lower]; + idx = rt->chars16[b3 + idx - rt->b4_3_lower]; + return rt->chars16[b4 + idx - rt->b4_4_lower]; + } } - else if (b2 > 0) + else if (l == 3) { /* 3-byte code */ - uint32 idx; - - /* check code validity - first byte */ - if (rt->b3idx[0] == NULL || - b2 < rt->b3idx[0]->lower || b2 > rt->b3idx[0]->upper) + /* check code validity */ + if (b2 < rt->b3_1_lower || b2 > rt->b3_1_upper || + b3 < rt->b3_2_lower || b3 > rt->b3_2_upper || + b4 < rt->b3_3_lower || b4 > rt->b3_3_upper) return 0; - idx = b2 - rt->b3idx[0]->lower; - off = rt->b3idx[0]->idx[idx]; - if (off == 0) - return 0; + if (rt->chars32) + { + uint32 idx = rt->b3root; - /* check code validity - second byte */ - if (b3 < rt->b3idx[1]->lower || b3 > rt->b3idx[1]->upper) - return 0; + idx = rt->chars32[b2 + idx - rt->b3_1_lower]; + idx = rt->chars32[b3 + idx - rt->b3_2_lower]; + return rt->chars32[b4 + idx - rt->b3_3_lower]; + } + else + { + uint16 idx = rt->b3root; - idx = b3 - rt->b3idx[1]->lower; - off = (rt->b3idx[1]->idx + off - 1)[idx]; + idx = rt->chars16[b2 + idx - rt->b3_1_lower]; + idx = rt->chars16[b3 + idx - rt->b3_2_lower]; + return rt->chars16[b4 + idx - rt->b3_3_lower]; + } } - else if (b3 > 0) + else if (l == 2) { /* 2-byte code */ - uint32 idx; /* check code validity - first byte */ - if (rt->b2idx == NULL || - b3 < rt->b2idx->lower || b3 > rt->b2idx->upper) + if (b3 < rt->b2_1_lower || b3 > rt->b2_1_upper || + b4 < rt->b2_2_lower || b4 > rt->b2_2_upper) return 0; - idx = b3 - rt->b2idx->lower; - off = rt->b2idx->idx[idx]; + if (rt->chars32) + { + uint32 idx = rt->b2root; + + idx = rt->chars32[b3 + idx - rt->b2_1_lower]; + return rt->chars32[b4 + idx - rt->b2_2_lower]; + } + else + { + uint16 idx = rt->b2root; + + idx = rt->chars16[b3 + idx - rt->b2_1_lower]; + return rt->chars16[b4 + idx - rt->b2_2_lower]; + } } - else + else if (l == 1) { - if (rt->single_byte) - off = 1; - } + /* 1-byte code */ - if (off == 0) - return 0; - - /* check code validity - last byte */ - if (b4 < rt->chars_lower || b4 > rt->chars_upper) - return 0; + /* check code validity - first byte */ + if (b4 < rt->b1_lower || b4 > rt->b1_upper) + return 0; - if (rt->chars32) - return (rt->chars32 + off - 1)[b4 - rt->chars_lower]; - else - return (rt->chars16 + off - 1)[b4 - rt->chars_lower]; + if (rt->chars32) + return rt->chars32[b4 + rt->b1root - rt->b1_lower]; + else + return rt->chars16[b4 + rt->b1root - rt->b1_lower]; + } + return 0; /* shouldn't happen */ } /* @@ -468,7 +449,6 @@ const uint32 pg_mb_radix_conv(const pg_mb_radix_tree *rt, const uint32 c) * iso: pointer to the output area (must be large enough!) (output string will be null-terminated) * map: conversion map for single characters - * mapsize: number of entries in the conversion map * cmap: conversion map for combined characters * (optional, pass NULL if none) * cmapsize: number of entries in the conversion map for combined characters @@ -486,14 +466,13 @@ const uint32 pg_mb_radix_conv(const pg_mb_radix_tree *rt, const uint32 c) void UtfToLocal(const unsigned char *utf, int len, unsigned char *iso, - const void *map, int mapsize, - const void *cmap, int cmapsize, + const pg_mb_radix_tree *map, + const pg_utf_to_local *cmap, int cmapsize, utf_local_conversion_func conv_func, int encoding) { uint32 iutf; int l; - const pg_utf_to_local *p; const pg_utf_to_local_combined *cp; if (!PG_VALID_ENCODING(encoding)) @@ -503,6 +482,11 @@ UtfToLocal(const unsigned char *utf, int len, for (; len > 0; len -= l) { + unsigned char b1 = 0; + unsigned char b2 = 0; + unsigned char b3 = 0; + unsigned char b4 = 0; + /* "break" cases all represent errors */ if (*utf == '\0') break; @@ -524,27 +508,28 @@ UtfToLocal(const unsigned char *utf, int len, /* collect coded char of length l */ if (l == 2) { - iutf = *utf++ << 8; - iutf |= *utf++; + b3 = *utf++; + b4 = *utf++; } else if (l == 3) { - iutf = *utf++ << 16; - iutf |= *utf++ << 8; - iutf |= *utf++; + b2 = *utf++; + b3 = *utf++; + b4 = *utf++; } else if (l == 4) { - iutf = *utf++ << 24; - iutf |= *utf++ << 16; - iutf |= *utf++ << 8; - iutf |= *utf++; + b1 = *utf++; + b2 = *utf++; + b3 = *utf++; + b4 = *utf++; } else { elog(ERROR, "unsupported character length %d", l); iutf = 0; /* keep compiler quiet */ } + iutf = (b1 << 24 | b2 << 16 | b3 << 8 | b4); /* First, try with combined map if possible */ if (cmap && len > l) @@ -613,21 +598,9 @@ UtfToLocal(const unsigned char *utf, int len, } /* Now check ordinary map */ - if (mapsize > 0) - { - p = bsearch(&iutf, map, mapsize, - sizeof(pg_utf_to_local), compare1); - - if (p) - { - iso = store_coded_char(iso, p->code); - continue; - } - } - else if (map) + if (map) { - uint32 converted = pg_mb_radix_conv((pg_mb_radix_tree *)map, - iutf); + uint32 converted = pg_mb_radix_conv(map, l, b1, b2, b3, b4); if (converted) { iso = store_coded_char(iso, converted); @@ -667,7 +640,6 @@ UtfToLocal(const unsigned char *utf, int len, * utf: pointer to the output area (must be large enough!) (output string will be null-terminated) * map: conversion map for single characters - * mapsize: number of entries in the conversion map * cmap: conversion map for combined characters * (optional, pass NULL if none) * cmapsize: number of entries in the conversion map for combined characters @@ -685,14 +657,13 @@ UtfToLocal(const unsigned char *utf, int len, void LocalToUtf(const unsigned char *iso, int len, unsigned char *utf, - const void *map, int mapsize, - const void *cmap, int cmapsize, + const pg_mb_radix_tree *map, + const pg_local_to_utf *cmap, int cmapsize, utf_local_conversion_func conv_func, int encoding) { uint32 iiso; int l; - const pg_local_to_utf *p; const pg_local_to_utf_combined *cp; if (!PG_VALID_ENCODING(encoding)) @@ -702,6 +673,11 @@ LocalToUtf(const unsigned char *iso, int len, for (; len > 0; len -= l) { + unsigned char b1 = 0; + unsigned char b2 = 0; + unsigned char b3 = 0; + unsigned char b4 = 0; + /* "break" cases all represent errors */ if (*iso == '\0') break; @@ -720,40 +696,39 @@ LocalToUtf(const unsigned char *iso, int len, /* collect coded char of length l */ if (l == 1) - iiso = *iso++; + b4 = *iso++; else if (l == 2) { - iiso = *iso++ << 8; - iiso |= *iso++; + b3 = *iso++; + b4 = *iso++; } else if (l == 3) { - iiso = *iso++ << 16; - iiso |= *iso++ << 8; - iiso |= *iso++; + b2 = *iso++; + b3 = *iso++; + b4 = *iso++; } else if (l == 4) { - iiso = *iso++ << 24; - iiso |= *iso++ << 16; - iiso |= *iso++ << 8; - iiso |= *iso++; + b1 = *iso++; + b2 = *iso++; + b3 = *iso++; + b4 = *iso++; } else { elog(ERROR, "unsupported character length %d", l); iiso = 0; /* keep compiler quiet */ } + iiso = (b1 << 24 | b2 << 16 | b3 << 8 | b4); - if (mapsize > 0) + if (map) { - /* First check ordinary map */ - p = bsearch(&iiso, map, mapsize, - sizeof(pg_local_to_utf), compare2); + uint32 converted = pg_mb_radix_conv(map, l, b1, b2, b3, b4); - if (p) + if (converted) { - utf = store_coded_char(utf, p->utf); + utf = store_coded_char(utf, converted); continue; } @@ -771,16 +746,6 @@ LocalToUtf(const unsigned char *iso, int len, } } } - else if (map) - { - uint32 converted = pg_mb_radix_conv((pg_mb_radix_tree*)map, iiso); - - if (converted) - { - utf = store_coded_char(utf, converted); - continue; - } - } /* if there's a conversion function, try that */ if (conv_func) diff --git a/src/backend/utils/mb/conversion_procs/utf8_and_big5/utf8_and_big5.c b/src/backend/utils/mb/conversion_procs/utf8_and_big5/utf8_and_big5.c index 2857228..6ca7191 100644 --- a/src/backend/utils/mb/conversion_procs/utf8_and_big5/utf8_and_big5.c +++ b/src/backend/utils/mb/conversion_procs/utf8_and_big5/utf8_and_big5.c @@ -42,7 +42,7 @@ big5_to_utf8(PG_FUNCTION_ARGS) CHECK_ENCODING_CONVERSION_ARGS(PG_BIG5, PG_UTF8); LocalToUtf(src, len, dest, - &big5_to_unicode_tree, 0, + &big5_to_unicode_tree, NULL, 0, NULL, PG_BIG5); @@ -60,7 +60,7 @@ utf8_to_big5(PG_FUNCTION_ARGS) CHECK_ENCODING_CONVERSION_ARGS(PG_UTF8, PG_BIG5); UtfToLocal(src, len, dest, - &big5_from_unicode_tree, 0, + &big5_from_unicode_tree, NULL, 0, NULL, PG_BIG5); diff --git a/src/backend/utils/mb/conversion_procs/utf8_and_cyrillic/utf8_and_cyrillic.c b/src/backend/utils/mb/conversion_procs/utf8_and_cyrillic/utf8_and_cyrillic.c index f61f86b..6580243 100644 --- a/src/backend/utils/mb/conversion_procs/utf8_and_cyrillic/utf8_and_cyrillic.c +++ b/src/backend/utils/mb/conversion_procs/utf8_and_cyrillic/utf8_and_cyrillic.c @@ -48,7 +48,7 @@ utf8_to_koi8r(PG_FUNCTION_ARGS) CHECK_ENCODING_CONVERSION_ARGS(PG_UTF8, PG_KOI8R); UtfToLocal(src, len, dest, - &koi8r_from_unicode_tree, 0, + &koi8r_from_unicode_tree, NULL, 0, NULL, PG_KOI8R); @@ -66,7 +66,7 @@ koi8r_to_utf8(PG_FUNCTION_ARGS) CHECK_ENCODING_CONVERSION_ARGS(PG_KOI8R, PG_UTF8); LocalToUtf(src, len, dest, - &koi8r_to_unicode_tree, 0, + &koi8r_to_unicode_tree, NULL, 0, NULL, PG_KOI8R); @@ -84,7 +84,7 @@ utf8_to_koi8u(PG_FUNCTION_ARGS) CHECK_ENCODING_CONVERSION_ARGS(PG_UTF8, PG_KOI8U); UtfToLocal(src, len, dest, - &koi8u_from_unicode_tree, 0, + &koi8u_from_unicode_tree, NULL, 0, NULL, PG_KOI8U); @@ -102,7 +102,7 @@ koi8u_to_utf8(PG_FUNCTION_ARGS) CHECK_ENCODING_CONVERSION_ARGS(PG_KOI8U, PG_UTF8); LocalToUtf(src, len, dest, - &koi8u_to_unicode_tree, 0, + &koi8u_to_unicode_tree, NULL, 0, NULL, PG_KOI8U); diff --git a/src/backend/utils/mb/conversion_procs/utf8_and_euc2004/utf8_and_euc2004.c b/src/backend/utils/mb/conversion_procs/utf8_and_euc2004/utf8_and_euc2004.c index 1ad3d03..8676618 100644 --- a/src/backend/utils/mb/conversion_procs/utf8_and_euc2004/utf8_and_euc2004.c +++ b/src/backend/utils/mb/conversion_procs/utf8_and_euc2004/utf8_and_euc2004.c @@ -44,7 +44,7 @@ euc_jis_2004_to_utf8(PG_FUNCTION_ARGS) CHECK_ENCODING_CONVERSION_ARGS(PG_EUC_JIS_2004, PG_UTF8); LocalToUtf(src, len, dest, - &euc_jis_2004_to_unicode_tree, 0, + &euc_jis_2004_to_unicode_tree, LUmapEUC_JIS_2004_combined, lengthof(LUmapEUC_JIS_2004_combined), NULL, PG_EUC_JIS_2004); @@ -62,7 +62,7 @@ utf8_to_euc_jis_2004(PG_FUNCTION_ARGS) CHECK_ENCODING_CONVERSION_ARGS(PG_UTF8, PG_EUC_JIS_2004); UtfToLocal(src, len, dest, - &euc_jis_2004_from_unicode_tree, 0, + &euc_jis_2004_from_unicode_tree, ULmapEUC_JIS_2004_combined, lengthof(ULmapEUC_JIS_2004_combined), NULL, PG_EUC_JIS_2004); diff --git a/src/backend/utils/mb/conversion_procs/utf8_and_euc_cn/utf8_and_euc_cn.c b/src/backend/utils/mb/conversion_procs/utf8_and_euc_cn/utf8_and_euc_cn.c index be1a036..1dea26e 100644 --- a/src/backend/utils/mb/conversion_procs/utf8_and_euc_cn/utf8_and_euc_cn.c +++ b/src/backend/utils/mb/conversion_procs/utf8_and_euc_cn/utf8_and_euc_cn.c @@ -42,7 +42,7 @@ euc_cn_to_utf8(PG_FUNCTION_ARGS) CHECK_ENCODING_CONVERSION_ARGS(PG_EUC_CN, PG_UTF8); LocalToUtf(src, len, dest, - &euc_cn_to_unicode_tree, 0, + &euc_cn_to_unicode_tree, NULL, 0, NULL, PG_EUC_CN); @@ -60,7 +60,7 @@ utf8_to_euc_cn(PG_FUNCTION_ARGS) CHECK_ENCODING_CONVERSION_ARGS(PG_UTF8, PG_EUC_CN); UtfToLocal(src, len, dest, - &euc_cn_from_unicode_tree, 0, + &euc_cn_from_unicode_tree, NULL, 0, NULL, PG_EUC_CN); diff --git a/src/backend/utils/mb/conversion_procs/utf8_and_euc_jp/utf8_and_euc_jp.c b/src/backend/utils/mb/conversion_procs/utf8_and_euc_jp/utf8_and_euc_jp.c index cc46003..0f65f44 100644 --- a/src/backend/utils/mb/conversion_procs/utf8_and_euc_jp/utf8_and_euc_jp.c +++ b/src/backend/utils/mb/conversion_procs/utf8_and_euc_jp/utf8_and_euc_jp.c @@ -42,7 +42,7 @@ euc_jp_to_utf8(PG_FUNCTION_ARGS) CHECK_ENCODING_CONVERSION_ARGS(PG_EUC_JP, PG_UTF8); LocalToUtf(src, len, dest, - &euc_jp_to_unicode_tree, 0, + &euc_jp_to_unicode_tree, NULL, 0, NULL, PG_EUC_JP); @@ -60,7 +60,7 @@ utf8_to_euc_jp(PG_FUNCTION_ARGS) CHECK_ENCODING_CONVERSION_ARGS(PG_UTF8, PG_EUC_JP); UtfToLocal(src, len, dest, - &euc_jp_from_unicode_tree, 0, + &euc_jp_from_unicode_tree, NULL, 0, NULL, PG_EUC_JP); diff --git a/src/backend/utils/mb/conversion_procs/utf8_and_euc_kr/utf8_and_euc_kr.c b/src/backend/utils/mb/conversion_procs/utf8_and_euc_kr/utf8_and_euc_kr.c index 5e83522..d7d2d78 100644 --- a/src/backend/utils/mb/conversion_procs/utf8_and_euc_kr/utf8_and_euc_kr.c +++ b/src/backend/utils/mb/conversion_procs/utf8_and_euc_kr/utf8_and_euc_kr.c @@ -42,7 +42,7 @@ euc_kr_to_utf8(PG_FUNCTION_ARGS) CHECK_ENCODING_CONVERSION_ARGS(PG_EUC_KR, PG_UTF8); LocalToUtf(src, len, dest, - &euc_kr_to_unicode_tree, 0, + &euc_kr_to_unicode_tree, NULL, 0, NULL, PG_EUC_KR); @@ -60,7 +60,7 @@ utf8_to_euc_kr(PG_FUNCTION_ARGS) CHECK_ENCODING_CONVERSION_ARGS(PG_UTF8, PG_EUC_KR); UtfToLocal(src, len, dest, - &euc_kr_from_unicode_tree, 0, + &euc_kr_from_unicode_tree, NULL, 0, NULL, PG_EUC_KR); diff --git a/src/backend/utils/mb/conversion_procs/utf8_and_euc_tw/utf8_and_euc_tw.c b/src/backend/utils/mb/conversion_procs/utf8_and_euc_tw/utf8_and_euc_tw.c index dd3d791..94d9bee 100644 --- a/src/backend/utils/mb/conversion_procs/utf8_and_euc_tw/utf8_and_euc_tw.c +++ b/src/backend/utils/mb/conversion_procs/utf8_and_euc_tw/utf8_and_euc_tw.c @@ -42,7 +42,7 @@ euc_tw_to_utf8(PG_FUNCTION_ARGS) CHECK_ENCODING_CONVERSION_ARGS(PG_EUC_TW, PG_UTF8); LocalToUtf(src, len, dest, - &euc_tw_to_unicode_tree, 0, + &euc_tw_to_unicode_tree, NULL, 0, NULL, PG_EUC_TW); @@ -60,7 +60,7 @@ utf8_to_euc_tw(PG_FUNCTION_ARGS) CHECK_ENCODING_CONVERSION_ARGS(PG_UTF8, PG_EUC_TW); UtfToLocal(src, len, dest, - &euc_tw_from_unicode_tree, 0, + &euc_tw_from_unicode_tree, NULL, 0, NULL, PG_EUC_TW); diff --git a/src/backend/utils/mb/conversion_procs/utf8_and_gb18030/utf8_and_gb18030.c b/src/backend/utils/mb/conversion_procs/utf8_and_gb18030/utf8_and_gb18030.c index 3e3c74d..0dca5eb 100644 --- a/src/backend/utils/mb/conversion_procs/utf8_and_gb18030/utf8_and_gb18030.c +++ b/src/backend/utils/mb/conversion_procs/utf8_and_gb18030/utf8_and_gb18030.c @@ -197,7 +197,7 @@ gb18030_to_utf8(PG_FUNCTION_ARGS) CHECK_ENCODING_CONVERSION_ARGS(PG_GB18030, PG_UTF8); LocalToUtf(src, len, dest, - &gb18030_to_unicode_tree, 0, + &gb18030_to_unicode_tree, NULL, 0, conv_18030_to_utf8, PG_GB18030); @@ -215,7 +215,7 @@ utf8_to_gb18030(PG_FUNCTION_ARGS) CHECK_ENCODING_CONVERSION_ARGS(PG_UTF8, PG_GB18030); UtfToLocal(src, len, dest, - &gb18030_from_unicode_tree, 0, + &gb18030_from_unicode_tree, NULL, 0, conv_utf8_to_18030, PG_GB18030); diff --git a/src/backend/utils/mb/conversion_procs/utf8_and_gbk/utf8_and_gbk.c b/src/backend/utils/mb/conversion_procs/utf8_and_gbk/utf8_and_gbk.c index 872f353..06234de 100644 --- a/src/backend/utils/mb/conversion_procs/utf8_and_gbk/utf8_and_gbk.c +++ b/src/backend/utils/mb/conversion_procs/utf8_and_gbk/utf8_and_gbk.c @@ -42,7 +42,7 @@ gbk_to_utf8(PG_FUNCTION_ARGS) CHECK_ENCODING_CONVERSION_ARGS(PG_GBK, PG_UTF8); LocalToUtf(src, len, dest, - &gbk_to_unicode_tree, 0, + &gbk_to_unicode_tree, NULL, 0, NULL, PG_GBK); @@ -60,7 +60,7 @@ utf8_to_gbk(PG_FUNCTION_ARGS) CHECK_ENCODING_CONVERSION_ARGS(PG_UTF8, PG_GBK); UtfToLocal(src, len, dest, - &gbk_from_unicode_tree, 0, + &gbk_from_unicode_tree, NULL, 0, NULL, PG_GBK); diff --git a/src/backend/utils/mb/conversion_procs/utf8_and_iso8859/utf8_and_iso8859.c b/src/backend/utils/mb/conversion_procs/utf8_and_iso8859/utf8_and_iso8859.c index 2361528..98cd3c7 100644 --- a/src/backend/utils/mb/conversion_procs/utf8_and_iso8859/utf8_and_iso8859.c +++ b/src/backend/utils/mb/conversion_procs/utf8_and_iso8859/utf8_and_iso8859.c @@ -109,7 +109,7 @@ iso8859_to_utf8(PG_FUNCTION_ARGS) if (encoding == maps[i].encoding) { LocalToUtf(src, len, dest, - maps[i].map1, 0, + maps[i].map1, NULL, 0, NULL, encoding); @@ -141,7 +141,7 @@ utf8_to_iso8859(PG_FUNCTION_ARGS) if (encoding == maps[i].encoding) { UtfToLocal(src, len, dest, - maps[i].map2, 0, + maps[i].map2, NULL, 0, NULL, encoding); diff --git a/src/backend/utils/mb/conversion_procs/utf8_and_johab/utf8_and_johab.c b/src/backend/utils/mb/conversion_procs/utf8_and_johab/utf8_and_johab.c index 2d8ca18..4036fd1 100644 --- a/src/backend/utils/mb/conversion_procs/utf8_and_johab/utf8_and_johab.c +++ b/src/backend/utils/mb/conversion_procs/utf8_and_johab/utf8_and_johab.c @@ -42,7 +42,7 @@ johab_to_utf8(PG_FUNCTION_ARGS) CHECK_ENCODING_CONVERSION_ARGS(PG_JOHAB, PG_UTF8); LocalToUtf(src, len, dest, - &johab_to_unicode_tree, 0, + &johab_to_unicode_tree, NULL, 0, NULL, PG_JOHAB); @@ -60,7 +60,7 @@ utf8_to_johab(PG_FUNCTION_ARGS) CHECK_ENCODING_CONVERSION_ARGS(PG_UTF8, PG_JOHAB); UtfToLocal(src, len, dest, - &johab_from_unicode_tree, 0, + &johab_from_unicode_tree, NULL, 0, NULL, PG_JOHAB); diff --git a/src/backend/utils/mb/conversion_procs/utf8_and_sjis/utf8_and_sjis.c b/src/backend/utils/mb/conversion_procs/utf8_and_sjis/utf8_and_sjis.c index 0a4802d..2a4245a 100644 --- a/src/backend/utils/mb/conversion_procs/utf8_and_sjis/utf8_and_sjis.c +++ b/src/backend/utils/mb/conversion_procs/utf8_and_sjis/utf8_and_sjis.c @@ -42,7 +42,7 @@ sjis_to_utf8(PG_FUNCTION_ARGS) CHECK_ENCODING_CONVERSION_ARGS(PG_SJIS, PG_UTF8); LocalToUtf(src, len, dest, - &sjis_to_unicode_tree, 0, + &sjis_to_unicode_tree, NULL, 0, NULL, PG_SJIS); @@ -60,7 +60,7 @@ utf8_to_sjis(PG_FUNCTION_ARGS) CHECK_ENCODING_CONVERSION_ARGS(PG_UTF8, PG_SJIS); UtfToLocal(src, len, dest, - &sjis_from_unicode_tree, 0, + &sjis_from_unicode_tree, NULL, 0, NULL, PG_SJIS); diff --git a/src/backend/utils/mb/conversion_procs/utf8_and_sjis2004/utf8_and_sjis2004.c b/src/backend/utils/mb/conversion_procs/utf8_and_sjis2004/utf8_and_sjis2004.c index 7160741..c83c5da 100644 --- a/src/backend/utils/mb/conversion_procs/utf8_and_sjis2004/utf8_and_sjis2004.c +++ b/src/backend/utils/mb/conversion_procs/utf8_and_sjis2004/utf8_and_sjis2004.c @@ -44,7 +44,7 @@ shift_jis_2004_to_utf8(PG_FUNCTION_ARGS) CHECK_ENCODING_CONVERSION_ARGS(PG_SHIFT_JIS_2004, PG_UTF8); LocalToUtf(src, len, dest, - &shift_jis_2004_to_unicode_tree, 0, + &shift_jis_2004_to_unicode_tree, LUmapSHIFT_JIS_2004_combined, lengthof(LUmapSHIFT_JIS_2004_combined), NULL, PG_SHIFT_JIS_2004); @@ -62,7 +62,7 @@ utf8_to_shift_jis_2004(PG_FUNCTION_ARGS) CHECK_ENCODING_CONVERSION_ARGS(PG_UTF8, PG_SHIFT_JIS_2004); UtfToLocal(src, len, dest, - &shift_jis_2004_from_unicode_tree, 0, + &shift_jis_2004_from_unicode_tree, ULmapSHIFT_JIS_2004_combined, lengthof(ULmapSHIFT_JIS_2004_combined), NULL, PG_SHIFT_JIS_2004); diff --git a/src/backend/utils/mb/conversion_procs/utf8_and_uhc/utf8_and_uhc.c b/src/backend/utils/mb/conversion_procs/utf8_and_uhc/utf8_and_uhc.c index fb66a8a..d06a19b 100644 --- a/src/backend/utils/mb/conversion_procs/utf8_and_uhc/utf8_and_uhc.c +++ b/src/backend/utils/mb/conversion_procs/utf8_and_uhc/utf8_and_uhc.c @@ -42,7 +42,7 @@ uhc_to_utf8(PG_FUNCTION_ARGS) CHECK_ENCODING_CONVERSION_ARGS(PG_UHC, PG_UTF8); LocalToUtf(src, len, dest, - &uhc_to_unicode_tree, 0, + &uhc_to_unicode_tree, NULL, 0, NULL, PG_UHC); @@ -60,7 +60,7 @@ utf8_to_uhc(PG_FUNCTION_ARGS) CHECK_ENCODING_CONVERSION_ARGS(PG_UTF8, PG_UHC); UtfToLocal(src, len, dest, - &uhc_from_unicode_tree, 0, + &uhc_from_unicode_tree, NULL, 0, NULL, PG_UHC); diff --git a/src/backend/utils/mb/conversion_procs/utf8_and_win/utf8_and_win.c b/src/backend/utils/mb/conversion_procs/utf8_and_win/utf8_and_win.c index d213927..9f55307 100644 --- a/src/backend/utils/mb/conversion_procs/utf8_and_win/utf8_and_win.c +++ b/src/backend/utils/mb/conversion_procs/utf8_and_win/utf8_and_win.c @@ -90,7 +90,7 @@ win_to_utf8(PG_FUNCTION_ARGS) if (encoding == maps[i].encoding) { LocalToUtf(src, len, dest, - maps[i].map1, 0, + maps[i].map1, NULL, 0, NULL, encoding); @@ -122,7 +122,7 @@ utf8_to_win(PG_FUNCTION_ARGS) if (encoding == maps[i].encoding) { UtfToLocal(src, len, dest, - maps[i].map2, 0, + maps[i].map2, NULL, 0, NULL, encoding); diff --git a/src/include/mb/pg_wchar.h b/src/include/mb/pg_wchar.h index 38edbff..7efa600 100644 --- a/src/include/mb/pg_wchar.h +++ b/src/include/mb/pg_wchar.h @@ -383,26 +383,56 @@ typedef struct uint32 code; /* local code */ } pg_utf_to_local; -/* - * radix tree structer for faster conversion - */ typedef struct pg_mb_radix_index { - uint8 lower, upper; /* index range of b2idx */ - uint32 idx[FLEXIBLE_ARRAY_MEMBER]; /* index body */ + uint8 lower; + uint8 upper; /* index range of b2idx */ } pg_mb_radix_index; +/* + * Radix tree structs for faster conversion + */ typedef struct { - const uint8 chars_lower, chars_upper; /* index range of chars* */ - const bool single_byte; /* true if the first segment is - * for single byte characters*/ - const uint16 *chars16; /* 16 bit character table */ + /* + * Array containing all the values. Only one of chars16 or chars32 is + * used, depending on how wide the values we need to represent are. + */ + const uint16 *chars16; /* 16 bit */ const uint32 *chars32; /* 32 bit character table */ - const pg_mb_radix_index *b2idx; - const pg_mb_radix_index *b3idx[2]; - const pg_mb_radix_index *b4idx[3]; + /* Radix tree for 1-byte inputs */ + uint32 b1root; /* offset of table in the chars[16|32] array */ + uint8 b1_lower; /* min allowed value for a single byte input */ + uint8 b1_upper; /* max allowed value for a single byte input */ + + /* Radix tree for 2-byte inputs */ + uint32 b2root; /* offset of 1st byte's table */ + uint8 b2_1_lower; /* min/max allowed value for 1st input byte */ + uint8 b2_1_upper; + uint8 b2_2_lower; /* min/max allowed value for 2nd input byte */ + uint8 b2_2_upper; + + /* Radix tree for 3-byte inputs */ + uint32 b3root; /* offset of 1st byte's table */ + uint8 b3_1_lower; /* min/max allowed value for 1st input byte */ + uint8 b3_1_upper; + uint8 b3_2_lower; /* min/max allowed value for 2nd input byte */ + uint8 b3_2_upper; + uint8 b3_3_lower; /* min/max allowed value for 3rd input byte */ + uint8 b3_3_upper; + + /* Radix tree for 4-byte inputs */ + uint32 b4root; /* offset of 1st byte's table */ + uint8 b4_1_lower; /* min/max allowed value for 1st input byte */ + uint8 b4_1_upper; + uint8 b4_2_lower; /* min/max allowed value for 2nd input byte */ + uint8 b4_2_upper; + uint8 b4_3_lower; /* min/max allowed value for 3rd input byte */ + uint8 b4_3_upper; + uint8 b4_4_lower; /* min/max allowed value for 4th input byte */ + uint8 b4_4_upper; + } pg_mb_radix_tree; /* @@ -532,14 +562,14 @@ extern unsigned short CNStoBIG5(unsigned short cns, unsigned char lc); extern void UtfToLocal(const unsigned char *utf, int len, unsigned char *iso, - const void *map, int mapsize, - const void *combined_map, int cmapsize, + const pg_mb_radix_tree *map, + const pg_utf_to_local *cmap, int cmapsize, utf_local_conversion_func conv_func, int encoding); extern void LocalToUtf(const unsigned char *iso, int len, unsigned char *utf, - const void *map, int mapsize, - const void *combined_cmap, int cmapsize, + const pg_mb_radix_tree *map, + const pg_local_to_utf *cmap, int cmapsize, utf_local_conversion_func conv_func, int encoding); @@ -573,7 +603,6 @@ extern void latin2mic_with_table(const unsigned char *l, unsigned char *p, extern void mic2latin_with_table(const unsigned char *mic, unsigned char *p, int len, int lc, int encoding, const unsigned char *tab); -extern const uint32 pg_mb_radix_conv(const pg_mb_radix_tree *rt, const uint32 c); extern bool pg_utf8_islegal(const unsigned char *source, int length);
-- Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org) To make changes to your subscription: http://www.postgresql.org/mailpref/pgsql-hackers