Tnanks to that Heikki have pushed the first two patches and a part of the third, only one patch is remaining now.
# Sorry for not separating KOI8 stuffs. At Tue, 31 Jan 2017 19:06:09 +0900 (Tokyo Standard Time), Kyotaro HORIGUCHI <horiguchi.kyot...@lab.ntt.co.jp> wrote in <20170131.190609.254672218.horiguchi.kyot...@lab.ntt.co.jp> > > Thanks for the new version, I'll look at it once I am done with the > > cleanup of the current CF. For now I have moved it to the CF 2017-03. > > Agreed. Thank you. Attached is the latest version on the current master (555494d). Note: since this patch is created by git diff --irreversble-delete, three files mb/Unicode/*.(txt|xml) to be deleted are left alone. regards, -- Kyotaro Horiguchi NTT Open Source Software Center
>From 68d75100b7e8aaab7706ea780a1e23557c676c87 Mon Sep 17 00:00:00 2001 From: Kyotaro Horiguchi <horiguchi.kyot...@lab.ntt.co.jp> Date: Tue, 10 Jan 2017 20:02:00 +0900 Subject: [PATCH] Use radix tree for character conversion This patch adds multibyte character converter based using radix tree based on Heikki's rework of my previous patch. --- src/backend/utils/mb/Makefile | 2 + src/backend/utils/mb/Unicode/.gitignore | 11 + src/backend/utils/mb/Unicode/Makefile | 72 +- src/backend/utils/mb/Unicode/UCS_to_BIG5.pl | 9 +- src/backend/utils/mb/Unicode/UCS_to_EUC_CN.pl | 9 +- .../utils/mb/Unicode/UCS_to_EUC_JIS_2004.pl | 19 +- src/backend/utils/mb/Unicode/UCS_to_EUC_JP.pl | 6 +- src/backend/utils/mb/Unicode/UCS_to_EUC_KR.pl | 13 +- src/backend/utils/mb/Unicode/UCS_to_EUC_TW.pl | 9 +- src/backend/utils/mb/Unicode/UCS_to_GB18030.pl | 9 +- src/backend/utils/mb/Unicode/UCS_to_JOHAB.pl | 11 +- .../utils/mb/Unicode/UCS_to_SHIFT_JIS_2004.pl | 14 +- src/backend/utils/mb/Unicode/UCS_to_SJIS.pl | 29 +- src/backend/utils/mb/Unicode/UCS_to_UHC.pl | 11 +- src/backend/utils/mb/Unicode/UCS_to_most.pl | 5 +- src/backend/utils/mb/Unicode/convutils.pm | 679 +- src/backend/utils/mb/Unicode/euc-jis-2004-std.txt | 11549 ------- src/backend/utils/mb/Unicode/gb-18030-2000.xml | 30916 ------------------- src/backend/utils/mb/Unicode/make_mapchecker.pl | 78 + src/backend/utils/mb/Unicode/map_checker.c | 94 + .../utils/mb/Unicode/sjis-0213-2004-std.txt | 11549 ------- src/backend/utils/mb/char_converter.c | 116 + src/backend/utils/mb/conv.c | 137 +- .../conversion_procs/utf8_and_big5/utf8_and_big5.c | 8 +- .../utf8_and_cyrillic/utf8_and_cyrillic.c | 16 +- .../utf8_and_euc2004/utf8_and_euc2004.c | 8 +- .../utf8_and_euc_cn/utf8_and_euc_cn.c | 8 +- .../utf8_and_euc_jp/utf8_and_euc_jp.c | 8 +- .../utf8_and_euc_kr/utf8_and_euc_kr.c | 8 +- .../utf8_and_euc_tw/utf8_and_euc_tw.c | 8 +- .../utf8_and_gb18030/utf8_and_gb18030.c | 8 +- .../conversion_procs/utf8_and_gbk/utf8_and_gbk.c | 8 +- .../utf8_and_iso8859/utf8_and_iso8859.c | 127 +- .../utf8_and_johab/utf8_and_johab.c | 8 +- .../conversion_procs/utf8_and_sjis/utf8_and_sjis.c | 8 +- .../utf8_and_sjis2004/utf8_and_sjis2004.c | 8 +- .../conversion_procs/utf8_and_uhc/utf8_and_uhc.c | 8 +- .../conversion_procs/utf8_and_win/utf8_and_win.c | 98 +- src/include/mb/pg_wchar.h | 56 +- 39 files changed, 1355 insertions(+), 54385 deletions(-) create mode 100644 src/backend/utils/mb/Unicode/.gitignore delete mode 100644 src/backend/utils/mb/Unicode/euc-jis-2004-std.txt delete mode 100644 src/backend/utils/mb/Unicode/gb-18030-2000.xml create mode 100755 src/backend/utils/mb/Unicode/make_mapchecker.pl create mode 100644 src/backend/utils/mb/Unicode/map_checker.c delete mode 100644 src/backend/utils/mb/Unicode/sjis-0213-2004-std.txt create mode 100644 src/backend/utils/mb/char_converter.c diff --git a/src/backend/utils/mb/Makefile b/src/backend/utils/mb/Makefile index 89bec21..d48e729 100644 --- a/src/backend/utils/mb/Makefile +++ b/src/backend/utils/mb/Makefile @@ -14,6 +14,8 @@ include $(top_builddir)/src/Makefile.global OBJS = encnames.o conv.o mbutils.o wchar.o wstrcmp.o wstrncmp.o +conv.o: conv.c char_converter.c + include $(top_srcdir)/src/backend/common.mk clean distclean maintainer-clean: diff --git a/src/backend/utils/mb/Unicode/.gitignore b/src/backend/utils/mb/Unicode/.gitignore new file mode 100644 index 0000000..3908cc3 --- /dev/null +++ b/src/backend/utils/mb/Unicode/.gitignore @@ -0,0 +1,11 @@ +# ignore backup files of editors +/*[~#] + +# ignore authority files +/*.TXT +/*.txt +/*.xml + +# ignore generated files +/map_checker +/map_checker.h diff --git a/src/backend/utils/mb/Unicode/Makefile b/src/backend/utils/mb/Unicode/Makefile index 10708b3..6706157 100644 --- a/src/backend/utils/mb/Unicode/Makefile +++ b/src/backend/utils/mb/Unicode/Makefile @@ -52,12 +52,17 @@ SPECIALMAPS = euc_cn_to_utf8.map utf8_to_euc_cn.map \ big5_to_utf8.map utf8_to_big5.map \ johab_to_utf8.map utf8_to_johab.map \ uhc_to_utf8.map utf8_to_uhc.map \ - euc_jis_2004_to_utf8.map euc_jis_2004_to_utf8_combined.map \ - utf8_to_euc_jis_2004.map utf8_to_euc_jis_2004_combined.map \ - shift_jis_2004_to_utf8.map shift_jis_2004_to_utf8_combined.map \ - utf8_to_shift_jis_2004.map utf8_to_shift_jis_2004_combined.map + euc_jis_2004_to_utf8.map utf8_to_euc_jis_2004.map \ + shift_jis_2004_to_utf8.map utf8_to_shift_jis_2004.map -MAPS = $(GENERICMAPS) $(SPECIALMAPS) +COMBINEDMAPS = euc_jis_2004_to_utf8_combined.map \ + utf8_to_euc_jis_2004_combined.map \ + shift_jis_2004_to_utf8_combined.map \ + utf8_to_shift_jis_2004_combined.map + +RADIXGENERICMAPS = $(subst .map,_radix.map,$(GENERICMAPS)) +RADIXMAPS = $(subst .map,_radix.map,$(GENERICMAPS) $(SPECIALMAPS)) +MAPS = $(GENERICMAPS) $(SPECIALMAPS) $(COMBINEDMAPS) ISO8859TEXTS = 8859-2.TXT 8859-3.TXT 8859-4.TXT 8859-5.TXT \ 8859-6.TXT 8859-7.TXT 8859-8.TXT 8859-9.TXT \ @@ -69,53 +74,76 @@ WINTEXTS = CP866.TXT CP874.TXT CP936.TXT \ CP1252.TXT CP1253.TXT CP1254.TXT CP1255.TXT \ CP1256.TXT CP1257.TXT CP1258.TXT +SPECIALTEXTS = BIG5.TXT CNS11643.TXT \ + CP932.TXT CP950.TXT \ + JIS0201.TXT JIS0208.TXT JIS0212.TXT SHIFTJIS.TXT \ + JOHAB.TXT KSX1001.TXT windows-949-2000.xml \ + euc-jis-2004-std.txt sjis-0213-2004-std.txt \ + gb-18030-2000.xml + GENERICTEXTS = $(ISO8859TEXTS) $(WINTEXTS) \ KOI8-R.TXT KOI8-U.TXT -all: $(MAPS) +TEXTS = $(GENERICTEXTS) $(WINTEXTS) $(ISO8859TEXTS) $(SPECIALTEXTS) + +OBJS = map_checker.o + +BINS = map_checker + +all: $(MAPS) $(RADIXMAPS) $(BINS) + +map_checker.h: make_mapchecker.pl $(MAPS) $(RADIXMAPS) + $(PERL) $< + +map_checker.o: map_checker.c map_checker.h ../char_converter.c + +map_checker: map_checker.o -$(GENERICMAPS): UCS_to_most.pl $(GENERICTEXTS) +$(GENERICMAPS) $(RADIXGENERICMAPS): UCS_to_most.pl $(GENERICTEXTS) $(PERL) $< -johab_to_utf8.map utf8_to_johab.map: UCS_to_JOHAB.pl JOHAB.TXT +johab_to_utf8.map utf8_to_johab.map johab_to_utf8_radix.map utf8_to_johab_radix.map: UCS_to_JOHAB.pl JOHAB.TXT $(PERL) $< -uhc_to_utf8.map utf8_to_uhc.map: UCS_to_UHC.pl windows-949-2000.xml +uhc_to_utf8.map utf8_to_uhc.map uhc_to_utf8_radix.map utf8_to_uhc_radix.map: UCS_to_UHC.pl windows-949-2000.xml $(PERL) $< -euc_jp_to_utf8.map utf8_to_euc_jp.map: UCS_to_EUC_JP.pl CP932.TXT JIS0212.TXT +euc_jp_to_utf8.map utf8_to_euc_jp.map euc_jp_to_utf8_radix.map utf8_to_euc_jp_radix.map: UCS_to_EUC_JP.pl CP932.TXT JIS0212.TXT $(PERL) $< -euc_cn_to_utf8.map utf8_to_euc_cn.map: UCS_to_EUC_CN.pl gb-18030-2000.xml +euc_cn_to_utf8.map utf8_to_euc_cn.map euc_cn_to_utf8_radix.map utf8_to_euc_cn_radix.map: UCS_to_EUC_CN.pl gb-18030-2000.xml $(PERL) $< -euc_kr_to_utf8.map utf8_to_euc_kr.map: UCS_to_EUC_KR.pl KSX1001.TXT +euc_kr_to_utf8.map utf8_to_euc_kr.map euc_kr_to_utf8_radix.map utf8_to_euc_kr_radix.map: UCS_to_EUC_KR.pl KSX1001.TXT $(PERL) $< -euc_tw_to_utf8.map utf8_to_euc_tw.map: UCS_to_EUC_TW.pl CNS11643.TXT +euc_tw_to_utf8.map utf8_to_euc_tw.map euc_tw_to_utf8_radix.map utf8_to_euc_tw_radix.map: UCS_to_EUC_TW.pl CNS11643.TXT $(PERL) $< -sjis_to_utf8.map utf8_to_sjis.map: UCS_to_SJIS.pl CP932.TXT +sjis_to_utf8.map utf8_to_sjis.map sjis_to_utf8_radix.map utf8_to_sjis_radix.map: UCS_to_SJIS.pl CP932.TXT $(PERL) $< -gb18030_to_utf8.map utf8_to_gb18030.map: UCS_to_GB18030.pl gb-18030-2000.xml +gb18030_to_utf8.map utf8_to_gb18030.map gb18030_to_utf8_radix.map utf8_to_gb18030_radix.map: UCS_to_GB18030.pl gb-18030-2000.xml $(PERL) $< -big5_to_utf8.map utf8_to_big5.map: UCS_to_BIG5.pl BIG5.TXT CP950.TXT +big5_to_utf8.map utf8_to_big5.map big5_to_utf8_radix.map utf8_to_big5_radix.map: UCS_to_BIG5.pl BIG5.TXT CP950.TXT $(PERL) $< -euc_jis_2004_to_utf8.map euc_jis_2004_to_utf8_combined.map utf8_to_euc_jis_2004.map utf8_to_euc_jis_2004_combined.map: UCS_to_EUC_JIS_2004.pl euc-jis-2004-std.txt +euc_jis_2004_to_utf8.map euc_jis_2004_to_utf8_radix.map euc_jis_2004_to_utf8_combined.map utf8_to_euc_jis_2004.map utf8_to_euc_jis_2004_radix.map utf8_to_euc_jis_2004_combined.map: UCS_to_EUC_JIS_2004.pl euc-jis-2004-std.txt $(PERL) $< -shift_jis_2004_to_utf8.map shift_jis_2004_to_utf8_combined.map utf8_to_shift_jis_2004.map utf8_to_shift_jis_2004_combined.map: UCS_to_SHIFT_JIS_2004.pl sjis-0213-2004-std.txt +shift_jis_2004_to_utf8.map shift_jis_2004_to_utf8_radix.map shift_jis_2004_to_utf8_combined.map utf8_to_shift_jis_2004.map utf8_to_shift_jis_2004_radix.map utf8_to_shift_jis_2004_combined.map: UCS_to_SHIFT_JIS_2004.pl sjis-0213-2004-std.txt $(PERL) $< -distclean: clean - rm -f $(TEXTS) +distclean: + rm -f $(TEXTS) $(GENERICMAPS) $(SPECIALMAPS) $(OBJS) $(BINS) map_checker.h -maintainer-clean: distclean - rm -f $(MAPS) +# maintainer-clean intentionally leaves $(TEXTS) +maintainer-clean: + rm -f $(MAPS) $(RADIXMAPS) $(GENERICMAPS) $(SPECIALMAPS) $(OBJS) $(BINS) map_checker.h +mapcheck: $(MAPS) $(RADIXMAPS) map_checker + ./map_checker DOWNLOAD = wget -O $@ --no-use-server-timestamps #DOWNLOAD = curl -o $@ diff --git a/src/backend/utils/mb/Unicode/UCS_to_BIG5.pl b/src/backend/utils/mb/Unicode/UCS_to_BIG5.pl index 20f6c70..65c6955 100755 --- a/src/backend/utils/mb/Unicode/UCS_to_BIG5.pl +++ b/src/backend/utils/mb/Unicode/UCS_to_BIG5.pl @@ -27,6 +27,8 @@ use strict; require convutils; +my $this_script = $0; + # Load BIG5.TXT my $all = &read_source("BIG5.TXT"); @@ -47,7 +49,9 @@ foreach my $i (@$cp950txt) { push @$all, {code => $code, ucs => $ucs, comment => $i->{comment}, - direction => "both"}; + direction => "both", + f => $i->{f}, + l => $i->{l} }; } } @@ -65,4 +69,5 @@ foreach my $i (@$all) { } # Output -print_tables("BIG5", $all); +print_tables($this_script, "BIG5", $all, 1); +print_radix_trees($this_script, "BIG5", $all); diff --git a/src/backend/utils/mb/Unicode/UCS_to_EUC_CN.pl b/src/backend/utils/mb/Unicode/UCS_to_EUC_CN.pl index 4f4375e..600d3ce 100755 --- a/src/backend/utils/mb/Unicode/UCS_to_EUC_CN.pl +++ b/src/backend/utils/mb/Unicode/UCS_to_EUC_CN.pl @@ -16,6 +16,8 @@ use strict; require convutils; +my $this_script = $0; + # Read the input my $in_file = "gb-18030-2000.xml"; @@ -68,9 +70,12 @@ while (<$in>) push @mapping, { ucs => $ucs, code => $code, - direction => 'both' + direction => 'both', + f => $in_file, + l => $. }; } close($in); -print_tables("EUC_CN", \@mapping); +print_tables($this_script, "EUC_CN", \@mapping, 1); +print_radix_trees($this_script, "EUC_CN", \@mapping); diff --git a/src/backend/utils/mb/Unicode/UCS_to_EUC_JIS_2004.pl b/src/backend/utils/mb/Unicode/UCS_to_EUC_JIS_2004.pl index cbe2a1e..6066139 100755 --- a/src/backend/utils/mb/Unicode/UCS_to_EUC_JIS_2004.pl +++ b/src/backend/utils/mb/Unicode/UCS_to_EUC_JIS_2004.pl @@ -10,6 +10,8 @@ use strict; require convutils; +my $this_script = $0; + # first generate UTF-8 --> EUC_JIS_2004 table my $in_file = "euc-jis-2004-std.txt"; @@ -33,8 +35,10 @@ while (my $line = <$in>) ucs => $ucs1, ucs_second => $ucs2, code => $code, - comment => $rest }; - next; + comment => $rest, + f => $in_file, + l => $. + }; } elsif ($line =~ /^0x(.*)[ \t]*U\+(.*)[ \t]*#(.*)$/) { @@ -45,9 +49,16 @@ while (my $line = <$in>) next if ($code < 0x80 && $ucs < 0x80); - push @all, { direction => 'both', ucs => $ucs, code => $code, comment => $rest }; + push @all, { direction => 'both', + ucs => $ucs, + code => $code, + comment => $rest, + f => $in_file, + l => $. + }; } } close($in); -print_tables("EUC_JIS_2004", \@all, 1); +print_tables($this_script, "EUC_JIS_2004", \@all, 1); +print_radix_trees($this_script, "EUC_JIS_2004", \@all); diff --git a/src/backend/utils/mb/Unicode/UCS_to_EUC_JP.pl b/src/backend/utils/mb/Unicode/UCS_to_EUC_JP.pl index 926d2d8..681b0d9 100755 --- a/src/backend/utils/mb/Unicode/UCS_to_EUC_JP.pl +++ b/src/backend/utils/mb/Unicode/UCS_to_EUC_JP.pl @@ -14,6 +14,8 @@ use strict; require convutils; +my $this_script = $0; + # Load JIS0212.TXT my $jis0212 = &read_source("JIS0212.TXT"); @@ -191,7 +193,9 @@ push @mapping, ( {direction => 'to_unicode', ucs => 0x3231, code => 0x8ff4ab, comment => '# PARENTHESIZED IDEOGRAPH STOCK'} ); -print_tables("EUC_JP", \@mapping); +print_tables($this_script, "EUC_JP", \@mapping, 1); +print_radix_trees($this_script, "EUC_JP", \@mapping); + ####################################################################### # sjis2jis ; SJIS => JIS conversion diff --git a/src/backend/utils/mb/Unicode/UCS_to_EUC_KR.pl b/src/backend/utils/mb/Unicode/UCS_to_EUC_KR.pl index 228fc4d..a032a27 100755 --- a/src/backend/utils/mb/Unicode/UCS_to_EUC_KR.pl +++ b/src/backend/utils/mb/Unicode/UCS_to_EUC_KR.pl @@ -19,6 +19,8 @@ use strict; require convutils; +my $this_script = $0; + # Load the source file. my $mapping = &read_source("KSX1001.TXT"); @@ -29,10 +31,11 @@ foreach my $i (@$mapping) } # Some extra characters that are not in KSX1001.TXT -push @$mapping, ( - {direction => 'both', ucs => 0x20AC, code => 0xa2e6, comment => '# EURO SIGN'}, - {direction => 'both', ucs => 0x00AE, code => 0xa2e7, comment => '# REGISTERED SIGN'}, - {direction => 'both', ucs => 0x327E, code => 0xa2e8, comment => '# CIRCLED HANGUL IEUNG U'} +push @$mapping,( + {direction => 'both', ucs => 0x20AC, code => 0xa2e6, comment => '# EURO SIGN', f => $this_script, l => __LINE__}, + {direction => 'both', ucs => 0x00AE, code => 0xa2e7, comment => '# REGISTERED SIGN', f => $this_script, l => __LINE__ }, + {direction => 'both', ucs => 0x327E, code => 0xa2e8, comment => '# CIRCLED HANGUL IEUNG U', f => $this_script, l => __LINE__ } ); -print_tables("EUC_KR", $mapping); +print_tables($this_script, "EUC_KR", $mapping, 1); +print_radix_trees($this_script, "EUC_KR", $mapping); diff --git a/src/backend/utils/mb/Unicode/UCS_to_EUC_TW.pl b/src/backend/utils/mb/Unicode/UCS_to_EUC_TW.pl index 296ed2b..0b73218 100755 --- a/src/backend/utils/mb/Unicode/UCS_to_EUC_TW.pl +++ b/src/backend/utils/mb/Unicode/UCS_to_EUC_TW.pl @@ -20,6 +20,8 @@ use strict; require convutils; +my $this_script = $0; + my $mapping = &read_source("CNS11643.TXT"); my @extras; @@ -54,11 +56,14 @@ foreach my $i (@$mapping) ucs => $i->{ucs}, code => ($i->{code} + 0x8ea10000), rest => $i->{rest}, - direction => 'to_unicode' + direction => 'to_unicode', + f => $i->{f}, + l => $i->{l} }; } } push @$mapping, @extras; -print_tables("EUC_TW", $mapping); +print_tables($this_script, "EUC_TW", $mapping, 1); +print_radix_trees($this_script, "EUC_TW", $mapping); diff --git a/src/backend/utils/mb/Unicode/UCS_to_GB18030.pl b/src/backend/utils/mb/Unicode/UCS_to_GB18030.pl index f754611..3c57fd6 100755 --- a/src/backend/utils/mb/Unicode/UCS_to_GB18030.pl +++ b/src/backend/utils/mb/Unicode/UCS_to_GB18030.pl @@ -16,6 +16,8 @@ use strict; require convutils; +my $this_script = $0; + # Read the input my $in_file = "gb-18030-2000.xml"; @@ -36,10 +38,13 @@ while (<$in>) push @mapping, { ucs => $ucs, code => $code, - direction => 'both' + direction => 'both', + f => $in_file, + l => $. }; } } close($in); -print_tables("GB18030", \@mapping); +print_tables($this_script, "GB18030", \@mapping, 1); +print_radix_trees($this_script, "GB18030", \@mapping); diff --git a/src/backend/utils/mb/Unicode/UCS_to_JOHAB.pl b/src/backend/utils/mb/Unicode/UCS_to_JOHAB.pl index b84d589..b3447ff 100755 --- a/src/backend/utils/mb/Unicode/UCS_to_JOHAB.pl +++ b/src/backend/utils/mb/Unicode/UCS_to_JOHAB.pl @@ -18,15 +18,18 @@ use strict; require convutils; +my $this_script = $0; + # Load the source file. my $mapping = &read_source("JOHAB.TXT"); # Some extra characters that are not in JOHAB.TXT push @$mapping, ( - {direction => 'both', ucs => 0x20AC, code => 0xd9e6, comment => '# EURO SIGN'}, - {direction => 'both', ucs => 0x00AE, code => 0xd9e7, comment => '# REGISTERED SIGN'}, - {direction => 'both', ucs => 0x327E, code => 0xd9e8, comment => '# CIRCLED HANGUL IEUNG U'} + {direction => 'both', ucs => 0x20AC, code => 0xd9e6, comment => '# EURO SIGN', f => $this_script, l => __LINE__ }, + {direction => 'both', ucs => 0x00AE, code => 0xd9e7, comment => '# REGISTERED SIGN', f => $this_script, l => __LINE__ }, + {direction => 'both', ucs => 0x327E, code => 0xd9e8, comment => '# CIRCLED HANGUL IEUNG U', f => $this_script, l => __LINE__ } ); -print_tables("JOHAB", $mapping); +print_tables($this_script, "JOHAB", $mapping, 1); +print_radix_trees($this_script, "JOHAB", $mapping); diff --git a/src/backend/utils/mb/Unicode/UCS_to_SHIFT_JIS_2004.pl b/src/backend/utils/mb/Unicode/UCS_to_SHIFT_JIS_2004.pl index 67b6ef6..a6d5483 100755 --- a/src/backend/utils/mb/Unicode/UCS_to_SHIFT_JIS_2004.pl +++ b/src/backend/utils/mb/Unicode/UCS_to_SHIFT_JIS_2004.pl @@ -12,6 +12,8 @@ require convutils; # first generate UTF-8 --> SHIFT_JIS_2004 table +my $this_script = $0; + my $in_file = "sjis-0213-2004-std.txt"; open(my $in, '<', $in_file) || die("cannot open $in_file"); @@ -34,9 +36,10 @@ while (my $line = <$in>) ucs => $ucs1, ucs_second => $ucs2, comment => $rest, - direction => 'both' + direction => 'both', + f => $in_file, + l => $. }; - next; } elsif ($line =~ /^0x(.*)[ \t]*U\+(.*)[ \t]*#(.*)$/) { @@ -67,10 +70,13 @@ while (my $line = <$in>) code => $code, ucs => $ucs, comment => $rest, - direction => $direction + direction => $direction, + f => $in_file, + l => $. }; } } close($in); -print_tables("SHIFT_JIS_2004", \@mapping, 1); +print_tables($this_script, "SHIFT_JIS_2004", \@mapping, 1); +print_radix_trees($this_script, "SHIFT_JIS_2004", \@mapping); diff --git a/src/backend/utils/mb/Unicode/UCS_to_SJIS.pl b/src/backend/utils/mb/Unicode/UCS_to_SJIS.pl index 74e206f..0dd9798 100755 --- a/src/backend/utils/mb/Unicode/UCS_to_SJIS.pl +++ b/src/backend/utils/mb/Unicode/UCS_to_SJIS.pl @@ -13,7 +13,9 @@ use strict; require convutils; -my $charset = read_source("CP932.TXT"); +my $this_script = $0; + +my $mapping = read_source("CP932.TXT"); # Drop these SJIS codes from the source for UTF8=>SJIS conversion my @reject_sjis =( @@ -22,7 +24,7 @@ my @reject_sjis =( 0x879a..0x879c ); -foreach my $i (@$charset) +foreach my $i (@$mapping) { my $code = $i->{code}; my $ucs = $i->{ucs}; @@ -34,15 +36,16 @@ foreach my $i (@$charset) } # Add these UTF8->SJIS pairs to the table. -push @$charset, ( - {direction => "from_unicode", ucs => 0x00a2, code => 0x8191, comment => '# CENT SIGN'}, - {direction => "from_unicode", ucs => 0x00a3, code => 0x8192, comment => '# POUND SIGN'}, - {direction => "from_unicode", ucs => 0x00a5, code => 0x5c, comment => '# YEN SIGN'}, - {direction => "from_unicode", ucs => 0x00ac, code => 0x81ca, comment => '# NOT SIGN'}, - {direction => "from_unicode", ucs => 0x2016, code => 0x8161, comment => '# DOUBLE VERTICAL LINE'}, - {direction => "from_unicode", ucs => 0x203e, code => 0x7e, comment => '# OVERLINE'}, - {direction => "from_unicode", ucs => 0x2212, code => 0x817c, comment => '# MINUS SIGN'}, - {direction => "from_unicode", ucs => 0x301c, code => 0x8160, comment => '# WAVE DASH'} -); +push @$mapping, ( + {direction => "from_unicode", ucs => 0x00a2, code => 0x8191, comment => '# CENT SIGN', f => $this_script, l => __LINE__ }, + {direction => "from_unicode", ucs => 0x00a3, code => 0x8192, comment => '# POUND SIGN', f => $this_script, l => __LINE__ }, + {direction => "from_unicode", ucs => 0x00a5, code => 0x5c, comment => '# YEN SIGN', f => $this_script, l => __LINE__ }, + {direction => "from_unicode", ucs => 0x00ac, code => 0x81ca, comment => '# NOT SIGN', f => $this_script, l => __LINE__ }, + {direction => "from_unicode", ucs => 0x2016, code => 0x8161, comment => '# DOUBLE VERTICAL LINE', f => $this_script, l => __LINE__ }, + {direction => "from_unicode", ucs => 0x203e, code => 0x7e, comment => '# OVERLINE', f => $this_script, l => __LINE__ }, + {direction => "from_unicode", ucs => 0x2212, code => 0x817c, comment => '# MINUS SIGN', f => $this_script, l => __LINE__ }, + {direction => "from_unicode", ucs => 0x301c, code => 0x8160, comment => '# WAVE DASH', f => $this_script, l => __LINE__ } + ); -print_tables("SJIS", $charset); +print_tables($this_script, "SJIS", $mapping, 1); +print_radix_trees($this_script, "SJIS", $mapping); diff --git a/src/backend/utils/mb/Unicode/UCS_to_UHC.pl b/src/backend/utils/mb/Unicode/UCS_to_UHC.pl index a65c537..d1297b8 100755 --- a/src/backend/utils/mb/Unicode/UCS_to_UHC.pl +++ b/src/backend/utils/mb/Unicode/UCS_to_UHC.pl @@ -16,6 +16,8 @@ use strict; require convutils; +my $this_script = $0; + # Read the input my $in_file = "windows-949-2000.xml"; @@ -39,13 +41,16 @@ while (<$in>) push @mapping, { ucs => $ucs, code => $code, - direction => 'both' + direction => 'both', + f => $in_file, + l => $. }; } } close($in); # One extra character that's not in the source file. -push @mapping, { direction => 'both', code => 0xa2e8, ucs => 0x327e, comment => 'CIRCLED HANGUL IEUNG U' }; +push @mapping, { direction => 'both', code => 0xa2e8, ucs => 0x327e, comment => 'CIRCLED HANGUL IEUNG U', f => $this_script, l => __LINE__ }; -print_tables("UHC", \@mapping); +print_tables($this_script, "UHC", \@mapping, 1); +print_radix_trees($this_script, "UHC", \@mapping); diff --git a/src/backend/utils/mb/Unicode/UCS_to_most.pl b/src/backend/utils/mb/Unicode/UCS_to_most.pl index acc03e3..799a0a1 100755 --- a/src/backend/utils/mb/Unicode/UCS_to_most.pl +++ b/src/backend/utils/mb/Unicode/UCS_to_most.pl @@ -18,6 +18,8 @@ use strict; require convutils; +my $this_script = $0; + my %filename = ( 'WIN866' => 'CP866.TXT', 'WIN874' => 'CP874.TXT', @@ -54,5 +56,6 @@ foreach my $charset (@charsets) { my $mapping = &read_source($filename{$charset}); - print_tables($charset, $mapping); + print_tables($this_script, $charset, $mapping, 1); + print_radix_trees($this_script, $charset, $mapping); } diff --git a/src/backend/utils/mb/Unicode/convutils.pm b/src/backend/utils/mb/Unicode/convutils.pm index 0ae79a2..7808c32 100644 --- a/src/backend/utils/mb/Unicode/convutils.pm +++ b/src/backend/utils/mb/Unicode/convutils.pm @@ -67,7 +67,9 @@ sub read_source code => hex($1), ucs => hex($2), comment => $4, - direction => "both" + direction => "both", + f => $fname, + l => $. }; # Ignore pure ASCII mappings. PostgreSQL character conversion code @@ -85,6 +87,7 @@ sub read_source # print_tables : output mapping tables # # Arguments: +# this_script - the name of the *caller script* of this feature # charset - string name of the character set. # table - mapping table (see format below) # verbose - if 1, output comment on each line, @@ -106,7 +109,7 @@ sub read_source # sub print_tables { - my ($charset, $table, $verbose) = @_; + my ($this_script, $charset, $table, $verbose) = @_; # Build an array with only the to-UTF8 direction mappings my @to_unicode; @@ -149,76 +152,96 @@ sub print_tables } } - print_to_utf8_map($charset, \@to_unicode, $verbose); - print_to_utf8_combined_map($charset, \@to_unicode_combined, $verbose) if (scalar @to_unicode_combined > 0); - print_from_utf8_map($charset, \@from_unicode, $verbose); - print_from_utf8_combined_map($charset, \@from_unicode_combined, $verbose) if (scalar @from_unicode_combined > 0); + print_to_utf8_map($this_script, $charset, \@to_unicode, $verbose); + if (scalar @to_unicode_combined > 0) + { + print_to_utf8_combined_map($this_script, $charset, + \@to_unicode_combined, $verbose); + } + print_from_utf8_map($this_script, $charset, \@from_unicode, $verbose); + if (scalar @from_unicode_combined > 0) + { + print_from_utf8_combined_map($this_script, $charset, + \@from_unicode_combined, $verbose); + } } sub print_from_utf8_map { - my ($charset, $table, $verbose) = @_; + my ($this_script, $charset, $table, $verbose) = @_; my $last_comment = ""; my $fname = lc("utf8_to_${charset}.map"); print "- Writing UTF8=>${charset} conversion table: $fname\n"; open(my $out, '>', $fname) || die "cannot open output file : $fname\n"; - printf($out "/* src/backend/utils/mb/Unicode/$fname */\n\n". - "static const pg_utf_to_local ULmap${charset}[ %d ] = {", - scalar(@$table)); + printf $out "/* src/backend/utils/mb/Unicode/$fname */\n" + . "/* This file is generated by $this_script */\n\n" + . "static const pg_utf_to_local ULmap${charset}[ %d ] = {", + scalar(@$table); my $first = 1; foreach my $i (sort {$a->{utf8} <=> $b->{utf8}} @$table) { print($out ",") if (!$first); $first = 0; - print($out "\t/* $last_comment */") if ($verbose); + print $out "\t/* $last_comment */" if ($verbose && $last_comment ne ""); printf($out "\n {0x%04x, 0x%04x}", $i->{utf8}, $i->{code}); if ($verbose >= 2) { - $last_comment = "$i->{f}:$i->{l} $i->{comment}"; + $last_comment = + sprintf("%s:%d %s", $i->{f}, $i->{l}, $i->{comment}); } - else + elsif ($verbose >= 1) { $last_comment = $i->{comment}; } } - print($out "\t/* $last_comment */") if ($verbose); + print $out "\t/* $last_comment */" if ($verbose && $last_comment ne ""); print $out "\n};\n"; close($out); } sub print_from_utf8_combined_map { - my ($charset, $table, $verbose) = @_; + my ($this_script, $charset, $table, $verbose) = @_; my $last_comment = ""; my $fname = lc("utf8_to_${charset}_combined.map"); print "- Writing UTF8=>${charset} conversion table: $fname\n"; open(my $out, '>', $fname) || die "cannot open output file : $fname\n"; - printf($out "/* src/backend/utils/mb/Unicode/$fname */\n\n". - "static const pg_utf_to_local_combined ULmap${charset}_combined[ %d ] = {", - scalar(@$table)); + printf $out "/* src/backend/utils/mb/Unicode/$fname */\n" + . "/* This file is generated by $this_script */\n\n" + . "static const pg_utf_to_local_combined ULmap${charset}_combined[ %d ] = {", + scalar(@$table); my $first = 1; foreach my $i (sort {$a->{utf8} <=> $b->{utf8}} @$table) { print($out ",") if (!$first); $first = 0; - print($out "\t/* $last_comment */") if ($verbose); + print $out "\t/* $last_comment */" if ($verbose && $last_comment ne ""); - printf($out "\n {0x%08x, 0x%08x, 0x%04x}", $i->{utf8}, $i->{utf8_second}, $i->{code}); - $last_comment = "$i->{comment}"; + printf $out "\n {0x%08x, 0x%08x, 0x%04x}", + $i->{utf8}, $i->{utf8_second}, $i->{code}; + if ($verbose >= 2) + { + $last_comment = + sprintf("%s:%d %s", $i->{f}, $i->{l}, $i->{comment}); + } + elsif ($verbose >= 1) + { + $last_comment = $i->{comment}; + } } - print($out "\t/* $last_comment */") if ($verbose); + print $out "\t/* $last_comment */" if ($verbose && $last_comment ne ""); print $out "\n};\n"; close($out); } sub print_to_utf8_map { - my ($charset, $table, $verbose) = @_; + my ($this_script, $charset, $table, $verbose) = @_; my $last_comment = ""; @@ -226,34 +249,37 @@ sub print_to_utf8_map print "- Writing ${charset}=>UTF8 conversion table: $fname\n"; open(my $out, '>', $fname) || die "cannot open output file : $fname\n"; - printf($out "/* src/backend/utils/mb/Unicode/${fname} */\n\n". - "static const pg_local_to_utf LUmap${charset}[ %d ] = {", - scalar(@$table)); + printf $out "/* src/backend/utils/mb/Unicode/$fname */\n" + . "/* This file is generated by $this_script */\n\n" + . "static const pg_local_to_utf LUmap${charset}[ %d ] = {", + scalar(@$table); + my $first = 1; foreach my $i (sort {$a->{code} <=> $b->{code}} @$table) { print($out ",") if (!$first); $first = 0; - print($out "\t/* $last_comment */") if ($verbose); + print $out "\t/* $last_comment */" if ($verbose && $last_comment ne ""); printf($out "\n {0x%04x, 0x%x}", $i->{code}, $i->{utf8}); if ($verbose >= 2) { - $last_comment = "$i->{f}:$i->{l} $i->{comment}"; + $last_comment = + sprintf("%s:%d %s", $i->{f}, $i->{l}, $i->{comment}); } - else + elsif ($verbose >= 1) { $last_comment = $i->{comment}; } } - print($out "\t/* $last_comment */") if ($verbose); + print $out "\t/* $last_comment */" if ($verbose && $last_comment ne ""); print $out "\n};\n"; close($out); } sub print_to_utf8_combined_map { - my ($charset, $table, $verbose) = @_; + my ($this_script, $charset, $table, $verbose) = @_; my $last_comment = ""; @@ -261,22 +287,599 @@ sub print_to_utf8_combined_map print "- Writing ${charset}=>UTF8 conversion table: $fname\n"; open(my $out, '>', $fname) || die "cannot open output file : $fname\n"; - printf($out "/* src/backend/utils/mb/Unicode/${fname} */\n\n". - "static const pg_local_to_utf_combined LUmap${charset}_combined[ %d ] = {", - scalar(@$table)); + printf $out "/* src/backend/utils/mb/Unicode/$fname */\n" + . "/* This file is generated by $this_script */\n\n" + . "static const pg_local_to_utf_combined LUmap${charset}_combined[ %d ] = {", + scalar(@$table); + my $first = 1; foreach my $i (sort {$a->{code} <=> $b->{code}} @$table) { print($out ",") if (!$first); $first = 0; - print($out "\t/* $last_comment */") if ($verbose); + print $out "\t/* $last_comment */" if ($verbose && $last_comment ne ""); + + printf $out "\n {0x%04x, 0x%08x, 0x%08x}", + $i->{code}, $i->{utf8}, $i->{utf8_second}; - printf($out "\n {0x%04x, 0x%08x, 0x%08x}", $i->{code}, $i->{utf8}, $i->{utf8_second}); - $last_comment = "$i->{comment}"; + if ($verbose >= 2) + { + $last_comment = + sprintf("%s:%d %s", $i->{f}, $i->{l}, $i->{comment}); + } + elsif ($verbose >= 1) + { + $last_comment = $i->{comment}; + } } - print($out "\t/* $last_comment */") if ($verbose); + print $out "\t/* $last_comment */" if ($verbose && $last_comment ne ""); print $out "\n};\n"; close($out); } +############################################################################# +# RADIX TREE STUFF + +######################################### +# print_radix_table(<charmap hash ref>) +# +# Input: A hash, mapping an input character to an output character. +# +# Constructs a radix tree from the hash, and prints it out as a C-struct. +# + +sub print_radix_table +{ + my ($out, $tblname, $c) = @_; + + ### + ### Build radix trees in memory, for 1-, 2-, 3- and 4-byte inputs. Each + ### radix tree is represented as a nested hash, each hash indexed by + ### input byte + ### + my %b1map; + my %b2map; + my %b3map; + my %b4map; + foreach my $in (keys %$c) + { + my $out = $c->{$in}; + + if ($in < 0x100) + { + $b1map{$in} = $out; + } + elsif ($in < 0x10000) + { + my $b1 = $in >> 8; + my $b2 = $in & 0xff; + + $b2map{$b1}{$b2} = $out; + } + elsif ($in < 0x1000000) + { + my $b1 = $in >> 16; + my $b2 = ($in >> 8) & 0xff; + my $b3 = $in & 0xff; + + $b3map{$b1}{$b2}{$b3} = $out; + } + elsif ($in < 0x100000000) + { + my $b1 = $in >> 24; + my $b2 = ($in >> 16) & 0xff; + my $b3 = ($in >> 8) & 0xff; + my $b4 = $in & 0xff; + + $b4map{$b1}{$b2}{$b3}{$b4} = $out; + } + else + { + die sprintf("up to 4 byte code is supported: %x", $in); + } + } + + my @segments; + + ### + ### Build a linear list of "segments", from the nested hashes. + ### + ### Each segment is a lookup table, keyed by the next byte in the input. + ### The segments are written out physically to one big array in the final + ### step, but logically, they form a radix tree. Or rather, four radix + ### trees: one for 1-byte inputs, another for 2-byte inputs, 3-byte + ### inputs, and 4-byte inputs. + ### + ### Each segment is represented by a hash with following fields: + ### + ### comment => <string to output as a comment> + ### label => <label that can be used to refer to this segment from elsewhere> + ### values => <a hash, keyed by byte, 0-0xff> + ### + ### Entries in 'values' can be integers (for leaf-level segments), or + ### string labels, pointing to a segment with that label. Any missing + ### values are treated as zeros. If 'values' hash is missing altogether, + ### it's treated as all-zeros. + ### + ### Subsequent steps will enrich the segments with more fields. + ### + + # Add the segments for the radix trees themselves. + push @segments, build_segments_from_tree("Single byte table", "1-byte", 1, \%b1map); + push @segments, build_segments_from_tree("Two byte table", "2-byte", 2, \%b2map); + push @segments, build_segments_from_tree("Three byte table", "3-byte", 3, \%b3map); + push @segments, build_segments_from_tree("Four byte table", "4-byte", 4, \%b4map); + + ### + ### Find min and max index used in each level of each tree. + ### + ### These are stored separately, and we can then leave out the unused + ### parts of every segment. (When using the resulting tree, you must + ### check each input byte against the min and max.) + ### + my %min_idx; + my %max_idx; + foreach my $seg (@segments) + { + my $this_min = $min_idx{$seg->{depth}}->{$seg->{level}}; + my $this_max = $max_idx{$seg->{depth}}->{$seg->{level}}; + + foreach my $i (keys %{$seg->{values}}) + { + $this_min = $i if (!defined $this_min || $i < $this_min); + $this_max = $i if (!defined $this_max || $i > $this_max); + } + + $min_idx{$seg->{depth}}{$seg->{level}} = $this_min; + $max_idx{$seg->{depth}}{$seg->{level}} = $this_max; + } + # Copy the mins and max's back to every segment, for convenience + foreach my $seg (@segments) + { + $seg->{min_idx} = $min_idx{$seg->{depth}}{$seg->{level}}; + $seg->{max_idx} = $max_idx{$seg->{depth}}{$seg->{level}}; + } + + ### + ### Prepend a dummy all-zeros map to the beginning. + ### + ### A 0 is an invalid value anywhere in the table, and this allows us to + ### point to 0 offset anywhere else in the tables, to get a 0 result. + + # Find the max range between min and max indexes in any of the segments. + my $widest_range = 0; + foreach my $seg (@segments) + { + my $this_range = $seg->{max_idx} - $seg->{min_idx}; + $widest_range = $this_range if ($this_range > $widest_range); + } + + unshift @segments, { + header => "Dummy map, for invalid values", + min_idx => 0, + max_idx => $widest_range + }; + + ### + ### Eliminate overlapping zeros + ### + ### For each segment, if there are zero values at the end of, and there + ### are also zero values at the beginning of the next segment, we can + ### overlay the tail of this segment with the head of next segment, to + ### save space. + ### + ### To achieve that, we subtract the 'max_idx' of each segment with the + ### amount of zeros that can be ovarlaid. + ### + for (my $j = 0; $j < $#segments - 1; $j++) + { + my $seg = $segments[$j]; + my $nextseg = $segments[$j + 1]; + + # Count the number of zero values at the end of this segment. + my $this_trail_zeros = 0; + for (my $i = $seg->{max_idx}; $i >= $seg->{min_idx} && !$seg->{values}->{$i}; $i--) + { + $this_trail_zeros++; + } + + # Count the number of zeros at the beginning of next segment. + my $next_lead_zeros = 0; + for (my $i = $nextseg->{min_idx}; $i <= $nextseg->{max_idx} && !$nextseg->{values}->{$i}; $i++) + { + $next_lead_zeros++; + } + + # How many zeros in common? + my $overlaid_trail_zeros = + ($this_trail_zeros > $next_lead_zeros) ? $next_lead_zeros : $this_trail_zeros; + + $seg->{overlaid_trail_zeros} = $overlaid_trail_zeros; + $seg->{max_idx} = $seg->{max_idx} - $overlaid_trail_zeros; + } + + ### + ### Replace label references with real offsets. + ### + ### So far, the non-leaf segments have referred to other segments by + ### their labels. Replace them with numerical offsets from the beginning + ### of the final array. You cannot move, add, or remove segments after + ### this step, as that would invalidate the offsets calculated here! + ### + my $flatoff = 0; + my %segmap; + + # First pass: assign offsets to each segment, and build hash + # of label => offset. + foreach my $seg (@segments) + { + $seg->{offset} = $flatoff; + $segmap{$seg->{label}} = $flatoff; + $flatoff += $seg->{max_idx} - $seg->{min_idx} + 1; + } + my $tblsize = $flatoff; + + # Second pass: look up the offset of each label reference in the hash. + foreach my $seg (@segments) + { + while (my ($i, $val) = each %{$seg->{values}}) + { + if (!($val =~ /^[0-9,.E]+$/ )) + { + my $segoff = $segmap{$val}; + if ($segoff) + { + $seg->{values}->{$i} = $segoff; + } + else + { + die "no segment with label $val"; + } + } + } + } + + # Also look up the positions of the roots in the table. + my $b1root = $segmap{"1-byte"}; + my $b2root = $segmap{"2-byte"}; + my $b3root = $segmap{"3-byte"}; + my $b4root = $segmap{"4-byte"}; + + # And the lower-upper values of each level in each radix tree. + my $b1_lower = $min_idx{1}{1}; + my $b1_upper = $max_idx{1}{1}; + + my $b2_1_lower = $min_idx{2}{1}; + my $b2_1_upper = $max_idx{2}{1}; + my $b2_2_lower = $min_idx{2}{2}; + my $b2_2_upper = $max_idx{2}{2}; + + my $b3_1_lower = $min_idx{3}{1}; + my $b3_1_upper = $max_idx{3}{1}; + my $b3_2_lower = $min_idx{3}{2}; + my $b3_2_upper = $max_idx{3}{2}; + my $b3_3_lower = $min_idx{3}{3}; + my $b3_3_upper = $max_idx{3}{3}; + + my $b4_1_lower = $min_idx{4}{1}; + my $b4_1_upper = $max_idx{4}{1}; + my $b4_2_lower = $min_idx{4}{2}; + my $b4_2_upper = $max_idx{4}{2}; + my $b4_3_lower = $min_idx{4}{3}; + my $b4_3_upper = $max_idx{4}{3}; + my $b4_4_lower = $min_idx{4}{4}; + my $b4_4_upper = $max_idx{4}{4}; + + ### + ### Find the maximum value in the whole table, to determine if we can + ### use uint16 or if we need to use uint32. + ### + my $max_val = 0; + foreach my $seg (@segments) + { + foreach my $val (values %{$seg->{values}}) + { + $max_val = $val if ($val > $max_val); + } + } + + my $datatype = ($max_val <= 0xffff) ? "uint16" : "uint32"; + + # For formatting, determine how many values we can fit on a single + # line, and how wide each value needs to be to align nicely. + my $vals_per_line; + my $colwidth; + + if ($max_val <= 0xffff) + { + $vals_per_line = 8; + $colwidth = 4; + } + elsif ($max_val <= 0xffffff) + { + $vals_per_line = 4; + $colwidth = 6; + } + else + { + $vals_per_line = 4; + $colwidth = 8; + } + + ### + ### Print the struct and array. + ### + printf $out "static const $datatype ${tblname}_table[];\n"; + printf $out "\n"; + printf $out "static const pg_mb_radix_tree $tblname =\n"; + printf $out "{\n"; + if ($datatype eq "uint16") + { + print $out " ${tblname}_table,\n"; + print $out " NULL, /* 32-bit table not used */\n"; + } + if ($datatype eq "uint32") + { + print $out " NULL, /* 16-bit table not used */\n"; + print $out " ${tblname}_table,\n"; + } + printf $out "\n"; + printf $out " 0x%04x, /* offset of table for 1-byte inputs */\n", $b1root; + printf $out " 0x%02x, /* b1_lower */\n", $b1_lower; + printf $out " 0x%02x, /* b1_upper */\n", $b1_upper; + printf $out "\n"; + printf $out " 0x%04x, /* offset of table for 2-byte inputs */\n", $b2root; + printf $out " 0x%02x, /* b2_1_lower */\n", $b2_1_lower; + printf $out " 0x%02x, /* b2_1_upper */\n", $b2_1_upper; + printf $out " 0x%02x, /* b2_2_lower */\n", $b2_2_lower; + printf $out " 0x%02x, /* b2_2_upper */\n", $b2_2_upper; + printf $out "\n"; + printf $out " 0x%04x, /* offset of table for 3-byte inputs */\n", $b3root; + printf $out " 0x%02x, /* b3_1_lower */\n", $b3_1_lower; + printf $out " 0x%02x, /* b3_1_upper */\n", $b3_1_upper; + printf $out " 0x%02x, /* b3_2_lower */\n", $b3_2_lower; + printf $out " 0x%02x, /* b3_2_upper */\n", $b3_2_upper; + printf $out " 0x%02x, /* b3_3_lower */\n", $b3_3_lower; + printf $out " 0x%02x, /* b3_3_upper */\n", $b3_3_upper; + printf $out "\n"; + printf $out " 0x%04x, /* offset of table for 3-byte inputs */\n", $b4root; + printf $out " 0x%02x, /* b4_1_lower */\n", $b4_1_lower; + printf $out " 0x%02x, /* b4_1_upper */\n", $b4_1_upper; + printf $out " 0x%02x, /* b4_2_lower */\n", $b4_2_lower; + printf $out " 0x%02x, /* b4_2_upper */\n", $b4_2_upper; + printf $out " 0x%02x, /* b4_3_lower */\n", $b4_3_lower; + printf $out " 0x%02x, /* b4_3_upper */\n", $b4_3_upper; + printf $out " 0x%02x, /* b4_4_lower */\n", $b4_4_lower; + printf $out " 0x%02x /* b4_4_upper */\n", $b4_4_upper; + print $out "};\n"; + print $out "\n"; + print $out "static const $datatype ${tblname}_table[$tblsize] =\n"; + print $out "{"; + my $off = 0; + foreach my $seg (@segments) + { + printf $out "\n"; + printf $out " /*** %s - offset 0x%05x ***/\n", $seg->{header}, $off; + printf $out "\n"; + + for (my $i=$seg->{min_idx}; $i <= $seg->{max_idx};) + { + # Print the next line's worth of values. + # XXX pad to begin at a nice boundary + printf $out " /* %02x */ ", $i; + for (my $j = 0; $j < $vals_per_line && $i <= $seg->{max_idx}; $j++) + { + my $val = $seg->{values}->{$i}; + + printf $out " 0x%0*x", $colwidth, $val; + $off++; + if ($off != $tblsize) + { + print $out ","; + } + $i++; + } + print $out "\n"; + } + if ($seg->{overlaid_trail_zeros}) + { + printf $out " /* $seg->{overlaid_trail_zeros} trailing zero values shared with next segment */\n"; + } + } + + # Sanity check. + if ($off != $tblsize) { die "table size didn't match!"; } + + print $out "};\n"; +} + +### +sub build_segments_from_tree +{ + my ($header, $rootlabel, $depth, $map) = @_; + + my @segments; + + if (%{$map}) + { + @segments = build_segments_recurse($header, $rootlabel, "", 1, $depth, $map); + + # Sort the segments into "breadth-first" order. Not strictly required, + # but makes the maps nicer to read. + @segments = sort { $a->{level} cmp $b->{level} or + $a->{path} cmp $b->{path}} + @segments; + } + + return @segments; +} + +### +sub build_segments_recurse +{ + my ($header, $label, $path, $level, $depth, $map) = @_; + + my @segments; + + if ($level == $depth) + { + push @segments, { + header => $header . ", leaf: ${path}xx", + label => $label, + level => $level, + depth => $depth, + path => $path, + values => $map + }; + } + else + { + my %children; + + while (my ($i, $val) = each %$map) + { + my $childpath = $path . sprintf("%02x", $i); + my $childlabel = "$depth-level-$level-$childpath"; + + push @segments, build_segments_recurse($header, $childlabel, $childpath, + $level + 1, $depth, $val); + $children{$i} = $childlabel; + } + + push @segments, { + header => $header . ", byte #$level: ${path}xx", + label => $label, + level => $level, + depth => $depth, + path => $path, + values => \%children + }; + } + return @segments; +} + +###################################################### +# make_charmap - convert charset table to charmap hash +# with checking duplicate source code +# +# make_charmap(\@charset, $direction) +# charset - ref to charset table : see print_tables +# direction - conversion direction + +sub make_charmap +{ + my ($charset, $direction) = @_; + + die "unacceptable direction : $direction" + if ($direction ne "to_unicode" && $direction ne "from_unicode"); + + my %charmap; + foreach my $c (@$charset) + { + next if ($c->{direction} ne $direction && $c->{direction} ne "both"); + + # don't generate entries for combined characters + next if (defined $c->{ucs_second}); + + my ($src, $dst) = + $direction eq "to_unicode" + ? ($c->{code}, $c->{ucs}) + : ($c->{ucs}, $c->{code}); + + if (defined $c->{$src}) + { + printf STDERR + "Error: duplicate source code: 0x%04x => 0x%04x, 0x%04x\n", + $src, $c->{$src}, $dst; + exit; + } + if ($direction eq "to_unicode") + { + $charmap{$src} = ucs2utf($dst); + } + else + { + $charmap{ ucs2utf($src) } = $dst; + } + + } + + return \%charmap; +} + + +######################################### +# print_radix_map - write the whole content of C source of tadix tree +# +# print_radix_map($this_script, $csname, $direction, \%charset, $tblwidth) +# +# this_script - the name of the *caller script* of this feature +# csname - character set name other than ucs +# direction - desired direction "to_unicode" or "from_unicode" +# charset - ref to character set array +# tblwidth - width in characters of output source file + +sub print_radix_map +{ + my ($this_script, $csname, $direction, $charset, $tblwidth) = @_; + + my $charmap = &make_charmap($charset, $direction); + my $fname = + $direction eq "to_unicode" + ? lc("${csname}_to_utf8_radix.map") + : lc("utf8_to_${csname}_radix.map"); + + my $tblname = lc("${csname}_${direction}_tree"); + my $name_prefix = lc("${csname}_${direction}_"); + + if ($direction eq "to_unicode") + { + print "- Writing ${csname}=>UTF8 conversion radix index: $fname\n"; + } + else + { + print "- Writing UTF8=>${csname} conversion radix index: $fname\n"; + } + + open(my $out, '>', $fname) || die("cannot open $fname"); + + print $out "/* src/backend/utils/mb/Unicode/$fname */\n" + . "/* This file is generated by $this_script */\n\n"; + + print_radix_table($out, $tblname, $charmap); + + close($out); +} + + +################################################################### +# print_radix_trees - write the radix tree files for both direction +# +# print_radix_trees($this_script, $csname, \%charset) +# +# this_script - the name of the *caller script* of this feature +# csname - character set name other than ucs +# charset - ref to character set array +sub print_radix_trees +{ + my ($this_script, $csname, $charset) = @_; + + &print_radix_map($this_script, $csname, "from_unicode", $charset, 78); + &print_radix_map($this_script, $csname, "to_unicode", $charset, 78); +} + +sub dump_charset +{ + my ($list, $filt) = @_; + + foreach my $i (@$list) + { + next if (defined $filt && !&$filt($i)); + if (!defined $i->{ucs}) { $i->{ucs} = &utf2ucs($i->{utf8}); } + printf "ucs=%x, code=%x, direction=%s %s:%d %s\n", + $i->{ucs}, $i->{code}, $i->{direction}, + $i->{f}, $i->{l}, $i->{comment}; + } +} + 1; diff --git a/src/backend/utils/mb/Unicode/euc-jis-2004-std.txt b/src/backend/utils/mb/Unicode/euc-jis-2004-std.txt deleted file mode 100644 index 8657e7f..0000000 diff --git a/src/backend/utils/mb/Unicode/gb-18030-2000.xml b/src/backend/utils/mb/Unicode/gb-18030-2000.xml deleted file mode 100644 index fbbc9e3..0000000 diff --git a/src/backend/utils/mb/Unicode/make_mapchecker.pl b/src/backend/utils/mb/Unicode/make_mapchecker.pl new file mode 100755 index 0000000..b912d83 --- /dev/null +++ b/src/backend/utils/mb/Unicode/make_mapchecker.pl @@ -0,0 +1,78 @@ +#! /usr/bin/perl +# +# make_mapchecker.pl - Gerates map_checker.h file included by map_checker.c +# + +use strict; + +# collect all radix mapfiles +opendir(my $dh, ".") || die "failed to open directory: ."; +my @radixmaps = grep { /_radix\.map$/ } readdir($dh); +closedir($dh); + +my %plainmaps; + +# check if all radix maps has corresponding plain map +foreach my $rmap (@radixmaps) +{ + my $pmap = $rmap; + $pmap =~ s/_radix//; + if (!-e $pmap) + { + die("radix map \"$rmap\" has no corresponding plain map\n"); + } + $plainmaps{$rmap} = $pmap; +} + +# generate sanity checker source +my $out; +open($out, '>', "map_checker.h") + || die "cannot open file to write: map_checker.h"; + +# add #include lines for all radix maps and corresponding plain maps +foreach my $i (sort @radixmaps) +{ + print $out "#include \"$i\"\n"; + print $out "#include \"$plainmaps{$i}\"\n"; +} + +print $out <<'EOF'; + +struct mappair +{ + const char *name; + int len; + const pg_local_to_utf *lu; + const pg_utf_to_local *ul; + const pg_mb_radix_tree *rt; +} mappairs[] = { +EOF + +# generate variable names for the array of mappair +my @mapnames = map { my $m = $_; $m =~ s/\.map//; $m } values %plainmaps; + +# write the content of mappairs array. +foreach my $m (@mapnames) +{ + if ($m =~ /^utf8_to_(.*)$/) + { + my $e = uc($1); + print $out + " {\"$m\", lengthof(ULmap$e), NULL, ULmap$e, &$1_from_unicode_tree}"; + } + elsif ($m =~ /^(.*)_to_utf8$/) + { + my $e = uc($1); + print $out + " {\"$m\", lengthof(LUmap$e), LUmap$e, NULL, &$1_to_unicode_tree}"; + } + else + { + die "Unrecognizable map name: $m"; + } + print $out ",\n"; +} + +print $out " {NULL, 0, NULL, NULL, NULL}\n};\n"; + +close($out); diff --git a/src/backend/utils/mb/Unicode/map_checker.c b/src/backend/utils/mb/Unicode/map_checker.c new file mode 100644 index 0000000..dec0716 --- /dev/null +++ b/src/backend/utils/mb/Unicode/map_checker.c @@ -0,0 +1,94 @@ +/*------------------------------------------------------------------------- + * + * Radix map checker + * + * Copyright (c) 2017, PostgreSQL Global Development Group + * + * IDENTIFICATION + * src/backend/utils/mb/Unicode/map_checker.c + * + *------------------------------------------------------------------------- + */ + +#include "postgres.h" +#include "mb/pg_wchar.h" + +#include "map_checker.h" + +#include "../char_converter.c" + +/* + * The old-style plain map files were error-resistant due to its + * straight-forward way for generation from authority files. In contrast the + * radix tree maps are generated by a rather complex calculation and have a + * complex, hard-to-confirm format. + * + * This program runs sanity check of the radix tree maps by confirming all + * characters in the plain map files to be converted to the same code by the + * corresponding radix tree map. + * + * All map files are included by map_checker.h that is generated by the script + * make_mapchecker.pl as the variable mappairs. + * + */ +int main(void) +{ + struct mappair *mp; + + for (mp = mappairs ; mp->name ; mp++) + { + int i; + + printf("Checking \"%s_radix.map\" against \"%s.map\"(%d chars)..", mp->name, mp->name, mp->len); + for (i = 0 ; i < mp->len ; i++) + { + uint32 s, c, d; + + unsigned char b1; + unsigned char b2; + unsigned char b3; + unsigned char b4; + int l; + + if (mp->ul) + { + s = mp->ul[i].utf; + d = mp->ul[i].code; + } + else + { + s = mp->lu[i].code; + d = mp->lu[i].utf; + } + if (s < 0x80) + { + fprintf(stderr, "\nASCII character ? (%x)", s); + exit(1); + } + + b1 = s >> 24; + b2 = s >> 16; + b3 = s >> 8; + b4 = s; + if (b1 != 0) + l = 4; + else if (b2 != 0) + l = 3; + else if (b3 != 0) + l = 2; + else + l = 1; + + c = pg_mb_radix_conv(mp->rt, l, b1, b2, b3, b4); + + if (c != d) + { + fprintf(stderr, "\nConversion failure in \"%s\": %x => %x, expected %x\n", + mp->name, s, c, d); + exit(1); + } + } + printf("Ok.\n"); + } + printf("All radix trees are perfect!\n"); +} diff --git a/src/backend/utils/mb/Unicode/sjis-0213-2004-std.txt b/src/backend/utils/mb/Unicode/sjis-0213-2004-std.txt deleted file mode 100644 index 4b12bce..0000000 diff --git a/src/backend/utils/mb/char_converter.c b/src/backend/utils/mb/char_converter.c new file mode 100644 index 0000000..3795b7d --- /dev/null +++ b/src/backend/utils/mb/char_converter.c @@ -0,0 +1,116 @@ +/*------------------------------------------------------------------------- + * + * Character converter function using radix tree + * + * Copyright (c) 2017, PostgreSQL Global Development Group + * + * IDENTIFICATION + * src/backend/utils/mb/char_converter.c + * + *------------------------------------------------------------------------- + */ + +static inline uint32 +pg_mb_radix_conv(const pg_mb_radix_tree *rt, + int l, + unsigned char b1, + unsigned char b2, + unsigned char b3, + unsigned char b4) +{ + if (l == 4) + { + /* 4-byte code */ + + /* check code validity */ + if (b1 < rt->b4_1_lower || b1 > rt->b4_1_upper || + b2 < rt->b4_2_lower || b2 > rt->b4_2_upper || + b3 < rt->b4_3_lower || b3 > rt->b4_3_upper || + b4 < rt->b4_4_lower || b4 > rt->b4_4_upper) + return 0; + + if (rt->chars32) + { + uint32 idx = rt->b4root; + + idx = rt->chars32[b1 + idx - rt->b4_1_lower]; + idx = rt->chars32[b2 + idx - rt->b4_2_lower]; + idx = rt->chars32[b3 + idx - rt->b4_3_lower]; + return rt->chars32[b4 + idx - rt->b4_4_lower]; + } + else + { + uint16 idx = rt->b4root; + + idx = rt->chars16[b1 + idx - rt->b4_1_lower]; + idx = rt->chars16[b2 + idx - rt->b4_2_lower]; + idx = rt->chars16[b3 + idx - rt->b4_3_lower]; + return rt->chars16[b4 + idx - rt->b4_4_lower]; + } + } + else if (l == 3) + { + /* 3-byte code */ + + /* check code validity */ + if (b2 < rt->b3_1_lower || b2 > rt->b3_1_upper || + b3 < rt->b3_2_lower || b3 > rt->b3_2_upper || + b4 < rt->b3_3_lower || b4 > rt->b3_3_upper) + return 0; + + if (rt->chars32) + { + uint32 idx = rt->b3root; + + idx = rt->chars32[b2 + idx - rt->b3_1_lower]; + idx = rt->chars32[b3 + idx - rt->b3_2_lower]; + return rt->chars32[b4 + idx - rt->b3_3_lower]; + } + else + { + uint16 idx = rt->b3root; + + idx = rt->chars16[b2 + idx - rt->b3_1_lower]; + idx = rt->chars16[b3 + idx - rt->b3_2_lower]; + return rt->chars16[b4 + idx - rt->b3_3_lower]; + } + } + else if (l == 2) + { + /* 2-byte code */ + + /* check code validity - first byte */ + if (b3 < rt->b2_1_lower || b3 > rt->b2_1_upper || + b4 < rt->b2_2_lower || b4 > rt->b2_2_upper) + return 0; + + if (rt->chars32) + { + uint32 idx = rt->b2root; + + idx = rt->chars32[b3 + idx - rt->b2_1_lower]; + return rt->chars32[b4 + idx - rt->b2_2_lower]; + } + else + { + uint16 idx = rt->b2root; + + idx = rt->chars16[b3 + idx - rt->b2_1_lower]; + return rt->chars16[b4 + idx - rt->b2_2_lower]; + } + } + else if (l == 1) + { + /* 1-byte code */ + + /* check code validity - first byte */ + if (b4 < rt->b1_lower || b4 > rt->b1_upper) + return 0; + + if (rt->chars32) + return rt->chars32[b4 + rt->b1root - rt->b1_lower]; + else + return rt->chars16[b4 + rt->b1root - rt->b1_lower]; + } + return 0; /* shouldn't happen */ +} diff --git a/src/backend/utils/mb/conv.c b/src/backend/utils/mb/conv.c index 9014a57..feaf8ef 100644 --- a/src/backend/utils/mb/conv.c +++ b/src/backend/utils/mb/conv.c @@ -13,6 +13,7 @@ #include "postgres.h" #include "mb/pg_wchar.h" +#include "char_converter.c" /* * local2local: a generic single byte charset encoding @@ -284,36 +285,6 @@ mic2latin_with_table(const unsigned char *mic, /* * comparison routine for bsearch() - * this routine is intended for UTF8 -> local code - */ -static int -compare1(const void *p1, const void *p2) -{ - uint32 v1, - v2; - - v1 = *(const uint32 *) p1; - v2 = ((const pg_utf_to_local *) p2)->utf; - return (v1 > v2) ? 1 : ((v1 == v2) ? 0 : -1); -} - -/* - * comparison routine for bsearch() - * this routine is intended for local code -> UTF8 - */ -static int -compare2(const void *p1, const void *p2) -{ - uint32 v1, - v2; - - v1 = *(const uint32 *) p1; - v2 = ((const pg_local_to_utf *) p2)->code; - return (v1 > v2) ? 1 : ((v1 == v2) ? 0 : -1); -} - -/* - * comparison routine for bsearch() * this routine is intended for combined UTF8 -> local code */ static int @@ -371,7 +342,6 @@ store_coded_char(unsigned char *dest, uint32 code) * iso: pointer to the output area (must be large enough!) (output string will be null-terminated) * map: conversion map for single characters - * mapsize: number of entries in the conversion map * cmap: conversion map for combined characters * (optional, pass NULL if none) * cmapsize: number of entries in the conversion map for combined characters @@ -389,14 +359,13 @@ store_coded_char(unsigned char *dest, uint32 code) void UtfToLocal(const unsigned char *utf, int len, unsigned char *iso, - const pg_utf_to_local *map, int mapsize, + const pg_mb_radix_tree *map, const pg_utf_to_local_combined *cmap, int cmapsize, utf_local_conversion_func conv_func, int encoding) { uint32 iutf; int l; - const pg_utf_to_local *p; const pg_utf_to_local_combined *cp; if (!PG_VALID_ENCODING(encoding)) @@ -406,6 +375,11 @@ UtfToLocal(const unsigned char *utf, int len, for (; len > 0; len -= l) { + unsigned char b1 = 0; + unsigned char b2 = 0; + unsigned char b3 = 0; + unsigned char b4 = 0; + /* "break" cases all represent errors */ if (*utf == '\0') break; @@ -427,27 +401,28 @@ UtfToLocal(const unsigned char *utf, int len, /* collect coded char of length l */ if (l == 2) { - iutf = *utf++ << 8; - iutf |= *utf++; + b3 = *utf++; + b4 = *utf++; } else if (l == 3) { - iutf = *utf++ << 16; - iutf |= *utf++ << 8; - iutf |= *utf++; + b2 = *utf++; + b3 = *utf++; + b4 = *utf++; } else if (l == 4) { - iutf = *utf++ << 24; - iutf |= *utf++ << 16; - iutf |= *utf++ << 8; - iutf |= *utf++; + b1 = *utf++; + b2 = *utf++; + b3 = *utf++; + b4 = *utf++; } else { elog(ERROR, "unsupported character length %d", l); iutf = 0; /* keep compiler quiet */ } + iutf = (b1 << 24 | b2 << 16 | b3 << 8 | b4); /* First, try with combined map if possible */ if (cmap && len > l) @@ -516,13 +491,14 @@ UtfToLocal(const unsigned char *utf, int len, } /* Now check ordinary map */ - p = bsearch(&iutf, map, mapsize, - sizeof(pg_utf_to_local), compare1); - - if (p) + if (map) { - iso = store_coded_char(iso, p->code); - continue; + uint32 converted = pg_mb_radix_conv(map, l, b1, b2, b3, b4); + if (converted) + { + iso = store_coded_char(iso, converted); + continue; + } } /* if there's a conversion function, try that */ @@ -557,7 +533,6 @@ UtfToLocal(const unsigned char *utf, int len, * utf: pointer to the output area (must be large enough!) (output string will be null-terminated) * map: conversion map for single characters - * mapsize: number of entries in the conversion map * cmap: conversion map for combined characters * (optional, pass NULL if none) * cmapsize: number of entries in the conversion map for combined characters @@ -575,14 +550,13 @@ UtfToLocal(const unsigned char *utf, int len, void LocalToUtf(const unsigned char *iso, int len, unsigned char *utf, - const pg_local_to_utf *map, int mapsize, + const pg_mb_radix_tree *map, const pg_local_to_utf_combined *cmap, int cmapsize, utf_local_conversion_func conv_func, int encoding) { uint32 iiso; int l; - const pg_local_to_utf *p; const pg_local_to_utf_combined *cp; if (!PG_VALID_ENCODING(encoding)) @@ -592,6 +566,11 @@ LocalToUtf(const unsigned char *iso, int len, for (; len > 0; len -= l) { + unsigned char b1 = 0; + unsigned char b2 = 0; + unsigned char b3 = 0; + unsigned char b4 = 0; + /* "break" cases all represent errors */ if (*iso == '\0') break; @@ -610,53 +589,55 @@ LocalToUtf(const unsigned char *iso, int len, /* collect coded char of length l */ if (l == 1) - iiso = *iso++; + b4 = *iso++; else if (l == 2) { - iiso = *iso++ << 8; - iiso |= *iso++; + b3 = *iso++; + b4 = *iso++; } else if (l == 3) { - iiso = *iso++ << 16; - iiso |= *iso++ << 8; - iiso |= *iso++; + b2 = *iso++; + b3 = *iso++; + b4 = *iso++; } else if (l == 4) { - iiso = *iso++ << 24; - iiso |= *iso++ << 16; - iiso |= *iso++ << 8; - iiso |= *iso++; + b1 = *iso++; + b2 = *iso++; + b3 = *iso++; + b4 = *iso++; } else { elog(ERROR, "unsupported character length %d", l); iiso = 0; /* keep compiler quiet */ } + iiso = (b1 << 24 | b2 << 16 | b3 << 8 | b4); - /* First check ordinary map */ - p = bsearch(&iiso, map, mapsize, - sizeof(pg_local_to_utf), compare2); - - if (p) - { - utf = store_coded_char(utf, p->utf); - continue; - } - - /* If there's a combined character map, try that */ - if (cmap) + if (map) { - cp = bsearch(&iiso, cmap, cmapsize, - sizeof(pg_local_to_utf_combined), compare4); + uint32 converted = pg_mb_radix_conv(map, l, b1, b2, b3, b4); - if (cp) + if (converted) { - utf = store_coded_char(utf, cp->utf1); - utf = store_coded_char(utf, cp->utf2); + utf = store_coded_char(utf, converted); continue; } + + /* If there's a combined character map, try that */ + if (cmap) + { + cp = bsearch(&iiso, cmap, cmapsize, + sizeof(pg_local_to_utf_combined), compare4); + + if (cp) + { + utf = store_coded_char(utf, cp->utf1); + utf = store_coded_char(utf, cp->utf2); + continue; + } + } } /* if there's a conversion function, try that */ diff --git a/src/backend/utils/mb/conversion_procs/utf8_and_big5/utf8_and_big5.c b/src/backend/utils/mb/conversion_procs/utf8_and_big5/utf8_and_big5.c index 746ed35..66e36d4 100644 --- a/src/backend/utils/mb/conversion_procs/utf8_and_big5/utf8_and_big5.c +++ b/src/backend/utils/mb/conversion_procs/utf8_and_big5/utf8_and_big5.c @@ -14,8 +14,8 @@ #include "postgres.h" #include "fmgr.h" #include "mb/pg_wchar.h" -#include "../../Unicode/big5_to_utf8.map" -#include "../../Unicode/utf8_to_big5.map" +#include "../../Unicode/big5_to_utf8_radix.map" +#include "../../Unicode/utf8_to_big5_radix.map" PG_MODULE_MAGIC; @@ -42,7 +42,7 @@ big5_to_utf8(PG_FUNCTION_ARGS) CHECK_ENCODING_CONVERSION_ARGS(PG_BIG5, PG_UTF8); LocalToUtf(src, len, dest, - LUmapBIG5, lengthof(LUmapBIG5), + &big5_to_unicode_tree, NULL, 0, NULL, PG_BIG5); @@ -60,7 +60,7 @@ utf8_to_big5(PG_FUNCTION_ARGS) CHECK_ENCODING_CONVERSION_ARGS(PG_UTF8, PG_BIG5); UtfToLocal(src, len, dest, - ULmapBIG5, lengthof(ULmapBIG5), + &big5_from_unicode_tree, NULL, 0, NULL, PG_BIG5); diff --git a/src/backend/utils/mb/conversion_procs/utf8_and_cyrillic/utf8_and_cyrillic.c b/src/backend/utils/mb/conversion_procs/utf8_and_cyrillic/utf8_and_cyrillic.c index d568c63..1a6402a 100644 --- a/src/backend/utils/mb/conversion_procs/utf8_and_cyrillic/utf8_and_cyrillic.c +++ b/src/backend/utils/mb/conversion_procs/utf8_and_cyrillic/utf8_and_cyrillic.c @@ -14,10 +14,10 @@ #include "postgres.h" #include "fmgr.h" #include "mb/pg_wchar.h" -#include "../../Unicode/utf8_to_koi8r.map" -#include "../../Unicode/koi8r_to_utf8.map" -#include "../../Unicode/utf8_to_koi8u.map" -#include "../../Unicode/koi8u_to_utf8.map" +#include "../../Unicode/utf8_to_koi8r_radix.map" +#include "../../Unicode/koi8r_to_utf8_radix.map" +#include "../../Unicode/utf8_to_koi8u_radix.map" +#include "../../Unicode/koi8u_to_utf8_radix.map" PG_MODULE_MAGIC; @@ -48,7 +48,7 @@ utf8_to_koi8r(PG_FUNCTION_ARGS) CHECK_ENCODING_CONVERSION_ARGS(PG_UTF8, PG_KOI8R); UtfToLocal(src, len, dest, - ULmapKOI8R, lengthof(ULmapKOI8R), + &koi8r_from_unicode_tree, NULL, 0, NULL, PG_KOI8R); @@ -66,7 +66,7 @@ koi8r_to_utf8(PG_FUNCTION_ARGS) CHECK_ENCODING_CONVERSION_ARGS(PG_KOI8R, PG_UTF8); LocalToUtf(src, len, dest, - LUmapKOI8R, lengthof(LUmapKOI8R), + &koi8r_to_unicode_tree, NULL, 0, NULL, PG_KOI8R); @@ -84,7 +84,7 @@ utf8_to_koi8u(PG_FUNCTION_ARGS) CHECK_ENCODING_CONVERSION_ARGS(PG_UTF8, PG_KOI8U); UtfToLocal(src, len, dest, - ULmapKOI8U, lengthof(ULmapKOI8U), + &koi8u_from_unicode_tree, NULL, 0, NULL, PG_KOI8U); @@ -102,7 +102,7 @@ koi8u_to_utf8(PG_FUNCTION_ARGS) CHECK_ENCODING_CONVERSION_ARGS(PG_KOI8U, PG_UTF8); LocalToUtf(src, len, dest, - LUmapKOI8U, lengthof(LUmapKOI8U), + &koi8u_to_unicode_tree, NULL, 0, NULL, PG_KOI8U); diff --git a/src/backend/utils/mb/conversion_procs/utf8_and_euc2004/utf8_and_euc2004.c b/src/backend/utils/mb/conversion_procs/utf8_and_euc2004/utf8_and_euc2004.c index ebf5f23..ec27841 100644 --- a/src/backend/utils/mb/conversion_procs/utf8_and_euc2004/utf8_and_euc2004.c +++ b/src/backend/utils/mb/conversion_procs/utf8_and_euc2004/utf8_and_euc2004.c @@ -14,8 +14,8 @@ #include "postgres.h" #include "fmgr.h" #include "mb/pg_wchar.h" -#include "../../Unicode/euc_jis_2004_to_utf8.map" -#include "../../Unicode/utf8_to_euc_jis_2004.map" +#include "../../Unicode/euc_jis_2004_to_utf8_radix.map" +#include "../../Unicode/utf8_to_euc_jis_2004_radix.map" #include "../../Unicode/euc_jis_2004_to_utf8_combined.map" #include "../../Unicode/utf8_to_euc_jis_2004_combined.map" @@ -44,7 +44,7 @@ euc_jis_2004_to_utf8(PG_FUNCTION_ARGS) CHECK_ENCODING_CONVERSION_ARGS(PG_EUC_JIS_2004, PG_UTF8); LocalToUtf(src, len, dest, - LUmapEUC_JIS_2004, lengthof(LUmapEUC_JIS_2004), + &euc_jis_2004_to_unicode_tree, LUmapEUC_JIS_2004_combined, lengthof(LUmapEUC_JIS_2004_combined), NULL, PG_EUC_JIS_2004); @@ -62,7 +62,7 @@ utf8_to_euc_jis_2004(PG_FUNCTION_ARGS) CHECK_ENCODING_CONVERSION_ARGS(PG_UTF8, PG_EUC_JIS_2004); UtfToLocal(src, len, dest, - ULmapEUC_JIS_2004, lengthof(ULmapEUC_JIS_2004), + &euc_jis_2004_from_unicode_tree, ULmapEUC_JIS_2004_combined, lengthof(ULmapEUC_JIS_2004_combined), NULL, PG_EUC_JIS_2004); diff --git a/src/backend/utils/mb/conversion_procs/utf8_and_euc_cn/utf8_and_euc_cn.c b/src/backend/utils/mb/conversion_procs/utf8_and_euc_cn/utf8_and_euc_cn.c index cb0751c..a6b156d 100644 --- a/src/backend/utils/mb/conversion_procs/utf8_and_euc_cn/utf8_and_euc_cn.c +++ b/src/backend/utils/mb/conversion_procs/utf8_and_euc_cn/utf8_and_euc_cn.c @@ -14,8 +14,8 @@ #include "postgres.h" #include "fmgr.h" #include "mb/pg_wchar.h" -#include "../../Unicode/euc_cn_to_utf8.map" -#include "../../Unicode/utf8_to_euc_cn.map" +#include "../../Unicode/euc_cn_to_utf8_radix.map" +#include "../../Unicode/utf8_to_euc_cn_radix.map" PG_MODULE_MAGIC; @@ -42,7 +42,7 @@ euc_cn_to_utf8(PG_FUNCTION_ARGS) CHECK_ENCODING_CONVERSION_ARGS(PG_EUC_CN, PG_UTF8); LocalToUtf(src, len, dest, - LUmapEUC_CN, lengthof(LUmapEUC_CN), + &euc_cn_to_unicode_tree, NULL, 0, NULL, PG_EUC_CN); @@ -60,7 +60,7 @@ utf8_to_euc_cn(PG_FUNCTION_ARGS) CHECK_ENCODING_CONVERSION_ARGS(PG_UTF8, PG_EUC_CN); UtfToLocal(src, len, dest, - ULmapEUC_CN, lengthof(ULmapEUC_CN), + &euc_cn_from_unicode_tree, NULL, 0, NULL, PG_EUC_CN); diff --git a/src/backend/utils/mb/conversion_procs/utf8_and_euc_jp/utf8_and_euc_jp.c b/src/backend/utils/mb/conversion_procs/utf8_and_euc_jp/utf8_and_euc_jp.c index 6512eee..75d190a 100644 --- a/src/backend/utils/mb/conversion_procs/utf8_and_euc_jp/utf8_and_euc_jp.c +++ b/src/backend/utils/mb/conversion_procs/utf8_and_euc_jp/utf8_and_euc_jp.c @@ -14,8 +14,8 @@ #include "postgres.h" #include "fmgr.h" #include "mb/pg_wchar.h" -#include "../../Unicode/euc_jp_to_utf8.map" -#include "../../Unicode/utf8_to_euc_jp.map" +#include "../../Unicode/euc_jp_to_utf8_radix.map" +#include "../../Unicode/utf8_to_euc_jp_radix.map" PG_MODULE_MAGIC; @@ -42,7 +42,7 @@ euc_jp_to_utf8(PG_FUNCTION_ARGS) CHECK_ENCODING_CONVERSION_ARGS(PG_EUC_JP, PG_UTF8); LocalToUtf(src, len, dest, - LUmapEUC_JP, lengthof(LUmapEUC_JP), + &euc_jp_to_unicode_tree, NULL, 0, NULL, PG_EUC_JP); @@ -60,7 +60,7 @@ utf8_to_euc_jp(PG_FUNCTION_ARGS) CHECK_ENCODING_CONVERSION_ARGS(PG_UTF8, PG_EUC_JP); UtfToLocal(src, len, dest, - ULmapEUC_JP, lengthof(ULmapEUC_JP), + &euc_jp_from_unicode_tree, NULL, 0, NULL, PG_EUC_JP); diff --git a/src/backend/utils/mb/conversion_procs/utf8_and_euc_kr/utf8_and_euc_kr.c b/src/backend/utils/mb/conversion_procs/utf8_and_euc_kr/utf8_and_euc_kr.c index f85720f..84302d3 100644 --- a/src/backend/utils/mb/conversion_procs/utf8_and_euc_kr/utf8_and_euc_kr.c +++ b/src/backend/utils/mb/conversion_procs/utf8_and_euc_kr/utf8_and_euc_kr.c @@ -14,8 +14,8 @@ #include "postgres.h" #include "fmgr.h" #include "mb/pg_wchar.h" -#include "../../Unicode/euc_kr_to_utf8.map" -#include "../../Unicode/utf8_to_euc_kr.map" +#include "../../Unicode/euc_kr_to_utf8_radix.map" +#include "../../Unicode/utf8_to_euc_kr_radix.map" PG_MODULE_MAGIC; @@ -42,7 +42,7 @@ euc_kr_to_utf8(PG_FUNCTION_ARGS) CHECK_ENCODING_CONVERSION_ARGS(PG_EUC_KR, PG_UTF8); LocalToUtf(src, len, dest, - LUmapEUC_KR, lengthof(LUmapEUC_KR), + &euc_kr_to_unicode_tree, NULL, 0, NULL, PG_EUC_KR); @@ -60,7 +60,7 @@ utf8_to_euc_kr(PG_FUNCTION_ARGS) CHECK_ENCODING_CONVERSION_ARGS(PG_UTF8, PG_EUC_KR); UtfToLocal(src, len, dest, - ULmapEUC_KR, lengthof(ULmapEUC_KR), + &euc_kr_from_unicode_tree, NULL, 0, NULL, PG_EUC_KR); diff --git a/src/backend/utils/mb/conversion_procs/utf8_and_euc_tw/utf8_and_euc_tw.c b/src/backend/utils/mb/conversion_procs/utf8_and_euc_tw/utf8_and_euc_tw.c index 1ce4099..0dc3c1f 100644 --- a/src/backend/utils/mb/conversion_procs/utf8_and_euc_tw/utf8_and_euc_tw.c +++ b/src/backend/utils/mb/conversion_procs/utf8_and_euc_tw/utf8_and_euc_tw.c @@ -14,8 +14,8 @@ #include "postgres.h" #include "fmgr.h" #include "mb/pg_wchar.h" -#include "../../Unicode/euc_tw_to_utf8.map" -#include "../../Unicode/utf8_to_euc_tw.map" +#include "../../Unicode/euc_tw_to_utf8_radix.map" +#include "../../Unicode/utf8_to_euc_tw_radix.map" PG_MODULE_MAGIC; @@ -42,7 +42,7 @@ euc_tw_to_utf8(PG_FUNCTION_ARGS) CHECK_ENCODING_CONVERSION_ARGS(PG_EUC_TW, PG_UTF8); LocalToUtf(src, len, dest, - LUmapEUC_TW, lengthof(LUmapEUC_TW), + &euc_tw_to_unicode_tree, NULL, 0, NULL, PG_EUC_TW); @@ -60,7 +60,7 @@ utf8_to_euc_tw(PG_FUNCTION_ARGS) CHECK_ENCODING_CONVERSION_ARGS(PG_UTF8, PG_EUC_TW); UtfToLocal(src, len, dest, - ULmapEUC_TW, lengthof(ULmapEUC_TW), + &euc_tw_from_unicode_tree, NULL, 0, NULL, PG_EUC_TW); diff --git a/src/backend/utils/mb/conversion_procs/utf8_and_gb18030/utf8_and_gb18030.c b/src/backend/utils/mb/conversion_procs/utf8_and_gb18030/utf8_and_gb18030.c index 22dd642..836ef72 100644 --- a/src/backend/utils/mb/conversion_procs/utf8_and_gb18030/utf8_and_gb18030.c +++ b/src/backend/utils/mb/conversion_procs/utf8_and_gb18030/utf8_and_gb18030.c @@ -14,8 +14,8 @@ #include "postgres.h" #include "fmgr.h" #include "mb/pg_wchar.h" -#include "../../Unicode/gb18030_to_utf8.map" -#include "../../Unicode/utf8_to_gb18030.map" +#include "../../Unicode/gb18030_to_utf8_radix.map" +#include "../../Unicode/utf8_to_gb18030_radix.map" PG_MODULE_MAGIC; @@ -197,7 +197,7 @@ gb18030_to_utf8(PG_FUNCTION_ARGS) CHECK_ENCODING_CONVERSION_ARGS(PG_GB18030, PG_UTF8); LocalToUtf(src, len, dest, - LUmapGB18030, lengthof(LUmapGB18030), + &gb18030_to_unicode_tree, NULL, 0, conv_18030_to_utf8, PG_GB18030); @@ -215,7 +215,7 @@ utf8_to_gb18030(PG_FUNCTION_ARGS) CHECK_ENCODING_CONVERSION_ARGS(PG_UTF8, PG_GB18030); UtfToLocal(src, len, dest, - ULmapGB18030, lengthof(ULmapGB18030), + &gb18030_from_unicode_tree, NULL, 0, conv_utf8_to_18030, PG_GB18030); diff --git a/src/backend/utils/mb/conversion_procs/utf8_and_gbk/utf8_and_gbk.c b/src/backend/utils/mb/conversion_procs/utf8_and_gbk/utf8_and_gbk.c index 1238e3d..a3b97b9 100644 --- a/src/backend/utils/mb/conversion_procs/utf8_and_gbk/utf8_and_gbk.c +++ b/src/backend/utils/mb/conversion_procs/utf8_and_gbk/utf8_and_gbk.c @@ -14,8 +14,8 @@ #include "postgres.h" #include "fmgr.h" #include "mb/pg_wchar.h" -#include "../../Unicode/gbk_to_utf8.map" -#include "../../Unicode/utf8_to_gbk.map" +#include "../../Unicode/gbk_to_utf8_radix.map" +#include "../../Unicode/utf8_to_gbk_radix.map" PG_MODULE_MAGIC; @@ -42,7 +42,7 @@ gbk_to_utf8(PG_FUNCTION_ARGS) CHECK_ENCODING_CONVERSION_ARGS(PG_GBK, PG_UTF8); LocalToUtf(src, len, dest, - LUmapGBK, lengthof(LUmapGBK), + &gbk_to_unicode_tree, NULL, 0, NULL, PG_GBK); @@ -60,7 +60,7 @@ utf8_to_gbk(PG_FUNCTION_ARGS) CHECK_ENCODING_CONVERSION_ARGS(PG_UTF8, PG_GBK); UtfToLocal(src, len, dest, - ULmapGBK, lengthof(ULmapGBK), + &gbk_from_unicode_tree, NULL, 0, NULL, PG_GBK); diff --git a/src/backend/utils/mb/conversion_procs/utf8_and_iso8859/utf8_and_iso8859.c b/src/backend/utils/mb/conversion_procs/utf8_and_iso8859/utf8_and_iso8859.c index 48acd3f..ca8ada5 100644 --- a/src/backend/utils/mb/conversion_procs/utf8_and_iso8859/utf8_and_iso8859.c +++ b/src/backend/utils/mb/conversion_procs/utf8_and_iso8859/utf8_and_iso8859.c @@ -14,32 +14,32 @@ #include "postgres.h" #include "fmgr.h" #include "mb/pg_wchar.h" -#include "../../Unicode/iso8859_10_to_utf8.map" -#include "../../Unicode/iso8859_13_to_utf8.map" -#include "../../Unicode/iso8859_14_to_utf8.map" -#include "../../Unicode/iso8859_15_to_utf8.map" -#include "../../Unicode/iso8859_2_to_utf8.map" -#include "../../Unicode/iso8859_3_to_utf8.map" -#include "../../Unicode/iso8859_4_to_utf8.map" -#include "../../Unicode/iso8859_5_to_utf8.map" -#include "../../Unicode/iso8859_6_to_utf8.map" -#include "../../Unicode/iso8859_7_to_utf8.map" -#include "../../Unicode/iso8859_8_to_utf8.map" -#include "../../Unicode/iso8859_9_to_utf8.map" -#include "../../Unicode/utf8_to_iso8859_10.map" -#include "../../Unicode/utf8_to_iso8859_13.map" -#include "../../Unicode/utf8_to_iso8859_14.map" -#include "../../Unicode/utf8_to_iso8859_15.map" -#include "../../Unicode/utf8_to_iso8859_16.map" -#include "../../Unicode/utf8_to_iso8859_2.map" -#include "../../Unicode/utf8_to_iso8859_3.map" -#include "../../Unicode/utf8_to_iso8859_4.map" -#include "../../Unicode/utf8_to_iso8859_5.map" -#include "../../Unicode/utf8_to_iso8859_6.map" -#include "../../Unicode/utf8_to_iso8859_7.map" -#include "../../Unicode/utf8_to_iso8859_8.map" -#include "../../Unicode/utf8_to_iso8859_9.map" -#include "../../Unicode/iso8859_16_to_utf8.map" +#include "../../Unicode/iso8859_10_to_utf8_radix.map" +#include "../../Unicode/iso8859_13_to_utf8_radix.map" +#include "../../Unicode/iso8859_14_to_utf8_radix.map" +#include "../../Unicode/iso8859_15_to_utf8_radix.map" +#include "../../Unicode/iso8859_2_to_utf8_radix.map" +#include "../../Unicode/iso8859_3_to_utf8_radix.map" +#include "../../Unicode/iso8859_4_to_utf8_radix.map" +#include "../../Unicode/iso8859_5_to_utf8_radix.map" +#include "../../Unicode/iso8859_6_to_utf8_radix.map" +#include "../../Unicode/iso8859_7_to_utf8_radix.map" +#include "../../Unicode/iso8859_8_to_utf8_radix.map" +#include "../../Unicode/iso8859_9_to_utf8_radix.map" +#include "../../Unicode/utf8_to_iso8859_10_radix.map" +#include "../../Unicode/utf8_to_iso8859_13_radix.map" +#include "../../Unicode/utf8_to_iso8859_14_radix.map" +#include "../../Unicode/utf8_to_iso8859_15_radix.map" +#include "../../Unicode/utf8_to_iso8859_16_radix.map" +#include "../../Unicode/utf8_to_iso8859_2_radix.map" +#include "../../Unicode/utf8_to_iso8859_3_radix.map" +#include "../../Unicode/utf8_to_iso8859_4_radix.map" +#include "../../Unicode/utf8_to_iso8859_5_radix.map" +#include "../../Unicode/utf8_to_iso8859_6_radix.map" +#include "../../Unicode/utf8_to_iso8859_7_radix.map" +#include "../../Unicode/utf8_to_iso8859_8_radix.map" +#include "../../Unicode/utf8_to_iso8859_9_radix.map" +#include "../../Unicode/iso8859_16_to_utf8_radix.map" PG_MODULE_MAGIC; @@ -60,52 +60,37 @@ PG_FUNCTION_INFO_V1(utf8_to_iso8859); typedef struct { pg_enc encoding; - const pg_local_to_utf *map1; /* to UTF8 map name */ - const pg_utf_to_local *map2; /* from UTF8 map name */ - int size1; /* size of map1 */ - int size2; /* size of map2 */ + const pg_mb_radix_tree *map1; /* to UTF8 map name */ + const pg_mb_radix_tree *map2; /* from UTF8 map name */ } pg_conv_map; static const pg_conv_map maps[] = { - {PG_LATIN2, LUmapISO8859_2, ULmapISO8859_2, - lengthof(LUmapISO8859_2), - lengthof(ULmapISO8859_2)}, /* ISO-8859-2 Latin 2 */ - {PG_LATIN3, LUmapISO8859_3, ULmapISO8859_3, - lengthof(LUmapISO8859_3), - lengthof(ULmapISO8859_3)}, /* ISO-8859-3 Latin 3 */ - {PG_LATIN4, LUmapISO8859_4, ULmapISO8859_4, - lengthof(LUmapISO8859_4), - lengthof(ULmapISO8859_4)}, /* ISO-8859-4 Latin 4 */ - {PG_LATIN5, LUmapISO8859_9, ULmapISO8859_9, - lengthof(LUmapISO8859_9), - lengthof(ULmapISO8859_9)}, /* ISO-8859-9 Latin 5 */ - {PG_LATIN6, LUmapISO8859_10, ULmapISO8859_10, - lengthof(LUmapISO8859_10), - lengthof(ULmapISO8859_10)}, /* ISO-8859-10 Latin 6 */ - {PG_LATIN7, LUmapISO8859_13, ULmapISO8859_13, - lengthof(LUmapISO8859_13), - lengthof(ULmapISO8859_13)}, /* ISO-8859-13 Latin 7 */ - {PG_LATIN8, LUmapISO8859_14, ULmapISO8859_14, - lengthof(LUmapISO8859_14), - lengthof(ULmapISO8859_14)}, /* ISO-8859-14 Latin 8 */ - {PG_LATIN9, LUmapISO8859_15, ULmapISO8859_15, - lengthof(LUmapISO8859_15), - lengthof(ULmapISO8859_15)}, /* ISO-8859-15 Latin 9 */ - {PG_LATIN10, LUmapISO8859_16, ULmapISO8859_16, - lengthof(LUmapISO8859_16), - lengthof(ULmapISO8859_16)}, /* ISO-8859-16 Latin 10 */ - {PG_ISO_8859_5, LUmapISO8859_5, ULmapISO8859_5, - lengthof(LUmapISO8859_5), - lengthof(ULmapISO8859_5)}, /* ISO-8859-5 */ - {PG_ISO_8859_6, LUmapISO8859_6, ULmapISO8859_6, - lengthof(LUmapISO8859_6), - lengthof(ULmapISO8859_6)}, /* ISO-8859-6 */ - {PG_ISO_8859_7, LUmapISO8859_7, ULmapISO8859_7, - lengthof(LUmapISO8859_7), - lengthof(ULmapISO8859_7)}, /* ISO-8859-7 */ - {PG_ISO_8859_8, LUmapISO8859_8, ULmapISO8859_8, - lengthof(LUmapISO8859_8), - lengthof(ULmapISO8859_8)}, /* ISO-8859-8 */ + {PG_LATIN2, &iso8859_2_to_unicode_tree, + &iso8859_2_from_unicode_tree}, /* ISO-8859-2 Latin 2 */ + {PG_LATIN3, &iso8859_3_to_unicode_tree, + &iso8859_3_from_unicode_tree}, /* ISO-8859-3 Latin 3 */ + {PG_LATIN4, &iso8859_4_to_unicode_tree, + &iso8859_4_from_unicode_tree}, /* ISO-8859-4 Latin 4 */ + {PG_LATIN5, &iso8859_9_to_unicode_tree, + &iso8859_9_from_unicode_tree}, /* ISO-8859-9 Latin 5 */ + {PG_LATIN6, &iso8859_10_to_unicode_tree, + &iso8859_10_from_unicode_tree}, /* ISO-8859-10 Latin 6 */ + {PG_LATIN7, &iso8859_13_to_unicode_tree, + &iso8859_13_from_unicode_tree}, /* ISO-8859-13 Latin 7 */ + {PG_LATIN8, &iso8859_14_to_unicode_tree, + &iso8859_14_from_unicode_tree}, /* ISO-8859-14 Latin 8 */ + {PG_LATIN9, &iso8859_15_to_unicode_tree, + &iso8859_15_from_unicode_tree}, /* ISO-8859-15 Latin 9 */ + {PG_LATIN10, &iso8859_16_to_unicode_tree, + &iso8859_16_from_unicode_tree}, /* ISO-8859-16 Latin 10 */ + {PG_ISO_8859_5, &iso8859_5_to_unicode_tree, + &iso8859_5_from_unicode_tree}, /* ISO-8859-5 */ + {PG_ISO_8859_6, &iso8859_6_to_unicode_tree, + &iso8859_6_from_unicode_tree}, /* ISO-8859-6 */ + {PG_ISO_8859_7, &iso8859_7_to_unicode_tree, + &iso8859_7_from_unicode_tree}, /* ISO-8859-7 */ + {PG_ISO_8859_8, &iso8859_8_to_unicode_tree, + &iso8859_8_from_unicode_tree}, /* ISO-8859-8 */ }; Datum @@ -124,7 +109,7 @@ iso8859_to_utf8(PG_FUNCTION_ARGS) if (encoding == maps[i].encoding) { LocalToUtf(src, len, dest, - maps[i].map1, maps[i].size1, + maps[i].map1, NULL, 0, NULL, encoding); @@ -156,7 +141,7 @@ utf8_to_iso8859(PG_FUNCTION_ARGS) if (encoding == maps[i].encoding) { UtfToLocal(src, len, dest, - maps[i].map2, maps[i].size2, + maps[i].map2, NULL, 0, NULL, encoding); diff --git a/src/backend/utils/mb/conversion_procs/utf8_and_johab/utf8_and_johab.c b/src/backend/utils/mb/conversion_procs/utf8_and_johab/utf8_and_johab.c index 51690b9..7e3a3cb 100644 --- a/src/backend/utils/mb/conversion_procs/utf8_and_johab/utf8_and_johab.c +++ b/src/backend/utils/mb/conversion_procs/utf8_and_johab/utf8_and_johab.c @@ -14,8 +14,8 @@ #include "postgres.h" #include "fmgr.h" #include "mb/pg_wchar.h" -#include "../../Unicode/johab_to_utf8.map" -#include "../../Unicode/utf8_to_johab.map" +#include "../../Unicode/johab_to_utf8_radix.map" +#include "../../Unicode/utf8_to_johab_radix.map" PG_MODULE_MAGIC; @@ -42,7 +42,7 @@ johab_to_utf8(PG_FUNCTION_ARGS) CHECK_ENCODING_CONVERSION_ARGS(PG_JOHAB, PG_UTF8); LocalToUtf(src, len, dest, - LUmapJOHAB, lengthof(LUmapJOHAB), + &johab_to_unicode_tree, NULL, 0, NULL, PG_JOHAB); @@ -60,7 +60,7 @@ utf8_to_johab(PG_FUNCTION_ARGS) CHECK_ENCODING_CONVERSION_ARGS(PG_UTF8, PG_JOHAB); UtfToLocal(src, len, dest, - ULmapJOHAB, lengthof(ULmapJOHAB), + &johab_from_unicode_tree, NULL, 0, NULL, PG_JOHAB); diff --git a/src/backend/utils/mb/conversion_procs/utf8_and_sjis/utf8_and_sjis.c b/src/backend/utils/mb/conversion_procs/utf8_and_sjis/utf8_and_sjis.c index 605fe40..4bd5ea5 100644 --- a/src/backend/utils/mb/conversion_procs/utf8_and_sjis/utf8_and_sjis.c +++ b/src/backend/utils/mb/conversion_procs/utf8_and_sjis/utf8_and_sjis.c @@ -14,8 +14,8 @@ #include "postgres.h" #include "fmgr.h" #include "mb/pg_wchar.h" -#include "../../Unicode/sjis_to_utf8.map" -#include "../../Unicode/utf8_to_sjis.map" +#include "../../Unicode/sjis_to_utf8_radix.map" +#include "../../Unicode/utf8_to_sjis_radix.map" PG_MODULE_MAGIC; @@ -42,7 +42,7 @@ sjis_to_utf8(PG_FUNCTION_ARGS) CHECK_ENCODING_CONVERSION_ARGS(PG_SJIS, PG_UTF8); LocalToUtf(src, len, dest, - LUmapSJIS, lengthof(LUmapSJIS), + &sjis_to_unicode_tree, NULL, 0, NULL, PG_SJIS); @@ -60,7 +60,7 @@ utf8_to_sjis(PG_FUNCTION_ARGS) CHECK_ENCODING_CONVERSION_ARGS(PG_UTF8, PG_SJIS); UtfToLocal(src, len, dest, - ULmapSJIS, lengthof(ULmapSJIS), + &sjis_from_unicode_tree, NULL, 0, NULL, PG_SJIS); diff --git a/src/backend/utils/mb/conversion_procs/utf8_and_sjis2004/utf8_and_sjis2004.c b/src/backend/utils/mb/conversion_procs/utf8_and_sjis2004/utf8_and_sjis2004.c index 8d8f508..5a1e8c6 100644 --- a/src/backend/utils/mb/conversion_procs/utf8_and_sjis2004/utf8_and_sjis2004.c +++ b/src/backend/utils/mb/conversion_procs/utf8_and_sjis2004/utf8_and_sjis2004.c @@ -14,8 +14,8 @@ #include "postgres.h" #include "fmgr.h" #include "mb/pg_wchar.h" -#include "../../Unicode/shift_jis_2004_to_utf8.map" -#include "../../Unicode/utf8_to_shift_jis_2004.map" +#include "../../Unicode/shift_jis_2004_to_utf8_radix.map" +#include "../../Unicode/utf8_to_shift_jis_2004_radix.map" #include "../../Unicode/shift_jis_2004_to_utf8_combined.map" #include "../../Unicode/utf8_to_shift_jis_2004_combined.map" @@ -44,7 +44,7 @@ shift_jis_2004_to_utf8(PG_FUNCTION_ARGS) CHECK_ENCODING_CONVERSION_ARGS(PG_SHIFT_JIS_2004, PG_UTF8); LocalToUtf(src, len, dest, - LUmapSHIFT_JIS_2004, lengthof(LUmapSHIFT_JIS_2004), + &shift_jis_2004_to_unicode_tree, LUmapSHIFT_JIS_2004_combined, lengthof(LUmapSHIFT_JIS_2004_combined), NULL, PG_SHIFT_JIS_2004); @@ -62,7 +62,7 @@ utf8_to_shift_jis_2004(PG_FUNCTION_ARGS) CHECK_ENCODING_CONVERSION_ARGS(PG_UTF8, PG_SHIFT_JIS_2004); UtfToLocal(src, len, dest, - ULmapSHIFT_JIS_2004, lengthof(ULmapSHIFT_JIS_2004), + &shift_jis_2004_from_unicode_tree, ULmapSHIFT_JIS_2004_combined, lengthof(ULmapSHIFT_JIS_2004_combined), NULL, PG_SHIFT_JIS_2004); diff --git a/src/backend/utils/mb/conversion_procs/utf8_and_uhc/utf8_and_uhc.c b/src/backend/utils/mb/conversion_procs/utf8_and_uhc/utf8_and_uhc.c index 97e895c..dd7a788 100644 --- a/src/backend/utils/mb/conversion_procs/utf8_and_uhc/utf8_and_uhc.c +++ b/src/backend/utils/mb/conversion_procs/utf8_and_uhc/utf8_and_uhc.c @@ -14,8 +14,8 @@ #include "postgres.h" #include "fmgr.h" #include "mb/pg_wchar.h" -#include "../../Unicode/uhc_to_utf8.map" -#include "../../Unicode/utf8_to_uhc.map" +#include "../../Unicode/uhc_to_utf8_radix.map" +#include "../../Unicode/utf8_to_uhc_radix.map" PG_MODULE_MAGIC; @@ -42,7 +42,7 @@ uhc_to_utf8(PG_FUNCTION_ARGS) CHECK_ENCODING_CONVERSION_ARGS(PG_UHC, PG_UTF8); LocalToUtf(src, len, dest, - LUmapUHC, lengthof(LUmapUHC), + &uhc_to_unicode_tree, NULL, 0, NULL, PG_UHC); @@ -60,7 +60,7 @@ utf8_to_uhc(PG_FUNCTION_ARGS) CHECK_ENCODING_CONVERSION_ARGS(PG_UTF8, PG_UHC); UtfToLocal(src, len, dest, - ULmapUHC, lengthof(ULmapUHC), + &uhc_from_unicode_tree, NULL, 0, NULL, PG_UHC); diff --git a/src/backend/utils/mb/conversion_procs/utf8_and_win/utf8_and_win.c b/src/backend/utils/mb/conversion_procs/utf8_and_win/utf8_and_win.c index ab6e624..9ee72c9 100644 --- a/src/backend/utils/mb/conversion_procs/utf8_and_win/utf8_and_win.c +++ b/src/backend/utils/mb/conversion_procs/utf8_and_win/utf8_and_win.c @@ -14,28 +14,28 @@ #include "postgres.h" #include "fmgr.h" #include "mb/pg_wchar.h" -#include "../../Unicode/utf8_to_win1250.map" -#include "../../Unicode/utf8_to_win1251.map" -#include "../../Unicode/utf8_to_win1252.map" -#include "../../Unicode/utf8_to_win1253.map" -#include "../../Unicode/utf8_to_win1254.map" -#include "../../Unicode/utf8_to_win1255.map" -#include "../../Unicode/utf8_to_win1256.map" -#include "../../Unicode/utf8_to_win1257.map" -#include "../../Unicode/utf8_to_win1258.map" -#include "../../Unicode/utf8_to_win866.map" -#include "../../Unicode/utf8_to_win874.map" -#include "../../Unicode/win1250_to_utf8.map" -#include "../../Unicode/win1251_to_utf8.map" -#include "../../Unicode/win1252_to_utf8.map" -#include "../../Unicode/win1253_to_utf8.map" -#include "../../Unicode/win1254_to_utf8.map" -#include "../../Unicode/win1255_to_utf8.map" -#include "../../Unicode/win1256_to_utf8.map" -#include "../../Unicode/win1257_to_utf8.map" -#include "../../Unicode/win866_to_utf8.map" -#include "../../Unicode/win874_to_utf8.map" -#include "../../Unicode/win1258_to_utf8.map" +#include "../../Unicode/utf8_to_win1250_radix.map" +#include "../../Unicode/utf8_to_win1251_radix.map" +#include "../../Unicode/utf8_to_win1252_radix.map" +#include "../../Unicode/utf8_to_win1253_radix.map" +#include "../../Unicode/utf8_to_win1254_radix.map" +#include "../../Unicode/utf8_to_win1255_radix.map" +#include "../../Unicode/utf8_to_win1256_radix.map" +#include "../../Unicode/utf8_to_win1257_radix.map" +#include "../../Unicode/utf8_to_win1258_radix.map" +#include "../../Unicode/utf8_to_win866_radix.map" +#include "../../Unicode/utf8_to_win874_radix.map" +#include "../../Unicode/win1250_to_utf8_radix.map" +#include "../../Unicode/win1251_to_utf8_radix.map" +#include "../../Unicode/win1252_to_utf8_radix.map" +#include "../../Unicode/win1253_to_utf8_radix.map" +#include "../../Unicode/win1254_to_utf8_radix.map" +#include "../../Unicode/win1255_to_utf8_radix.map" +#include "../../Unicode/win1256_to_utf8_radix.map" +#include "../../Unicode/win1257_to_utf8_radix.map" +#include "../../Unicode/win866_to_utf8_radix.map" +#include "../../Unicode/win874_to_utf8_radix.map" +#include "../../Unicode/win1258_to_utf8_radix.map" PG_MODULE_MAGIC; @@ -56,46 +56,22 @@ PG_FUNCTION_INFO_V1(utf8_to_win); typedef struct { pg_enc encoding; - const pg_local_to_utf *map1; /* to UTF8 map name */ - const pg_utf_to_local *map2; /* from UTF8 map name */ - int size1; /* size of map1 */ - int size2; /* size of map2 */ + const pg_mb_radix_tree *map1; /* to UTF8 map name */ + const pg_mb_radix_tree *map2; /* from UTF8 map name */ } pg_conv_map; static const pg_conv_map maps[] = { - {PG_WIN866, LUmapWIN866, ULmapWIN866, - lengthof(LUmapWIN866), - lengthof(ULmapWIN866)}, - {PG_WIN874, LUmapWIN874, ULmapWIN874, - lengthof(LUmapWIN874), - lengthof(ULmapWIN874)}, - {PG_WIN1250, LUmapWIN1250, ULmapWIN1250, - lengthof(LUmapWIN1250), - lengthof(ULmapWIN1250)}, - {PG_WIN1251, LUmapWIN1251, ULmapWIN1251, - lengthof(LUmapWIN1251), - lengthof(ULmapWIN1251)}, - {PG_WIN1252, LUmapWIN1252, ULmapWIN1252, - lengthof(LUmapWIN1252), - lengthof(ULmapWIN1252)}, - {PG_WIN1253, LUmapWIN1253, ULmapWIN1253, - lengthof(LUmapWIN1253), - lengthof(ULmapWIN1253)}, - {PG_WIN1254, LUmapWIN1254, ULmapWIN1254, - lengthof(LUmapWIN1254), - lengthof(ULmapWIN1254)}, - {PG_WIN1255, LUmapWIN1255, ULmapWIN1255, - lengthof(LUmapWIN1255), - lengthof(ULmapWIN1255)}, - {PG_WIN1256, LUmapWIN1256, ULmapWIN1256, - lengthof(LUmapWIN1256), - lengthof(ULmapWIN1256)}, - {PG_WIN1257, LUmapWIN1257, ULmapWIN1257, - lengthof(LUmapWIN1257), - lengthof(ULmapWIN1257)}, - {PG_WIN1258, LUmapWIN1258, ULmapWIN1258, - lengthof(LUmapWIN1258), - lengthof(ULmapWIN1258)}, + {PG_WIN866, &win866_to_unicode_tree, &win866_from_unicode_tree}, + {PG_WIN874, &win874_to_unicode_tree, &win874_from_unicode_tree}, + {PG_WIN1250, &win1250_to_unicode_tree, &win1250_from_unicode_tree}, + {PG_WIN1251, &win1251_to_unicode_tree, &win1251_from_unicode_tree}, + {PG_WIN1252, &win1252_to_unicode_tree, &win1252_from_unicode_tree}, + {PG_WIN1253, &win1253_to_unicode_tree, &win1253_from_unicode_tree}, + {PG_WIN1254, &win1254_to_unicode_tree, &win1254_from_unicode_tree}, + {PG_WIN1255, &win1255_to_unicode_tree, &win1255_from_unicode_tree}, + {PG_WIN1256, &win1256_to_unicode_tree, &win1256_from_unicode_tree}, + {PG_WIN1257, &win1257_to_unicode_tree, &win1257_from_unicode_tree}, + {PG_WIN1258, &win1258_to_unicode_tree, &win1258_from_unicode_tree}, }; Datum @@ -114,7 +90,7 @@ win_to_utf8(PG_FUNCTION_ARGS) if (encoding == maps[i].encoding) { LocalToUtf(src, len, dest, - maps[i].map1, maps[i].size1, + maps[i].map1, NULL, 0, NULL, encoding); @@ -146,7 +122,7 @@ utf8_to_win(PG_FUNCTION_ARGS) if (encoding == maps[i].encoding) { UtfToLocal(src, len, dest, - maps[i].map2, maps[i].size2, + maps[i].map2, NULL, 0, NULL, encoding); diff --git a/src/include/mb/pg_wchar.h b/src/include/mb/pg_wchar.h index ceb5695..5ab93cb 100644 --- a/src/include/mb/pg_wchar.h +++ b/src/include/mb/pg_wchar.h @@ -383,6 +383,58 @@ typedef struct uint32 code; /* local code */ } pg_utf_to_local; +typedef struct pg_mb_radix_index +{ + uint8 lower; + uint8 upper; /* index range of b2idx */ +} pg_mb_radix_index; + +/* + * Radix tree structs for faster conversion + */ +typedef struct +{ + /* + * Array containing all the values. Only one of chars16 or chars32 is + * used, depending on how wide the values we need to represent are. + */ + const uint16 *chars16; /* 16 bit */ + const uint32 *chars32; /* 32 bit character table */ + + /* Radix tree for 1-byte inputs */ + uint32 b1root; /* offset of table in the chars[16|32] array */ + uint8 b1_lower; /* min allowed value for a single byte input */ + uint8 b1_upper; /* max allowed value for a single byte input */ + + /* Radix tree for 2-byte inputs */ + uint32 b2root; /* offset of 1st byte's table */ + uint8 b2_1_lower; /* min/max allowed value for 1st input byte */ + uint8 b2_1_upper; + uint8 b2_2_lower; /* min/max allowed value for 2nd input byte */ + uint8 b2_2_upper; + + /* Radix tree for 3-byte inputs */ + uint32 b3root; /* offset of 1st byte's table */ + uint8 b3_1_lower; /* min/max allowed value for 1st input byte */ + uint8 b3_1_upper; + uint8 b3_2_lower; /* min/max allowed value for 2nd input byte */ + uint8 b3_2_upper; + uint8 b3_3_lower; /* min/max allowed value for 3rd input byte */ + uint8 b3_3_upper; + + /* Radix tree for 4-byte inputs */ + uint32 b4root; /* offset of 1st byte's table */ + uint8 b4_1_lower; /* min/max allowed value for 1st input byte */ + uint8 b4_1_upper; + uint8 b4_2_lower; /* min/max allowed value for 2nd input byte */ + uint8 b4_2_upper; + uint8 b4_3_lower; /* min/max allowed value for 3rd input byte */ + uint8 b4_3_upper; + uint8 b4_4_lower; /* min/max allowed value for 4th input byte */ + uint8 b4_4_upper; + +} pg_mb_radix_tree; + /* * local code to UTF-8 conversion map */ @@ -510,13 +562,13 @@ extern unsigned short CNStoBIG5(unsigned short cns, unsigned char lc); extern void UtfToLocal(const unsigned char *utf, int len, unsigned char *iso, - const pg_utf_to_local *map, int mapsize, + const pg_mb_radix_tree *map, const pg_utf_to_local_combined *cmap, int cmapsize, utf_local_conversion_func conv_func, int encoding); extern void LocalToUtf(const unsigned char *iso, int len, unsigned char *utf, - const pg_local_to_utf *map, int mapsize, + const pg_mb_radix_tree *map, const pg_local_to_utf_combined *cmap, int cmapsize, utf_local_conversion_func conv_func, int encoding); -- 2.9.2
-- Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org) To make changes to your subscription: http://www.postgresql.org/mailpref/pgsql-hackers