On Thu, 10 Oct 2024 16:00:41 +0900 (JST) Tatsuo Ishii <is...@postgresql.org> wrote:
> > Bruce Momjian <br...@momjian.us> writes: > >> Can we use Unicode in the SGML files? > > > > I believe we've been doing it for contributors' names that require > > non-ASCII letters, but not in any other places. > > We have non-ASCII letters in charset.sgml too, to show some examples > of collation. We can check non-ASCII letters SGML/XML files by preparing "allowlist" that contains lines which are allowed to have non-ascii characters, although this list will need to be maintained when lines in it are modified. I've attached a patch to add a simple Perl script to do this. During testing this script, I found "stylesheet-man.xsl" also has non-ascii characters. I don't know these characters are really necessary though, since I don't understand this file well. Regards, Yugo Nagata -- Yugo Nagata <nag...@sraoss.co.jp>
>From c5a16f1f7c515294cb600554fe1bbe045d25ec26 Mon Sep 17 00:00:00 2001 From: Yugo Nagata <nag...@sraoss.co.jp> Date: Thu, 10 Oct 2024 23:35:19 +0900 Subject: [PATCH] Doc: Add check to detect non-ASCII characters --- doc/src/sgml/Makefile | 11 ++++---- doc/src/sgml/check_non_ascii.pl | 47 +++++++++++++++++++++++++++++++++ 2 files changed, 52 insertions(+), 6 deletions(-) create mode 100644 doc/src/sgml/check_non_ascii.pl diff --git a/doc/src/sgml/Makefile b/doc/src/sgml/Makefile index 65ed32cd0a..90cbeed542 100644 --- a/doc/src/sgml/Makefile +++ b/doc/src/sgml/Makefile @@ -194,7 +194,7 @@ MAKEINFO = makeinfo ## # Quick syntax check without style processing -check: postgres.sgml $(ALLSGML) check-tabs check-nbsp +check: postgres.sgml $(ALLSGML) check-tabs check-non-ascii $(XMLLINT) $(XMLINCLUDE) --noout --valid $< @@ -260,12 +260,11 @@ check-tabs: @( ! grep ' ' $(wildcard $(srcdir)/*.sgml $(srcdir)/ref/*.sgml $(srcdir)/*.xsl) ) || \ (echo "Tabs appear in SGML/XML files" 1>&2; exit 1) -# Non-breaking spaces are harmless, but it is best to avoid them in SGML files. +# Non-ASCII characters are harmless, but it is best to avoid them in SGML files. # Use perl command because non-GNU grep or sed could not have hex escape sequence. -check-nbsp: - @ ( $(PERL) -ne '/\xC2\xA0/ and print("$$ARGV:$$_"),$$n++; END {exit($$n>0)}' \ - $(wildcard $(srcdir)/*.sgml $(srcdir)/ref/*.sgml $(srcdir)/*.xsl) ) || \ - (echo "Non-breaking spaces appear in SGML/XML files" 1>&2; exit 1) +check-non-ascii: + @ ( $(PERL) $(srcdir)/check_non_ascii.pl $(wildcard $(srcdir)/*.sgml $(srcdir)/ref/*.sgml $(srcdir)/*.xsl) ) || \ + (echo "Non-ASCII characters appear in SGML/XML files" 1>&2; exit 1) ## ## Clean diff --git a/doc/src/sgml/check_non_ascii.pl b/doc/src/sgml/check_non_ascii.pl new file mode 100644 index 0000000000..1d7ae405b5 --- /dev/null +++ b/doc/src/sgml/check_non_ascii.pl @@ -0,0 +1,47 @@ +#!/usr/bin/perl +# +# Check if non-ASCII characters appear in SGML/XML files +# Copyright (c) 2000-2024, PostgreSQL Global Development Group + +use strict; +use warnings FATAL => 'all'; + +# list of lines where non-ascii characters are allowed +my %allowlist = ( +'./charset.sgml' => [ +"SELECT 'à ' = 'A' COLLATE ignore_accent_case; -- true", +" <entry><literal>'n' = 'ñ'</literal></entry>", +" performed. For example, <literal>'á'</literal> may be composed of the", +" locale <literal>und-u-kb</literal> sorts <literal>'à e'</literal>", +" before <literal>'aé'</literal>." +], +'./stylesheet-man.xsl' => [ +'<l:template name="sect.*" text="Section %n, â%tâ, in the documentation"/>' +] +); + +# begin of the acknowledgements for contributors in the release-note +my $release_ack='<sect2 id="release-.*-acknowledgements">'; + +my $n = 0; +foreach my $file (@ARGV) +{ + open my $fh, '<', $file or die; + while (my $line = <$fh>) + { + # skip lines in allowlist + next if exists($allowlist{$file}) and (grep {$line =~ $_} @{$allowlist{$file}}); + + # skip contributor names in the acknowledgements + last if ($line =~ /$release_ack/); + + # check non-ascii characters + if ($line =~ /[^\x00-\x7f]/) + { + print "$file:$line"; + $n++; + } + } + close $fh; +} +exit($n>0); -- 2.34.1