Hi,

From: Josip Rodin <[EMAIL PROTECTED]>
Subject: Re: lists.debian.org de-localization
Date: Sun, 12 Jan 2003 04:14:45 +0100

> This, on the other hand, is a hassle to handle (backporting or installation
> into subdirs). master.d.o is scheduled to be upgraded to woody after samosa.
> That's all I know. <shrug>

This is a good news.  Then I will work later on various encoding support.

Anyway, I don't expect the new master.d.o will have development version
of MHonArc (with encoding-assuming feature for raw 8bit headers) even if
it comes from non-Debian-package version.  Thus I think we will have to
have some method to handle raw 8bit headers.

Here is a "filter" to convert 8bit characters (assumed to be KOI8-R) to
"&#xxxx;" expression, which I wrote by imitating iso8859.pl, CharEnt.pm,
and UTF8.pm .  This filter is used for raw 7bit/8bit strings.  Since
7bit part of KOI8-R is identical to ASCII, it doesn't harm legal ASCII
headers.  The filter is to be installed into 
org/lists.debian.org/mhonarc/share/mhonarc/MHonArc/DEBIAN.pm and doesn't
depend on the version of MHonArc or Debian.
##  DEBIAN.pm by Tomohiro KUBOTA <[EMAIL PROTECTED]>
##
##  CHARSETCONVERTER module that assume input string to be KOI8-R
##  and convert it into &#xxx; expression where xxx is decimal Unicode
##  codepoint.

package DEBIAN;

%US_ASCII_To_Ent = (
  #--------------------------------------------------------------------------
  # Hex Code    Entity Ref      # ISO external entity and description
  #--------------------------------------------------------------------------
    0x22,       "&quot;",       # ISOnum : Quotation mark
    0x26,       "&amp;",        # ISOnum : Ampersand
    0x3C,       "&lt;",         # ISOnum : Less-than sign
    0x3E,       "&gt;",         # ISOnum : Greater-than sign
);

%KOI8_R_To_Ent = (
  #--------------------------------------------------------------------------
  # Hex Code    Entity Ref      # ISO external entity and description
  #--------------------------------------------------------------------------
    0x80,       "&#9472;",      # BOX DRAWINGS LIGHT HORIZONTAL
    0x81,       "&#9474;",      # BOX DRAWINGS LIGHT VERTICAL
    0x82,       "&#9484;",      # BOX DRAWINGS LIGHT DOWN AND RIGHT
    0x83,       "&#9488;",      # BOX DRAWINGS LIGHT DOWN AND LEFT
    0x84,       "&#9492;",      # BOX DRAWINGS LIGHT UP AND RIGHT
    0x85,       "&#9496;",      # BOX DRAWINGS LIGHT UP AND LEFT
    0x86,       "&#9500;",      # BOX DRAWINGS LIGHT VERTICAL AND RIGHT
    0x87,       "&#9508;",      # BOX DRAWINGS LIGHT VERTICAL AND LEFT
    0x88,       "&#9516;",      # BOX DRAWINGS LIGHT DOWN AND HORIZONTAL
    0x89,       "&#9524;",      # BOX DRAWINGS LIGHT UP AND HORIZONTAL
    0x8a,       "&#9532;",      # BOX DRAWINGS LIGHT VERTICAL AND HORIZONTAL
    0x8b,       "&#9600;",      # UPPER HALF BLOCK
    0x8c,       "&#9604;",      # LOWER HALF BLOCK
    0x8d,       "&#9608;",      # FULL BLOCK
    0x8e,       "&#9612;",      # LEFT HALF BLOCK
    0x8f,       "&#9616;",      # RIGHT HALF BLOCK
    0x90,       "&#9617;",      # LIGHT SHADE
    0x91,       "&#9618;",      # MEDIUM SHADE
    0x92,       "&#9619;",      # DARK SHADE
    0x93,       "&#8992;",      # TOP HALF INTEGRAL
    0x94,       "&#9632;",      # BLACK SQUARE
    0x95,       "&#8729;",      # BULLET OPERATOR
    0x96,       "&#8730;",      # SQUARE ROOT
    0x97,       "&#8776;",      # ALMOST EQUAL TO
    0x98,       "&#8804;",      # LESS-THAN OR EQUAL TO
    0x99,       "&#8805;",      # GREATER-THAN OR EQUAL TO
    0x9a,       "&#160;",       # NO-BREAK SPACE
    0x9b,       "&#8993;",      # BOTTOM HALF INTEGRAL
    0x9c,       "&#176;",       # DEGREE SIGN
    0x9d,       "&#178;",       # SUPERSCRIPT TWO
    0x9e,       "&#183;",       # MIDDLE DOT
    0x9f,       "&#247;",       # DIVISION SIGN
    0xa0,       "&#9552;",      # BOX DRAWINGS DOUBLE HORIZONTAL
    0xa1,       "&#9553;",      # BOX DRAWINGS DOUBLE VERTICAL
    0xa2,       "&#9554;",      # BOX DRAWINGS DOWN SINGLE AND RIGHT DOUBLE
    0xa3,       "&#1105;",      # CYRILLIC SMALL LETTER IO
    0xa4,       "&#9555;",      # BOX DRAWINGS DOWN DOUBLE AND RIGHT SINGLE
    0xa5,       "&#9556;",      # BOX DRAWINGS DOUBLE DOWN AND RIGHT
    0xa6,       "&#9557;",      # BOX DRAWINGS DOWN SINGLE AND LEFT DOUBLE
    0xa7,       "&#9558;",      # BOX DRAWINGS DOWN DOUBLE AND LEFT SINGLE
    0xa8,       "&#9559;",      # BOX DRAWINGS DOUBLE DOWN AND LEFT
    0xa9,       "&#9560;",      # BOX DRAWINGS UP SINGLE AND RIGHT DOUBLE
    0xaa,       "&#9561;",      # BOX DRAWINGS UP DOUBLE AND RIGHT SINGLE
    0xab,       "&#9562;",      # BOX DRAWINGS DOUBLE UP AND RIGHT
    0xac,       "&#9563;",      # BOX DRAWINGS UP SINGLE AND LEFT DOUBLE
    0xad,       "&#9564;",      # BOX DRAWINGS UP DOUBLE AND LEFT SINGLE
    0xae,       "&#9565;",      # BOX DRAWINGS DOUBLE UP AND LEFT
    0xaf,       "&#9566;",      # BOX DRAWINGS VERTICAL SINGLE AND RIGHT DOUBLE
    0xb0,       "&#9567;",      # BOX DRAWINGS VERTICAL DOUBLE AND RIGHT SINGLE
    0xb1,       "&#9568;",      # BOX DRAWINGS DOUBLE VERTICAL AND RIGHT
    0xb2,       "&#9569;",      # BOX DRAWINGS VERTICAL SINGLE AND LEFT DOUBLE
    0xb3,       "&#1025;",      # CYRILLIC CAPITAL LETTER IO
    0xb4,       "&#9570;",      # BOX DRAWINGS VERTICAL DOUBLE AND LEFT SINGLE
    0xb5,       "&#9571;",      # BOX DRAWINGS DOUBLE VERTICAL AND LEFT
    0xb6,       "&#9572;",      # BOX DRAWINGS DOWN SINGLE AND HORIZONTAL DOUBLE
    0xb7,       "&#9573;",      # BOX DRAWINGS DOWN DOUBLE AND HORIZONTAL SINGLE
    0xb8,       "&#9574;",      # BOX DRAWINGS DOUBLE DOWN AND HORIZONTAL
    0xb9,       "&#9575;",      # BOX DRAWINGS UP SINGLE AND HORIZONTAL DOUBLE
    0xba,       "&#9576;",      # BOX DRAWINGS UP DOUBLE AND HORIZONTAL SINGLE
    0xbb,       "&#9577;",      # BOX DRAWINGS DOUBLE UP AND HORIZONTAL
    0xbc,       "&#9578;",      # BOX DRAWINGS VERTICAL SINGLE AND HORIZONTAL 
DOUBLE
    0xbd,       "&#9579;",      # BOX DRAWINGS VERTICAL DOUBLE AND HORIZONTAL 
SINGLE
    0xbe,       "&#9580;",      # BOX DRAWINGS DOUBLE VERTICAL AND HORIZONTAL
    0xbf,       "&#169;",       # COPYRIGHT SIGN
    0xc0,       "&#1102;",      # CYRILLIC SMALL LETTER YU
    0xc1,       "&#1072;",      # CYRILLIC SMALL LETTER A
    0xc2,       "&#1073;",      # CYRILLIC SMALL LETTER BE
    0xc3,       "&#1094;",      # CYRILLIC SMALL LETTER TSE
    0xc4,       "&#1076;",      # CYRILLIC SMALL LETTER DE
    0xc5,       "&#1077;",      # CYRILLIC SMALL LETTER IE
    0xc6,       "&#1092;",      # CYRILLIC SMALL LETTER EF
    0xc7,       "&#1075;",      # CYRILLIC SMALL LETTER GHE
    0xc8,       "&#1093;",      # CYRILLIC SMALL LETTER HA
    0xc9,       "&#1080;",      # CYRILLIC SMALL LETTER I
    0xca,       "&#1081;",      # CYRILLIC SMALL LETTER SHORT I
    0xcb,       "&#1082;",      # CYRILLIC SMALL LETTER KA
    0xcc,       "&#1083;",      # CYRILLIC SMALL LETTER EL
    0xcd,       "&#1084;",      # CYRILLIC SMALL LETTER EM
    0xce,       "&#1085;",      # CYRILLIC SMALL LETTER EN
    0xcf,       "&#1086;",      # CYRILLIC SMALL LETTER O
    0xd0,       "&#1087;",      # CYRILLIC SMALL LETTER PE
    0xd1,       "&#1103;",      # CYRILLIC SMALL LETTER YA
    0xd2,       "&#1088;",      # CYRILLIC SMALL LETTER ER
    0xd3,       "&#1089;",      # CYRILLIC SMALL LETTER ES
    0xd4,       "&#1090;",      # CYRILLIC SMALL LETTER TE
    0xd5,       "&#1091;",      # CYRILLIC SMALL LETTER U
    0xd6,       "&#1078;",      # CYRILLIC SMALL LETTER ZHE
    0xd7,       "&#1074;",      # CYRILLIC SMALL LETTER VE
    0xd8,       "&#1100;",      # CYRILLIC SMALL LETTER SOFT SIGN
    0xd9,       "&#1099;",      # CYRILLIC SMALL LETTER YERU
    0xda,       "&#1079;",      # CYRILLIC SMALL LETTER ZE
    0xdb,       "&#1096;",      # CYRILLIC SMALL LETTER SHA
    0xdc,       "&#1101;",      # CYRILLIC SMALL LETTER E
    0xdd,       "&#1097;",      # CYRILLIC SMALL LETTER SHCHA
    0xde,       "&#1095;",      # CYRILLIC SMALL LETTER CHE
    0xdf,       "&#1098;",      # CYRILLIC SMALL LETTER HARD SIGN
    0xe0,       "&#1070;",      # CYRILLIC CAPITAL LETTER YU
    0xe1,       "&#1040;",      # CYRILLIC CAPITAL LETTER A
    0xe2,       "&#1041;",      # CYRILLIC CAPITAL LETTER BE
    0xe3,       "&#1062;",      # CYRILLIC CAPITAL LETTER TSE
    0xe4,       "&#1044;",      # CYRILLIC CAPITAL LETTER DE
    0xe5,       "&#1045;",      # CYRILLIC CAPITAL LETTER IE
    0xe6,       "&#1060;",      # CYRILLIC CAPITAL LETTER EF
    0xe7,       "&#1043;",      # CYRILLIC CAPITAL LETTER GHE
    0xe8,       "&#1061;",      # CYRILLIC CAPITAL LETTER HA
    0xe9,       "&#1048;",      # CYRILLIC CAPITAL LETTER I
    0xea,       "&#1049;",      # CYRILLIC CAPITAL LETTER SHORT I
    0xeb,       "&#1050;",      # CYRILLIC CAPITAL LETTER KA
    0xec,       "&#1051;",      # CYRILLIC CAPITAL LETTER EL
    0xed,       "&#1052;",      # CYRILLIC CAPITAL LETTER EM
    0xee,       "&#1053;",      # CYRILLIC CAPITAL LETTER EN
    0xef,       "&#1054;",      # CYRILLIC CAPITAL LETTER O
    0xf0,       "&#1055;",      # CYRILLIC CAPITAL LETTER PE
    0xf1,       "&#1071;",      # CYRILLIC CAPITAL LETTER YA
    0xf2,       "&#1056;",      # CYRILLIC CAPITAL LETTER ER
    0xf3,       "&#1057;",      # CYRILLIC CAPITAL LETTER ES
    0xf4,       "&#1058;",      # CYRILLIC CAPITAL LETTER TE
    0xf5,       "&#1059;",      # CYRILLIC CAPITAL LETTER U
    0xf6,       "&#1046;",      # CYRILLIC CAPITAL LETTER ZHE
    0xf7,       "&#1042;",      # CYRILLIC CAPITAL LETTER VE
    0xf8,       "&#1068;",      # CYRILLIC CAPITAL LETTER SOFT SIGN
    0xf9,       "&#1067;",      # CYRILLIC CAPITAL LETTER YERU
    0xfa,       "&#1047;",      # CYRILLIC CAPITAL LETTER ZE
    0xfb,       "&#1064;",      # CYRILLIC CAPITAL LETTER SHA
    0xfc,       "&#1069;",      # CYRILLIC CAPITAL LETTER E
    0xfd,       "&#1065;",      # CYRILLIC CAPITAL LETTER SHCHA
    0xfe,       "&#1063;",      # CYRILLIC CAPITAL LETTER CHE
    0xff,       "&#1066;",      # CYRILLIC CAPITAL LETTER HARD SIGN
);

sub koi8r2sgml {
    my $data = $_[0];
    my ($len, $ret, $char, $offset);

    $len = length($data); $ret = ""; $offset = 0;
    while ($offset < $len) {
        $char = unpack("C", substr($data, $offset++, 1));
        if ($char < 128) {
            $ret .= ($US_ASCII_To_Ent{$char} || pack("C", $char));
        } else {
            $ret .= ($KOI8_R_To_Ent{$char} || pack("C", $char));
        }
    }
    $ret;
}

1;
--- debian.rc   2003-01-12 12:33:02.000000000 +0900
+++ debian.rc.new       2003-01-12 12:35:43.000000000 +0900
@@ -3,7 +3,7 @@
 
 <!-- Common Resources -------------------------------------------------------->
 <CharsetConverters>
-plain;          mhonarc::htmlize;
+plain;          MHonArc::DEBIAN::koi8r2sgml;  MHonArc/DEBIAN.pm
 us-ascii;       mhonarc::htmlize;
 iso-8859-1;     iso_8859::str2sgml;     iso8859.pl
 iso-8859-2;     iso_8859::str2sgml;     iso8859.pl

Reply via email to