Unfortunately my first patch does not account for the first two octets when they are not a BOM. In that case one needs to reset the read pointer to the beginning.
root@d5305a0f945d:~# cat check-unicode.pl use Encode qw/encode decode/; my $str = 'ABCD'; printf "%s vs %s\n", $str, decode('utf-16be', encode('utf-16be', $str)); printf "%s vs %s\n", $str, decode('utf-16', encode('utf-16', $str)); printf "%s vs %s\n", $str, decode('utf-16', encode('utf-16be', $str)); root@d5305a0f945d:~# perl check-unicode.pl # debian version ABCD vs ABCD ABCD vs ABCD UTF-16:Unrecognised BOM 41 at /usr/lib/x86_64-linux-gnu/perl/5.20/Encode.pm line 175. root@d5305a0f945d:~# perl check-unicode.pl # first version of patch ABCD vs ABCD ABCD vs ABCD ABCD vs BCD root@d5305a0f945d:~# perl check-unicode.pl # second version of patch ABCD vs ABCD ABCD vs ABCD ABCD vs ABCD diff --git a/Unicode/Unicode.xs b/Unicode/Unicode.xs index 5f3bceb..e309307 100644 --- a/Unicode/Unicode.xs +++ b/Unicode/Unicode.xs @@ -166,9 +166,19 @@ CODE: endian = 'V'; } else { - croak("%"SVf":Unrecognised BOM %"UVxf, - *hv_fetch((HV *)SvRV(obj),"Name",4,0), - bom); + /* No BOM found, use big-endian fallback as specified in + * RFC2781 and the Unicode Standard version 8.0: + * + * The UTF-16 encoding scheme may or may not begin with + * a BOM. However, when there is no BOM, and in the + * absence of a higher-level protocol, the byte order + * of the UTF-16 encoding scheme is big-endian. + * + * If the first two octets of the text is not 0xFE + * followed by 0xFF, and is not 0xFF followed by 0xFE, + * then the text SHOULD be interpreted as big-endian. + */ + s -= size; } } #if 1