On 2015-09-24 20:37, Nicolas George wrote: > Le tridi 3 vendémiaire, an CCXXIV, James Darnley a écrit : >> I don't know what to say here. I know the encodings needed for iconv >> because I arrived at them by brute force. I wrote a short Lua script to >> iterate over a list of encodings supported by my iconv and arrived at >> this answer. The command line tool called iconv is too clever for this >> because it returns an error when it can't convert. As for ending in >> GBK, it is what the script told me. > > Could you share the script and enough input to run it and reproduce the > results?
I can. You should find it attached to this email. I cleaned it up and put two test cases of data into the file. You will need Lua and the Lua-iconv module. If your package manager doesn't have that see here: https://ittner.github.io/lua-iconv/ To run it: lua <filename> >> This feature would not work if there was a misinterpretation in the >> middle. As you say that would need A->B and C->D where B != C. Perhaps >> this is why my solution isn't perfect, because there should be an >> assumption in the middle. >> >> I could rework my code to allow for assumptions in the middle. My case >> would then use "CP1252,UTF-8,UTF-8,GBK" as an argument. > > I must say, I do not like your approach very much because it manipulates > text encoding in the middle of the program. All strings inside the program > should be in UTF-8. > > I can propose this: add an option "metadata_text_encoding" to > AVFormatContext. If it is set on a demuxer, the demuxing framework uses it > to convert from it to UTF-8; and similarly, if it is set on a muxer, the > muxing framework uses it to convert from UTF-8 to it. > > Then we can have a special syntax for it to specify bogus conversions. > Possibly: -metadata_text_encoding "[CP1252>UTF-8]GBK" to specify that the > text must first be converted from CP1252 to UTF-8 then considered to be GBK > (and converted to UTF-8). (Well, I consider the feature evil, so I will > probably not volunteer to implement it, but I will not oppose as long as it > can not be triggered too easily by an unsuspecting user. > > What do you think of it? As for more special syntax, I'm not a fan of it. Handling this in the demuxer, somewhere, might be a better idea.
local iconv = require('iconv') local function canonicalize_list(list) local tbl = {} for _,v in ipairs(list) do local cp = iconv.canonicalize(v) tbl[cp] = true end local ret = {} for k,_ in pairs(tbl) do table.insert(ret, k) end table.sort(ret) return ret end local function hex_string_to_bytes(str) local ret = '' for i in string.gmatch(str, '%x%x') do ret = ret .. string.char(tonumber(i, 16)) end return ret end -- Moderately slow, ~15sec for 143 encodings. local function run(encoding_list, mojibake, correct) for _,a in ipairs(encoding_list) do for _,b in ipairs(encoding_list) do for _,c in ipairs(encoding_list) do local a2b = iconv.new(a, b) local b2c = iconv.new(b, c) local str = a2b:iconv(mojibake) str = b2c:iconv(str) if string.match(str, correct) then io.stdout:write(string.format('%s,%s,%s = %s\n', a, b, c, str)) end end end end end -- Very fast, ~0.1sec for 143 encodings. local function run_assume_middle_utf8(encoding_list, mojibake, correct) for _,a in ipairs(encoding_list) do for _,b in ipairs(encoding_list) do local a2utf = iconv.new(a, 'UTF-8') local utf2b = iconv.new('UTF-8', b) local str = a2utf:iconv(mojibake) str = utf2b:iconv(str) if string.match(str, correct) then io.stdout:write(string.format('%s,UTF-8,%s = %s\n', a, b, str)) end end end end -- Very slow, many minutes for 143 encodings. local function run_assume_middle_random(encoding_list, mojibake, correct) for _,a in ipairs(encoding_list) do for _,b in ipairs(encoding_list) do for _,c in ipairs(encoding_list) do for _,d in ipairs(encoding_list) do local a2b = iconv.new(a, b) local c2d = iconv.new(c, d) local str = a2b:iconv(mojibake) str = c2d:iconv(str) if string.match(str, correct) then io.stdout:write(string.format('%s,%s_%s,%s = %s\n', a, b, c, d, str)) end end end end end end -- Main program local encoding_list = {} if true or not iconv.list or not iconv.canonicalize then io.stdout:write( 'The iconv module does not support the list or canonicalize functions so ' .. 'this tool will use an internal list of character encodings.\n') encoding_list = { "ARMSCII-8", "ATARIST", "BIG5", "BIG5-2003", "BIG5-HKSCS", "BIG5-HKSCS:1999", "BIG5-HKSCS:2001", "BIG5-HKSCS:2004", "C99", "CP1046", "CP1124", "CP1125", "CP1129", "CP1131", "CP1133", "CP1161", "CP1162", "CP1163", "CP1250", "CP1251", "CP1252", "CP1253", "CP1254", "CP1255", "CP1256", "CP1257", "CP1258", "CP437", "CP737", "CP775", "CP850", "CP852", "CP853", "CP855", "CP856", "CP857", "CP858", "CP860", "CP861", "CP862", "CP863", "CP864", "CP865", "CP866", "CP869", "CP874", "CP922", "CP932", "CP936", "CP943", "CP949", "CP950", "DEC-HANYU", "DEC-KANJI", "EUC-CN", "EUC-JISX0213", "EUC-JP", "EUC-KR", "EUC-TW", "GB18030", "GBK", "GB_1988-80", "GB_2312-80", "GEORGIAN-ACADEMY", "GEORGIAN-PS", "HP-ROMAN8", "HZ", "ISO-2022-CN", "ISO-2022-CN-EXT", "ISO-2022-JP", "ISO-2022-JP-1", "ISO-2022-JP-2", "ISO-2022-JP-3", "ISO-2022-KR", "ISO-8859-1", "ISO-8859-10", "ISO-8859-11", "ISO-8859-13", "ISO-8859-14", "ISO-8859-15", "ISO-8859-16", "ISO-8859-2", "ISO-8859-3", "ISO-8859-4", "ISO-8859-5", "ISO-8859-6", "ISO-8859-7", "ISO-8859-8", "ISO-8859-9", "ISO-IR-165", "JAVA", "JIS_C6220-1969-RO", "JIS_X0201", "JIS_X0208", "JIS_X0212", "JOHAB", "KOI8-R", "KOI8-RU", "KOI8-T", "KOI8-U", "KSC_5601", "MACARABIC", "MACCENTRALEUROPE", "MACCROATIAN", "MACCYRILLIC", "MACGREEK", "MACHEBREW", "MACICELAND", "MACROMAN", "MACROMANIA", "MACTHAI", "MACTURKISH", "MACUKRAINE", "MULELAO-1", "NEXTSTEP", "PT154", "RISCOS-LATIN1", "RK1048", "SHIFT_JIS", "SHIFT_JISX0213", "TCVN", "TDS565", "TIS-620", "UCS-2", "UCS-2-INTERNAL", "UCS-2-SWAPPED", "UCS-2BE", "UCS-2LE", "UCS-4", "UCS-4-INTERNAL", "UCS-4-SWAPPED", "UCS-4BE", "UCS-4LE", "US-ASCII", "UTF-16", "UTF-16BE", "UTF-16LE", "UTF-32", "UTF-32BE", "UTF-32LE", "UTF-7", "UTF-8", "VISCII" } else encoding_list = canonicalize_list(iconv.list()) end local mojibake_correct_pairs = { bcc_14 = { [[ C38AC2AFC2B4C2A8C393C2A2C380C389 20617320C2B8C2A1C396C3B1C38AC2AE C38BC384C380C3893FC38EC3B7C2B4C3 A5C2A4C381C2A4C38AC2A4C39F206173 20C2BBC2A2C28FC398C387C3A5C392C3 B43FC39F68C2BDC3BCC390C2A2C392C2 BB20617320C390C2A1C2B4C2BBC38FC3 89C38CC2ABC380C389]], '一' }, bcc_15 = { [[ C395C39BC3B3C392C2B8C2BBC383C380 C397C39320617320C390C3A0C384C2BE C2A5C3ABC2A5C2ADC2A5C2A23FC38BC3 89C592C3B9C393C389C3994620617320 C2BEC2AEC389C38FC2BFE28094C5A0C2 AA]], '姫'}, } for _,tbl in pairs(mojibake_correct_pairs) do local mojibake = hex_string_to_bytes(tbl[1]) local correct = tbl[2] io.stdout:write(string.format( '\nTrying to find the codepages that will transform:\n\t%s\n' .. 'So that it contains:\n\t%s\n', mojibake, correct)) run_assume_middle_utf8(encoding_list, mojibake, correct) end
signature.asc
Description: OpenPGP digital signature
_______________________________________________ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org http://ffmpeg.org/mailman/listinfo/ffmpeg-devel