msodumper/globals.py | 86 +++++++++++++++++++++++++++++++++++++++---------- msodumper/node.py | 9 +++-- msodumper/ole.py | 2 + msodumper/xlsrecord.py | 40 ++++++++++++++-------- msodumper/xlsstream.py | 49 ++++++++++++++++++++++----- 5 files changed, 144 insertions(+), 42 deletions(-)
New commits: commit 595542f5865f13aee61b161f468c4ee8b25cf6ad Author: Jean-Francois Dockes <j...@dockes.org> Date: Thu Jan 9 08:20:21 2014 +0100 Accept shorter BOF record diff --git a/msodumper/xlsrecord.py b/msodumper/xlsrecord.py index 0c55b60..ac070f4 100644 --- a/msodumper/xlsrecord.py +++ b/msodumper/xlsrecord.py @@ -627,16 +627,27 @@ class BOF(BaseRecordHandler): self.buildYear = self.readUnsignedInt(2) # file history flags - self.flags = self.readUnsignedInt(4) - self.win = (self.flags & 0x00000001) - self.risc = (self.flags & 0x00000002) - self.beta = (self.flags & 0x00000004) - self.winAny = (self.flags & 0x00000008) - self.macAny = (self.flags & 0x00000010) - self.betaAny = (self.flags & 0x00000020) - self.riscAny = (self.flags & 0x00000100) - self.lowestExcelVer = self.readSignedInt(4) - + try: + self.flags = self.readUnsignedInt(4) + self.win = (self.flags & 0x00000001) + self.risc = (self.flags & 0x00000002) + self.beta = (self.flags & 0x00000004) + self.winAny = (self.flags & 0x00000008) + self.macAny = (self.flags & 0x00000010) + self.betaAny = (self.flags & 0x00000020) + self.riscAny = (self.flags & 0x00000100) + self.lowestExcelVer = self.readSignedInt(4) + except: + self.flags = 0 + self.win = 0 + self.risc = 0 + self.beta = 0 + self.winAny = 0 + self.macAny = 0 + self.betaAny = 0 + self.riscAny = 0 + self.lowestExcelVer = 0 + def parseBytes (self): self.__parseBytes() # BIFF version commit 2d4f7cf511275a7a6103bce1d75963354e7e14fa Author: Jean-Francois Dockes <j...@dockes.org> Date: Wed Jan 8 17:20:29 2014 +0100 Process CONTINUE records by appending them to their base. Specially process SST CONTINUE records by handling the string compression byte (grbit) at the beginning of each CONTINUE record. diff --git a/msodumper/globals.py b/msodumper/globals.py index ca817ec..276a317 100644 --- a/msodumper/globals.py +++ b/msodumper/globals.py @@ -164,25 +164,55 @@ def encodeName (name, lowOnly = False, lowLimit = 0x20): return newname +# Uncompress "compressed" UTF-16. This compression strips high bytes +# from a string when they are all 0. Just restore them. +def uncompCompUnicode(bytes): + out = "" + for b in bytes: + out += b + out += '\0' + return out class UnicodeRichExtText(object): def __init__ (self): - self.baseText = '' + self.baseText = unicode() self.phoneticBytes = [] +# Linear search for index of first element in sorted list strictly +# bigger than a given value. Might be converted to binary search, but our +# lists (CONTINUE record offsets) are small. If the returned index is +# the list size (last valid index+1), the input value is beyond the +# max list value +def find_first_bigger(ilist, value): + i = 0 + while i < len(ilist) and value >= ilist[i]: + i +=1 + return i -def getUnicodeRichExtText (bytes): +def getUnicodeRichExtText (bytes, offset = 0, rofflist = []): + if len(rofflist) == 0: + rofflist = [len(bytes)] ret = UnicodeRichExtText() # Avoid myriad of messages when in "catching" mode if params.catchExceptions and (bytes is None or len(bytes) == 0): return ret, 0 + + if len(rofflist) == 0 or rofflist[len(rofflist)-1] != len(bytes): + error("bad input to getUnicodeRichExtText: empty offset list or last offset != size. size %d list %s" % (len(bytes), str(rofflist))) + raise ByteStreamError() + strm = ByteStream(bytes) + strm.setCurrentPos(offset) + try: textLen = strm.readUnsignedInt(2) flags = strm.readUnsignedInt(1) # 0 0 0 0 0 0 0 0 # |-------|D|C|B|A| - isDoubleByte = (flags & 0x01) > 0 # A + if (flags & 0x01) > 0: # A + bytesPerChar = 2 + else: + bytesPerChar = 1 ignored = (flags & 0x02) > 0 # B hasPhonetic = (flags & 0x04) > 0 # C isRichStr = (flags & 0x08) > 0 # D @@ -195,18 +225,42 @@ def getUnicodeRichExtText (bytes): if hasPhonetic: phoneticBytes = strm.readUnsignedInt(4) - if isDoubleByte: - # double-byte string (UTF-16) - ret.baseText = \ - unicode(strm.readBytes(2*textLen), 'UTF-16LE', errors='replace') - else: - # "Compressed Unicode" string. UTF-16 without the zero - # octets. These have to be latin1 - if params.utf8: - ret.baseText = strm.readBytes(textLen).decode('cp1252') - else: - # If utf8 is not set, we'll print hex bytes, keep data as is - ret.baseText = strm.readBytes(textLen) + # Reading the string proper. This is made a bit more + # complicated by the fact that the format can switch from + # compressed (latin data with high zeros stripped) to normal + # (UTF-16LE) whenever a string encounters a CONTINUE record + # boundary. The new format is indicated by a single byte at + # the start of the CONTINUE record payload. + while textLen > 0: + #print("Reading Unicode with bytesPerChar %d" % bytesPerChar) + bytesToRead = textLen * bytesPerChar + + # Truncate to next record boundary + ibound = find_first_bigger(rofflist, strm.getCurrentPos()) + if ibound == len(rofflist): + # Just try to read and let the stream raise an exception + strm.readBytes(bytesToRead) + return + + bytesToRead = min(bytesToRead, \ + rofflist[ibound]- strm.getCurrentPos()) + newdata = strm.readBytes(bytesToRead) + if bytesPerChar == 1: + newdata = uncompCompUnicode(newdata) + + ret.baseText += unicode(newdata, 'UTF-16LE', errors='replace') + + textLen -= bytesToRead // bytesPerChar + + # If there is still data to read, we hit a record boundary. Read + # the grbit byte for detecting possible compression switch + if textLen > 0: + grbit = strm.readUnsignedInt(1) + if (grbit & 1) != 0: + bytesPerChar = 2 + else: + bytesPerChar = 1 + if isRichStr: for i in xrange(0, numElem): posChar = strm.readUnsignedInt(2) @@ -219,7 +273,7 @@ def getUnicodeRichExtText (bytes): raise error("getUnicodeRichExtText: %s\n" % e) return ret, len(bytes) - return ret, strm.getCurrentPos() + return ret, strm.getCurrentPos() - offset def getRichText (bytes, textLen=None): diff --git a/msodumper/node.py b/msodumper/node.py index abf67ca..bab92bd 100644 --- a/msodumper/node.py +++ b/msodumper/node.py @@ -128,15 +128,18 @@ def encodeString (sin, utf8 = False): sout1 = sin.encode('UTF-8') else: sout1 = sin - # Escape special characters as entities + # Escape special characters as entities. Can't keep zero bytes either + # (bad XML). They can only arrive here if there is a bug somewhere. for c in sout1: - if c in encodeTable: + if ord(c) == 0: + sout += '(nullbyte)' + elif c in encodeTable: sout += '&' + encodeTable[c] + ';' else: sout += c else: for c in sin: - if ord(c) >= 128: + if ord(c) >= 128 or ord(c) == 0: # encode non-ascii ranges. sout += "\\x%2.2x"%ord(c) elif encodeTable.has_key(c): diff --git a/msodumper/xlsrecord.py b/msodumper/xlsrecord.py index 5fa3fc6..0c55b60 100644 --- a/msodumper/xlsrecord.py +++ b/msodumper/xlsrecord.py @@ -249,11 +249,12 @@ class DXFN12NoCB(object): class BaseRecordHandler(globals.ByteStream): - def __init__ (self, header, size, bytes, strmData): + def __init__ (self, header, size, bytes, strmData, roflist = []): globals.ByteStream.__init__(self, bytes) self.header = header self.lines = [] self.strmData = strmData + self.roflist = roflist def parseBytes (self): """Parse the original bytes and generate human readable output. @@ -1599,7 +1600,7 @@ class SST(BaseRecordHandler): self.strCount = self.readSignedInt(4) # total number of unique strings. self.sharedStrings = [] for i in xrange(0, self.strCount): - extText, bytesRead = globals.getUnicodeRichExtText(self.bytes[self.getCurrentPos():]) + extText, bytesRead = globals.getUnicodeRichExtText(self.bytes, self.getCurrentPos(), self.roflist) self.readBytes(bytesRead) # advance current position. self.sharedStrings.append(extText) @@ -1922,7 +1923,7 @@ class SupBook(BaseRecordHandler): self.moveBack(2) pos = self.getCurrentPos() while pos < self.size: - ret, bytesLen = globals.getUnicodeRichExtText(self.bytes[pos:]) + ret, bytesLen = globals.getUnicodeRichExtText(self.bytes, pos) name = ret.baseText self.moveForward(bytesLen) self.names.append(name) @@ -2167,7 +2168,7 @@ class Crn(BaseRecordHandler): elif typeId == 0x02: # string pos = self.getCurrentPos() - ret, length = globals.getUnicodeRichExtText(self.bytes[pos:]) + ret, length = globals.getUnicodeRichExtText(self.bytes, pos) text = ret.baseText text = globals.encodeName(text) self.moveForward(length) diff --git a/msodumper/xlsstream.py b/msodumper/xlsstream.py index debce12..945c5a8 100644 --- a/msodumper/xlsstream.py +++ b/msodumper/xlsstream.py @@ -449,12 +449,43 @@ class XLDirStream(object): bytes = self.readByteArray(size) return pos, header, size, bytes - def __getRecordHandler (self, header, size, bytes): + def __readRecAndContBytes(self): + '''Read record itself and possible CONTINUE blocks.''' + + pos, header, size, bytes = self.__readRecordBytes() + + # Records boundaries/offset list (only useful if there are + # CONTINUE records) + roflist = [size] + + # Read possible CONTINUE records, and concatenate the data + while self.peekNext() == 0x3c: + cpos, cheader, csize, cbytes = self.__readRecordBytes() + bytes += cbytes + size += csize + roflist.append(size) + + return pos, header, size, bytes, roflist + + def peekNext (self): + '''Check type of next record without changing stream state''' + + if self.size - self.pos < 4: + raise EndOfStream + + pos = self.pos + header = self.readRaw(2) + if header == 0x0000: + raise EndOfStream + self.pos = pos + return header + + def __getRecordHandler (self, header, size, bytes, roflist): # record handler that parses the raw bytes and displays more # meaningful information. handler = None if recData.has_key(header) and len(recData[header]) >= 3: - handler = recData[header][2](header, size, bytes, self.strmData) + handler = recData[header][2](header, size, bytes, self.strmData, roflist) if handler != None and self.strmData.encrypted: # record handler exists. Parse the record and display more info @@ -470,8 +501,8 @@ class XLDirStream(object): self.strmData.encrypted = True def fillModel (self, model): - pos, header, size, bytes = self.__readRecordBytes() - handler = self.__getRecordHandler(header, size, bytes) + pos, header, size, bytes, roflist = self.__readRecAndContBytes() + handler = self.__getRecordHandler(header, size, bytes, roflist) if handler != None: try: handler.fillModel(model) @@ -483,11 +514,11 @@ class XLDirStream(object): def getNextRecordHandler (self): - pos, header, size, bytes = self.__readRecordBytes() - return self.__getRecordHandler(header, size, bytes) + pos, header, size, bytes, roflist = self.__readRecAndContBytes() + return self.__getRecordHandler(header, size, bytes, roflist) def readRecord (self): - pos, header, size, bytes = self.__readRecordBytes() + pos, header, size, bytes, roflist = self.__readRecAndContBytes() # record handler that parses the raw bytes and displays more # meaningful information. @@ -500,12 +531,12 @@ class XLDirStream(object): print("%4.4Xh: %s - %s (%4.4Xh)"% (header, recData[header][0], recData[header][1], header)) if len(recData[header]) >= 3: - handler = recData[header][2](header, size, bytes, self.strmData) + handler = recData[header][2](header, size, bytes, self.strmData, roflist) elif self.type == DirType.RevisionLog and recDataRev.has_key(header): print("%4.4Xh: %s - %s (%4.4Xh)"% (header, recDataRev[header][0], recDataRev[header][1], header)) if len(recDataRev[header]) >= 3: - handler = recDataRev[header][2](header, size, bytes, self.strmData) + handler = recDataRev[header][2](header, size, bytes, self.strmData, roflist) else: print("%4.4Xh: [unknown record name] (%4.4Xh)"%(header, header)) commit 23d56056390cb8146ce36deff3c608e37453130a Author: Jean-Francois Dockes <j...@dockes.org> Date: Wed Jan 8 15:49:39 2014 +0100 Prevent possible infinite loop on bad input diff --git a/msodumper/ole.py b/msodumper/ole.py index dc284fb..16f3cb9 100644 --- a/msodumper/ole.py +++ b/msodumper/ole.py @@ -203,6 +203,8 @@ class Header(object): # additional sectors are used to store more SAT sector IDs. secID = self.__secIDFirstMSAT size = self.getSectorSize() + if size < 4: + raise Exception("ole.Header::parse: got %d as sector size!" % size) inLoop = True while inLoop: pos = 512 + secID*size _______________________________________________ Libreoffice-commits mailing list libreoffice-comm...@lists.freedesktop.org http://lists.freedesktop.org/mailman/listinfo/libreoffice-commits