Package: catdoc
Version: 1:0.95-4
Tags: patch

This issue happens when running 'xls2csv' on XLS files containing international 
characters. The issue can be reproduced with the attached 'unicode_MS.xls' file.

Details: When investigating more I noticed that sometimes contents from the 
xls2csv process heap memory was dumped to the output.
Note: Applying the attached patch to the source code in version 0.95.4 seems to 
fix the issue.

Attachment: unicode_MS.xls
Description: MS-Excel spreadsheet

diff --git a/src/xls.h b/src/xls.h
index c389e4a..48d4794 100644
--- a/src/xls.h
+++ b/src/xls.h
@@ -37,7 +37,9 @@ char *gettypename(long rectype);
 void parse_sst(unsigned char *sstbuf,int bufsize);
 void process_item (uint16_t rectype, uint16_t reclen, unsigned char *rec);
 unsigned char **allocate(int row,int col);
-unsigned char *copy_unicode_string(unsigned char **src);
+unsigned char *copy_unicode_string(unsigned char **src, int fromSst,
+									const unsigned char * const sourceStart,
+									const unsigned char * const sourceEnd);
 char convert8to8(char *src,int count);
 char *convert16to8(char *src,int count);
 void do_table(FILE *input,char *filename);
diff --git a/src/xlsparse.c b/src/xlsparse.c
index 53fc21c..cd8aa0e 100644
--- a/src/xlsparse.c
+++ b/src/xlsparse.c
@@ -128,6 +128,9 @@ unsigned char **sst=NULL;/* Shared string table parsed into array of strings in
 int sstsize = 0; /*Number of strings in SST*/
 unsigned char *sstBuffer=NULL; /*Unparsed sst to accumulate all its parts*/
 int sstBytes = 0; /*Size of SST Data, already accumulated in the buffer */
+int *sstPartBoundaries = NULL; /* offsets in the sstBuffer where the SST record
+								  was interrupted by continuation records, the
+								  array is terminated by a 0 */
 int codepage=1251; /*default*/
 int prev_rectype=0;
 /* holds a pointer to formula value, becouse value itself would be in
@@ -135,6 +138,34 @@ int prev_rectype=0;
  */
 unsigned char **saved_reference = NULL;
 
+/* return 0 on error and 1 on success */
+static int saveSstBoundary(int boundary) {
+	int *p;
+	size_t count;
+
+	if (sstPartBoundaries) {
+		p = sstPartBoundaries;
+		while (*p) {
+			p++;
+		}
+		count = (size_t) (p - sstPartBoundaries) + 2;
+	} else {
+		count = 2;
+	}
+
+	p = (int*) realloc(sstPartBoundaries,
+				count * sizeof(sstPartBoundaries[0]));
+	if (!p) {
+		// allocation failure, leave boundary list as it is
+		return 0;
+	}
+
+	sstPartBoundaries = p;
+	sstPartBoundaries[count - 2] = boundary;
+	sstPartBoundaries[count - 1] = 0;
+	return 1;
+}
+
 void process_item (uint16_t rectype, uint16_t reclen, unsigned char *rec) {
 	if (rectype != CONTINUE && prev_rectype == SST) {
 		/* we have accumulated  unparsed SST, and now encountered
@@ -193,7 +224,10 @@ void process_item (uint16_t rectype, uint16_t reclen, unsigned char *rec) {
 			free(sstBuffer);
 		if (sst != NULL)
 			free(sst);
-		
+		if (sstPartBoundaries != NULL) {
+			free(sstPartBoundaries);
+			sstPartBoundaries = NULL;
+		}
 		sstBuffer=(unsigned char*)malloc(reclen);
 		sstBytes = reclen;
 		if (sstBuffer == NULL ) {
@@ -211,7 +245,11 @@ void process_item (uint16_t rectype, uint16_t reclen, unsigned char *rec) {
 		if (sstBuffer == NULL ) {
 			perror("SSTptr realloc error! ");
 			exit(1);
-		}	  
+		}
+		/* add current SST buffer offset as a boundary in the SST boundaries
+		   list so that we know to use the extra byte that is inserted at each
+		   boundary between SST parts */
+		saveSstBoundary(sstBytes);
 		memcpy(sstBuffer+sstBytes,rec,reclen);
 		sstBytes+=reclen;
 		return;
@@ -226,7 +264,7 @@ void process_item (uint16_t rectype, uint16_t reclen, unsigned char *rec) {
 		col = getshort(rec,2);
 		/* 		fprintf(stderr,"LABEL!\n"); */
 		pcell=allocate(row,col);
-		*pcell=copy_unicode_string(&src);
+		*pcell=copy_unicode_string(&src, 0, rec, rec + reclen);
 		break;
 	}     
 	case BLANK: { int row,col;unsigned char **pcell;
@@ -362,7 +400,7 @@ void process_item (uint16_t rectype, uint16_t reclen, unsigned char *rec) {
 			fprintf(stderr,"String record without preceeding string formula\n");
 			break;
 		}
-		*saved_reference=copy_unicode_string(&src);
+		*saved_reference=copy_unicode_string(&src, 0, rec, rec + reclen);
 		break;
 	}	    
 	case BOF2:
@@ -429,7 +467,9 @@ void process_item (uint16_t rectype, uint16_t reclen, unsigned char *rec) {
 /*
  * Extracts string from sst and returns mallocked copy of it
  */
-unsigned char *copy_unicode_string (unsigned char **src) {
+unsigned char *copy_unicode_string (unsigned char **src, int fromSst,
+									const unsigned char * const sourceStart,
+									const unsigned char * const sourceEnd) {
 	int count=0;
 	int flags = 0;
 	int start_offset=0;
@@ -501,15 +541,29 @@ unsigned char *copy_unicode_string (unsigned char **src) {
 	*dest=0;l=0;
 	for (s=*src,d=dest,i=0;i<count;i++,s+=charsize) {
 		/* 		fprintf(stderr,"l=%d len=%d count=%d charsize=%d\n",l,len,count,charsize); */
-		if ( (charsize == 1 && (*s == 1 || *s == 0)) ||
-				 (charsize == 2 && (*s == 1 || *s == 0) && *(s+1) != 4)) {
-			/* 			fprintf(stderr,"extchar (unicode)=%02x %02x\n",*s, *(s+1)); */
-			charsize=(*s &0x01) ? 2 : 1;
-			if (charsize == 2)
-				s-=1;
-			count++;
-			continue;
+		if (fromSst && (*s == 1 || *s == 0)) {
+			int sstOffset = (int) (s - sourceStart);
+			int flagFound = 0;
+			int *k;
+			for (k = sstPartBoundaries; k && *k; k++) {
+				if (*k == sstOffset) {
+					/* we passed a CONTINUE record boundary, check the charsize
+					   (a.k.a. compression) flag */
+					charsize = (*s == 0 ? 1 : 2);
+					flagFound = 1;
+					break;
+				}
+			}
+
+			if (flagFound) {
+				if (charsize == 2) {
+					s -= 1; /* will be incremented by 2 */
+				}
+				count++;
+				continue;
+			}
 		}
+
 		if ( charsize == 2 ){
 			u=(unsigned short)getshort(s,0);
 			c=(unsigned char *)convert_char(u);
@@ -783,7 +837,7 @@ void parse_sst(unsigned char *sstbuf,int bufsize) {
 	for (i=0,parsedString=sst,curString=sstbuf+8;
 			 i<sstsize && curString<barrier; i++,parsedString++) {
 		/* 		fprintf(stderr,"copying %d string\n",i); */
-		*parsedString = copy_unicode_string(&curString);
+		*parsedString = copy_unicode_string(&curString, 1, sstbuf, barrier);
 	}       
 	/* 	fprintf(stderr,"end sst i=%d sstsize=%d\n",i,sstsize); */
 

Reply via email to