Changeset: f349cdd547dc for MonetDB
URL: http://dev.monetdb.org/hg/MonetDB?cmd=changeset;node=f349cdd547dc
Modified Files:
        common/stream/stream.c
Branch: Jan2014
Log Message:

When opening a file for a stream, check for UTF-8 BOM.
When we find a BOM, we skip it, and we mark the stream as being
UTF-8.  Then we can (and do) skip conversion with iconv, so that on
(e.g.) Windows you can run mclient with some encoding set, but still
read UTF-8 encoded files, as long as they start with the BOM.
This fixes bug 3436.


diffs (75 lines):

diff --git a/common/stream/stream.c b/common/stream/stream.c
--- a/common/stream/stream.c
+++ b/common/stream/stream.c
@@ -115,6 +115,9 @@
 #define pclose _pclose
 #endif
 
+#define UTF8BOM                "\xEF\xBB\xBF" /* UTF-8 encoding of Unicode BOM 
*/
+#define UTF8BOMLENGTH  3              /* length of above */
+
 #define short_int_SWAP(s) ((short)(((0x00ff&(s))<<8) | ((0xff00&(s))>>8)))
 
 #define normal_int_SWAP(i) (((0x000000ff&(i))<<24) | ((0x0000ff00&(i))<<8) | \
@@ -127,7 +130,8 @@
 
 struct stream {
        short byteorder;
-       short access;           /* read/write */
+       char access;            /* read/write */
+       char isutf8;            /* known to be UTF-8 due to BOM */
        short type;             /* ascii/binary */
        char *name;
        unsigned int timeout;
@@ -463,6 +467,7 @@ create_stream(const char *name)
                return NULL;
        s->byteorder = 1234;
        s->access = ST_READ;
+       s->isutf8 = 0;          /* not known for sure */
        s->type = ST_ASCII;
        s->name = strdup(name);
        s->stream_data.p = NULL;
@@ -625,6 +630,8 @@ open_stream(const char *filename, const 
 {
        stream *s;
        FILE *fp;
+       lng pos;
+       char buf[4];
 
        if ((s = create_stream(filename)) == NULL)
                return NULL;
@@ -639,6 +646,17 @@ open_stream(const char *filename, const 
        s->fgetpos = file_fgetpos;
        s->fsetpos = file_fsetpos;
        s->stream_data.p = (void *) fp;
+       /* if file is opened for reading, and it starts with the UTF-8
+        * encoding of the Unicode Byte Order Mark, skip the mark, and
+        * mark the stream as being a UTF-8 stream */
+       if (flags[0] == 'r' &&
+           file_fgetpos(s, &pos) == 0) {
+               if (file_read(s, buf, 1, UTF8BOMLENGTH) == 3 &&
+                   strncmp(buf, UTF8BOM, UTF8BOMLENGTH) == 0)
+                       s->isutf8 = 1;
+               else
+                       file_fsetpos(s, pos);
+       }
        return s;
 }
 
@@ -2157,6 +2175,8 @@ ic_open(iconv_t cd, stream *ss, const ch
        stream *s;
        struct icstream *ic;
 
+       if (ss->isutf8)
+               return ss;
        if ((s = create_stream(name)) == NULL)
                return NULL;
        s->read = ic_read;
@@ -2191,6 +2211,7 @@ iconv_rstream(stream *ss, const char *ch
                return NULL;
        s = ic_open(cd, ss, name);
        s->access = ST_READ;
+       s->isutf8 = 1;
        return s;
 }
 
_______________________________________________
checkin-list mailing list
checkin-list@monetdb.org
https://www.monetdb.org/mailman/listinfo/checkin-list

Reply via email to