Hi,
I wanted to help the Debian project and found the gzip bug report

  https://bugs.debian.org/cgi-bin/bugreport.cgi?bug=575884

Despite not being a C programmer I decided to give it a try.
Since this is the first time I have a patch for an open source project,
I thought that maybe I should first ask upstream if it makes sense.

In the code of gzip.c there is the comment:

 * If the lseek fails, we could use read() to get to the end, but
 * --list is used to get quick results.
 * Use "gunzip < foo.gz | wc -c" to get the uncompressed size if
 * you are not concerned about speed.

Assuming it is correct, the patch does just that, use read() to get
to the end.
After applying the patch and running

  $ time cat rnd0.bin.gz | gzip -l -

on a gzipped 3GB file created with /dev/urandom, the result is

   compressed        uncompressed  ratio uncompressed_name
   3000485948          3000000000  -0.0% stdout

   real   0m0.740s
   user   0m0.013s
   sys    0m1.134s

To me it seems quite fast, but maybe gzip is used with much bigger
files and one second is too slow.

The patch is attached to this e-mail.
I'd like to know what you think about it.
Thanks
 
 
Index: b/gzip.c
===================================================================
--- a/gzip.c
+++ b/gzip.c
@@ -1727,6 +1727,7 @@ local void do_list(ifd, method)
     int ifd;     /* input file descriptor */
     int method;  /* compression method */
 {
+    const off_t save_bytes_in = bytes_in;
     ulg crc;  /* original crc */
     static int first_time = 1;
     static char const *const methods[MAX_METHODS] = {
@@ -1772,11 +1773,14 @@ local void do_list(ifd, method)

     if (method == DEFLATED && !last_member) {
         /* Get the crc and uncompressed size for gzip'ed (not zip'ed) files.
-         * If the lseek fails, we could use read() to get to the end, but
-         * --list is used to get quick results.
-         * Use "gunzip < foo.gz | wc -c" to get the uncompressed size if
-         * you are not concerned about speed.
          */
+        if (insize != INBUFSIZ) {
+            /* eof: no need to lseek */
+            /* assert( insize >= 8 ) */
+            bytes_in  = save_bytes_in;
+            crc       = LG(inbuf + insize - 8);
+            bytes_out = LG(inbuf + insize - 4);
+        } else {
         bytes_in = lseek(ifd, (off_t)(-8), SEEK_END);
         if (bytes_in != -1L) {
             uch buf[8];
@@ -1786,6 +1790,62 @@ local void do_list(ifd, method)
             }
             crc       = LG(buf);
             bytes_out = LG(buf+4);
+        } else {
+            /* assert(insize == INBUFSIZ) */
+            /* assert((INBUFSIZ % 2) == 0) */
+            bytes_in = save_bytes_in;
+            const int half_buf_size = INBUFSIZ / 2;
+            /* If present (possibly partially), the last 8 bytes can only
+             * be in the second half of the inbuf buffer,
+             * so the next block to read is the first half. */
+            ssize_t nread;
+            uch *buf;
+            size_t buf_to_read = half_buf_size;
+            int half_idx = 0;
+            errno = 0; /* reset lseek error */
+            insize = 0;
+            while (1) {
+                nread = read_buffer(ifd, inbuf + half_idx * half_buf_size, buf_to_read);
+                if (nread == 0) {
+                    break;
+                }
+                if (nread < 0) {
+                    read_error();
+                }
+                bytes_in += nread;
+                insize += nread;
+                buf_to_read -= nread;
+                if (buf_to_read == 0) {
+                    buf_to_read = half_buf_size;
+                    insize = 0;
+                    half_idx = 1 - half_idx;
+                }
+            }
+            insize = half_buf_size - buf_to_read;
+            if (insize >= 8) {
+                /* All 8 bytes fit in the current half buffer */
+                buf = inbuf + half_idx * half_buf_size + insize - 8;
+            } else if (insize == 0) {
+                /* All 8 bytes are in the other half buffer */
+                buf = inbuf + (1 - half_idx) * half_buf_size + half_buf_size - 8;
+            } else {
+                /* The 8 bytes are partially on the other half buffer */
+                if (half_idx == 1) {
+                    /* The 8 bytes are contiguous */
+                    buf = inbuf + half_buf_size + insize - 8;
+                } else {
+                    /* The end of the 8 bytes is at the beginning of the first half,
+                     * the start of the 8 bytes is at the end of the second half.
+                     * Let's move them both at the start of the first half. */
+                    const size_t start_size = 8 - insize;
+                    memmove(inbuf + start_size, inbuf, insize);
+                    memcpy(inbuf, inbuf + half_buf_size + half_buf_size - start_size, start_size);
+                    buf = inbuf;
+                }
+            }
+            crc       = LG(buf);
+            bytes_out = LG(buf+4);
+        }
         }
     }

Reply via email to