bug-wget
[Top][All Lists]
Advanced

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

[Bug-wget] [PATCH 3/3] Add on the fly gzip decompression support


From: Tim Schlueter
Subject: [Bug-wget] [PATCH 3/3] Add on the fly gzip decompression support
Date: Fri, 28 Jul 2017 19:41:23 -0700
User-agent: Mozilla/5.0 (X11; Linux x86_64; rv:52.0) Gecko/20100101 Thunderbird/52.2.1

Add on the fly gzip decompression support to wget1.

If the --compression=none argument is given to wget, gzip decompression
is disabled, and wget will function as it has before this patch. The
"Accept-Encoding: identity" header will be sent (unless overridden by
--header) and any compressed response from the remote server will be
written out as-is to the appropriate file.  E.g. index.html without the
-E argument, index.html.gz with -E.

If --compression=auto is given (or no --compression argument is given)
and --continue or --start-pos are given, compression will be disabled
for backwards compatibility.

Otherwise, --compression=auto (or no --compression argument) functions
the same as --compression=gzip.

If the --compression=gzip argument is given, the --continue and
--start-pos arguments will be ignored and a warning will be printed if
they are present, see patch 2. The "Accept-Encoding: gzip" header will
be sent with the request (unless overridden). If the Content-Encoding
response header is set to "gzip" or "x-gzip", wget will decompress the
data on the fly and store them in the appropriate file (and -E will NOT
append .gz).

Incorrect server responses with the Content-Type set to */gzip will be
NOT be decompressed and will instead be written out as-is (-E will still
ensure it has a .gz extension however).

---
 src/http.c |  39 ++++++++++++++++-
 src/retr.c | 143 ++++++++++++++++++++++++++++++++++++++++++++++++++++--
 src/retr.h |   4 +-
 3 files changed, 180 insertions(+), 6 deletions(-)

diff --git a/src/http.c b/src/http.c
index a8c6e18..08b2ed6 100644
--- a/src/http.c
+++ b/src/http.c
@@ -1581,6 +1581,7 @@ struct http_stat
 #endif

   encoding_t local_encoding;    /* the encoding of the local file */
+  encoding_t remote_encoding;   /* the encoding of the remote file */

   bool temporary;               /* downloading a temporary file */
 };
@@ -1693,6 +1694,9 @@ read_response_body (struct http_stat *hs, int
sock, FILE *fp, wgint contlen,
   if (chunked_transfer_encoding)
     flags |= rb_chunked_transfer_encoding;

+  if (hs->remote_encoding == ENC_GZIP)
+    flags |= rb_compressed_gzip;
+
   hs->len = hs->restval;
   hs->rd_size = 0;
   /* Download the response body and write it to fp.
@@ -1886,7 +1890,12 @@ initialize_request (const struct url *u, struct
http_stat *hs, int *dt, struct u
                         rel_value);
   SET_USER_AGENT (req);
   request_set_header (req, "Accept", "*/*", rel_none);
-  request_set_header (req, "Accept-Encoding", "identity", rel_none);
+#ifdef HAVE_LIBZ
+  if (opt.compression != compression_none)
+    request_set_header (req, "Accept-Encoding", "gzip", rel_none);
+  else
+#endif
+    request_set_header (req, "Accept-Encoding", "identity", rel_none);

   /* Find the username with priority */
   if (u->user)
@@ -3203,6 +3212,7 @@ gethttp (const struct url *u, struct url
*original_url, struct http_stat *hs,
   hs->error = NULL;
   hs->message = NULL;
   hs->local_encoding = ENC_NONE;
+  hs->remote_encoding = ENC_NONE;

   conn = u;

@@ -3694,6 +3704,30 @@ gethttp (const struct url *u, struct url
*original_url, struct http_stat *hs,
           DEBUGP (("Unrecognized Content-Encoding: %s\n", hdrval));
           hs->local_encoding = ENC_NONE;
         }
+#ifdef HAVE_LIBZ
+      else if (hs->local_encoding == ENC_GZIP
+               && opt.compression != compression_none)
+        {
+          /* Make sure the Content-Type is not gzip before decompressing */
+          const char * p = strchr (type, '/');
+          if (p == NULL)
+            {
+              hs->remote_encoding = ENC_GZIP;
+              hs->local_encoding = ENC_NONE;
+            }
+          else
+            {
+              p++;
+              if (c_tolower(p[0]) == 'x' && p[1] == '-')
+                p += 2;
+              if (0 != c_strcasecmp (p, "gzip"))
+                {
+                  hs->remote_encoding = ENC_GZIP;
+                  hs->local_encoding = ENC_NONE;
+                }
+            }
+        }
+#endif
     }

   /* 20x responses are counted among successful by default.  */
@@ -3930,6 +3964,9 @@ gethttp (const struct url *u, struct url
*original_url, struct http_stat *hs,
     }
   if (contlen == -1)
     hs->contlen = -1;
+  /* If the response is gzipped, the uncompressed size is unknown. */
+  else if (hs->remote_encoding == ENC_GZIP)
+    hs->contlen = -1;
   else
     hs->contlen = contlen + contrange;

diff --git a/src/retr.c b/src/retr.c
index 0cf438e..a27d58a 100644
--- a/src/retr.c
+++ b/src/retr.c
@@ -41,6 +41,10 @@ as that of the covered work.  */
 # include <unixio.h>            /* For delete(). */
 #endif

+#ifdef HAVE_LIBZ
+# include <zlib.h>
+#endif
+
 #include "exits.h"
 #include "utils.h"
 #include "retr.h"
@@ -84,6 +88,22 @@ limit_bandwidth_reset (void)
   xzero (limit_data);
 }

+#ifdef HAVE_LIBZ
+static voidpf
+zalloc (voidpf opaque, unsigned int items, unsigned int size)
+{
+  (void) opaque;
+  return (voidpf) xcalloc (items, size);
+}
+
+static void
+zfree (voidpf opaque, voidpf address)
+{
+  (void) opaque;
+  xfree (address);
+}
+#endif
+
 /* Limit the bandwidth by pausing the download for an amount of time.
    BYTES is the number of bytes received from the network, and TIMER
    is the timer that started at the beginning of download.  */
@@ -257,6 +277,44 @@ fd_read_body (const char *downloaded_filename, int
fd, FILE *out, wgint toread,
   wgint sum_written = 0;
   wgint remaining_chunk_size = 0;

+#ifdef HAVE_LIBZ
+  /* try to minimize the number of calls to inflate() and write_data() per
+     call to fd_read() */
+  unsigned int gzbufsize = dlbufsize * 4;
+  char *gzbuf = NULL;
+  z_stream gzstream;
+
+  if (flags & rb_compressed_gzip)
+    {
+      gzbuf = xmalloc (gzbufsize);
+      if (gzbuf != NULL)
+        {
+          gzstream.zalloc = zalloc;
+          gzstream.zfree = zfree;
+          gzstream.opaque = Z_NULL;
+          gzstream.next_in = Z_NULL;
+          gzstream.avail_in = 0;
+
+          #define GZIP_DETECT 32 /* gzip format detection */
+          #define GZIP_WINDOW 15 /* logarithmic window size (default:
15) */
+          ret = inflateInit2 (&gzstream, GZIP_DETECT | GZIP_WINDOW);
+          if (ret != Z_OK)
+            {
+              xfree (gzbuf);
+              errno = (ret == Z_MEM_ERROR) ? ENOMEM : EINVAL;
+              ret = -1;
+              goto out;
+            }
+        }
+      else
+        {
+          errno = ENOMEM;
+          ret = -1;
+          goto out;
+        }
+    }
+#endif
+
   if (flags & rb_skip_startpos)
     skip = startpos;

@@ -383,12 +441,64 @@ fd_read_body (const char *downloaded_filename, int
fd, FILE *out, wgint toread,
           int write_res;

           sum_read += ret;
-          write_res = write_data (out, out2, dlbuf, ret, &skip,
&sum_written);
-          if (write_res < 0)
+
+#ifdef HAVE_LIBZ
+          if (gzbuf != NULL)
             {
-              ret = (write_res == -3) ? -3 : -2;
-              goto out;
+              int err;
+              int towrite;
+              gzstream.avail_in = ret;
+              gzstream.next_in = (unsigned char *) dlbuf;
+
+              do
+                {
+                  gzstream.avail_out = gzbufsize;
+                  gzstream.next_out = (unsigned char *) gzbuf;
+
+                  err = inflate (&gzstream, Z_NO_FLUSH);
+
+                  switch (err)
+                    {
+                    case Z_MEM_ERROR:
+                      errno = ENOMEM;
+                      ret = -1;
+                      goto out;
+                    case Z_NEED_DICT:
+                    case Z_DATA_ERROR:
+                      errno = EINVAL;
+                      ret = -1;
+                      goto out;
+                    case Z_STREAM_END:
+                      if (exact && sum_read != toread)
+                        {
+                          DEBUGP(("zlib stream ended unexpectedly after "
+                                  "%ld/%ld bytes\n", sum_read, toread));
+                        }
+                    }
+
+                  towrite = gzbufsize - gzstream.avail_out;
+                  write_res = write_data (out, out2, gzbuf, towrite, &skip,
+                                          &sum_written);
+                  if (write_res < 0)
+                    {
+                      ret = (write_res == -3) ? -3 : -2;
+                      goto out;
+                    }
+                }
+              while (gzstream.avail_out == 0);
+            }
+          else
+#endif
+            {
+              write_res = write_data (out, out2, dlbuf, ret, &skip,
+                                      &sum_written);
+              if (write_res < 0)
+                {
+                  ret = (write_res == -3) ? -3 : -2;
+                  goto out;
+                }
             }
+
           if (chunked)
             {
               remaining_chunk_size -= ret;
@@ -433,6 +543,31 @@ fd_read_body (const char *downloaded_filename, int
fd, FILE *out, wgint toread,
   if (timer)
     ptimer_destroy (timer);

+#ifdef HAVE_LIBZ
+  if (gzbuf != NULL)
+    {
+      int err = inflateEnd (&gzstream);
+      if (ret >= 0)
+        {
+          /* with compression enabled, ret must be 0 if successful */
+          if (err == Z_OK)
+            ret = 0;
+          else
+            {
+              errno = EINVAL;
+              ret = -1;
+            }
+        }
+      xfree (gzbuf);
+
+      if (gzstream.total_in != sum_read)
+        {
+          DEBUGP(("zlib read size differs from raw read size (%lu/%lu)\n",
+                  gzstream.total_in, sum_read));
+        }
+    }
+#endif
+
   if (qtyread)
     *qtyread += sum_read;
   if (qtywritten)
diff --git a/src/retr.h b/src/retr.h
index 5fbbacb..f133c83 100644
--- a/src/retr.h
+++ b/src/retr.h
@@ -49,7 +49,9 @@ enum {
   rb_skip_startpos = 2,

   /* Used by HTTP/HTTPS*/
-  rb_chunked_transfer_encoding = 4
+  rb_chunked_transfer_encoding = 4,
+
+  rb_compressed_gzip = 8
 };

 int fd_read_body (const char *, int, FILE *, wgint, wgint, wgint *,
wgint *, double *, int, FILE *);
-- 

Attachment: signature.asc
Description: OpenPGP digital signature


reply via email to

[Prev in Thread] Current Thread [Next in Thread]