bug-wget
[Top][All Lists]
Advanced

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

Re: [Bug-wget] [PATCH 1-3/3] support HTTP compression


From: Ángel González
Subject: Re: [Bug-wget] [PATCH 1-3/3] support HTTP compression
Date: Wed, 17 Dec 2014 23:36:42 +0100
User-agent: Thunderbird

On 16/12/14 16:20, Yuriy M. Kaminskiy wrote:



Thanks!

Comments inline.

> From ccb95548926a0ab8ad4ed3787470d4803efad4ca Mon Sep 17 00:00:00 2001
From: "Yuriy M. Kaminskiy"<address@hidden>
Date: Tue, 16 Dec 2014 17:19:22 +0300
Subject: [PATCH 1/3] Recognize Content-Encoding header

---
  src/http.c |   49 +++++++++++++++++++++++++++++++++++++++++++++++++
  src/wget.h |    8 +++++++-
  2 files changed, 56 insertions(+), 1 deletions(-)

diff --git a/src/http.c b/src/http.c
index 1a6cd39..c769e95 100644
--- a/src/http.c
+++ b/src/http.c
@@ -2682,6 +2682,39 @@ read_header:
            contlen = last_byte_pos - first_byte_pos + 1;
          }
      }
+
+  if (resp_header_copy (resp, "Content-Encoding", hdrval, sizeof(hdrval)))
+    {
+      const char *p = hdrval;
+      if (p[0] == 'x'&&  p[1] == '-')
+        p += 2;
+      switch (p[0])
+        {
+        case 'g': case 'G':
+          if (0 == c_strcasecmp(p, "gzip"))
+            *dt |= CE_GZIP;
+          break;
+        case 'c': case 'C':
+          if (0 == c_strcasecmp(p, "compress"))
+            *dt |= CE_COMPRESS;
+          break;
+        case 'd': case 'D':
+          if (0 == c_strcasecmp(p, "deflate"))
+            *dt |= CE_DEFLATE;
+          break;
+        case 'b': case 'B':
+          if (0 == c_strcasecmp(p, "bzip2"))
+            *dt |= CE_BZIP2;
+          break;
+        default:
+          break;
+        }
+      if (!(*dt&  CE_ANY))
+        {
+          DEBUGP (("Unrecognized Content-Encoding: %s\n", p));
+        }
+    }
+
    resp_free (resp);

    /* 20x responses are counted among successful by default.  */
@@ -2810,6 +2843,18 @@ read_header:

    if (opt.adjust_extension)
      {
+      const char *ce_ext = ((*dt&  CE_GZIP) ? ".gz" :
+                            (*dt&  CE_COMPRESS) ? ".Z" :
+                            (*dt&  CE_BZIP2) ? ".bz2" :
+                            (*dt&  CE_DEFLATE) ? ".zlib" : NULL);
+      char *last_period;
+      if (ce_ext != NULL&&
+          (last_period = strrchr(hs->local_file, '.')) != NULL&&
+          strcasecmp(last_period, ce_ext) == 0)
+        /* strip Content-Encoding extension (it will be re-added later) */
+        {
+          *last_period = '\0';
+        }
        if (*dt&  TEXTHTML)
          /* -E / --adjust-extension / adjust_extension = on was specified,
             and this is a text/html file.  If some case-insensitive
@@ -2822,6 +2867,10 @@ read_header:
          {
            ensure_extension (hs, ".css", dt);
          }
+      if (ce_ext != NULL)
+        {
+          ensure_extension (hs, ce_ext, dt);
+        }
      }

    if (statcode == HTTP_STATUS_RANGE_NOT_SATISFIABLE

I don't think we should adapt the extension to include the compression. We just want the files uncompressed, don't we?

diff --git a/src/wget.h b/src/wget.h
index 6edbfb8..be5fe51 100644
--- a/src/wget.h
+++ b/src/wget.h
@@ -331,7 +331,13 @@ enum
    SEND_NOCACHE         = 0x0008,        /* send Pragma: no-cache directive */
    ACCEPTRANGES         = 0x0010,        /* Accept-ranges header was found */
    ADDED_HTML_EXTENSION = 0x0020,        /* added ".html" extension due to -E 
*/
-  TEXTCSS              = 0x0040         /* document is of type text/css */
+  TEXTCSS              = 0x0040,        /* document is of type text/css */
+  CE_COMPRESS          = 0x0100,        /* Content-Encoding: compress */
+  CE_DEFLATE           = 0x0200,        /* Content-Encoding: deflate */
+  CE_GZIP              = 0x0400,        /* Content-Encoding: gzip */
+  CE_BZIP2             = 0x0800,        /* Content-Encoding: bzip2 */
+#define CE_ANY (CE_COMPRESS|CE_DEFLATE|CE_GZIP|CE_BZIP2)
+                                        /* Any known Content-Encoding */
  };

  /* Universal error type -- used almost everywhere.  Error reporting of
--


> From b41339ba8d0e28f9033e83883d33889177ebbdc5 Mon Sep 17 00:00:00 2001
From: "Yuriy M. Kaminskiy"<address@hidden>
Date: Tue, 16 Dec 2014 17:29:34 +0300
Subject: [PATCH 2/3] Added option to support HTTP compression

Send Accept-Encoding header, decompress gzip-compressed document
(recognize by extension, should work well with --adjust-extension option)
---
  src/http.c    |    6 +++++-
  src/init.c    |    1 +
  src/main.c    |    3 +++
  src/options.h |    1 +
  src/utils.c   |   43 +++++++++++++++++++++++++++++++++++++++++++
  5 files changed, 53 insertions(+), 1 deletions(-)

diff --git a/src/http.c b/src/http.c
index c769e95..5ce6c93 100644
--- a/src/http.c
+++ b/src/http.c
@@ -1798,6 +1798,9 @@ gethttp (struct url *u, struct http_stat *hs, int *dt, 
struct url *proxy,
                          rel_value);
    SET_USER_AGENT (req);
    request_set_header (req, "Accept", "*/*", rel_none);
+  if (opt.compressed)
+    request_set_header (req, "Accept-Encoding", "gzip, deflate", rel_none);
If we are also supporting compress and bzip2, they should be listed.

+  else
    request_set_header (req, "Accept-Encoding", "identity", rel_none);

You should indent lines when they move into else branches.


    /* Find the username and password for authentication. */
@@ -2683,7 +2686,8 @@ read_header:
          }
      }

-  if (resp_header_copy (resp, "Content-Encoding", hdrval, sizeof(hdrval)))
+  if (opt.compressed&&
+      resp_header_copy (resp, "Content-Encoding", hdrval, sizeof(hdrval)))
      {
I would decompress it even if we didn't ask for the compression.

        const char *p = hdrval;
        if (p[0] == 'x'&&  p[1] == '-')
diff --git a/src/init.c b/src/init.c
index 088a6e9..ccf742a 100644
--- a/src/init.c
+++ b/src/init.c
@@ -154,6 +154,7 @@ static const struct {
    { "checkcertificate",&opt.check_cert,        cmd_boolean },
  #endif
    { "chooseconfig",&opt.choose_config,     cmd_file },
+  { "compressed",&opt.compressed,        cmd_boolean },
    { "connecttimeout",&opt.connect_timeout,   cmd_time },
    { "contentdisposition",&opt.content_disposition, cmd_boolean },
    { "contentonerror",&opt.content_on_error,  cmd_boolean },
diff --git a/src/main.c b/src/main.c
index 99c2819..6f0d7b2 100644
--- a/src/main.c
+++ b/src/main.c
@@ -186,6 +186,7 @@ static struct cmdline_option option_data[] =
      { IF_SSL ("certificate-type"), 0, OPT_VALUE, "certificatetype", -1 },
      { IF_SSL ("check-certificate"), 0, OPT_BOOLEAN, "checkcertificate", -1 },
      { "clobber", 0, OPT__CLOBBER, NULL, optional_argument },
+    { "compressed", 'C', OPT_BOOLEAN, "compressed", -1 },
      { "config", 0, OPT_VALUE, "chooseconfig", -1 },
      { "connect-timeout", 0, OPT_VALUE, "connecttimeout", -1 },
      { "continue", 'c', OPT_BOOLEAN, "continue", -1 },
@@ -582,6 +583,8 @@ Directories:\n"),
      N_("\
    -nH, --no-host-directories       don't create host directories.\n"),
      N_("\
+       --compressed                support HTTP compression 
(Content-Encoding).\n"),
Maybe it's better to name it "compression" ? (with either none or methods as values)

+    N_("\
         --protocol-directories      use protocol name in directories.\n"),
      N_("\
    -P,  --directory-prefix=PREFIX   save files to PREFIX/...\n"),
diff --git a/src/options.h b/src/options.h
index b995126..d1a7faf 100644
--- a/src/options.h
+++ b/src/options.h
@@ -288,6 +288,7 @@ struct options
    bool show_all_dns_entries;    /* Show all the DNS entries when resolving a
                                     name. */
    bool report_bps;              /*Output bandwidth in bits format*/
+  bool compressed;              /* Use Accept-Encoding/Content-Encoding */
  };

  extern struct options opt;
diff --git a/src/utils.c b/src/utils.c
index 42a7c4c..46c23a7 100644
--- a/src/utils.c
+++ b/src/utils.c
@@ -63,6 +63,10 @@ as that of the covered work.  */

  #include<sys/stat.h>

+#ifdef HAVE_LIBZ
+# include<zlib.h>
+#endif
+
  /* For TIOCGWINSZ and friends: */
  #include<sys/ioctl.h>
  #include<termios.h>
@@ -1154,6 +1158,9 @@ wget_read_file (const char *file)
    struct file_memory *fm;
    long size;
    bool inhibit_close = false;
+#ifdef HAVE_LIBZ
+  gzFile *zfile = NULL;
+#endif

    /* Some magic in the finest tradition of Perl and its kin: if FILE
       is "-", just use stdin.  */
@@ -1170,6 +1177,29 @@ wget_read_file (const char *file)
      return NULL;
    fm = xnew (struct file_memory);

+#ifdef HAVE_LIBZ
+  do {
+    const char *ext = strrchr(file, '.');
+    int zfd = -1;
+
+    if (!opt.compressed) break;
+    if (ext == NULL) break;
+    if (0 != strcasecmp(ext + 1, "gz")&&
+        0 != strcasecmp(ext + 1, "zlib"))
+      break;
+    if ((zfd = dup(fd)) == -1)
+      break;
+    if ((zfile = gzdopen(zfd, "r")) == NULL)
+      {
+        close(zfd);
+        break;
+      }
+#ifdef HAVE_MMAP
+    goto mmap_lose;
+#endif
+  } while(0);
+#endif
+
  #ifdef HAVE_MMAP
    {
      struct_fstat buf;
@@ -1224,6 +1254,11 @@ wget_read_file (const char *file)
            size<<= 1;
            fm->content = xrealloc (fm->content, size);
          }
+#ifdef HAVE_LIBZ
+      if (zfile != NULL)
+        nread = gzread (zfile, fm->content + fm->length, size - fm->length);
+      else
+#endif
        nread = read (fd, fm->content + fm->length, size - fm->length);
        if (nread>  0)
          /* Successful read. */
@@ -1237,6 +1272,10 @@ wget_read_file (const char *file)
      }
    if (!inhibit_close)
      close (fd);
+#ifdef HAVE_LIBZ
+  if (zfile != NULL)
+    gzclose(zfile);
+#endif
    if (size>  fm->length&&  fm->length != 0)
      /* Due to exponential growth of fm->content, the allocated region
         might be much larger than what is actually needed.  */
@@ -1245,6 +1284,10 @@ wget_read_file (const char *file)
    return fm;

   lose:
+#ifdef HAVE_LIBZ
+  if (zfile != NULL)
+    gzclose(zfile);
+#endif
    if (!inhibit_close)
      close (fd);
    xfree (fm->content);
--

> From d48d72d891773858aa70806787466224c2e8a31f Mon Sep 17 00:00:00 2001
From: "Yuriy M. Kaminskiy"<address@hidden>
Date: Tue, 16 Dec 2014 17:35:27 +0300
Subject: [PATCH 3/3] Attempt to recognize compressed files by signature

Should improve --compressed without --adjust-extension
---
  src/utils.c |   48 +++++++++++++++++++++++++++++++++++++++++++++++-
  1 files changed, 47 insertions(+), 1 deletions(-)

diff --git a/src/utils.c b/src/utils.c
index 46c23a7..04f56b6 100644
--- a/src/utils.c
+++ b/src/utils.c
@@ -1160,6 +1160,7 @@ wget_read_file (const char *file)
    bool inhibit_close = false;
  #ifdef HAVE_LIBZ
    gzFile *zfile = NULL;
+  bool try_compression_once = opt.compressed;
  #endif

    /* Some magic in the finest tradition of Perl and its kin: if FILE
@@ -1182,15 +1183,19 @@ wget_read_file (const char *file)
      const char *ext = strrchr(file, '.');
      int zfd = -1;

-    if (!opt.compressed) break;
+    if (!try_compression_once) break;
      if (ext == NULL) break;
      if (0 != strcasecmp(ext + 1, "gz")&&
          0 != strcasecmp(ext + 1, "zlib"))
        break;
+    DEBUGP (("Decompress: extension matched: %s\n", ext));
+ force_compressed:
+    try_compression_once = false;
      if ((zfd = dup(fd)) == -1)
        break;
      if ((zfile = gzdopen(zfd, "r")) == NULL)
        {
+        DEBUGP (("gzdopen() failed\n"));
          close(zfd);
          break;
        }
@@ -1214,6 +1219,19 @@ wget_read_file (const char *file)
                          MAP_PRIVATE, fd, 0);
      if (fm->content == (char *)MAP_FAILED)
        goto mmap_lose;
+#ifdef HAVE_LIBZ
+    if (try_compression_once&&
+        fm->length>= 10 /* minimal gzip size */&&
+        fm->content[0] == '\x1f'&&  /* gzip signature */
+        fm->content[1] == '\x8b'&&
+        fm->content[2] == '\x08')
+      {
+        DEBUGP (("Automatic decompress check: ok, munmap\n"));
+        munmap (fm->content, fm->length);
+        fm->content = NULL;
+        goto force_compressed;
+      }
+#endif
      if (!inhibit_close)
        close (fd);

@@ -1269,6 +1287,34 @@ wget_read_file (const char *file)
        else
          /* EOF */
          break;
+#ifdef HAVE_LIBZ
+      if (try_compression_once /*&&  zfile == NULL */&&
+          fm->length - nread<  10 /* old length was less than minimum */&&
+          fm->length>= 10 /* new length is longer than minimum */)
+        {
+          try_compression_once = false;
+          DEBUGP (("Automatic decompress check: "));
+          if (!(fm->content[0] == '\x1f'&&  /* gzip signature */
+                fm->content[1] == '\x8b'&&
+                fm->content[2] == '\x08'))
+            {
+              DEBUGP (("gzip signature mismatch\n"));
+              continue;
+            }
+          /* try to seek back */
+          if (lseek (fd, SEEK_CUR, -(fm->length)) == (off_t)-1)
+            {
+              DEBUGP (("lseek(%ld) failed: %s\n", -(long)fm->length,
+                       strerror(errno)));
+              continue;
+            }
+          /* drop content and retry with compression */
+          xfree(fm->content);
+          fm->content = NULL;
+          DEBUGP (("ok, freed\n"));
+          goto force_compressed;
+        }
+#endif
      }
    if (!inhibit_close)
      close (fd);
--
Doesn't this last patch mean that if I download http://ftp.gnu.org/gnu/wget/wget-1.16.tar.gz and the compressed option is enabled, then I get a plain tar, not a tgz?

Cheers


reply via email to

[Prev in Thread] Current Thread [Next in Thread]