bug-wget
[Top][All Lists]
Advanced

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

[Bug-wget] [PATCH 1-3/3] support HTTP compression (was: HTTP gzip compre


From: Yuriy M. Kaminskiy
Subject: [Bug-wget] [PATCH 1-3/3] support HTTP compression (was: HTTP gzip compression)
Date: Tue, 16 Dec 2014 18:20:59 +0300
User-agent: Mozilla/5.0 (X11; Linux x86_64; rv:24.0) Gecko/20100101 Thunderbird/24.6.0

Ángel González wrote:
> No problem for suggesting things. I agree it would be cool. Doesn't even
> look too hard, just detect the Content-Encoding: gzip and filter through gzip
> -d when saving.
> But someone needs to code it :)

Something like attached.

>From ccb95548926a0ab8ad4ed3787470d4803efad4ca Mon Sep 17 00:00:00 2001
From: "Yuriy M. Kaminskiy" <address@hidden>
Date: Tue, 16 Dec 2014 17:19:22 +0300
Subject: [PATCH 1/3] Recognize Content-Encoding header

---
 src/http.c |   49 +++++++++++++++++++++++++++++++++++++++++++++++++
 src/wget.h |    8 +++++++-
 2 files changed, 56 insertions(+), 1 deletions(-)

diff --git a/src/http.c b/src/http.c
index 1a6cd39..c769e95 100644
--- a/src/http.c
+++ b/src/http.c
@@ -2682,6 +2682,39 @@ read_header:
           contlen = last_byte_pos - first_byte_pos + 1;
         }
     }
+
+  if (resp_header_copy (resp, "Content-Encoding", hdrval, sizeof(hdrval)))
+    {
+      const char *p = hdrval;
+      if (p[0] == 'x' && p[1] == '-')
+        p += 2;
+      switch (p[0])
+        {
+        case 'g': case 'G':
+          if (0 == c_strcasecmp(p, "gzip"))
+            *dt |= CE_GZIP;
+          break;
+        case 'c': case 'C':
+          if (0 == c_strcasecmp(p, "compress"))
+            *dt |= CE_COMPRESS;
+          break;
+        case 'd': case 'D':
+          if (0 == c_strcasecmp(p, "deflate"))
+            *dt |= CE_DEFLATE;
+          break;
+        case 'b': case 'B':
+          if (0 == c_strcasecmp(p, "bzip2"))
+            *dt |= CE_BZIP2;
+          break;
+        default:
+          break;
+        }
+      if (!(*dt & CE_ANY))
+        {
+          DEBUGP (("Unrecognized Content-Encoding: %s\n", p));
+        }
+    }
+
   resp_free (resp);
 
   /* 20x responses are counted among successful by default.  */
@@ -2810,6 +2843,18 @@ read_header:
 
   if (opt.adjust_extension)
     {
+      const char *ce_ext = ((*dt & CE_GZIP) ? ".gz" :
+                            (*dt & CE_COMPRESS) ? ".Z" :
+                            (*dt & CE_BZIP2) ? ".bz2" :
+                            (*dt & CE_DEFLATE) ? ".zlib" : NULL);
+      char *last_period;
+      if (ce_ext != NULL &&
+          (last_period = strrchr(hs->local_file, '.')) != NULL &&
+          strcasecmp(last_period, ce_ext) == 0)
+        /* strip Content-Encoding extension (it will be re-added later) */
+        {
+          *last_period = '\0';
+        }
       if (*dt & TEXTHTML)
         /* -E / --adjust-extension / adjust_extension = on was specified,
            and this is a text/html file.  If some case-insensitive
@@ -2822,6 +2867,10 @@ read_header:
         {
           ensure_extension (hs, ".css", dt);
         }
+      if (ce_ext != NULL)
+        {
+          ensure_extension (hs, ce_ext, dt);
+        }
     }
 
   if (statcode == HTTP_STATUS_RANGE_NOT_SATISFIABLE
diff --git a/src/wget.h b/src/wget.h
index 6edbfb8..be5fe51 100644
--- a/src/wget.h
+++ b/src/wget.h
@@ -331,7 +331,13 @@ enum
   SEND_NOCACHE         = 0x0008,        /* send Pragma: no-cache directive */
   ACCEPTRANGES         = 0x0010,        /* Accept-ranges header was found */
   ADDED_HTML_EXTENSION = 0x0020,        /* added ".html" extension due to -E */
-  TEXTCSS              = 0x0040         /* document is of type text/css */
+  TEXTCSS              = 0x0040,        /* document is of type text/css */
+  CE_COMPRESS          = 0x0100,        /* Content-Encoding: compress */
+  CE_DEFLATE           = 0x0200,        /* Content-Encoding: deflate */
+  CE_GZIP              = 0x0400,        /* Content-Encoding: gzip */
+  CE_BZIP2             = 0x0800,        /* Content-Encoding: bzip2 */
+#define CE_ANY (CE_COMPRESS|CE_DEFLATE|CE_GZIP|CE_BZIP2)
+                                        /* Any known Content-Encoding */
 };
 
 /* Universal error type -- used almost everywhere.  Error reporting of
-- 

>From b41339ba8d0e28f9033e83883d33889177ebbdc5 Mon Sep 17 00:00:00 2001
From: "Yuriy M. Kaminskiy" <address@hidden>
Date: Tue, 16 Dec 2014 17:29:34 +0300
Subject: [PATCH 2/3] Added option to support HTTP compression

Send Accept-Encoding header, decompress gzip-compressed document
(recognize by extension, should work well with --adjust-extension option)
---
 src/http.c    |    6 +++++-
 src/init.c    |    1 +
 src/main.c    |    3 +++
 src/options.h |    1 +
 src/utils.c   |   43 +++++++++++++++++++++++++++++++++++++++++++
 5 files changed, 53 insertions(+), 1 deletions(-)

diff --git a/src/http.c b/src/http.c
index c769e95..5ce6c93 100644
--- a/src/http.c
+++ b/src/http.c
@@ -1798,6 +1798,9 @@ gethttp (struct url *u, struct http_stat *hs, int *dt, 
struct url *proxy,
                         rel_value);
   SET_USER_AGENT (req);
   request_set_header (req, "Accept", "*/*", rel_none);
+  if (opt.compressed)
+    request_set_header (req, "Accept-Encoding", "gzip, deflate", rel_none);
+  else
   request_set_header (req, "Accept-Encoding", "identity", rel_none);
 
   /* Find the username and password for authentication. */
@@ -2683,7 +2686,8 @@ read_header:
         }
     }
 
-  if (resp_header_copy (resp, "Content-Encoding", hdrval, sizeof(hdrval)))
+  if (opt.compressed &&
+      resp_header_copy (resp, "Content-Encoding", hdrval, sizeof(hdrval)))
     {
       const char *p = hdrval;
       if (p[0] == 'x' && p[1] == '-')
diff --git a/src/init.c b/src/init.c
index 088a6e9..ccf742a 100644
--- a/src/init.c
+++ b/src/init.c
@@ -154,6 +154,7 @@ static const struct {
   { "checkcertificate", &opt.check_cert,        cmd_boolean },
 #endif
   { "chooseconfig",     &opt.choose_config,     cmd_file },
+  { "compressed",       &opt.compressed,        cmd_boolean },
   { "connecttimeout",   &opt.connect_timeout,   cmd_time },
   { "contentdisposition", &opt.content_disposition, cmd_boolean },
   { "contentonerror",   &opt.content_on_error,  cmd_boolean },
diff --git a/src/main.c b/src/main.c
index 99c2819..6f0d7b2 100644
--- a/src/main.c
+++ b/src/main.c
@@ -186,6 +186,7 @@ static struct cmdline_option option_data[] =
     { IF_SSL ("certificate-type"), 0, OPT_VALUE, "certificatetype", -1 },
     { IF_SSL ("check-certificate"), 0, OPT_BOOLEAN, "checkcertificate", -1 },
     { "clobber", 0, OPT__CLOBBER, NULL, optional_argument },
+    { "compressed", 'C', OPT_BOOLEAN, "compressed", -1 },
     { "config", 0, OPT_VALUE, "chooseconfig", -1 },
     { "connect-timeout", 0, OPT_VALUE, "connecttimeout", -1 },
     { "continue", 'c', OPT_BOOLEAN, "continue", -1 },
@@ -582,6 +583,8 @@ Directories:\n"),
     N_("\
   -nH, --no-host-directories       don't create host directories.\n"),
     N_("\
+       --compressed                support HTTP compression 
(Content-Encoding).\n"),
+    N_("\
        --protocol-directories      use protocol name in directories.\n"),
     N_("\
   -P,  --directory-prefix=PREFIX   save files to PREFIX/...\n"),
diff --git a/src/options.h b/src/options.h
index b995126..d1a7faf 100644
--- a/src/options.h
+++ b/src/options.h
@@ -288,6 +288,7 @@ struct options
   bool show_all_dns_entries;    /* Show all the DNS entries when resolving a
                                    name. */
   bool report_bps;              /*Output bandwidth in bits format*/
+  bool compressed;              /* Use Accept-Encoding/Content-Encoding */
 };
 
 extern struct options opt;
diff --git a/src/utils.c b/src/utils.c
index 42a7c4c..46c23a7 100644
--- a/src/utils.c
+++ b/src/utils.c
@@ -63,6 +63,10 @@ as that of the covered work.  */
 
 #include <sys/stat.h>
 
+#ifdef HAVE_LIBZ
+# include <zlib.h>
+#endif
+
 /* For TIOCGWINSZ and friends: */
 #include <sys/ioctl.h>
 #include <termios.h>
@@ -1154,6 +1158,9 @@ wget_read_file (const char *file)
   struct file_memory *fm;
   long size;
   bool inhibit_close = false;
+#ifdef HAVE_LIBZ
+  gzFile *zfile = NULL;
+#endif
 
   /* Some magic in the finest tradition of Perl and its kin: if FILE
      is "-", just use stdin.  */
@@ -1170,6 +1177,29 @@ wget_read_file (const char *file)
     return NULL;
   fm = xnew (struct file_memory);
 
+#ifdef HAVE_LIBZ
+  do {
+    const char *ext = strrchr(file, '.');
+    int zfd = -1;
+
+    if (!opt.compressed) break;
+    if (ext == NULL) break;
+    if (0 != strcasecmp(ext + 1, "gz") &&
+        0 != strcasecmp(ext + 1, "zlib"))
+      break;
+    if ((zfd = dup(fd)) == -1)
+      break;
+    if ((zfile = gzdopen(zfd, "r")) == NULL)
+      {
+        close(zfd);
+        break;
+      }
+#ifdef HAVE_MMAP
+    goto mmap_lose;
+#endif
+  } while(0);
+#endif
+
 #ifdef HAVE_MMAP
   {
     struct_fstat buf;
@@ -1224,6 +1254,11 @@ wget_read_file (const char *file)
           size <<= 1;
           fm->content = xrealloc (fm->content, size);
         }
+#ifdef HAVE_LIBZ
+      if (zfile != NULL)
+        nread = gzread (zfile, fm->content + fm->length, size - fm->length);
+      else
+#endif
       nread = read (fd, fm->content + fm->length, size - fm->length);
       if (nread > 0)
         /* Successful read. */
@@ -1237,6 +1272,10 @@ wget_read_file (const char *file)
     }
   if (!inhibit_close)
     close (fd);
+#ifdef HAVE_LIBZ
+  if (zfile != NULL)
+    gzclose(zfile);
+#endif
   if (size > fm->length && fm->length != 0)
     /* Due to exponential growth of fm->content, the allocated region
        might be much larger than what is actually needed.  */
@@ -1245,6 +1284,10 @@ wget_read_file (const char *file)
   return fm;
 
  lose:
+#ifdef HAVE_LIBZ
+  if (zfile != NULL)
+    gzclose(zfile);
+#endif
   if (!inhibit_close)
     close (fd);
   xfree (fm->content);
-- 

>From d48d72d891773858aa70806787466224c2e8a31f Mon Sep 17 00:00:00 2001
From: "Yuriy M. Kaminskiy" <address@hidden>
Date: Tue, 16 Dec 2014 17:35:27 +0300
Subject: [PATCH 3/3] Attempt to recognize compressed files by signature

Should improve --compressed without --adjust-extension
---
 src/utils.c |   48 +++++++++++++++++++++++++++++++++++++++++++++++-
 1 files changed, 47 insertions(+), 1 deletions(-)

diff --git a/src/utils.c b/src/utils.c
index 46c23a7..04f56b6 100644
--- a/src/utils.c
+++ b/src/utils.c
@@ -1160,6 +1160,7 @@ wget_read_file (const char *file)
   bool inhibit_close = false;
 #ifdef HAVE_LIBZ
   gzFile *zfile = NULL;
+  bool try_compression_once = opt.compressed;
 #endif
 
   /* Some magic in the finest tradition of Perl and its kin: if FILE
@@ -1182,15 +1183,19 @@ wget_read_file (const char *file)
     const char *ext = strrchr(file, '.');
     int zfd = -1;
 
-    if (!opt.compressed) break;
+    if (!try_compression_once) break;
     if (ext == NULL) break;
     if (0 != strcasecmp(ext + 1, "gz") &&
         0 != strcasecmp(ext + 1, "zlib"))
       break;
+    DEBUGP (("Decompress: extension matched: %s\n", ext));
+ force_compressed:
+    try_compression_once = false;
     if ((zfd = dup(fd)) == -1)
       break;
     if ((zfile = gzdopen(zfd, "r")) == NULL)
       {
+        DEBUGP (("gzdopen() failed\n"));
         close(zfd);
         break;
       }
@@ -1214,6 +1219,19 @@ wget_read_file (const char *file)
                         MAP_PRIVATE, fd, 0);
     if (fm->content == (char *)MAP_FAILED)
       goto mmap_lose;
+#ifdef HAVE_LIBZ
+    if (try_compression_once &&
+        fm->length >= 10 /* minimal gzip size */ &&
+        fm->content[0] == '\x1f' && /* gzip signature */
+        fm->content[1] == '\x8b' &&
+        fm->content[2] == '\x08')
+      {
+        DEBUGP (("Automatic decompress check: ok, munmap\n"));
+        munmap (fm->content, fm->length);
+        fm->content = NULL;
+        goto force_compressed;
+      }
+#endif
     if (!inhibit_close)
       close (fd);
 
@@ -1269,6 +1287,34 @@ wget_read_file (const char *file)
       else
         /* EOF */
         break;
+#ifdef HAVE_LIBZ
+      if (try_compression_once /* && zfile == NULL */ &&
+          fm->length - nread < 10 /* old length was less than minimum */ &&
+          fm->length >= 10 /* new length is longer than minimum */)
+        {
+          try_compression_once = false;
+          DEBUGP (("Automatic decompress check: "));
+          if (!(fm->content[0] == '\x1f' && /* gzip signature */
+                fm->content[1] == '\x8b' &&
+                fm->content[2] == '\x08'))
+            {
+              DEBUGP (("gzip signature mismatch\n"));
+              continue;
+            }
+          /* try to seek back */
+          if (lseek (fd, SEEK_CUR, -(fm->length)) == (off_t)-1)
+            {
+              DEBUGP (("lseek(%ld) failed: %s\n", -(long)fm->length,
+                       strerror(errno)));
+              continue;
+            }
+          /* drop content and retry with compression */
+          xfree(fm->content);
+          fm->content = NULL;
+          DEBUGP (("ok, freed\n"));
+          goto force_compressed;
+        }
+#endif
     }
   if (!inhibit_close)
     close (fd);
-- 


reply via email to

[Prev in Thread] Current Thread [Next in Thread]