[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]
[Bug-wget] [PATCH 1-3/3] support HTTP compression (was: HTTP gzip compre
From: |
Yuriy M. Kaminskiy |
Subject: |
[Bug-wget] [PATCH 1-3/3] support HTTP compression (was: HTTP gzip compression) |
Date: |
Tue, 16 Dec 2014 18:20:59 +0300 |
User-agent: |
Mozilla/5.0 (X11; Linux x86_64; rv:24.0) Gecko/20100101 Thunderbird/24.6.0 |
Ángel González wrote:
> No problem for suggesting things. I agree it would be cool. Doesn't even
> look too hard, just detect the Content-Encoding: gzip and filter through gzip
> -d when saving.
> But someone needs to code it :)
Something like attached.
>From ccb95548926a0ab8ad4ed3787470d4803efad4ca Mon Sep 17 00:00:00 2001
From: "Yuriy M. Kaminskiy" <address@hidden>
Date: Tue, 16 Dec 2014 17:19:22 +0300
Subject: [PATCH 1/3] Recognize Content-Encoding header
---
src/http.c | 49 +++++++++++++++++++++++++++++++++++++++++++++++++
src/wget.h | 8 +++++++-
2 files changed, 56 insertions(+), 1 deletions(-)
diff --git a/src/http.c b/src/http.c
index 1a6cd39..c769e95 100644
--- a/src/http.c
+++ b/src/http.c
@@ -2682,6 +2682,39 @@ read_header:
contlen = last_byte_pos - first_byte_pos + 1;
}
}
+
+ if (resp_header_copy (resp, "Content-Encoding", hdrval, sizeof(hdrval)))
+ {
+ const char *p = hdrval;
+ if (p[0] == 'x' && p[1] == '-')
+ p += 2;
+ switch (p[0])
+ {
+ case 'g': case 'G':
+ if (0 == c_strcasecmp(p, "gzip"))
+ *dt |= CE_GZIP;
+ break;
+ case 'c': case 'C':
+ if (0 == c_strcasecmp(p, "compress"))
+ *dt |= CE_COMPRESS;
+ break;
+ case 'd': case 'D':
+ if (0 == c_strcasecmp(p, "deflate"))
+ *dt |= CE_DEFLATE;
+ break;
+ case 'b': case 'B':
+ if (0 == c_strcasecmp(p, "bzip2"))
+ *dt |= CE_BZIP2;
+ break;
+ default:
+ break;
+ }
+ if (!(*dt & CE_ANY))
+ {
+ DEBUGP (("Unrecognized Content-Encoding: %s\n", p));
+ }
+ }
+
resp_free (resp);
/* 20x responses are counted among successful by default. */
@@ -2810,6 +2843,18 @@ read_header:
if (opt.adjust_extension)
{
+ const char *ce_ext = ((*dt & CE_GZIP) ? ".gz" :
+ (*dt & CE_COMPRESS) ? ".Z" :
+ (*dt & CE_BZIP2) ? ".bz2" :
+ (*dt & CE_DEFLATE) ? ".zlib" : NULL);
+ char *last_period;
+ if (ce_ext != NULL &&
+ (last_period = strrchr(hs->local_file, '.')) != NULL &&
+ strcasecmp(last_period, ce_ext) == 0)
+ /* strip Content-Encoding extension (it will be re-added later) */
+ {
+ *last_period = '\0';
+ }
if (*dt & TEXTHTML)
/* -E / --adjust-extension / adjust_extension = on was specified,
and this is a text/html file. If some case-insensitive
@@ -2822,6 +2867,10 @@ read_header:
{
ensure_extension (hs, ".css", dt);
}
+ if (ce_ext != NULL)
+ {
+ ensure_extension (hs, ce_ext, dt);
+ }
}
if (statcode == HTTP_STATUS_RANGE_NOT_SATISFIABLE
diff --git a/src/wget.h b/src/wget.h
index 6edbfb8..be5fe51 100644
--- a/src/wget.h
+++ b/src/wget.h
@@ -331,7 +331,13 @@ enum
SEND_NOCACHE = 0x0008, /* send Pragma: no-cache directive */
ACCEPTRANGES = 0x0010, /* Accept-ranges header was found */
ADDED_HTML_EXTENSION = 0x0020, /* added ".html" extension due to -E */
- TEXTCSS = 0x0040 /* document is of type text/css */
+ TEXTCSS = 0x0040, /* document is of type text/css */
+ CE_COMPRESS = 0x0100, /* Content-Encoding: compress */
+ CE_DEFLATE = 0x0200, /* Content-Encoding: deflate */
+ CE_GZIP = 0x0400, /* Content-Encoding: gzip */
+ CE_BZIP2 = 0x0800, /* Content-Encoding: bzip2 */
+#define CE_ANY (CE_COMPRESS|CE_DEFLATE|CE_GZIP|CE_BZIP2)
+ /* Any known Content-Encoding */
};
/* Universal error type -- used almost everywhere. Error reporting of
--
>From b41339ba8d0e28f9033e83883d33889177ebbdc5 Mon Sep 17 00:00:00 2001
From: "Yuriy M. Kaminskiy" <address@hidden>
Date: Tue, 16 Dec 2014 17:29:34 +0300
Subject: [PATCH 2/3] Added option to support HTTP compression
Send Accept-Encoding header, decompress gzip-compressed document
(recognize by extension, should work well with --adjust-extension option)
---
src/http.c | 6 +++++-
src/init.c | 1 +
src/main.c | 3 +++
src/options.h | 1 +
src/utils.c | 43 +++++++++++++++++++++++++++++++++++++++++++
5 files changed, 53 insertions(+), 1 deletions(-)
diff --git a/src/http.c b/src/http.c
index c769e95..5ce6c93 100644
--- a/src/http.c
+++ b/src/http.c
@@ -1798,6 +1798,9 @@ gethttp (struct url *u, struct http_stat *hs, int *dt,
struct url *proxy,
rel_value);
SET_USER_AGENT (req);
request_set_header (req, "Accept", "*/*", rel_none);
+ if (opt.compressed)
+ request_set_header (req, "Accept-Encoding", "gzip, deflate", rel_none);
+ else
request_set_header (req, "Accept-Encoding", "identity", rel_none);
/* Find the username and password for authentication. */
@@ -2683,7 +2686,8 @@ read_header:
}
}
- if (resp_header_copy (resp, "Content-Encoding", hdrval, sizeof(hdrval)))
+ if (opt.compressed &&
+ resp_header_copy (resp, "Content-Encoding", hdrval, sizeof(hdrval)))
{
const char *p = hdrval;
if (p[0] == 'x' && p[1] == '-')
diff --git a/src/init.c b/src/init.c
index 088a6e9..ccf742a 100644
--- a/src/init.c
+++ b/src/init.c
@@ -154,6 +154,7 @@ static const struct {
{ "checkcertificate", &opt.check_cert, cmd_boolean },
#endif
{ "chooseconfig", &opt.choose_config, cmd_file },
+ { "compressed", &opt.compressed, cmd_boolean },
{ "connecttimeout", &opt.connect_timeout, cmd_time },
{ "contentdisposition", &opt.content_disposition, cmd_boolean },
{ "contentonerror", &opt.content_on_error, cmd_boolean },
diff --git a/src/main.c b/src/main.c
index 99c2819..6f0d7b2 100644
--- a/src/main.c
+++ b/src/main.c
@@ -186,6 +186,7 @@ static struct cmdline_option option_data[] =
{ IF_SSL ("certificate-type"), 0, OPT_VALUE, "certificatetype", -1 },
{ IF_SSL ("check-certificate"), 0, OPT_BOOLEAN, "checkcertificate", -1 },
{ "clobber", 0, OPT__CLOBBER, NULL, optional_argument },
+ { "compressed", 'C', OPT_BOOLEAN, "compressed", -1 },
{ "config", 0, OPT_VALUE, "chooseconfig", -1 },
{ "connect-timeout", 0, OPT_VALUE, "connecttimeout", -1 },
{ "continue", 'c', OPT_BOOLEAN, "continue", -1 },
@@ -582,6 +583,8 @@ Directories:\n"),
N_("\
-nH, --no-host-directories don't create host directories.\n"),
N_("\
+ --compressed support HTTP compression
(Content-Encoding).\n"),
+ N_("\
--protocol-directories use protocol name in directories.\n"),
N_("\
-P, --directory-prefix=PREFIX save files to PREFIX/...\n"),
diff --git a/src/options.h b/src/options.h
index b995126..d1a7faf 100644
--- a/src/options.h
+++ b/src/options.h
@@ -288,6 +288,7 @@ struct options
bool show_all_dns_entries; /* Show all the DNS entries when resolving a
name. */
bool report_bps; /*Output bandwidth in bits format*/
+ bool compressed; /* Use Accept-Encoding/Content-Encoding */
};
extern struct options opt;
diff --git a/src/utils.c b/src/utils.c
index 42a7c4c..46c23a7 100644
--- a/src/utils.c
+++ b/src/utils.c
@@ -63,6 +63,10 @@ as that of the covered work. */
#include <sys/stat.h>
+#ifdef HAVE_LIBZ
+# include <zlib.h>
+#endif
+
/* For TIOCGWINSZ and friends: */
#include <sys/ioctl.h>
#include <termios.h>
@@ -1154,6 +1158,9 @@ wget_read_file (const char *file)
struct file_memory *fm;
long size;
bool inhibit_close = false;
+#ifdef HAVE_LIBZ
+ gzFile *zfile = NULL;
+#endif
/* Some magic in the finest tradition of Perl and its kin: if FILE
is "-", just use stdin. */
@@ -1170,6 +1177,29 @@ wget_read_file (const char *file)
return NULL;
fm = xnew (struct file_memory);
+#ifdef HAVE_LIBZ
+ do {
+ const char *ext = strrchr(file, '.');
+ int zfd = -1;
+
+ if (!opt.compressed) break;
+ if (ext == NULL) break;
+ if (0 != strcasecmp(ext + 1, "gz") &&
+ 0 != strcasecmp(ext + 1, "zlib"))
+ break;
+ if ((zfd = dup(fd)) == -1)
+ break;
+ if ((zfile = gzdopen(zfd, "r")) == NULL)
+ {
+ close(zfd);
+ break;
+ }
+#ifdef HAVE_MMAP
+ goto mmap_lose;
+#endif
+ } while(0);
+#endif
+
#ifdef HAVE_MMAP
{
struct_fstat buf;
@@ -1224,6 +1254,11 @@ wget_read_file (const char *file)
size <<= 1;
fm->content = xrealloc (fm->content, size);
}
+#ifdef HAVE_LIBZ
+ if (zfile != NULL)
+ nread = gzread (zfile, fm->content + fm->length, size - fm->length);
+ else
+#endif
nread = read (fd, fm->content + fm->length, size - fm->length);
if (nread > 0)
/* Successful read. */
@@ -1237,6 +1272,10 @@ wget_read_file (const char *file)
}
if (!inhibit_close)
close (fd);
+#ifdef HAVE_LIBZ
+ if (zfile != NULL)
+ gzclose(zfile);
+#endif
if (size > fm->length && fm->length != 0)
/* Due to exponential growth of fm->content, the allocated region
might be much larger than what is actually needed. */
@@ -1245,6 +1284,10 @@ wget_read_file (const char *file)
return fm;
lose:
+#ifdef HAVE_LIBZ
+ if (zfile != NULL)
+ gzclose(zfile);
+#endif
if (!inhibit_close)
close (fd);
xfree (fm->content);
--
>From d48d72d891773858aa70806787466224c2e8a31f Mon Sep 17 00:00:00 2001
From: "Yuriy M. Kaminskiy" <address@hidden>
Date: Tue, 16 Dec 2014 17:35:27 +0300
Subject: [PATCH 3/3] Attempt to recognize compressed files by signature
Should improve --compressed without --adjust-extension
---
src/utils.c | 48 +++++++++++++++++++++++++++++++++++++++++++++++-
1 files changed, 47 insertions(+), 1 deletions(-)
diff --git a/src/utils.c b/src/utils.c
index 46c23a7..04f56b6 100644
--- a/src/utils.c
+++ b/src/utils.c
@@ -1160,6 +1160,7 @@ wget_read_file (const char *file)
bool inhibit_close = false;
#ifdef HAVE_LIBZ
gzFile *zfile = NULL;
+ bool try_compression_once = opt.compressed;
#endif
/* Some magic in the finest tradition of Perl and its kin: if FILE
@@ -1182,15 +1183,19 @@ wget_read_file (const char *file)
const char *ext = strrchr(file, '.');
int zfd = -1;
- if (!opt.compressed) break;
+ if (!try_compression_once) break;
if (ext == NULL) break;
if (0 != strcasecmp(ext + 1, "gz") &&
0 != strcasecmp(ext + 1, "zlib"))
break;
+ DEBUGP (("Decompress: extension matched: %s\n", ext));
+ force_compressed:
+ try_compression_once = false;
if ((zfd = dup(fd)) == -1)
break;
if ((zfile = gzdopen(zfd, "r")) == NULL)
{
+ DEBUGP (("gzdopen() failed\n"));
close(zfd);
break;
}
@@ -1214,6 +1219,19 @@ wget_read_file (const char *file)
MAP_PRIVATE, fd, 0);
if (fm->content == (char *)MAP_FAILED)
goto mmap_lose;
+#ifdef HAVE_LIBZ
+ if (try_compression_once &&
+ fm->length >= 10 /* minimal gzip size */ &&
+ fm->content[0] == '\x1f' && /* gzip signature */
+ fm->content[1] == '\x8b' &&
+ fm->content[2] == '\x08')
+ {
+ DEBUGP (("Automatic decompress check: ok, munmap\n"));
+ munmap (fm->content, fm->length);
+ fm->content = NULL;
+ goto force_compressed;
+ }
+#endif
if (!inhibit_close)
close (fd);
@@ -1269,6 +1287,34 @@ wget_read_file (const char *file)
else
/* EOF */
break;
+#ifdef HAVE_LIBZ
+ if (try_compression_once /* && zfile == NULL */ &&
+ fm->length - nread < 10 /* old length was less than minimum */ &&
+ fm->length >= 10 /* new length is longer than minimum */)
+ {
+ try_compression_once = false;
+ DEBUGP (("Automatic decompress check: "));
+ if (!(fm->content[0] == '\x1f' && /* gzip signature */
+ fm->content[1] == '\x8b' &&
+ fm->content[2] == '\x08'))
+ {
+ DEBUGP (("gzip signature mismatch\n"));
+ continue;
+ }
+ /* try to seek back */
+ if (lseek (fd, SEEK_CUR, -(fm->length)) == (off_t)-1)
+ {
+ DEBUGP (("lseek(%ld) failed: %s\n", -(long)fm->length,
+ strerror(errno)));
+ continue;
+ }
+ /* drop content and retry with compression */
+ xfree(fm->content);
+ fm->content = NULL;
+ DEBUGP (("ok, freed\n"));
+ goto force_compressed;
+ }
+#endif
}
if (!inhibit_close)
close (fd);
--