>From 6a0960d49adbcc113949720df6ac78d61e2c4c1d Mon Sep 17 00:00:00 2001 From: Tim Ruehsen Date: Thu, 22 Aug 2013 12:28:11 +0200 Subject: [PATCH] added option --https-only --- doc/ChangeLog | 4 +++ doc/wget.texi | 3 ++ src/ChangeLog | 5 +++ src/init.c | 3 ++ src/main.c | 3 ++ src/options.h | 2 +- src/recur.c | 23 +++++++++----- tests/ChangeLog | 6 ++++ tests/Makefile.am | 1 + tests/Test--httpsonly-r.px | 79 ++++++++++++++++++++++++++++++++++++++++++++++ tests/run-px | 1 + 11 files changed, 121 insertions(+), 9 deletions(-) create mode 100755 tests/Test--httpsonly-r.px diff --git a/doc/ChangeLog b/doc/ChangeLog index bc0fb79..d283055 100644 --- a/doc/ChangeLog +++ b/doc/ChangeLog @@ -1,3 +1,7 @@ +2013-08-22 Tim Ruehsen + + * wget.texi: added description for --https-only + 2013-08-13 Hrvoje Niksic * wget.texi (Download Options): Fix misspelling. diff --git a/doc/wget.texi b/doc/wget.texi index ba4612d..cced7ed 100644 --- a/doc/wget.texi +++ b/doc/wget.texi @@ -1606,6 +1606,9 @@ buggy SSL server implementations that make it hard for OpenSSL to choose the correct protocol version. Fortunately, such servers are quite rare. address@hidden --https-only +When in recursive mode, only HTTPS links are followed. + @cindex SSL certificate, check @item --no-check-certificate Don't check the server certificate against the available certificate diff --git a/src/ChangeLog b/src/ChangeLog index edfb80f..9fb6b97 100644 --- a/src/ChangeLog +++ b/src/ChangeLog @@ -1,3 +1,8 @@ +2013-08-22 Tim Ruehsen + + * added new option --https-only (main.c, options.h) + * recur.c (download_child_p): checking for HTTPS + 2013-08-09 Tim Ruehsen * gnutls.c (ssl_init): Prevent CA files from being loaded twice diff --git a/src/init.c b/src/init.c index 1c4432b..033da4f 100644 --- a/src/init.c +++ b/src/init.c @@ -194,6 +194,9 @@ static const struct { { "httppasswd", &opt.http_passwd, cmd_string }, /* deprecated */ { "httppassword", &opt.http_passwd, cmd_string }, { "httpproxy", &opt.http_proxy, cmd_string }, +#ifdef HAVE_SSL + { "httpsonly", &opt.https_only, cmd_boolean }, +#endif { "httpsproxy", &opt.https_proxy, cmd_string }, { "httpuser", &opt.http_user, cmd_string }, { "ignorecase", &opt.ignore_case, cmd_boolean }, diff --git a/src/main.c b/src/main.c index 6b71a20..8414f5e 100644 --- a/src/main.c +++ b/src/main.c @@ -217,6 +217,7 @@ static struct cmdline_option option_data[] = { "http-passwd", 0, OPT_VALUE, "httppassword", -1 }, /* deprecated */ { "http-password", 0, OPT_VALUE, "httppassword", -1 }, { "http-user", 0, OPT_VALUE, "httpuser", -1 }, + { IF_SSL ("https-only"), 0, OPT_BOOLEAN, "httpsonly", -1 }, { "ignore-case", 0, OPT_BOOLEAN, "ignorecase", -1 }, { "ignore-length", 0, OPT_BOOLEAN, "ignorelength", -1 }, { "ignore-tags", 0, OPT_VALUE, "ignoretags", -1 }, @@ -636,6 +637,8 @@ HTTPS (SSL/TLS) options:\n"), --secure-protocol=PR choose secure protocol, one of auto, SSLv2,\n\ SSLv3, and TLSv1.\n"), N_("\ + --https-only only follow secure HTTPS links\n"), + N_("\ --no-check-certificate don't validate the server's certificate.\n"), N_("\ --certificate=FILE client certificate file.\n"), diff --git a/src/options.h b/src/options.h index 0a10c9b..4460c6c 100644 --- a/src/options.h +++ b/src/options.h @@ -215,9 +215,9 @@ struct options char *ca_directory; /* CA directory (hash files) */ char *ca_cert; /* CA certificate file to use */ - char *random_file; /* file with random data to seed the PRNG */ char *egd_file; /* file name of the egd daemon socket */ + bool https_only; /* whether to follow HTTPS only */ #endif /* HAVE_SSL */ bool cookies; /* whether cookies are used. */ diff --git a/src/recur.c b/src/recur.c index b6ba1d9..edf34d4 100644 --- a/src/recur.c +++ b/src/recur.c @@ -505,15 +505,16 @@ download_child_p (const struct urlpos *upos, struct url *parent, int depth, } /* Several things to check for: - 1. if scheme is not http, and we don't load it - 2. check for relative links (if relative_only is set) - 3. check for domain - 4. check for no-parent - 5. check for excludes && includes - 6. check for suffix - 7. check for same host (if spanhost is unset), with possible + 1. if scheme is not https and https_only requested + 2. if scheme is not http, and we don't load it + 3. check for relative links (if relative_only is set) + 4. check for domain + 5. check for no-parent + 6. check for excludes && includes + 7. check for suffix + 8. check for same host (if spanhost is unset), with possible gethostbyname baggage - 8. check for robots.txt + 9. check for robots.txt Addendum: If the URL is FTP, and it is to be loaded, only the domain and suffix settings are "stronger". @@ -525,6 +526,12 @@ download_child_p (const struct urlpos *upos, struct url *parent, int depth, More time- and memory- consuming tests should be put later on the list. */ + if (opt.https_only && u->scheme != SCHEME_HTTPS) + { + DEBUGP (("Not following non-HTTPS links.\n")); + goto out; + } + /* Determine whether URL under consideration has a HTTP-like scheme. */ u_scheme_like_http = schemes_are_similar_p (u->scheme, SCHEME_HTTP); diff --git a/tests/ChangeLog b/tests/ChangeLog index 8cd4864..9a58797 100644 --- a/tests/ChangeLog +++ b/tests/ChangeLog @@ -1,3 +1,9 @@ +2013-08-22 Tim Ruehsen + + * Makefile.am (EXTRA_DIST): Add Test--httpsonly-r.px. + * run-px (tests): Likewise. + * Test--httpsonly-r.px: New file. + 2013-03-12 Darshit Shah * Makefile.am (EXTRA_DIST): Add Test--post-file.px. diff --git a/tests/Makefile.am b/tests/Makefile.am index ac6a663..a494787 100644 --- a/tests/Makefile.am +++ b/tests/Makefile.am @@ -132,6 +132,7 @@ EXTRA_DIST = FTPServer.pm FTPTest.pm HTTPServer.pm HTTPTest.pm \ Test--spider-r--no-content-disposition.px \ Test--spider-r--no-content-disposition-trivial.px \ Test--spider-r.px \ + Test--httpsonly-r.px \ run-px certs check_PROGRAMS = unit-tests diff --git a/tests/Test--httpsonly-r.px b/tests/Test--httpsonly-r.px new file mode 100755 index 0000000..019df1a --- /dev/null +++ b/tests/Test--httpsonly-r.px @@ -0,0 +1,79 @@ +#!/usr/bin/env perl + +use strict; +use warnings; + +use HTTPTest; + + +############################################################################### + +my $mainpage = < + + Main Page + + +

+ Some text and a link to a second page. +

+ + +EOF + +my $secondpage = < + + Second Page + + +

+ Anything. +

+ + +EOF + +# code, msg, headers, content +my %urls = ( + '/index.html' => { + code => "200", + msg => "Dontcare", + headers => { + "Content-type" => "text/html", + }, + content => $mainpage, + }, + '/secondpage.html' => { + code => "200", + msg => "Dontcare", + headers => { + "Content-type" => "text/html", + }, + content => $secondpage, + } +); + +my $cmdline = $WgetTest::WGETPATH . " --https-only -r -nH http://localhost:{{port}}/"; + +my $expected_error_code = 0; + +my %expected_downloaded_files = ( + 'index.html' => { + content => $mainpage, + }, +); + +############################################################################### + +my $the_test = HTTPTest->new (name => "Test--httpsonly-r", + input => \%urls, + cmdline => $cmdline, + errcode => $expected_error_code, + output => \%expected_downloaded_files); +print $expected_error_code."\n"; + +exit $the_test->run(); + +# vim: et ts=4 sw=4 + diff --git a/tests/run-px b/tests/run-px index 3c35d6f..14f5e7c 100755 --- a/tests/run-px +++ b/tests/run-px @@ -81,6 +81,7 @@ my @tests = ( 'Test--spider-r--no-content-disposition.px', 'Test--spider-r--no-content-disposition-trivial.px', 'Test--spider-r.px', + 'Test--httpsonly-r.px', ); foreach my $var (qw(SYSTEM_WGETRC WGETRC)) { -- 1.8.4.rc3