diff -turN wget-1.18/src/convert.c wget-1.18.0.1thb/src/convert.c --- wget-1.18/src/convert.c 2016-03-03 08:38:45.000000000 +0000 +++ wget-1.18.0.1thb/src/convert.c 2016-12-14 20:59:15.913959715 +0000 @@ -32,6 +32,7 @@ #include #include +#include #include #include #include @@ -47,6 +48,7 @@ #include "css-url.h" #include "iri.h" #include "xstrndup.h" +#include "exits.h" static struct hash_table *dl_file_url_map; struct hash_table *dl_url_file_map; @@ -168,6 +170,8 @@ } } +static inline void add_specified_links_to_hashtables (void); + /* This function is called when the retrieval is done to convert the links that have been downloaded. It has to be called at the end of the retrieval, because only then does Wget know conclusively which @@ -191,6 +195,7 @@ struct ptimer *timer = ptimer_new (); + add_specified_links_to_hashtables (); convert_links_in_hashtable (downloaded_html_set, 0, &file_count); convert_links_in_hashtable (downloaded_css_set, 1, &file_count); @@ -993,6 +998,127 @@ string_set_add (downloaded_css_set, file); } +/* This function merely aids the implementation of + add_specified_links_to_hashtables(), defined later; it serves no other + purpose. It is optimized for security auditability rather than for speed. + It allocates the string it returns; so, when the caller is done with the + string, the caller should xfree() the string. */ + +static char * +add_specified_links_to_hashtables_impl_path (const char *const fileext) +{ + /* It is slightly wasteful to compute strings' lengths each time this + function is called, but the function is not meant to be called from the + program's main loop, so the waste ought to be negligible. More important + is that the function be easy for a human security auditor to read and to + reason about. By constructing a file path, the function exposes a + potential security attack surface, so its logic needs to be correct. */ + const size_t buflen_path = + strlen (opt.convert_specified_links) + strlen (fileext) + 2; + char *const path = xmalloc (buflen_path); /* returns only if successful */ + const char *const bufend_path = path + buflen_path; + char *q = path; + q = stpncpy (q, opt.convert_specified_links, bufend_path-q); + /* assert(!()) would be an alternative to if(){} */ + if (q < bufend_path) + *q++ = '.'; + else /* impossible */ + exit (WGET_EXIT_UNKNOWN); + q = stpncpy (q, fileext, bufend_path-q); + if (q != bufend_path-1 || *q) /* impossible */ + exit (WGET_EXIT_UNKNOWN); + return path; /* The caller is to free() the returned string. */ +} + +static void +add_specified_links_to_hashtables_impl_exit_because_cannot_parse + (const char *const path) +{ + fprintf (stderr, _("Cannot parse %s.\n"), path); + exit (WGET_EXIT_IO_FAIL); +} + +/* Normally, the wget(1) command automatically determines which links to + convert, and that only if the -k/--convert-links option is issued. However, + by issuing the --convert-specified-links option, the user can manually + specify links to convert. This function registers the user's manually + specified links for later conversion. */ + +static inline void +add_specified_links_to_hashtables (void) +{ + if (opt.convert_specified_links) { + char *path_urls = + add_specified_links_to_hashtables_impl_path + (CONVERT_SPECIFIED_LINKS_FILEEXT_URLS); + char *path_links = + add_specified_links_to_hashtables_impl_path + (CONVERT_SPECIFIED_LINKS_FILEEXT_LINKS); + FILE *const file_urls = fopen (path_urls , "r"); + FILE *const file_links = fopen (path_links, "r"); + if (!(file_urls && file_links)) + { + fprintf (stderr, + _("Cannot open %s and/or %s.\n"), path_urls, path_links); + exit (WGET_EXIT_IO_FAIL); + } + { + char *line; + size_t len_line; + while (line = NULL, len_line = 0, + getline (&line, &len_line, file_urls) >= 0) + { + char *words[2]; + const size_t no_of_words = split_string_into_words (line, words, 2); + if (no_of_words == 0) + continue; + else if (no_of_words != 2) + add_specified_links_to_hashtables_impl_exit_because_cannot_parse + (path_urls); + hash_table_put (dl_file_url_map, + xstrdup (words[1]), xstrdup (words[0])); + hash_table_put (dl_url_file_map, words[0], words[1]); + xfree (line); + } + while (line = NULL, len_line = 0, + getline (&line, &len_line, file_links) >= 0) + { + /* This block should probably not be merged with the earlier block. + The two blocks coincidentally appear more similar than they ought. + The two differ conceptually. */ + char *words[2]; + const size_t no_of_words = split_string_into_words (line, words, 2); + if (no_of_words == 0) + continue; + else if (no_of_words != 2) + add_specified_links_to_hashtables_impl_exit_because_cannot_parse + (path_links); + { + char *p; + for (p = words[0]; *p; ++p) *p = toupper (*p); + } + if (!strcmp (words[0], CONVERT_SPECIFIED_LINKS_HTML)) + string_set_add (downloaded_html_set, words[1]); + else if (!strcmp (words[1], CONVERT_SPECIFIED_LINKS_CSS)) + string_set_add (downloaded_css_set, words[1]); + else + add_specified_links_to_hashtables_impl_exit_because_cannot_parse + (path_links); + xfree (words[0]); + xfree (line); + } + } + if (fclose (file_links) || fclose (file_urls)) + { + fprintf (stderr, + _("Cannot close %s and/or %s.\n"), path_urls, path_links); + exit (WGET_EXIT_IO_FAIL); + } + xfree (path_links); + xfree (path_urls); + } +} + static void downloaded_files_free (void); /* Cleanup the data structures associated with this file. */ diff -turN wget-1.18/src/convert.h wget-1.18.0.1thb/src/convert.h --- wget-1.18/src/convert.h 2016-03-03 08:38:45.000000000 +0000 +++ wget-1.18.0.1thb/src/convert.h 2016-12-14 16:05:16.937628793 +0000 @@ -36,6 +36,11 @@ extern struct hash_table *downloaded_html_set; extern struct hash_table *downloaded_css_set; +#define CONVERT_SPECIFIED_LINKS_FILEEXT_URLS "urls" +#define CONVERT_SPECIFIED_LINKS_FILEEXT_LINKS "links" +#define CONVERT_SPECIFIED_LINKS_HTML "HTML" +#define CONVERT_SPECIFIED_LINKS_CSS "CSS" + enum convert_options { CO_NOCONVERT = 0, /* don't convert this URL */ CO_CONVERT_TO_RELATIVE, /* convert to relative, e.g. to diff -turN wget-1.18/src/exits.c wget-1.18.0.1thb/src/exits.c --- wget-1.18/src/exits.c 2016-01-19 09:26:56.000000000 +0000 +++ wget-1.18.0.1thb/src/exits.c 2016-12-14 16:05:16.937628793 +0000 @@ -18,6 +18,8 @@ along with Wget. If not, see . */ +#include + #include "wget.h" #include "exits.h" @@ -91,3 +93,17 @@ ? 1 : final_exit_status; } + +/* This function may exit the program without returning. You pass it a pointer + from malloc() [or from xmalloc() or the like]. It returns only if malloc() + has succeeded. */ + +void die_if_memory_allocation_has_failed (const void *const p) +{ + if (p == NULL) + { + fprintf (stderr, _("Memory allocation problem\n")); + exit (WGET_EXIT_PARSE_ERROR); + } +} + diff -turN wget-1.18/src/exits.h wget-1.18.0.1thb/src/exits.h --- wget-1.18/src/exits.h 2016-01-19 09:26:56.000000000 +0000 +++ wget-1.18.0.1thb/src/exits.h 2016-12-14 16:05:16.937628793 +0000 @@ -44,5 +44,7 @@ int get_exit_status (void); +void die_if_memory_allocation_has_failed (const void *); + #endif /* WGET_EXITS_H */ diff -turN wget-1.18/src/hash.c wget-1.18.0.1thb/src/hash.c --- wget-1.18/src/hash.c 2016-01-19 09:26:56.000000000 +0000 +++ wget-1.18.0.1thb/src/hash.c 2016-12-14 20:21:54.133390238 +0000 @@ -82,6 +82,7 @@ hash_table_iter_next -- return next element during iteration. hash_table_clear -- clear hash table contents. hash_table_count -- return the number of entries in the table. + hash_table_print -- print all string keys and values. The hash table grows internally as new entries are added and is not limited in size, except by available memory. The table doubles @@ -587,6 +588,18 @@ return ht->count; } +void +hash_table_print (FILE *const f, struct hash_table *const ht) +{ + if (ht && hash_table_count (ht)) + { + hash_table_iterator iter; + for (hash_table_iterate (ht, &iter); hash_table_iter_next (&iter); ) + fprintf(f, "KEY == \"%s\"; VALUE == \"%s\"\n", + (const char *)(iter.key), (const char *)(iter.value)); + } +} + /* Functions from this point onward are meant for convenience and don't strictly belong to this file. However, this is as good a place for them as any. */ diff -turN wget-1.18/src/hash.h wget-1.18.0.1thb/src/hash.h --- wget-1.18/src/hash.h 2016-01-19 09:26:56.000000000 +0000 +++ wget-1.18.0.1thb/src/hash.h 2016-12-14 20:03:13.809407596 +0000 @@ -58,6 +58,8 @@ int hash_table_count (const struct hash_table *); +void hash_table_print (FILE *, struct hash_table *); + struct hash_table *make_string_hash_table (int); struct hash_table *make_nocase_string_hash_table (int); diff -turN wget-1.18/src/init.c wget-1.18.0.1thb/src/init.c --- wget-1.18/src/init.c 2016-06-03 11:55:13.000000000 +0000 +++ wget-1.18.0.1thb/src/init.c 2016-12-14 16:05:16.937628793 +0000 @@ -165,6 +165,7 @@ { "continue", &opt.always_rest, cmd_boolean }, { "convertfileonly", &opt.convert_file_only, cmd_boolean }, { "convertlinks", &opt.convert_links, cmd_boolean }, + { "convertspecifiedlinks", &opt.convert_specified_links, cmd_file }, { "cookies", &opt.cookies, cmd_boolean }, #ifdef HAVE_SSL { "crlfile", &opt.crl_file, cmd_file_once }, diff -turN wget-1.18/src/main.c wget-1.18.0.1thb/src/main.c --- wget-1.18/src/main.c 2016-06-06 15:36:04.000000000 +0000 +++ wget-1.18.0.1thb/src/main.c 2016-12-14 16:05:16.937628793 +0000 @@ -278,6 +278,7 @@ { "continue", 'c', OPT_BOOLEAN, "continue", -1 }, { "convert-file-only", 0, OPT_BOOLEAN, "convertfileonly", -1 }, { "convert-links", 'k', OPT_BOOLEAN, "convertlinks", -1 }, + { "convert-specified-links", 0, OPT_VALUE, "convertspecifiedlinks", -1 }, { "content-disposition", 0, OPT_BOOLEAN, "contentdisposition", -1 }, { "content-on-error", 0, OPT_BOOLEAN, "contentonerror", -1 }, { "cookies", 0, OPT_BOOLEAN, "cookies", -1 }, @@ -905,6 +906,9 @@ -k, --convert-links make links in downloaded HTML or CSS point to\n\ local files\n"), N_("\ + --convert-specified-links=FILESTEM like -k, but let the user say\n\ + which links are to be converted\n"), + N_("\ --convert-file-only convert the file part of the URLs only (usually known as the basename)\n"), N_("\ --backups=N before writing file X, rotate up to N backup files\n"), @@ -1218,11 +1222,7 @@ for (argstring_length = 1, i = 1; i < argc; i++) argstring_length += strlen (argv[i]) + 3 + 1; program_argstring = p = malloc (argstring_length); - if (p == NULL) - { - fprintf (stderr, _("Memory allocation problem\n")); - exit (WGET_EXIT_PARSE_ERROR); - } + die_if_memory_allocation_has_failed (p); for (i = 1; i < argc; i++) { int arglen; @@ -1417,12 +1417,15 @@ /* All user options have now been processed, so it's now safe to do interoption dependency checks. */ - if (opt.noclobber && (opt.convert_links || opt.convert_file_only)) + if (opt.noclobber && (opt.convert_links || opt.convert_specified_links || opt.convert_file_only)) { fprintf (stderr, opt.convert_links ? _("Both --no-clobber and --convert-links were specified," " only --convert-links will be used.\n") : + opt.convert_specified_links ? + _("Both --no-clobber and --convert-specified-links were specified," + " only --convert-specified-links will be used.\n") : _("Both --no-clobber and --convert-file-only were specified," " only --convert-file-only will be used.\n")); opt.noclobber = false; @@ -1478,11 +1481,12 @@ #endif if (opt.output_document) { - if ((opt.convert_links || opt.convert_file_only) + if ((opt.convert_links || opt.convert_specified_links || opt.convert_file_only) && (nurl > 1 || opt.page_requisites || opt.recursive)) { fputs (_("\ -Cannot specify both -k or --convert-file-only and -O if multiple URLs are given, or in combination\n\ +Cannot specify both -k (or a related option like --convert-specified-links\n\ +or --convert-file-only) and -O if multiple URLs are given, or in combination\n\ with -p or -r. See the manual for details.\n\n"), stderr); print_usage (1); exit (WGET_EXIT_GENERIC_ERROR); @@ -1718,11 +1722,7 @@ /* Fill in the arguments. */ url = alloca_array (char *, nurl + 1); - if (url == NULL) - { - fprintf (stderr, _("Memory allocation problem\n")); - exit (WGET_EXIT_PARSE_ERROR); - } + die_if_memory_allocation_has_failed (url); for (i = 0; i < nurl; i++, optind++) { char *rewritten = rewrite_shorthand_url (argv[optind]); @@ -1787,16 +1787,16 @@ if (fstat (fileno (output_stream), &st) == 0 && S_ISREG (st.st_mode)) output_stream_regular = true; } - if (!output_stream_regular && (opt.convert_links || opt.recursive)) + if (!output_stream_regular && (opt.convert_links || opt.convert_specified_links || opt.recursive)) { - fprintf (stderr, _("-k or -r can be used together with -O only if \ + fprintf (stderr, _("-k, --convert-specified-links or -r can be used together with -O only if \ outputting to a regular file.\n")); exit (WGET_EXIT_GENERIC_ERROR); } - if (!output_stream_regular && (opt.convert_links || opt.convert_file_only)) + if (!output_stream_regular && (opt.convert_links || opt.convert_specified_links || opt.convert_file_only)) { - fprintf (stderr, _("--convert-links or --convert-file-only can be used together \ -only if outputting to a regular file.\n")); + fprintf (stderr, _("--convert-links, --convert-specified-links or --convert-file-only can be used together \ +with -O only if outputting to a regular file.\n")); exit (WGET_EXIT_GENERIC_ERROR); } } @@ -2069,7 +2069,7 @@ save_hsts (); #endif - if ((opt.convert_links || opt.convert_file_only) && !opt.delete_after) + if ((opt.convert_links || opt.convert_specified_links || opt.convert_file_only) && !opt.delete_after) convert_all_links (); cleanup (); diff -turN wget-1.18/src/options.h wget-1.18.0.1thb/src/options.h --- wget-1.18/src/options.h 2016-06-03 11:55:13.000000000 +0000 +++ wget-1.18.0.1thb/src/options.h 2016-12-14 16:05:16.941628793 +0000 @@ -194,6 +194,9 @@ NULL. */ bool convert_links; /* Will the links be converted locally? */ + char *convert_specified_links; /* The single stem of the names of the two + files that, together, say what is to + be converted. */ bool convert_file_only; /* Convert only the file portion of the URI (i.e. basename). Leave everything else untouched. */ diff -turN wget-1.18/src/utils.c wget-1.18.0.1thb/src/utils.c --- wget-1.18/src/utils.c 2016-06-09 16:10:14.000000000 +0000 +++ wget-1.18.0.1thb/src/utils.c 2016-12-14 20:46:00.133367835 +0000 @@ -32,9 +32,11 @@ #include "wget.h" #include "sha256.h" +#include "xstrndup.h" #include #include #include +#include #include #include #ifdef HAVE_PROCESS_H @@ -2531,6 +2533,34 @@ str_buffer[2 * i] = '\0'; } +/* The caller passes a string s containing whitespace-delimited words. The + function copies the individual words for the caller's use. The caller is + left to xfree() the copies later. The function returns the number of + words copied. If s contains more than max_no_of_words, then the extra + words are not copied but are silently ignored. For example, if s + were " foo bar baz" but max_no_of_words == 2, then words[0] would + be "foo" and words[1] would be "bar"; the "baz" would be ignored; and the + function would return not 3 but 2. [See also sepstring() in this file, + which works differently but serves a related purpose.] */ + +size_t +split_string_into_words(char *s, + char **const words, + const size_t max_no_of_words) +{ + size_t no_of_words = 0; + while (no_of_words < max_no_of_words && *s) + { + const char *word; + while (*s && isspace (*s)) ++s; + if (!*s) break; + word = s++; + while (*s && !isspace (*s)) ++s; + words[no_of_words++] = xstrndup (word, s-word); + } + return no_of_words; +} + #ifdef HAVE_SSL /* diff -turN wget-1.18/src/utils.h wget-1.18.0.1thb/src/utils.h --- wget-1.18/src/utils.h 2016-06-03 11:55:13.000000000 +0000 +++ wget-1.18.0.1thb/src/utils.h 2016-12-14 16:05:16.941628793 +0000 @@ -163,6 +163,8 @@ void wg_hex_to_string (char *str_buffer, const char *hex_buffer, size_t hex_len); +size_t split_string_into_words(char *s, char **words, size_t max_no_of_words); + extern unsigned char char_prop[]; #ifdef HAVE_SSL