>From e3e69bc551a0f3794f8da3fdb421c2868d6e8326 Mon Sep 17 00:00:00 2001 From: Matthew White Date: Thu, 28 Jul 2016 17:10:46 +0200 Subject: [PATCH 03/25] Bugfix: Fix NULL filename and output_stream in Metalink module * src/metalink.c (retrieve_from_metalink): Fix NULL filename, set filename to the right "path/file" value * src/metalink.c (retrieve_from_metalink): Fix NULL output_stream, set output_stream to filename when it is created by retrieve_url() * src/metalink.c (retrieve_from_metalink): Add RFC5854 comments about proper metalink:file "path/file" name format handling * doc/metalink.txt: Update document. Remove resolved bugs If unique_create() cannot create/open the destination file, filename and output_stream remain NULL. If fopen() is used instead, filename always remains NULL. Both functions cannot create "path/file" trees. Setting filename to the right value is sufficient to prevent SIGSEGV generating from testing a NULL value. This also allows retrieve_url() to create a "path/file" tree through opt.output_document. Reading NULL as output_stream, when it shall not be, leads to wrong results. For instance, a non-NULL output_stream tells when a stream was interrupted, reading NULL instead means to assume the contrary. This patch conforms to the RFC5854 specification: The Metalink Download Description Format 4.1.2.1. The "name" Attribute https://tools.ietf.org/html/rfc5854#section-4.1.2.1 --- doc/metalink.txt | 139 ++++++++++++++++++++++++++++++++++--------------------- src/metalink.c | 33 ++++++++++++- 2 files changed, 117 insertions(+), 55 deletions(-) diff --git a/doc/metalink.txt b/doc/metalink.txt index 31734a3..0f3706a 100644 --- a/doc/metalink.txt +++ b/doc/metalink.txt @@ -1,24 +1,26 @@ -GNU Wget Metalink module (--input-metalink) +GNU Wget Metalink module - Evaluation of "Directory Options" on the command line + Evaluation of the Metalink/XML and Metalink/HTTP implementations 1. Introduction *************** This document, and the results contained in it, is focused over the -testing of the metalink:file "path/file" name format. +evaluation of the Metalink/XML and Metalink/HTTP implementations. The "Directory Options" mentioned here are used on the command line in -conjunction with the option '--input-metalink=file': +conjunction with the option '--input-metalink=file' for Metalink/XML, +and '--metalink-over-http' for Metalink/HTTP. -$ wget --input-metalink=file +$ wget --input-metalink= [directory options] +$ wget --metalink-over-http [directory options] 2. Notes ******** -Tests containing a metalink:file "/path/file", "./path/file", or -"../path/file" name shall be run manually due to security concerns. +Tests for metalink:file names beginning with '/', '~/', './', or '../' +(e.g. "/path/file") shall be run manually due to security concerns. 3. Metalink files used as reference *********************************** @@ -47,17 +49,30 @@ EOF 4.1 Implemented safety features =============================== -Do not follow relative or absolute paths: "/path/file", "./path/file", -and "../path/file" as metalink:file name formats are all ignored (wget -refuses to start). The options --trust-server-names changes nothing. +Any metalink:file name containing an absolute, relative, or home path +(see '2. Notes') parsed from Metalink/XML files is rejected. -4.2 Actual behaviour -==================== +This is a libmetalink's design decision implemented in the function +metalink_check_safe_path(). This feature shall not be modified. -Given a metalink:file "path/file" name, if "path" exists, download -"path/file", then compute its checksum. If "path" doesn't exist, -download the url's file in the working directory; then the checksum -fails: cannot find "path/file". +All the above conform to the RFC5854 standard. + +References: + https://tools.ietf.org/html/rfc5854#section-4.1.2.1 + https://tools.ietf.org/html/rfc5854#section-4.2.8.3 + +4.2 File download behaviour +=========================== + +When a Metalink/XML file is parsed: +1. create the metalink:file "path/file" tree; +2. download the metalink:url file as "path/file"; +3. verify the "path/file" checksum. + +All the above conform to the RFC5854 standard. + +References: + https://tools.ietf.org/html/rfc5854 4.3 Questionable behaviours =========================== @@ -69,69 +84,85 @@ If more metalink:file elements are the same, wget downloads them all. The download is OK even when metalink:file size is wrong. -5. Directory Options +5. `wget --metalink-over-http` +****************************** + +5.1 Implemented safety features +=============================== + +The function url_file_name() is responsible of parsing the url's file +name and mixing in the "Directory Options" wrote on the command line. + +The use of libmetalink's metalink_check_safe_path() shouldn't be +necessary (see '4.1 Implemented safety features'). + +All the above comform to the usual Wget's download behaviour. + +References: + wget(1) + +5.2 File download behaviour +=========================== + +When a Metalink/HTTP header is parsed: +1. extract metalink metadata from the header; +2. download the file from the mirror with the highest priority; +3. verify the file's checksum. + +All the above comform to the usual Wget's download behaviour and to +the RFC6249 standard. + +References: + wget(1) + https://tools.ietf.org/html/rfc6249 + +6. Directory Options ******************** '-nd' '--no-directories' - Used alone has no effect (see `wget --input-metalink=test.meta4`). + Do not apply to Metalink/XML files (aka --input-metalink=). - Used in conjunction with --recursive, given "path/file", if "path" - exists, download "path/file" and compute its checksum. If "path" - doesnt' exist, download the url's file in the working directory, - then the checksum fails: cannot find "path/file". + Apply to Metalink/HTTP urls as described in the Wget's manual, see + wget(1). The target url is the url wrote on the command line. '-x' '--force-directories' - Given "path/file", if "path" exists, download "path/file", then - compute its checksum. If "path" doesn't exist, create the url - hierarchy, then the checksum fails: cannot find "path/file". + Do not apply to Metalink/XML files (aka --input-metalink=). + + Apply to Metalink/HTTP urls as described in the Wget's manual, see + wget(1). The target url is the url wrote on the command line. '-nH' '--no-host-directories' - Given "path/file", if "path" exists, download "path/file", then - compute its checksum. If "path" doesn't exist, download the url's - file in the working directory, then the checksum fails: cannot - find "path/file"; in this context, if --force-directories is - present, create the url hierarchy omitting the host component. + Do not apply to Metalink/XML files (aka --input-metalink=). + + Apply to Metalink/HTTP urls as described in the Wget's manual, see + wget(1). The target url is the url wrote on the command line. '--protocol-directories' - Used alone has no effect (see `wget --input-metalink=test.meta4`). + Do not apply to Metalink/XML files (aka --input-metalink=). - In conjunction with --force-directories, use the protocol name as - the first directory component (see --force-directories). + Apply to Metalink/HTTP urls as described in the Wget's manual, see + wget(1). The target url is the url wrote on the command line. '--cut-dirs=number' - Used alone has no effect (see `wget --input-metalink=test.meta4`). + Do not apply to Metalink/XML files (aka --input-metalink=). - In conjunction with --force-directories, ignore 'number' directory - components after the domain (see --force-directories). + Apply to Metalink/HTTP urls as described in the Wget's manual, see + wget(1). The target url is the url wrote on the command line. '-P prefix' '--directory-prefix=prefix' - This is buggy or non-intuitive. - - Given "path/file", and more metalink:url uris for the same file, - if '-P path' is specified, the first url's file is downloaded as - "path/", and the second url's file as "path/file". The - first file fails the checksum: cannot find "path/file". The file - "path/file" passes the checksum verification. - - Given "path/file", and more metalink:url uris for the same file, - if '-P newp' is specified, all the urls' files are downloaded as - "newp/. A suffix counter is added to the file names to - not overwrite existing files. Then all the checksums fail: cannot - find "path/file". + Do not apply to Metalink/XML files (aka --input-metalink=). - Given "path/file", and more metalink:url uris for the same file, - if '-P ../path' is specified, the same things as if '-P ../newp' - or '-P newp' will happen, e.g. "newp/ and checksums - failures. + Apply to Metalink/HTTP downloads. - [write here more wrong things happening] + The directory prefix is the directory where all other files and + subdirectories will be saved to, see wget(1). diff --git a/src/metalink.c b/src/metalink.c index fd6d0e2..4a0a56e 100644 --- a/src/metalink.c +++ b/src/metalink.c @@ -170,7 +170,26 @@ retrieve_from_metalink (const metalink_t* metalink) output_stream_regular = true; - /* Store the real file name for displaying in messages. */ + /* + At this point, if output_stream is NULL, the file + couldn't be created/opened. + + This happens when the metalink:file has a "path/file" + name format and its directory tree cannot be created: + * stdio.h (fopen) + * src/utils.c (unique_create) + + RFC5854 requires a proper "path/file" format handling, + this can be achieved setting opt.output_document while + output_stream is left to NULL: + * src/http.c (open_output_stream): If output_stream is + NULL, create the opt.output_document "path/file" + */ + if (!filename) + filename = xstrdup (mfile->name); + + /* Store the real file name for displaying in messages, + and for proper RFC5854 "path/file" handling. */ opt.output_document = filename; opt.metalink_over_http = false; @@ -178,6 +197,18 @@ retrieve_from_metalink (const metalink_t* metalink) retr_err = retrieve_url (url, mres->url, NULL, NULL, NULL, NULL, opt.recursive, iri, false); opt.metalink_over_http = _metalink_http; + + /* + Bug: output_stream is NULL, but retrieve_url() somehow + created filename. + + Bugfix: point output_stream to filename if it exists. + */ + if (!output_stream && (output_stream = fopen (filename, "rb"))) + { + fclose (output_stream); + output_stream = fopen (filename, "ab"); + } } url_free (url); iri_free (iri); -- 2.7.3