bug-wget
[Top][All Lists]
Advanced

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

Re: [Bug-wget] download inline elements from remote hosts (page requisit


From: Jonas Gasser
Subject: Re: [Bug-wget] download inline elements from remote hosts (page requisites) PATCH
Date: Wed, 18 Apr 2012 04:22:00 -0700 (PDT)

> From: Paul Wratt <address@hidden>
> To: Jonas Gasser <address@hidden>; wget <address@hidden>
> Cc: 
> Sent: Wednesday, January 4, 2012 12:53 PM
> Subject: Re: [Bug-wget] download inline elements from remote hosts (page 
> requisites)
> 
> yeah I am interested in it - what are the repercussions of not having
> an option for this...
> 
> 1) you cant turn it off
> 2) ..
> 
> host limiting should still work tho
> 
> also note that you can no longer use --convert-links if you require
> --no-clobber and a valid windows filename (cant update mirror)
> 
> Paul
> 
> On Fri, Dec 30, 2011 at 1:59 AM, Jonas Gasser <address@hidden> 
> wrote:
>>  Hello everybody
>> 
>>  As I use wget quite often for mirrors, it's very annoying to be limited 
> to the host for inline elements (without the span-host option).
>>  So I modified recur.c with 2-3 lines
>> 
>>    /* 7. */
>>    if (schemes_are_similar_p (u->scheme, parent->scheme)) {
>>      if (!opt.spanhost && 0 != strcasecmp (parent->host, 
> u->host))
>>        {
>>      if (!(opt.page_requisites && upos->link_inline_p))
>>      {
>>        DEBUGP (("This is not the same hostname as the parent's (%s 
> and %s).\n",
>>                   u->host, parent->host));
>>        goto out;
>>      }
>>        }
>>    }
>>    /* 8. */
>> 
>>  and I use it like:
>> 
>>  $ ./wget --output-file=test.log -k -E -K --no-check-certificate -p -e 
> robots=off -m URL
>> 
>>  First tests are done (simple examples - inline images and iframes included 
> from amazon) and seem to work.
>>  My question : Is there interest in this and would an additional option be 
> the better way?
>> 
>> 
>>  Greets from Switzerland, Jonas
>> 
>> 
>

Hey

I attached a patch for the "all inline elements download". I tried it with 
single pages and whole websites.
I'm not happy with the long name "page-requisites-extended".
Description:
Added option "page-requisites-extended" (init.c, main.c, options.h)
Added cmd_spec_page_requisites_extended --> set page-requisites and 
page-requisites-extended true (init.c)
Modified recur.c /* 7 */

Greets, Jonas

=== modified file 'src/init.c'
--- old/src/init.c    2012-03-08 09:00:51 +0000
+++ new/src/init.c    2012-04-18 09:51:47 +0000
@@ -91,6 +91,7 @@
 CMD_DECLARE (cmd_spec_warc_header);
 CMD_DECLARE (cmd_spec_htmlify);
 CMD_DECLARE (cmd_spec_mirror);
+CMD_DECLARE (cmd_spec_page_requisites_extended);
 CMD_DECLARE (cmd_spec_prefer_family);
 CMD_DECLARE (cmd_spec_progress);
 CMD_DECLARE (cmd_spec_recursive);
@@ -210,6 +211,7 @@
   { "numtries",         &opt.ntry,              cmd_number_inf },/* 
deprecated*/
   { "outputdocument",   &opt.output_document,   cmd_file },
   { "pagerequisites",   &opt.page_requisites,   cmd_boolean },
+  { "pagerequisitesextended",   &opt.page_requisites_extended,   
cmd_spec_page_requisites_extended },
   { "passiveftp",       &opt.ftp_pasv,          cmd_boolean },
   { "passwd",           &opt.ftp_passwd,        cmd_string },/* deprecated*/
   { "password",         &opt.passwd,            cmd_string },
@@ -1312,6 +1314,23 @@
   return true;
 }
 
+/* Set the "page-requisites-extended" mode.  It means: set page-requisites 
equal true */
+
+static bool
+cmd_spec_page_requisites_extended (const char *com, const char *val, void 
*place_ignored)
+{
+  int pagerequisitesextended;
+
+  if (!cmd_boolean (com, val, &pagerequisitesextended))
+    return false;
+  if (pagerequisitesextended)
+    {
+      opt.page_requisites = true;
+      opt.page_requisites_extended = true;
+    }
+  return true;
+}
+
 /* Validate --prefer-family and set the choice.  Allowed values are
    "IPv4", "IPv6", and "none".  */
 

=== modified file 'src/main.c'
--- old/src/main.c    2012-03-05 21:23:06 +0000
+++ new/src/main.c    2012-04-18 08:54:36 +0000
@@ -239,6 +239,7 @@
     { "output-document", 'O', OPT_VALUE, "outputdocument", -1 },
     { "output-file", 'o', OPT_VALUE, "logfile", -1 },
     { "page-requisites", 'p', OPT_BOOLEAN, "pagerequisites", -1 },
+    { "page-requisites-extended", 0, OPT_BOOLEAN, "pagerequisitesextended", -1 
},
     { "parent", 0, OPT__PARENT, NULL, optional_argument },
     { "passive-ftp", 0, OPT_BOOLEAN, "passiveftp", -1 },
     { "password", 0, OPT_VALUE, "password", -1 },
@@ -713,6 +714,8 @@
     N_("\
   -p,  --page-requisites    get all images, etc. needed to display HTML 
page.\n"),
     N_("\
+       --page-requisites-extended    get all inline elements (images, videos 
etc.) needed to display HTML page.\n"),
+    N_("\
        --strict-comments    turn on strict (SGML) handling of HTML 
comments.\n"),
     "\n",
 

=== modified file 'src/options.h'
--- old/src/options.h    2012-03-05 21:23:06 +0000
+++ new/src/options.h    2012-04-18 08:55:32 +0000
@@ -180,6 +180,9 @@
 
   bool page_requisites;        /* Whether we need to download all files
                    necessary to display a page properly. */
+  bool page_requisites_extended;         /* Whether we need to download all 
files
+                                   necessary to display a page properly. */
+
   char *bind_address;        /* What local IP address to bind to. */
 
 #ifdef HAVE_SSL

=== modified file 'src/recur.c'
--- old/src/recur.c    2011-03-30 23:37:12 +0000
+++ new/src/recur.c    2012-04-18 09:11:33 +0000
@@ -616,9 +616,12 @@
   if (schemes_are_similar_p (u->scheme, parent->scheme))
     if (!opt.spanhost && 0 != strcasecmp (parent->host, u->host))
       {
+        if (!(opt.page_requisites_extended && upos->link_inline_p))
+          {
         DEBUGP (("This is not the same hostname as the parent's (%s and 
%s).\n",
                  u->host, parent->host));
         goto out;
+          }
       }
 
   /* 8. */

Attachment: page-requisites-extended.patch
Description: Binary data


reply via email to

[Prev in Thread] Current Thread [Next in Thread]