bug-wget
[Top][All Lists]
Advanced

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

[Bug-wget] Patch for bug #20382


From: gerel
Subject: [Bug-wget] Patch for bug #20382
Date: Sun, 01 Feb 2009 12:06:07 -0800 (PST)

Hi everyone,

Here I attach a patch for "bug #20382 : Wget parses the same URI multiple times
throughout the code".

Hope all is fine.

cheers
-gerel

##
Exporting patches:
# HG changeset patch
# User address@hidden
# Date 1233504392 10800
# Node ID 565d1abd73d20c891eadec22d87bb2c791f24e5b
# Parent  1b4062e241879e6718b08d88b2832fd358757e4f
one less call

diff -r 1b4062e241879e6718b08d88b2832fd358757e4f -r 
565d1abd73d20c891eadec22d87bb2c791f24e5b src/recur.c
--- a/src/recur.c       Mon Dec 01 22:59:03 2008 -0800
+++ b/src/recur.c       Sun Feb 01 13:06:32 2009 -0300
@@ -154,7 +154,7 @@ url_dequeue (struct url_queue *queue,
 
 static bool download_child_p (const struct urlpos *, struct url *, int,
                               struct url *, struct hash_table *);
-static bool descend_redirect_p (const char *, const char *, int,
+static bool descend_redirect_p (const char *, struct url *, int,
                                 struct url *, struct hash_table *);
 
 
@@ -264,10 +264,21 @@ retrieve_tree (const char *start_url)
         }
       else
         {
-          int dt = 0;
+          int dt = 0, url_err;
           char *redirected = NULL;
-
-          status = retrieve_url (url, &file, &redirected, referer, &dt, false);
+          struct url *url_parsed = url_parse (url, &url_err);
+
+          if (!url_parsed)
+            {
+              char *error = url_error (url, url_err);
+              logprintf (LOG_NOTQUIET, "%s: %s.\n", url, error);
+              xfree (error);
+              status = URLERROR;
+            }
+          else
+            {
+              status = retrieve_url (url, &file, &redirected, referer, &dt, 
false);
+            }
 
           if (html_allowed && file && status == RETROK
               && (dt & RETROKF) && (dt & TEXTHTML))
@@ -294,7 +305,7 @@ retrieve_tree (const char *start_url)
                  want to follow it.  */
               if (descend)
                 {
-                  if (!descend_redirect_p (redirected, url, depth,
+                  if (!descend_redirect_p (redirected, url_parsed, depth,
                                            start_url_parsed, blacklist))
                     descend = false;
                   else
@@ -306,6 +317,7 @@ retrieve_tree (const char *start_url)
               xfree (url);
               url = redirected;
             }
+          url_free(url_parsed);
         }
 
       if (opt.spider)
@@ -656,14 +668,13 @@ download_child_p (const struct urlpos *u
    it is merely a simple-minded wrapper around download_child_p.  */
 
 static bool
-descend_redirect_p (const char *redirected, const char *original, int depth,
+descend_redirect_p (const char *redirected, struct url *orig_parsed, int depth,
                     struct url *start_url_parsed, struct hash_table *blacklist)
 {
-  struct url *orig_parsed, *new_parsed;
+  struct url *new_parsed;
   struct urlpos *upos;
   bool success;
 
-  orig_parsed = url_parse (original, NULL);
   assert (orig_parsed != NULL);
 
   new_parsed = url_parse (redirected, NULL);
@@ -675,7 +686,6 @@ descend_redirect_p (const char *redirect
   success = download_child_p (upos, orig_parsed, depth,
                               start_url_parsed, blacklist);
 
-  url_free (orig_parsed);
   url_free (new_parsed);
   xfree (upos);
 
# HG changeset patch
# User "gerel <address@hidden>"
# Date 1233511431 10800
# Node ID 2fe25ad6bedc11ab5b17a33ed87aaf14911ec4d4
# Parent  565d1abd73d20c891eadec22d87bb2c791f24e5b
removed some more calls

diff -r 565d1abd73d20c891eadec22d87bb2c791f24e5b -r 
2fe25ad6bedc11ab5b17a33ed87aaf14911ec4d4 src/main.c
--- a/src/main.c        Sun Feb 01 13:06:32 2009 -0300
+++ b/src/main.c        Sun Feb 01 15:03:51 2009 -0300
@@ -1178,34 +1178,45 @@ WARNING: Can't reopen standard output in
   for (t = url; *t; t++)
     {
       char *filename = NULL, *redirected_URL = NULL;
-      int dt;
-
-      if ((opt.recursive || opt.page_requisites)
-          && (url_scheme (*t) != SCHEME_FTP || url_uses_proxy (*t)))
-        {
-          int old_follow_ftp = opt.follow_ftp;
-
-          /* Turn opt.follow_ftp on in case of recursive FTP retrieval */
-          if (url_scheme (*t) == SCHEME_FTP) 
-            opt.follow_ftp = 1;
+      int dt, url_err;
+      struct url *url_parsed = url_parse (*t, &url_err);
+
+      if (!url_parsed)
+        {
+          char *error = url_error (*t, url_err);
+          logprintf (LOG_NOTQUIET, "%s: %s.\n",*t, error);
+          xfree (error);
+          status = URLERROR;
+        }
+      else
+        {
+          if ((opt.recursive || opt.page_requisites)
+              && (url_scheme (*t) != SCHEME_FTP || url_uses_proxy 
(url_parsed)))
+            {
+              int old_follow_ftp = opt.follow_ftp;
+
+              /* Turn opt.follow_ftp on in case of recursive FTP retrieval */
+              if (url_scheme (*t) == SCHEME_FTP) 
+                opt.follow_ftp = 1;
           
-          status = retrieve_tree (*t);
-
-          opt.follow_ftp = old_follow_ftp;
-        }
-      else
-        status = retrieve_url (*t, &filename, &redirected_URL, NULL, &dt, 
opt.recursive);
-
-      if (opt.delete_after && file_exists_p(filename))
-        {
-          DEBUGP (("Removing file due to --delete-after in main():\n"));
-          logprintf (LOG_VERBOSE, _("Removing %s.\n"), filename);
-          if (unlink (filename))
-            logprintf (LOG_NOTQUIET, "unlink: %s\n", strerror (errno));
-        }
-
-      xfree_null (redirected_URL);
-      xfree_null (filename);
+              status = retrieve_tree (url_parsed);
+
+              opt.follow_ftp = old_follow_ftp;
+            }
+          else
+            status = retrieve_url (url_parsed, *t, &filename, &redirected_URL, 
NULL, &dt, opt.recursive);
+
+          if (opt.delete_after && file_exists_p(filename))
+            {
+              DEBUGP (("Removing file due to --delete-after in main():\n"));
+              logprintf (LOG_VERBOSE, _("Removing %s.\n"), filename);
+              if (unlink (filename))
+                logprintf (LOG_NOTQUIET, "unlink: %s\n", strerror (errno));
+            }
+          xfree_null (redirected_URL);
+          xfree_null (filename);
+          url_free (url_parsed);
+        }
     }
 
   /* And then from the input file, if any.  */
diff -r 565d1abd73d20c891eadec22d87bb2c791f24e5b -r 
2fe25ad6bedc11ab5b17a33ed87aaf14911ec4d4 src/recur.c
--- a/src/recur.c       Sun Feb 01 13:06:32 2009 -0300
+++ b/src/recur.c       Sun Feb 01 15:03:51 2009 -0300
@@ -180,7 +180,7 @@ static bool descend_redirect_p (const ch
           options, add it to the queue. */
 
 uerr_t
-retrieve_tree (const char *start_url)
+retrieve_tree (struct url *start_url_parsed)
 {
   uerr_t status = RETROK;
 
@@ -190,17 +190,6 @@ retrieve_tree (const char *start_url)
   /* The URLs we do not wish to enqueue, because they are already in
      the queue, but haven't been downloaded yet.  */
   struct hash_table *blacklist;
-
-  int up_error_code;
-  struct url *start_url_parsed = url_parse (start_url, &up_error_code);
-
-  if (!start_url_parsed)
-    {
-      char *error = url_error (start_url, up_error_code);
-      logprintf (LOG_NOTQUIET, "%s: %s.\n", start_url, error);
-      xfree (error);
-      return URLERROR;
-    }
 
   queue = url_queue_new ();
   blacklist = make_string_hash_table (0);
@@ -277,7 +266,8 @@ retrieve_tree (const char *start_url)
             }
           else
             {
-              status = retrieve_url (url, &file, &redirected, referer, &dt, 
false);
+              status = retrieve_url (url_parsed, url, &file, &redirected,
+                                     referer, &dt, false);
             }
 
           if (html_allowed && file && status == RETROK
@@ -451,8 +441,6 @@ retrieve_tree (const char *start_url)
   }
   url_queue_delete (queue);
 
-  if (start_url_parsed)
-    url_free (start_url_parsed);
   string_set_free (blacklist);
 
   if (opt.quota && total_downloaded_bytes > opt.quota)
diff -r 565d1abd73d20c891eadec22d87bb2c791f24e5b -r 
2fe25ad6bedc11ab5b17a33ed87aaf14911ec4d4 src/recur.h
--- a/src/recur.h       Sun Feb 01 13:06:32 2009 -0300
+++ b/src/recur.h       Sun Feb 01 15:03:51 2009 -0300
@@ -31,6 +31,8 @@ as that of the covered work.  */
 #ifndef RECUR_H
 #define RECUR_H
 
+#include "url.h"
+
 /* For most options, 0 means no limits, but with -p in the picture,
    that causes a problem on the maximum recursion depth variable.  To
    retain backwards compatibility we allow users to consider "0" to be
@@ -42,6 +44,6 @@ struct urlpos;
 struct urlpos;
 
 void recursive_cleanup (void);
-uerr_t retrieve_tree (const char *);
+uerr_t retrieve_tree (struct url *);
 
 #endif /* RECUR_H */
diff -r 565d1abd73d20c891eadec22d87bb2c791f24e5b -r 
2fe25ad6bedc11ab5b17a33ed87aaf14911ec4d4 src/res.c
--- a/src/res.c Sun Feb 01 13:06:32 2009 -0300
+++ b/src/res.c Sun Feb 01 15:03:51 2009 -0300
@@ -537,13 +537,29 @@ res_retrieve_file (const char *url, char
   uerr_t err;
   char *robots_url = uri_merge (url, RES_SPECS_LOCATION);
   int saved_ts_val = opt.timestamping;
-  int saved_sp_val = opt.spider;
+  int saved_sp_val = opt.spider, url_err;
+  struct url * url_parsed;
 
   logputs (LOG_VERBOSE, _("Loading robots.txt; please ignore errors.\n"));
   *file = NULL;
   opt.timestamping = false;
   opt.spider       = false;
-  err = retrieve_url (robots_url, file, NULL, NULL, NULL, false);
+
+  url_parsed = url_parse (robots_url, &url_err);
+  if (!url_parsed)
+    {
+      char *error = url_error (robots_url, url_err);
+      logprintf (LOG_NOTQUIET, "%s: %s.\n", robots_url, error);
+      xfree (error);
+      err = URLERROR;
+    }
+  else
+    {
+      err = retrieve_url (url_parsed, robots_url, file, NULL, NULL, NULL,
+                          false);
+      url_free(url_parsed);
+    }
+
   opt.timestamping = saved_ts_val;
   opt.spider       = saved_sp_val;  
   xfree (robots_url);
diff -r 565d1abd73d20c891eadec22d87bb2c791f24e5b -r 
2fe25ad6bedc11ab5b17a33ed87aaf14911ec4d4 src/retr.c
--- a/src/retr.c        Sun Feb 01 13:06:32 2009 -0300
+++ b/src/retr.c        Sun Feb 01 15:03:51 2009 -0300
@@ -596,15 +596,15 @@ static char *getproxy (struct url *);
    multiple points. */
 
 uerr_t
-retrieve_url (const char *origurl, char **file, char **newloc,
-              const char *refurl, int *dt, bool recursive)
+retrieve_url (struct url * orig_parsed, const char *origurl, char **file,
+              char **newloc, const char *refurl, int *dt, bool recursive)
 {
   uerr_t result;
   char *url;
   bool location_changed;
   int dummy;
   char *mynewloc, *proxy;
-  struct url *u, *proxy_url;
+  struct url *u = orig_parsed, *proxy_url;
   int up_error_code;            /* url parse error code */
   char *local_file;
   int redirection_count = 0;
@@ -624,16 +624,6 @@ retrieve_url (const char *origurl, char 
     *newloc = NULL;
   if (file)
     *file = NULL;
-
-  u = url_parse (url, &up_error_code);
-  if (!u)
-    {
-      char *error = url_error (url, up_error_code);
-      logprintf (LOG_NOTQUIET, "%s: %s.\n", url, error);
-      xfree (url);
-      xfree (error);
-      return URLERROR;
-    }
 
   if (!refurl)
     refurl = opt.referer;
@@ -733,7 +723,10 @@ retrieve_url (const char *origurl, char 
           char *error = url_error (mynewloc, up_error_code);
           logprintf (LOG_NOTQUIET, "%s: %s.\n", escnonprint_uri (mynewloc),
                      error);
-          url_free (u);
+          if (orig_parsed != u)
+            {
+              url_free (u);
+            }
           xfree (url);
           xfree (mynewloc);
           xfree (error);
@@ -753,7 +746,10 @@ retrieve_url (const char *origurl, char 
           logprintf (LOG_NOTQUIET, _("%d redirections exceeded.\n"),
                      opt.max_redirect);
           url_free (newloc_parsed);
-          url_free (u);
+          if (orig_parsed != u)
+            {
+              url_free (u);
+            }
           xfree (url);
           xfree (mynewloc);
           RESTORE_POST_DATA;
@@ -762,7 +758,10 @@ retrieve_url (const char *origurl, char 
 
       xfree (url);
       url = mynewloc;
-      url_free (u);
+      if (orig_parsed != u)
+        {
+          url_free (u);
+        }
       u = newloc_parsed;
 
       /* If we're being redirected from POST, we don't want to POST
@@ -795,7 +794,10 @@ retrieve_url (const char *origurl, char 
   else
     xfree_null (local_file);
 
-  url_free (u);
+  if (orig_parsed != u)
+    {
+      url_free (u);
+    }
 
   if (redirection_count)
     {
@@ -836,13 +838,22 @@ retrieve_from_file (const char *file, bo
   
   if (url_has_scheme (url))
     {
-      int dt;
+      int dt,url_err;
       uerr_t status;
+      struct url * url_parsed = url_parse(url, &url_err);
+
+      if (!url_parsed)
+        {
+          char *error = url_error (url, url_err);
+          logprintf (LOG_NOTQUIET, "%s: %s.\n", url, error);
+          xfree (error);
+          return URLERROR;
+        }
 
       if (!opt.base_href)
         opt.base_href = xstrdup (url);
 
-      status = retrieve_url (url, &input_file, NULL, NULL, &dt, false);
+      status = retrieve_url (url_parsed, url, &input_file, NULL, NULL, &dt, 
false);
       if (status != RETROK)
         return status;
 
@@ -877,12 +888,15 @@ retrieve_from_file (const char *file, bo
           if (cur_url->url->scheme == SCHEME_FTP) 
             opt.follow_ftp = 1;
           
-          status = retrieve_tree (cur_url->url->url);
+          status = retrieve_tree (cur_url->url);
 
           opt.follow_ftp = old_follow_ftp;
         }
       else
-        status = retrieve_url (cur_url->url->url, &filename, &new_file, NULL, 
&dt, opt.recursive);
+        {
+          status = retrieve_url (cur_url->url, cur_url->url->url, &filename,
+                                 &new_file, NULL, &dt, opt.recursive);
+        }
 
       if (filename && opt.delete_after && file_exists_p (filename))
         {
@@ -1050,14 +1064,12 @@ getproxy (struct url *u)
 /* Returns true if URL would be downloaded through a proxy. */
 
 bool
-url_uses_proxy (const char *url)
+url_uses_proxy (struct url * u)
 {
   bool ret;
-  struct url *u = url_parse (url, NULL);
   if (!u)
     return false;
   ret = getproxy (u) != NULL;
-  url_free (u);
   return ret;
 }
 
diff -r 565d1abd73d20c891eadec22d87bb2c791f24e5b -r 
2fe25ad6bedc11ab5b17a33ed87aaf14911ec4d4 src/retr.h
--- a/src/retr.h        Sun Feb 01 13:06:32 2009 -0300
+++ b/src/retr.h        Sun Feb 01 15:03:51 2009 -0300
@@ -31,6 +31,8 @@ as that of the covered work.  */
 #ifndef RETR_H
 #define RETR_H
 
+#include "url.h"
+
 /* These global vars should be made static to retr.c and exported via
    functions! */
 extern SUM_SIZE_INT total_downloaded_bytes;
@@ -51,7 +53,7 @@ char *fd_read_hunk (int, hunk_terminator
 char *fd_read_hunk (int, hunk_terminator_t, long, long);
 char *fd_read_line (int);
 
-uerr_t retrieve_url (const char *, char **, char **, const char *, int *, 
bool);
+uerr_t retrieve_url (struct url *, const char *, char **, char **, const char 
*, int *, bool);
 uerr_t retrieve_from_file (const char *, bool, int *);
 
 const char *retr_rate (wgint, double);
@@ -62,6 +64,6 @@ void sleep_between_retrievals (int);
 
 void rotate_backups (const char *);
 
-bool url_uses_proxy (const char *);
+bool url_uses_proxy (struct url *);
 
 #endif /* RETR_H */
# HG changeset patch
# User "gerel <address@hidden>"
# Date 1233513848 10800
# Node ID 292aea0c3956e94d74d4dfabd464fad6af375425
# Parent  2fe25ad6bedc11ab5b17a33ed87aaf14911ec4d4
added changelog entry

diff -r 2fe25ad6bedc11ab5b17a33ed87aaf14911ec4d4 -r 
292aea0c3956e94d74d4dfabd464fad6af375425 ChangeLog
--- a/ChangeLog Sun Feb 01 15:03:51 2009 -0300
+++ b/ChangeLog Sun Feb 01 15:44:08 2009 -0300
@@ -1,3 +1,17 @@ 2008-11-10  Micah Cowan  <address@hidden
+2009-02-01  Gerardo E. Gidoni  <address@hidden>
+
+       * src/main.c: restructured code to avoid multiple 'url_parse' calls.
+
+       * src/recur.c: same.
+
+       * src/recur.h: same.
+
+       * src/res.c: same.
+
+       * src/retr.c: same.
+
+       * src/retr.h: same.
+
 2008-11-10  Micah Cowan  <address@hidden>
 
        * MAILING-LIST: Mention Gmane, introduce subsections.

###





reply via email to

[Prev in Thread] Current Thread [Next in Thread]