[Bug-wget] [PATCH] rejected-log: Add option to dump URL rejections to a

bug-wget

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

[Bug-wget] [PATCH] rejected-log: Add option to dump URL rejections to a

From:	Jookia
Subject:	[Bug-wget] [PATCH] rejected-log: Add option to dump URL rejections to a log.
Date:	Tue, 21 Jul 2015 16:33:43 -0000

This allows you to figure out why URLs are being rejected and some context
around it.
---
 doc/wget.texi |   5 ++
 src/init.c    |   2 +
 src/main.c    |   3 ++
 src/options.h |   2 +
 src/recur.c   | 149 ++++++++++++++++++++++++++++++++++++++++++++++++----------
 5 files changed, 135 insertions(+), 26 deletions(-)

diff --git a/doc/wget.texi b/doc/wget.texi
index 0b683d8..d2ff7dc 100644
--- a/doc/wget.texi
+++ b/doc/wget.texi
@@ -551,6 +551,11 @@ would be resolved to @samp{http://foo/baz/b.html}.
 @cindex specify config 
 @item address@hidden
 Specify the location of a startup file you wish to use.
+
address@hidden address@hidden
+Logs all URL rejections to @var{logfile} as comma separated values.  The values
+include the reason of rejection, the URL and the parent URL it was found in.
+
 @end table
 
 @node Download Options, Directory Options, Logging and Input File Options, 
Invoking
diff --git a/src/init.c b/src/init.c
index da17614..ea074cc 100644
--- a/src/init.c
+++ b/src/init.c
@@ -274,6 +274,7 @@ static const struct {
   { "referer",          &opt.referer,           cmd_string },
   { "regextype",        &opt.regex_type,        cmd_spec_regex_type },
   { "reject",           &opt.rejects,           cmd_vector },
+  { "rejectedlog",      &opt.rejected_log,      cmd_file },
   { "rejectregex",      &opt.rejectregex_s,     cmd_string },
   { "relativeonly",     &opt.relative_only,     cmd_boolean },
   { "remoteencoding",   &opt.encoding_remote,   cmd_string },
@@ -1856,6 +1857,7 @@ cleanup (void)
   xfree (opt.post_data);
   xfree (opt.body_data);
   xfree (opt.body_file);
+  xfree (opt.rejected_log);
 
 #endif /* DEBUG_MALLOC */
 }
diff --git a/src/main.c b/src/main.c
index 6490711..880057b 100644
--- a/src/main.c
+++ b/src/main.c
@@ -317,6 +317,7 @@ static struct cmdline_option option_data[] =
     { "limit-rate", 0, OPT_VALUE, "limitrate", -1 },
     { "load-cookies", 0, OPT_VALUE, "loadcookies", -1 },
     { "local-encoding", 0, OPT_VALUE, "localencoding", -1 },
+    { "rejected-log", 0, OPT_VALUE, "rejectedlog", -1 },
     { "max-redirect", 0, OPT_VALUE, "maxredirect", -1 },
 #ifdef HAVE_METALINK
     { "metalink-over-http", 0, OPT_BOOLEAN, "metalink-over-http", -1 },
@@ -572,6 +573,8 @@ Logging and input file:\n"),
        --config=FILE               specify config file to use\n"),
     N_("\
        --no-config                 do not read any config file\n"),
+    N_("\
+       --rejected-log=FILE         log reasons for URL rejection to FILE\n"),
     "\n",
 
     N_("\
diff --git a/src/options.h b/src/options.h
index f8244ea..24ddbb5 100644
--- a/src/options.h
+++ b/src/options.h
@@ -296,6 +296,8 @@ struct options
                                    name. */
   bool report_bps;              /*Output bandwidth in bits format*/
 
+  char *rejected_log;           /* The file to log rejected URLS to. */
+
 #ifdef HAVE_HSTS
   bool hsts;
   char *hsts_file;
diff --git a/src/recur.c b/src/recur.c
index a000d54..5d16561 100644
--- a/src/recur.c
+++ b/src/recur.c
@@ -182,11 +182,18 @@ static int blacklist_contains (struct hash_table 
*blacklist, const char *url)
   return ret;
 }
 
-static bool download_child_p (const struct urlpos *, struct url *, int,
-                              struct url *, struct hash_table *, struct iri *);
-static bool descend_redirect_p (const char *, struct url *, int,
-                                struct url *, struct hash_table *, struct iri 
*);
+typedef enum
+{
+  SUCCESS, BLACKLIST, NOTHTTPS, NONHTTP, ABSOLUTE, DOMAIN, PARENT, LIST, REGEX,
+  RULES, SPANNEDHOST, ROBOTS
+} reject_reason;
 
+static reject_reason download_child_p (const struct urlpos *, struct url *, 
int,
+                              struct url *, struct hash_table *, struct iri *);
+static reject_reason descend_redirect_p (const char *, struct url *, int,
+                              struct url *, struct hash_table *, struct iri *);
+static void write_reject_reason_csv (FILE*, reject_reason, struct url *,
+                              struct url *);
 
 /* Retrieve a part of the web beginning with START_URL.  This used to
    be called "recursive retrieval", because the old function was
@@ -244,6 +251,14 @@ retrieve_tree (struct url *start_url_parsed, struct iri 
*pi)
                false);
   blacklist_add (blacklist, start_url_parsed->url);
 
+  FILE *rejectedlog = 0;
+  if(opt.rejected_log)
+    {
+      rejectedlog = fopen (opt.rejected_log, "w");
+      if (!rejectedlog)
+        logprintf (LOG_NOTQUIET, "%s: %s\n", opt.rejected_log, strerror 
(errno));
+    }
+
   while (1)
     {
       bool descend = false;
@@ -337,13 +352,19 @@ retrieve_tree (struct url *start_url_parsed, struct iri 
*pi)
                      want to follow it.  */
                   if (descend)
                     {
-                      if (!descend_redirect_p (redirected, url_parsed, depth,
-                                               start_url_parsed, blacklist, i))
-                        descend = false;
+                      reject_reason r = descend_redirect_p (redirected, 
url_parsed,
+                                        depth, start_url_parsed, blacklist, i);
+                      if(r == SUCCESS)
+                        {
+                          /* Make sure that the old pre-redirect form gets
+                             blacklisted. */
+                          blacklist_add (blacklist, url);
+                        }
                       else
-                        /* Make sure that the old pre-redirect form gets
-                           blacklisted. */
-                        blacklist_add (blacklist, url);
+                        {
+                          descend = false;
+                          write_reject_reason_csv(rejectedlog, r, url_parsed, 
start_url_parsed);
+                        }
                     }
 
                   xfree (url);
@@ -425,8 +446,9 @@ retrieve_tree (struct url *start_url_parsed, struct iri *pi)
                     continue;
                   if (dash_p_leaf_HTML && !child->link_inline_p)
                     continue;
-                  if (download_child_p (child, url_parsed, depth, 
start_url_parsed,
-                                        blacklist, i))
+                  reject_reason r = download_child_p (child, url_parsed, depth,
+                                    start_url_parsed, blacklist, i);
+                  if(r == SUCCESS)
                     {
                       ci = iri_new ();
                       set_uri_encoding (ci, i->content_encoding, false);
@@ -439,6 +461,10 @@ retrieve_tree (struct url *start_url_parsed, struct iri 
*pi)
                          same URL twice.  */
                       blacklist_add (blacklist, child->url->url);
                     }
+                  else
+                    {
+                      write_reject_reason_csv(rejectedlog, r, child->url, 
url_parsed);
+                    }
                 }
 
               if (strip_auth)
@@ -478,6 +504,9 @@ retrieve_tree (struct url *start_url_parsed, struct iri *pi)
       iri_free (i);
     }
 
+  if (rejectedlog)
+    fclose(rejectedlog);
+
   /* If anything is left of the queue due to a premature exit, free it
      now.  */
   {
@@ -513,7 +542,7 @@ retrieve_tree (struct url *start_url_parsed, struct iri *pi)
    by storing these URLs to BLACKLIST.  This may or may not help.  It
    will help if those URLs are encountered many times.  */
 
-static bool
+static reject_reason
 download_child_p (const struct urlpos *upos, struct url *parent, int depth,
                   struct url *start_url_parsed, struct hash_table *blacklist,
                   struct iri *iri)
@@ -521,6 +550,7 @@ download_child_p (const struct urlpos *upos, struct url 
*parent, int depth,
   struct url *u = upos->url;
   const char *url = u->url;
   bool u_scheme_like_http;
+  reject_reason reason = SUCCESS;
 
   DEBUGP (("Deciding whether to enqueue \"%s\".\n", url));
 
@@ -534,6 +564,7 @@ download_child_p (const struct urlpos *upos, struct url 
*parent, int depth,
           xfree (referrer);
         }
       DEBUGP (("Already on the black list.\n"));
+      reason = BLACKLIST;
       goto out;
     }
 
@@ -563,6 +594,7 @@ download_child_p (const struct urlpos *upos, struct url 
*parent, int depth,
   if (opt.https_only && u->scheme != SCHEME_HTTPS)
     {
       DEBUGP (("Not following non-HTTPS links.\n"));
+      reason = NOTHTTPS;
       goto out;
     }
 #endif
@@ -574,6 +606,7 @@ download_child_p (const struct urlpos *upos, struct url 
*parent, int depth,
   if (!u_scheme_like_http && !(u->scheme == SCHEME_FTP && opt.follow_ftp))
     {
       DEBUGP (("Not following non-HTTP schemes.\n"));
+      reason = NONHTTP;
       goto out;
     }
 
@@ -583,6 +616,7 @@ download_child_p (const struct urlpos *upos, struct url 
*parent, int depth,
     if (opt.relative_only && !upos->link_relative_p)
       {
         DEBUGP (("It doesn't really look like a relative link.\n"));
+      reason = ABSOLUTE;
         goto out;
       }
 
@@ -591,6 +625,7 @@ download_child_p (const struct urlpos *upos, struct url 
*parent, int depth,
   if (!accept_domain (u))
     {
       DEBUGP (("The domain was not accepted.\n"));
+      reason = DOMAIN;
       goto out;
     }
 
@@ -610,6 +645,7 @@ download_child_p (const struct urlpos *upos, struct url 
*parent, int depth,
         {
           DEBUGP (("Going to \"%s\" would escape \"%s\" with no_parent on.\n",
                    u->dir, start_url_parsed->dir));
+          reason = PARENT;
           goto out;
         }
     }
@@ -622,12 +658,14 @@ download_child_p (const struct urlpos *upos, struct url 
*parent, int depth,
       if (!accdir (u->dir))
         {
           DEBUGP (("%s (%s) is excluded/not-included.\n", url, u->dir));
+          reason = LIST;
           goto out;
         }
     }
   if (!accept_url (url))
     {
       DEBUGP (("%s is excluded/not-included through regex.\n", url));
+      reason = REGEX;
       goto out;
     }
 
@@ -652,6 +690,7 @@ download_child_p (const struct urlpos *upos, struct url 
*parent, int depth,
         {
           DEBUGP (("%s (%s) does not match acc/rej rules.\n",
                    url, u->file));
+          reason = RULES;
           goto out;
         }
     }
@@ -662,6 +701,7 @@ download_child_p (const struct urlpos *upos, struct url 
*parent, int depth,
       {
         DEBUGP (("This is not the same hostname as the parent's (%s and 
%s).\n",
                  u->host, parent->host));
+        reason = SPANNEDHOST;
         goto out;
       }
 
@@ -704,20 +744,21 @@ download_child_p (const struct urlpos *upos, struct url 
*parent, int depth,
         {
           DEBUGP (("Not following %s because robots.txt forbids it.\n", url));
           blacklist_add (blacklist, url);
+          reason = ROBOTS;
           goto out;
         }
     }
 
-  /* The URL has passed all the tests.  It can be placed in the
-     download queue. */
-  DEBUGP (("Decided to load it.\n"));
+  out:
 
-  return true;
-
- out:
-  DEBUGP (("Decided NOT to load it.\n"));
+  if(reason == SUCCESS)
+    /* The URL has passed all the tests.  It can be placed in the
+       download queue. */
+    DEBUGP (("Decided to load it.\n"));
+  else
+    DEBUGP (("Decided NOT to load it.\n"));
 
-  return false;
+  return reason;
 }
 
 /* This function determines whether we will consider downloading the
@@ -725,14 +766,14 @@ download_child_p (const struct urlpos *upos, struct url 
*parent, int depth,
    possibly to another host, etc.  It is needed very rarely, and thus
    it is merely a simple-minded wrapper around download_child_p.  */
 
-static bool
+static reject_reason
 descend_redirect_p (const char *redirected, struct url *orig_parsed, int depth,
                     struct url *start_url_parsed, struct hash_table *blacklist,
                     struct iri *iri)
 {
   struct url *new_parsed;
   struct urlpos *upos;
-  bool success;
+  reject_reason reason;
 
   assert (orig_parsed != NULL);
 
@@ -742,10 +783,10 @@ descend_redirect_p (const char *redirected, struct url 
*orig_parsed, int depth,
   upos = xnew0 (struct urlpos);
   upos->url = new_parsed;
 
-  success = download_child_p (upos, orig_parsed, depth,
+  reason = download_child_p (upos, orig_parsed, depth,
                               start_url_parsed, blacklist, iri);
 
-  if (success)
+  if (reason == SUCCESS)
     blacklist_add (blacklist, upos->url->url);
   else
     DEBUGP (("Redirection \"%s\" failed the test.\n", redirected));
@@ -753,7 +794,63 @@ descend_redirect_p (const char *redirected, struct url 
*orig_parsed, int depth,
   url_free (new_parsed);
   xfree (upos);
 
-  return success;
+  return reason;
+}
+
+
+static void write_url_csv (FILE* f, struct url *url)
+{
+  char const *scheme_str = 0;
+  switch(url->scheme)
+    {
+      case SCHEME_HTTP:    scheme_str = "SCHEME_HTTP";    break;
+      #ifdef HAVE_SSL
+        case SCHEME_HTTPS: scheme_str = "SCHEME_HTTPS";   break;
+      #endif
+      case SCHEME_FTP:     scheme_str = "SCHEME_FTP";     break;
+      case SCHEME_INVALID: scheme_str = "SCHEME_INVALID"; break;
+    }
+
+  fprintf(f, "%s,%s,%s,%i,%s,%s,%s,%s",
+    url->url,
+    scheme_str,
+    url->host,
+    url->port,
+    url->path,
+    url->params,
+    url->query,
+    url->fragment);
+}
+
+/* This function writes out information on why a URL was rejected and its
+   context from download_child_p such as the URL being rejected and it's
+   parent's URL. The format it uses is comma separated values.*/
+static void
+write_reject_reason_csv (FILE* f, reject_reason r, struct url *url,
+                         struct url *parent)
+{
+  char const *reason_str = 0;
+  switch(r)
+    {
+      case SUCCESS:     reason_str = "SUCCESS";     break;
+      case BLACKLIST:   reason_str = "BLACKLIST";   break;
+      case NOTHTTPS:    reason_str = "NOTHTTPS";    break;
+      case NONHTTP:     reason_str = "NONHTTP";     break;
+      case ABSOLUTE:    reason_str = "ABSOLUTE";    break;
+      case DOMAIN:      reason_str = "DOMAIN";      break;
+      case PARENT:      reason_str = "PARENT";      break;
+      case LIST:        reason_str = "LIST";        break;
+      case REGEX:       reason_str = "REGEX";       break;
+      case RULES:       reason_str = "RULES";       break;
+      case SPANNEDHOST: reason_str = "SPANNEDHOST"; break;
+      case ROBOTS:      reason_str = "ROBOTS";      break;
+    }
+
+  fprintf(f, "%s,", reason_str);
+  write_url_csv(f, url);
+  fprintf(f, ",");
+  write_url_csv(f, parent);
+  fprintf(f, "\n");
 }
 
 /* vim:set sts=2 sw=2 cino+={s: */
-- 
2.4.6

[Prev in Thread]

Current Thread

[Next in Thread]

[Bug-wget] [PATCH] rejected-log: Add option to dump URL rejections to a log., Jookia <=
- Re: [Bug-wget] [PATCH] rejected-log: Add option to dump URL rejections to a log., Giuseppe Scrivano, 2015/07/27
  - Re: [Bug-wget] [PATCH] rejected-log: Add option to dump URL rejections to a log., Jookia, 2015/07/28
  - [Bug-wget] [PATCH] Add option to write URL rejections to a CSV log., Jookia, 2015/07/28
    - Re: [Bug-wget] [Bulk] [PATCH] Add option to write URL rejections to a CSV log., Gisle Vanem, 2015/07/28
    - Re: [Bug-wget] [Bulk] [PATCH] Add option to write URL rejections to a CSV log., Jookia, 2015/07/28
    - Re: [Bug-wget] [PATCH] Add option to write URL rejections to a CSV log., Daniel Kahn Gillmor, 2015/07/28
- Re: [Bug-wget] [PATCH] rejected-log: Add option to dump URL rejections to a log., Jookia, 2015/07/31
  - [Bug-wget] [PATCH] Add option to write URL rejections to a tab-delimited CSV log., Jookia, 2015/07/31

Prev by Date: Re: [Bug-wget] bug wget user/password
Next by Date: Re: [Bug-wget] ERROR 407: authenticationrequired
Previous by thread: [Bug-wget] bug wget user/password
Next by thread: Re: [Bug-wget] [PATCH] rejected-log: Add option to dump URL rejections to a log.
Index(es):
- Date
- Thread