[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]
[Bug-wget] [PATCH] rejected-log: Add option to dump URL rejections to a
From: |
Jookia |
Subject: |
[Bug-wget] [PATCH] rejected-log: Add option to dump URL rejections to a log. |
Date: |
Tue, 21 Jul 2015 16:33:43 -0000 |
This allows you to figure out why URLs are being rejected and some context
around it.
---
doc/wget.texi | 5 ++
src/init.c | 2 +
src/main.c | 3 ++
src/options.h | 2 +
src/recur.c | 149 ++++++++++++++++++++++++++++++++++++++++++++++++----------
5 files changed, 135 insertions(+), 26 deletions(-)
diff --git a/doc/wget.texi b/doc/wget.texi
index 0b683d8..d2ff7dc 100644
--- a/doc/wget.texi
+++ b/doc/wget.texi
@@ -551,6 +551,11 @@ would be resolved to @samp{http://foo/baz/b.html}.
@cindex specify config
@item address@hidden
Specify the location of a startup file you wish to use.
+
address@hidden address@hidden
+Logs all URL rejections to @var{logfile} as comma separated values. The values
+include the reason of rejection, the URL and the parent URL it was found in.
+
@end table
@node Download Options, Directory Options, Logging and Input File Options,
Invoking
diff --git a/src/init.c b/src/init.c
index da17614..ea074cc 100644
--- a/src/init.c
+++ b/src/init.c
@@ -274,6 +274,7 @@ static const struct {
{ "referer", &opt.referer, cmd_string },
{ "regextype", &opt.regex_type, cmd_spec_regex_type },
{ "reject", &opt.rejects, cmd_vector },
+ { "rejectedlog", &opt.rejected_log, cmd_file },
{ "rejectregex", &opt.rejectregex_s, cmd_string },
{ "relativeonly", &opt.relative_only, cmd_boolean },
{ "remoteencoding", &opt.encoding_remote, cmd_string },
@@ -1856,6 +1857,7 @@ cleanup (void)
xfree (opt.post_data);
xfree (opt.body_data);
xfree (opt.body_file);
+ xfree (opt.rejected_log);
#endif /* DEBUG_MALLOC */
}
diff --git a/src/main.c b/src/main.c
index 6490711..880057b 100644
--- a/src/main.c
+++ b/src/main.c
@@ -317,6 +317,7 @@ static struct cmdline_option option_data[] =
{ "limit-rate", 0, OPT_VALUE, "limitrate", -1 },
{ "load-cookies", 0, OPT_VALUE, "loadcookies", -1 },
{ "local-encoding", 0, OPT_VALUE, "localencoding", -1 },
+ { "rejected-log", 0, OPT_VALUE, "rejectedlog", -1 },
{ "max-redirect", 0, OPT_VALUE, "maxredirect", -1 },
#ifdef HAVE_METALINK
{ "metalink-over-http", 0, OPT_BOOLEAN, "metalink-over-http", -1 },
@@ -572,6 +573,8 @@ Logging and input file:\n"),
--config=FILE specify config file to use\n"),
N_("\
--no-config do not read any config file\n"),
+ N_("\
+ --rejected-log=FILE log reasons for URL rejection to FILE\n"),
"\n",
N_("\
diff --git a/src/options.h b/src/options.h
index f8244ea..24ddbb5 100644
--- a/src/options.h
+++ b/src/options.h
@@ -296,6 +296,8 @@ struct options
name. */
bool report_bps; /*Output bandwidth in bits format*/
+ char *rejected_log; /* The file to log rejected URLS to. */
+
#ifdef HAVE_HSTS
bool hsts;
char *hsts_file;
diff --git a/src/recur.c b/src/recur.c
index a000d54..5d16561 100644
--- a/src/recur.c
+++ b/src/recur.c
@@ -182,11 +182,18 @@ static int blacklist_contains (struct hash_table
*blacklist, const char *url)
return ret;
}
-static bool download_child_p (const struct urlpos *, struct url *, int,
- struct url *, struct hash_table *, struct iri *);
-static bool descend_redirect_p (const char *, struct url *, int,
- struct url *, struct hash_table *, struct iri
*);
+typedef enum
+{
+ SUCCESS, BLACKLIST, NOTHTTPS, NONHTTP, ABSOLUTE, DOMAIN, PARENT, LIST, REGEX,
+ RULES, SPANNEDHOST, ROBOTS
+} reject_reason;
+static reject_reason download_child_p (const struct urlpos *, struct url *,
int,
+ struct url *, struct hash_table *, struct iri *);
+static reject_reason descend_redirect_p (const char *, struct url *, int,
+ struct url *, struct hash_table *, struct iri *);
+static void write_reject_reason_csv (FILE*, reject_reason, struct url *,
+ struct url *);
/* Retrieve a part of the web beginning with START_URL. This used to
be called "recursive retrieval", because the old function was
@@ -244,6 +251,14 @@ retrieve_tree (struct url *start_url_parsed, struct iri
*pi)
false);
blacklist_add (blacklist, start_url_parsed->url);
+ FILE *rejectedlog = 0;
+ if(opt.rejected_log)
+ {
+ rejectedlog = fopen (opt.rejected_log, "w");
+ if (!rejectedlog)
+ logprintf (LOG_NOTQUIET, "%s: %s\n", opt.rejected_log, strerror
(errno));
+ }
+
while (1)
{
bool descend = false;
@@ -337,13 +352,19 @@ retrieve_tree (struct url *start_url_parsed, struct iri
*pi)
want to follow it. */
if (descend)
{
- if (!descend_redirect_p (redirected, url_parsed, depth,
- start_url_parsed, blacklist, i))
- descend = false;
+ reject_reason r = descend_redirect_p (redirected,
url_parsed,
+ depth, start_url_parsed, blacklist, i);
+ if(r == SUCCESS)
+ {
+ /* Make sure that the old pre-redirect form gets
+ blacklisted. */
+ blacklist_add (blacklist, url);
+ }
else
- /* Make sure that the old pre-redirect form gets
- blacklisted. */
- blacklist_add (blacklist, url);
+ {
+ descend = false;
+ write_reject_reason_csv(rejectedlog, r, url_parsed,
start_url_parsed);
+ }
}
xfree (url);
@@ -425,8 +446,9 @@ retrieve_tree (struct url *start_url_parsed, struct iri *pi)
continue;
if (dash_p_leaf_HTML && !child->link_inline_p)
continue;
- if (download_child_p (child, url_parsed, depth,
start_url_parsed,
- blacklist, i))
+ reject_reason r = download_child_p (child, url_parsed, depth,
+ start_url_parsed, blacklist, i);
+ if(r == SUCCESS)
{
ci = iri_new ();
set_uri_encoding (ci, i->content_encoding, false);
@@ -439,6 +461,10 @@ retrieve_tree (struct url *start_url_parsed, struct iri
*pi)
same URL twice. */
blacklist_add (blacklist, child->url->url);
}
+ else
+ {
+ write_reject_reason_csv(rejectedlog, r, child->url,
url_parsed);
+ }
}
if (strip_auth)
@@ -478,6 +504,9 @@ retrieve_tree (struct url *start_url_parsed, struct iri *pi)
iri_free (i);
}
+ if (rejectedlog)
+ fclose(rejectedlog);
+
/* If anything is left of the queue due to a premature exit, free it
now. */
{
@@ -513,7 +542,7 @@ retrieve_tree (struct url *start_url_parsed, struct iri *pi)
by storing these URLs to BLACKLIST. This may or may not help. It
will help if those URLs are encountered many times. */
-static bool
+static reject_reason
download_child_p (const struct urlpos *upos, struct url *parent, int depth,
struct url *start_url_parsed, struct hash_table *blacklist,
struct iri *iri)
@@ -521,6 +550,7 @@ download_child_p (const struct urlpos *upos, struct url
*parent, int depth,
struct url *u = upos->url;
const char *url = u->url;
bool u_scheme_like_http;
+ reject_reason reason = SUCCESS;
DEBUGP (("Deciding whether to enqueue \"%s\".\n", url));
@@ -534,6 +564,7 @@ download_child_p (const struct urlpos *upos, struct url
*parent, int depth,
xfree (referrer);
}
DEBUGP (("Already on the black list.\n"));
+ reason = BLACKLIST;
goto out;
}
@@ -563,6 +594,7 @@ download_child_p (const struct urlpos *upos, struct url
*parent, int depth,
if (opt.https_only && u->scheme != SCHEME_HTTPS)
{
DEBUGP (("Not following non-HTTPS links.\n"));
+ reason = NOTHTTPS;
goto out;
}
#endif
@@ -574,6 +606,7 @@ download_child_p (const struct urlpos *upos, struct url
*parent, int depth,
if (!u_scheme_like_http && !(u->scheme == SCHEME_FTP && opt.follow_ftp))
{
DEBUGP (("Not following non-HTTP schemes.\n"));
+ reason = NONHTTP;
goto out;
}
@@ -583,6 +616,7 @@ download_child_p (const struct urlpos *upos, struct url
*parent, int depth,
if (opt.relative_only && !upos->link_relative_p)
{
DEBUGP (("It doesn't really look like a relative link.\n"));
+ reason = ABSOLUTE;
goto out;
}
@@ -591,6 +625,7 @@ download_child_p (const struct urlpos *upos, struct url
*parent, int depth,
if (!accept_domain (u))
{
DEBUGP (("The domain was not accepted.\n"));
+ reason = DOMAIN;
goto out;
}
@@ -610,6 +645,7 @@ download_child_p (const struct urlpos *upos, struct url
*parent, int depth,
{
DEBUGP (("Going to \"%s\" would escape \"%s\" with no_parent on.\n",
u->dir, start_url_parsed->dir));
+ reason = PARENT;
goto out;
}
}
@@ -622,12 +658,14 @@ download_child_p (const struct urlpos *upos, struct url
*parent, int depth,
if (!accdir (u->dir))
{
DEBUGP (("%s (%s) is excluded/not-included.\n", url, u->dir));
+ reason = LIST;
goto out;
}
}
if (!accept_url (url))
{
DEBUGP (("%s is excluded/not-included through regex.\n", url));
+ reason = REGEX;
goto out;
}
@@ -652,6 +690,7 @@ download_child_p (const struct urlpos *upos, struct url
*parent, int depth,
{
DEBUGP (("%s (%s) does not match acc/rej rules.\n",
url, u->file));
+ reason = RULES;
goto out;
}
}
@@ -662,6 +701,7 @@ download_child_p (const struct urlpos *upos, struct url
*parent, int depth,
{
DEBUGP (("This is not the same hostname as the parent's (%s and
%s).\n",
u->host, parent->host));
+ reason = SPANNEDHOST;
goto out;
}
@@ -704,20 +744,21 @@ download_child_p (const struct urlpos *upos, struct url
*parent, int depth,
{
DEBUGP (("Not following %s because robots.txt forbids it.\n", url));
blacklist_add (blacklist, url);
+ reason = ROBOTS;
goto out;
}
}
- /* The URL has passed all the tests. It can be placed in the
- download queue. */
- DEBUGP (("Decided to load it.\n"));
+ out:
- return true;
-
- out:
- DEBUGP (("Decided NOT to load it.\n"));
+ if(reason == SUCCESS)
+ /* The URL has passed all the tests. It can be placed in the
+ download queue. */
+ DEBUGP (("Decided to load it.\n"));
+ else
+ DEBUGP (("Decided NOT to load it.\n"));
- return false;
+ return reason;
}
/* This function determines whether we will consider downloading the
@@ -725,14 +766,14 @@ download_child_p (const struct urlpos *upos, struct url
*parent, int depth,
possibly to another host, etc. It is needed very rarely, and thus
it is merely a simple-minded wrapper around download_child_p. */
-static bool
+static reject_reason
descend_redirect_p (const char *redirected, struct url *orig_parsed, int depth,
struct url *start_url_parsed, struct hash_table *blacklist,
struct iri *iri)
{
struct url *new_parsed;
struct urlpos *upos;
- bool success;
+ reject_reason reason;
assert (orig_parsed != NULL);
@@ -742,10 +783,10 @@ descend_redirect_p (const char *redirected, struct url
*orig_parsed, int depth,
upos = xnew0 (struct urlpos);
upos->url = new_parsed;
- success = download_child_p (upos, orig_parsed, depth,
+ reason = download_child_p (upos, orig_parsed, depth,
start_url_parsed, blacklist, iri);
- if (success)
+ if (reason == SUCCESS)
blacklist_add (blacklist, upos->url->url);
else
DEBUGP (("Redirection \"%s\" failed the test.\n", redirected));
@@ -753,7 +794,63 @@ descend_redirect_p (const char *redirected, struct url
*orig_parsed, int depth,
url_free (new_parsed);
xfree (upos);
- return success;
+ return reason;
+}
+
+
+static void write_url_csv (FILE* f, struct url *url)
+{
+ char const *scheme_str = 0;
+ switch(url->scheme)
+ {
+ case SCHEME_HTTP: scheme_str = "SCHEME_HTTP"; break;
+ #ifdef HAVE_SSL
+ case SCHEME_HTTPS: scheme_str = "SCHEME_HTTPS"; break;
+ #endif
+ case SCHEME_FTP: scheme_str = "SCHEME_FTP"; break;
+ case SCHEME_INVALID: scheme_str = "SCHEME_INVALID"; break;
+ }
+
+ fprintf(f, "%s,%s,%s,%i,%s,%s,%s,%s",
+ url->url,
+ scheme_str,
+ url->host,
+ url->port,
+ url->path,
+ url->params,
+ url->query,
+ url->fragment);
+}
+
+/* This function writes out information on why a URL was rejected and its
+ context from download_child_p such as the URL being rejected and it's
+ parent's URL. The format it uses is comma separated values.*/
+static void
+write_reject_reason_csv (FILE* f, reject_reason r, struct url *url,
+ struct url *parent)
+{
+ char const *reason_str = 0;
+ switch(r)
+ {
+ case SUCCESS: reason_str = "SUCCESS"; break;
+ case BLACKLIST: reason_str = "BLACKLIST"; break;
+ case NOTHTTPS: reason_str = "NOTHTTPS"; break;
+ case NONHTTP: reason_str = "NONHTTP"; break;
+ case ABSOLUTE: reason_str = "ABSOLUTE"; break;
+ case DOMAIN: reason_str = "DOMAIN"; break;
+ case PARENT: reason_str = "PARENT"; break;
+ case LIST: reason_str = "LIST"; break;
+ case REGEX: reason_str = "REGEX"; break;
+ case RULES: reason_str = "RULES"; break;
+ case SPANNEDHOST: reason_str = "SPANNEDHOST"; break;
+ case ROBOTS: reason_str = "ROBOTS"; break;
+ }
+
+ fprintf(f, "%s,", reason_str);
+ write_url_csv(f, url);
+ fprintf(f, ",");
+ write_url_csv(f, parent);
+ fprintf(f, "\n");
}
/* vim:set sts=2 sw=2 cino+={s: */
--
2.4.6