=== modified file 'configure.ac' --- configure.ac 2012-03-25 11:47:53 +0000 +++ configure.ac 2012-03-30 12:15:48 +0000 @@ -532,6 +532,18 @@ ]) ) +dnl +dnl Check for PCRE +dnl + +AC_CHECK_HEADER(pcre.h, + AC_CHECK_LIB(pcre, pcre_compile, + [LIBS="${LIBS} -lpcre" + AC_DEFINE([HAVE_LIBPCRE], 1, + [Define if libpcre is available.]) + ]) +) + dnl Needed by src/Makefile.am AM_CONDITIONAL([IRI_IS_ENABLED], [test "X$iri" != "Xno"]) === modified file 'src/init.c' --- src/init.c 2012-03-08 09:00:51 +0000 +++ src/init.c 2012-03-30 13:06:42 +0000 @@ -80,6 +80,9 @@ CMD_DECLARE (cmd_directory_vector); CMD_DECLARE (cmd_number); CMD_DECLARE (cmd_number_inf); +#ifdef HAVE_LIBPCRE +CMD_DECLARE (cmd_regex); +#endif CMD_DECLARE (cmd_string); CMD_DECLARE (cmd_file); CMD_DECLARE (cmd_directory); @@ -116,6 +119,9 @@ } commands[] = { /* KEEP THIS LIST ALPHABETICALLY SORTED */ { "accept", &opt.accepts, cmd_vector }, +#ifdef HAVE_LIBPCRE + { "acceptregex", &opt.acceptregex, cmd_regex }, +#endif { "addhostdir", &opt.add_hostdir, cmd_boolean }, { "adjustextension", &opt.adjust_extension, cmd_boolean }, { "alwaysrest", &opt.always_rest, cmd_boolean }, /* deprecated */ @@ -237,6 +243,9 @@ { "recursive", NULL, cmd_spec_recursive }, { "referer", &opt.referer, cmd_string }, { "reject", &opt.rejects, cmd_vector }, +#ifdef HAVE_LIBPCRE + { "rejectregex", &opt.rejectregex, cmd_regex }, +#endif { "relativeonly", &opt.relative_only, cmd_boolean }, { "remoteencoding", &opt.encoding_remote, cmd_string }, { "removelisting", &opt.remove_listing, cmd_boolean }, @@ -943,6 +952,30 @@ return true; } +#ifdef HAVE_LIBPCRE +/* Compile the PCRE regular expression and place a + pointer to *PLACE. */ +static bool +cmd_regex (const char *com, const char *val, void *place) +{ + pcre **pregex = (pcre **)place; + + const char *error; + int erroffset; + + *pregex = pcre_compile (val, 0, &error, &erroffset, 0); + + if (!pregex) + { + fprintf (stderr, _("%s: %s: Invalid regular expression %s, %s\n"), + exec_name, com, quote (val), error); + return false; + } + + return true; +} +#endif + /* Like the above, but handles tilde-expansion when reading a user's `.wgetrc'. In that case, and if VAL begins with `~', the tilde === modified file 'src/main.c' --- src/main.c 2012-03-05 21:23:06 +0000 +++ src/main.c 2012-03-30 12:20:40 +0000 @@ -158,6 +158,9 @@ static struct cmdline_option option_data[] = { { "accept", 'A', OPT_VALUE, "accept", -1 }, +#ifdef HAVE_LIBPCRE + { "acceptregex", 0, OPT_VALUE, "acceptregex", -1 }, +#endif { "adjust-extension", 'E', OPT_BOOLEAN, "adjustextension", -1 }, { "append-output", 'a', OPT__APPEND_OUTPUT, NULL, required_argument }, { "ask-password", 0, OPT_BOOLEAN, "askpassword", -1 }, @@ -263,6 +266,9 @@ { "recursive", 'r', OPT_BOOLEAN, "recursive", -1 }, { "referer", 0, OPT_VALUE, "referer", -1 }, { "reject", 'R', OPT_VALUE, "reject", -1 }, +#ifdef HAVE_LIBPCRE + { "rejectregex", 0, OPT_VALUE, "rejectregex", -1 }, +#endif { "relative", 'L', OPT_BOOLEAN, "relativeonly", -1 }, { "remote-encoding", 0, OPT_VALUE, "remoteencoding", -1 }, { "remove-listing", 0, OPT_BOOLEAN, "removelisting", -1 }, @@ -722,6 +728,12 @@ -A, --accept=LIST comma-separated list of accepted extensions.\n"), N_("\ -R, --reject=LIST comma-separated list of rejected extensions.\n"), +#ifdef HAVE_LIBPCRE + N_("\ + --acceptregex=REGEX PCRE-compatible regex matching accepted URLs.\n"), + N_("\ + --rejectregex=REGEX PCRE-compatible regex matching rejected URLs.\n"), +#endif N_("\ -D, --domains=LIST comma-separated list of accepted domains.\n"), N_("\ === modified file 'src/options.h' --- src/options.h 2012-03-05 21:23:06 +0000 +++ src/options.h 2012-03-30 12:23:09 +0000 @@ -29,6 +29,10 @@ shall include the source code for the parts of OpenSSL used as well as that of the covered work. */ +#ifdef HAVE_LIBPCRE +#include +#endif + struct options { int verbose; /* Are we verbose? (First set to -1, @@ -74,6 +78,11 @@ bool ignore_case; /* Whether to ignore case when matching dirs and files */ +#ifdef HAVE_LIBPCRE + pcre *acceptregex; /* Patterns to accept. */ + pcre *rejectregex; /* Patterns to reject. */ +#endif + char **domains; /* See host.c */ char **exclude_domains; bool dns_cache; /* whether we cache DNS lookups. */ === modified file 'src/recur.c' --- src/recur.c 2011-03-30 23:37:12 +0000 +++ src/recur.c 2012-04-04 17:49:20 +0000 @@ -586,6 +586,11 @@ goto out; } } + if (!accept_url (url)) + { + DEBUGP (("%s is excluded/not-included through regex.\n", url)); + goto out; + } /* 6. Check for acceptance/rejection rules. We ignore these rules for directories (no file name to match) and for non-leaf HTMLs, === modified file 'src/utils.c' --- src/utils.c 2012-03-29 18:13:27 +0000 +++ src/utils.c 2012-03-30 13:26:44 +0000 @@ -917,6 +917,44 @@ return true; } +#ifdef HAVE_LIBPCRE +#define OVECCOUNT 30 +/* Determine whether an URL is acceptable to be followed, according to + regex patterns to accept/reject. */ +bool +accept_url (const char *s) +{ + int l = strlen (s); + int rc; + int ovector[OVECCOUNT]; + bool accept = true; + + if (opt.acceptregex) + { + rc = pcre_exec (opt.acceptregex, 0, s, l, 0, 0, ovector, OVECCOUNT); + if (rc == PCRE_ERROR_NOMATCH) + accept = false; + else if (rc < 0) + { + logprintf (LOG_VERBOSE, _("Error while matching %s: %d\n"), + quote (s), rc); + accept = false; + } + } + if (accept && opt.rejectregex) + { + rc = pcre_exec (opt.rejectregex, 0, s, l, 0, 0, ovector, OVECCOUNT); + if (rc >= 0) + accept = false; + else if (rc < 0) + logprintf (LOG_VERBOSE, _("Error while matching %s: %d\n"), + quote (s), rc); + } + return accept; +} +#undef OVECCOUNT +#endif + /* Check if D2 is a subdirectory of D1. E.g. if D1 is `/something', subdir_p() will return true if and only if D2 begins with `/something/' or is exactly '/something'. */ === modified file 'src/utils.h' --- src/utils.h 2011-01-01 12:19:37 +0000 +++ src/utils.h 2012-03-30 13:30:25 +0000 @@ -90,6 +90,9 @@ int fnmatch_nocase (const char *, const char *, int); bool acceptable (const char *); +#ifdef HAVE_LIBPCRE +bool accept_url (const char *); +#endif bool accdir (const char *s); char *suffix (const char *s); bool match_tail (const char *, const char *, bool);