=== modified file 'ChangeLog' --- ChangeLog 2012-03-25 11:47:53 +0000 +++ ChangeLog 2012-04-10 22:28:11 +0000 @@ -1,3 +1,8 @@ +2012-04-11 Gijs van Tulder + + * bootstrap.conf (gnulib_modules): Include module `regex'. + * configure.ac: Check for PCRE library. + 2012-03-25 Ray Satiro * configure.ac: Fix build under mingw when OpenSSL is used. === modified file 'bootstrap.conf' --- bootstrap.conf 2012-03-20 19:41:14 +0000 +++ bootstrap.conf 2012-04-04 15:09:08 +0000 @@ -58,6 +58,7 @@ quote quotearg recv +regex select send setsockopt === modified file 'configure.ac' --- configure.ac 2012-03-25 11:47:53 +0000 +++ configure.ac 2012-04-10 21:59:48 +0000 @@ -532,6 +532,18 @@ ]) ) +dnl +dnl Check for PCRE +dnl + +AC_CHECK_HEADER(pcre.h, + AC_CHECK_LIB(pcre, pcre_compile, + [LIBS="${LIBS} -lpcre" + AC_DEFINE([HAVE_LIBPCRE], 1, + [Define if libpcre is available.]) + ]) +) + dnl Needed by src/Makefile.am AM_CONDITIONAL([IRI_IS_ENABLED], [test "X$iri" != "Xno"]) === modified file 'src/ChangeLog' --- src/ChangeLog 2012-04-01 14:30:59 +0000 +++ src/ChangeLog 2012-04-10 22:30:28 +0000 @@ -1,3 +1,12 @@ +2012-04-11 Gijs van Tulder + + * init.c: Add --accept-regex, --reject-regex and --regex-type. + * main.c: Likewise. + * options.c: Likewise. + * recur.c: Likewise. + * utils.c: Add regex-related functions. + * utils.h: Add regex-related functions. + 2012-04-01 Giuseppe Scrivano * gnutls.c (wgnutls_read_timeout): Ensure timer is freed. === modified file 'src/init.c' --- src/init.c 2012-03-08 09:00:51 +0000 +++ src/init.c 2012-04-10 22:10:10 +0000 @@ -46,6 +46,10 @@ # endif #endif +#include +#ifdef HAVE_LIBPCRE +# include +#endif #ifdef HAVE_PWD_H # include @@ -94,6 +98,7 @@ CMD_DECLARE (cmd_spec_prefer_family); CMD_DECLARE (cmd_spec_progress); CMD_DECLARE (cmd_spec_recursive); +CMD_DECLARE (cmd_spec_regex_type); CMD_DECLARE (cmd_spec_restrict_file_names); #ifdef HAVE_SSL CMD_DECLARE (cmd_spec_secure_protocol); @@ -116,6 +121,7 @@ } commands[] = { /* KEEP THIS LIST ALPHABETICALLY SORTED */ { "accept", &opt.accepts, cmd_vector }, + { "acceptregex", &opt.acceptregex_s, cmd_string }, { "addhostdir", &opt.add_hostdir, cmd_boolean }, { "adjustextension", &opt.adjust_extension, cmd_boolean }, { "alwaysrest", &opt.always_rest, cmd_boolean }, /* deprecated */ @@ -236,7 +242,9 @@ { "reclevel", &opt.reclevel, cmd_number_inf }, { "recursive", NULL, cmd_spec_recursive }, { "referer", &opt.referer, cmd_string }, + { "regextype", &opt.regex_type, cmd_spec_regex_type }, { "reject", &opt.rejects, cmd_vector }, + { "rejectregex", &opt.rejectregex_s, cmd_string }, { "relativeonly", &opt.relative_only, cmd_boolean }, { "remoteencoding", &opt.encoding_remote, cmd_string }, { "removelisting", &opt.remove_listing, cmd_boolean }, @@ -361,6 +369,8 @@ opt.restrict_files_nonascii = false; opt.restrict_files_case = restrict_no_case_restriction; + opt.regex_type = regex_type_posix; + opt.max_redirect = 20; opt.waitretry = 10; @@ -1368,6 +1378,25 @@ return true; } +/* Validate --regex-type and set the choice. */ + +static bool +cmd_spec_regex_type (const char *com, const char *val, void *place_ignored) +{ + static const struct decode_item choices[] = { + { "posix", regex_type_posix }, +#ifdef HAVE_LIBPCRE + { "pcre", regex_type_pcre }, +#endif + }; + int regex_type = regex_type_posix; + int ok = decode_string (val, choices, countof (choices), ®ex_type); + if (!ok) + fprintf (stderr, _("%s: %s: Invalid value %s.\n"), exec_name, com, quote (val)); + opt.regex_type = regex_type; + return ok; +} + static bool cmd_spec_restrict_file_names (const char *com, const char *val, void *place_ignored) { === modified file 'src/main.c' --- src/main.c 2012-03-05 21:23:06 +0000 +++ src/main.c 2012-04-10 22:25:56 +0000 @@ -158,6 +158,7 @@ static struct cmdline_option option_data[] = { { "accept", 'A', OPT_VALUE, "accept", -1 }, + { "accept-regex", 0, OPT_VALUE, "acceptregex", -1 }, { "adjust-extension", 'E', OPT_BOOLEAN, "adjustextension", -1 }, { "append-output", 'a', OPT__APPEND_OUTPUT, NULL, required_argument }, { "ask-password", 0, OPT_BOOLEAN, "askpassword", -1 }, @@ -262,7 +263,9 @@ { "read-timeout", 0, OPT_VALUE, "readtimeout", -1 }, { "recursive", 'r', OPT_BOOLEAN, "recursive", -1 }, { "referer", 0, OPT_VALUE, "referer", -1 }, + { "regex-type", 0, OPT_VALUE, "regextype", -1 }, { "reject", 'R', OPT_VALUE, "reject", -1 }, + { "reject-regex", 0, OPT_VALUE, "rejectregex", -1 }, { "relative", 'L', OPT_BOOLEAN, "relativeonly", -1 }, { "remote-encoding", 0, OPT_VALUE, "remoteencoding", -1 }, { "remove-listing", 0, OPT_BOOLEAN, "removelisting", -1 }, @@ -723,6 +726,17 @@ N_("\ -R, --reject=LIST comma-separated list of rejected extensions.\n"), N_("\ + --accept-regex=REGEX regex matching accepted URLs.\n"), + N_("\ + --reject-regex=REGEX regex matching rejected URLs.\n"), +#ifdef HAVE_LIBPCRE + N_("\ + --regex-type=TYPE regex type (posix|pcre).\n"), +#else + N_("\ + --regex-type=TYPE regex type (posix).\n"), +#endif + N_("\ -D, --domains=LIST comma-separated list of accepted domains.\n"), N_("\ --exclude-domains=LIST comma-separated list of rejected domains.\n"), @@ -1323,6 +1337,35 @@ exit (1); } + /* Compile the regular expressions. */ + switch (opt.regex_type) + { +#ifdef HAVE_LIBPCRE + case regex_type_pcre: + opt.regex_compile_fun = compile_pcre_regex; + opt.regex_match_fun = match_pcre_regex; + break; +#endif + + case regex_type_posix: + default: + opt.regex_compile_fun = compile_posix_regex; + opt.regex_match_fun = match_posix_regex; + break; + } + if (opt.acceptregex_s) + { + opt.acceptregex = opt.regex_compile_fun (opt.acceptregex_s); + if (!opt.acceptregex) + exit (1); + } + if (opt.rejectregex_s) + { + opt.rejectregex = opt.regex_compile_fun (opt.rejectregex_s); + if (!opt.rejectregex) + exit (1); + } + #ifdef ENABLE_IRI if (opt.enable_iri) { === modified file 'src/options.h' --- src/options.h 2012-03-05 21:23:06 +0000 +++ src/options.h 2012-04-10 22:20:26 +0000 @@ -74,6 +74,19 @@ bool ignore_case; /* Whether to ignore case when matching dirs and files */ + char *acceptregex_s; /* Patterns to accept (a regex string). */ + char *rejectregex_s; /* Patterns to reject (a regex string). */ + void *acceptregex; /* Patterns to accept (a regex struct). */ + void *rejectregex; /* Patterns to reject (a regex struct). */ + enum { +#ifdef HAVE_LIBPCRE + regex_type_pcre, +#endif + regex_type_posix + } regex_type; /* The regex library. */ + void *(*regex_compile_fun)(const char *); /* Function to compile a regex. */ + bool (*regex_match_fun)(const void *, const char *); /* Function to match a string to a regex. */ + char **domains; /* See host.c */ char **exclude_domains; bool dns_cache; /* whether we cache DNS lookups. */ === modified file 'src/recur.c' --- src/recur.c 2011-03-30 23:37:12 +0000 +++ src/recur.c 2012-04-04 17:48:34 +0000 @@ -586,6 +586,11 @@ goto out; } } + if (!accept_url (url)) + { + DEBUGP (("%s is excluded/not-included through regex.\n", url)); + goto out; + } /* 6. Check for acceptance/rejection rules. We ignore these rules for directories (no file name to match) and for non-leaf HTMLs, === modified file 'src/utils.c' --- src/utils.c 2012-03-29 18:13:27 +0000 +++ src/utils.c 2012-04-10 22:22:10 +0000 @@ -73,6 +73,11 @@ #include #include +#include +#ifdef HAVE_LIBPCRE +# include +#endif + #ifndef HAVE_SIGSETJMP /* If sigsetjmp is a macro, configure won't pick it up. */ # ifdef sigsetjmp @@ -917,6 +922,19 @@ return true; } +/* Determine whether an URL is acceptable to be followed, according to + regex patterns to accept/reject. */ +bool +accept_url (const char *s) +{ + if (opt.acceptregex && !opt.regex_match_fun (opt.acceptregex, s)) + return false; + if (opt.rejectregex && opt.regex_match_fun (opt.rejectregex, s)) + return false; + + return true; +} + /* Check if D2 is a subdirectory of D1. E.g. if D1 is `/something', subdir_p() will return true if and only if D2 begins with `/something/' or is exactly '/something'. */ @@ -2309,6 +2327,92 @@ return q - (char *) dest; } +#ifdef HAVE_LIBPCRE +/* Compiles the PCRE regex. */ +void * +compile_pcre_regex (const char *str) +{ + const char *errbuf; + int erroffset; + pcre *regex = pcre_compile (str, 0, &errbuf, &erroffset, 0); + + if (!regex) + { + fprintf (stderr, _("Invalid regular expression %s, %s\n"), + quote (str), errbuf); + return false; + } + return regex; +} +#endif + +/* Compiles the POSIX regex. */ +void * +compile_posix_regex (const char *str) +{ + regex_t *regex = malloc (sizeof (regex_t)); + + int errcode = regcomp ((regex_t *) regex, str, REG_EXTENDED | REG_NOSUB); + + if (errcode != 0) + { + int errbuf_size = regerror (errcode, (regex_t *) regex, NULL, 0); + char *errbuf = malloc (errbuf_size); + errbuf_size = regerror (errcode, (regex_t *) regex, errbuf, errbuf_size); + fprintf (stderr, _("Invalid regular expression %s, %s\n"), + quote (str), errbuf); + xfree (errbuf); + return NULL; + } + + return regex; +} + +#ifdef HAVE_LIBPCRE +#define OVECCOUNT 30 +/* Matches a PCRE regex. */ +bool +match_pcre_regex (const void *regex, const char *str) +{ + int l = strlen (str); + int ovector[OVECCOUNT]; + + int rc = pcre_exec ((pcre *) regex, 0, str, l, 0, 0, ovector, OVECCOUNT); + if (rc == PCRE_ERROR_NOMATCH) + return false; + else if (rc < 0) + { + logprintf (LOG_VERBOSE, _("Error while matching %s: %d\n"), + quote (str), rc); + return false; + } + else + return true; +} +#undef OVECCOUNT +#endif + +/* Matches a POSIX regex. */ +bool +match_posix_regex (const void *regex, const char *str) +{ + int rc = regexec ((regex_t *) regex, str, 0, NULL, 0); + if (rc == REG_NOMATCH) + return false; + else if (rc == 0) + return true; + else + { + int errbuf_size = regerror (rc, opt.acceptregex, NULL, 0); + char *errbuf = malloc (errbuf_size); + errbuf_size = regerror (rc, opt.acceptregex, errbuf, errbuf_size); + logprintf (LOG_VERBOSE, _("Error while matching %s: %d\n"), + quote (str), rc); + xfree (errbuf); + return false; + } +} + #undef IS_ASCII #undef NEXT_CHAR === modified file 'src/utils.h' --- src/utils.h 2011-01-01 12:19:37 +0000 +++ src/utils.h 2012-04-10 22:10:39 +0000 @@ -90,6 +90,7 @@ int fnmatch_nocase (const char *, const char *, int); bool acceptable (const char *); +bool accept_url (const char *); bool accdir (const char *s); char *suffix (const char *s); bool match_tail (const char *, const char *, bool); @@ -141,6 +142,14 @@ int base64_encode (const void *, int, char *); int base64_decode (const char *, void *); +#ifdef HAVE_LIBPCRE +void *compile_pcre_regex (const char *); +bool match_pcre_regex (const void *, const char *); +#endif + +void *compile_posix_regex (const char *); +bool match_posix_regex (const void *, const char *); + void stable_sort (void *, size_t, size_t, int (*) (const void *, const void *)); const char *print_decimal (double);