# # # add_file "pcrewrap.cc" # content [cf2674222370e2902e15901902961e4e603c12c0] # # add_file "pcrewrap.hh" # content [206922378c80ba9fd283ccd510af1e7857328ef6] # # patch "ChangeLog" # from [172919140a9f04e8ebbd1d628c93915e951dc8f1] # to [5e7be0ecfa379320d569de711c255345b07a74a7] # # patch "Makefile.am" # from [c77994d06248ca5e37577009cb6445132cb1fe64] # to [cdd7815c1e675ef9cb034b2a0fd9d2bdb36a1f55] # # patch "configure.ac" # from [dc18dea85941b866f1520a9c787296cd87417369] # to [661fc6adca568307e46e8d53d8dbd51c6ead71f0] # # patch "diff_patch.cc" # from [a8b1d3481a4ff79239887399a16f224df14a408a] # to [11ffd8cefc404f68b6488d385807739bf5b287ca] # # patch "globish.cc" # from [85d971be4e285160674cbb8cd1ae6e948f433037] # to [de92057faf87bd383e7c587f1dec11c16dce53e4] # # patch "globish.hh" # from [0cc1789ca486ca25778d7c88b5e935c916503091] # to [0ce72827a54b9f14bb186be32cf0731ab40e86ac] # # patch "lua.cc" # from [e0af7e037ef78d315f4fa61c58f7f11c4ec25768] # to [6e59fe049bc11b83e177f02d9bf5620335c80488] # # patch "luaext_globish.cc" # from [ab1acf27ba10c8676e264f6046be753f4884d590] # to [506249a961f0209231535ac1c696bda6442d22da] # # patch "netsync.cc" # from [8473129fbbfc6f05efcb2471f7cfbce8e89f51ef] # to [f286f2af714d0047ff6de97b163860ee2d9118c4] # # patch "packet.cc" # from [a59b48af0d11f15f912c85e5cc53a04f7f17b5f9] # to [5042d84abe71bbf3e1027a4475e51955370bef79] # # patch "pch.hh" # from [e561236387f43aa08076cda7b35e4af7be163be9] # to [86ad5ca677a37ec6382332441b926c3458b77b4e] # # patch "uri.cc" # from [72edc51b298f51cac2c9b431ac72e5987f6d53f9] # to [81b7b978e2ed0e02f6edee4088ae324940e9b52d] # ============================================================ --- pcrewrap.cc cf2674222370e2902e15901902961e4e603c12c0 +++ pcrewrap.cc cf2674222370e2902e15901902961e4e603c12c0 @@ -0,0 +1,182 @@ +#include +#include +#include +#include + +#include "pcrewrap.hh" + +#define pcre pcre_t +#include "pcre.h" +#undef pcre + +static unsigned int +flags_to_internal(pcre::flags f) +{ + using namespace pcre; +#define C(f_, x) (((f_) & (x)) ? PCRE_##x : 0) + unsigned int i = 0; + i |= C(f, NEWLINE_CR); + i |= C(f, NEWLINE_LF); + // NEWLINE_CRLF is handled above + i |= C(f, ANCHORED); + i |= C(f, NOTBOL); + i |= C(f, NOTEOL); + i |= C(f, NOTEMPTY); + i |= C(f, CASELESS); + i |= C(f, DOLLAR_ENDONLY); + i |= C(f, DOTALL); + i |= C(f, DUPNAMES); + i |= C(f, EXTENDED); + i |= C(f, FIRSTLINE); + i |= C(f, MULTILINE); + i |= C(f, UNGREEDY); + return i; +} + +namespace pcre +{ + void regex::init(const char *pattern, pcre::flags options) + { + int erroff; + const char *err; + basedat = static_cast + (pcre_compile(pattern, flags_to_internal(options), &err, &erroff, 0)); + if (!basedat) + throw compile_error(err, erroff, pattern); + + int errcode = pcre_fullinfo(static_cast(basedat), 0, + PCRE_INFO_CAPTURECOUNT, + static_cast(&capturecount)); + if (errcode < 0) + throw compile_error((F("pcre_fullinfo error %d") % errcode).str().c_str(), + 0, pattern); + } + + regex::regex(const char *pattern, pcre::flags options) + : basedat(0), extradat(0), capturecount(0) + { + this->init(pattern, options); + } + + regex::regex(const std::string &pattern, pcre::flags options) + : basedat(0), extradat(0), capturecount(0) + { + this->init(pattern.c_str(), options); + } + + regex::~regex() + { + if (basedat) + pcre_free(const_cast(basedat)); + if (extradat) + pcre_free(const_cast(extradat)); + } + + void regex::study() + { + const char *err; + extradat = static_cast + (pcre_study(static_cast(basedat), 0, &err)); + if (err) + throw study_error(err); + } + + bool + regex::match(const std::string &subject, matches &result, + std::string::const_iterator startptr, + pcre::flags options) const + { + int startoffset = 0; + if (startptr != std::string::const_iterator(0)) + startoffset = &*startptr - &*subject.data(); + + // pcre_exec has a bizarro calling convention. It wants ovec to + // provide three integer slots per capturing paren, plus three + // more (for the whole-pattern match). The first two-thirds of + // the vector will contain useful pairs of integers on exit from + // pcre_exec; the last third will be used as scribble space by + // pcre_exec. (Why can't it allocate its own damn scribble space?) + std::vector ovec((capturecount + 1) * 3); + int rc = pcre_exec(static_cast(basedat), + static_cast(extradat), + subject.data(), subject.size(), + startoffset, + flags_to_internal(options), + &ovec.front(), ovec.size()); // ??? ovec.data() + if (rc >= 0) + { + // If the return value is nonnegative, the pattern matched, + // and rc is one more than the number of pairs of integers in + // ovec that are meaningful. + result.clear(); + result.reserve(capturecount + 1); + for (int i = 0; i < rc * 2; i += 2) + { + if (ovec[i] == -1 && ovec[i+1] == -1) + result.push_back(capture(std::string::const_iterator(0), + std::string::const_iterator(0))); + else if (ovec[i] == -1 || ovec[i+1] == -1) + throw match_error(PCRE_ERROR_INTERNAL); // should never happen + else + result.push_back(capture(subject.begin() + ovec[i], + subject.begin() + ovec[i+1])); + } + for (int i = rc; i < capturecount + 1; i++) + result.push_back(capture(std::string::const_iterator(0), + std::string::const_iterator(0))); + I(result.size() == capturecount + 1); + return true; + } + else if (rc == PCRE_ERROR_NOMATCH) + { + result = matches(capturecount + 1, + capture(std::string::const_iterator(0), + std::string::const_iterator(0))); + I(result.size() == capturecount + 1); + return false; + } + else + throw match_error(rc); + } + + // This overload is for when you don't care about captures, only + // whether or not it matched. + bool + regex::match(const std::string &subject, + std::string::const_iterator startptr, + pcre::flags options) const + { + int startoffset = 0; + if (startptr != std::string::const_iterator(0)) + startoffset = &*startptr - &*subject.data(); + + int rc = pcre_exec(static_cast(basedat), + static_cast(extradat), + subject.data(), subject.size(), + startoffset, flags_to_internal(options), 0, 0); + if (rc == 0) + return true; + else if (rc == PCRE_ERROR_NOMATCH) + return false; + else + throw match_error(rc); + } + + // error handling. + + static std::string + compile_error_message(const char *err, int offset, const char *pattern) + { + return (F("parse error at char %d in pattern '%s': %s") + % offset % pattern % err).str(); + } + + compile_error::compile_error(const char *err, int offset, const char *pattern) + : std::runtime_error(compile_error_message(err, offset, pattern)) + {} + + match_error::match_error(int code) + : std::runtime_error((F("Error during matching, code %d") % code).str()) + {} + +} // namespace pcre ============================================================ --- pcrewrap.hh 206922378c80ba9fd283ccd510af1e7857328ef6 +++ pcrewrap.hh 206922378c80ba9fd283ccd510af1e7857328ef6 @@ -0,0 +1,141 @@ +#ifndef _PCREWRAP_HH +#define _PCREWRAP_HH + +#include +#include +#include +#include "sanity.hh" + +// This is a sensible C++ wrapper interface around the bare C API exported +// by pcre.h. Note that pcre.h is a very "noisy" header in terms of macro +// definitions and so we don't actually expose it here. + +namespace pcre +{ + enum flags + { + // flags usable with both pcre_compile and pcre_exec + DEFAULT = 0x0000, // no special behavior + NEWLINE_CR = 0x0001, // newline is \r + NEWLINE_LF = 0x0002, // newline is \n + NEWLINE_CRLF = (NEWLINE_CR|NEWLINE_LF), // newline is \r\n + ANCHORED = 0x0004, // match only at beginning + // of string (\A in pat) + // flags usable only with pcre_exec + NOTBOL = 0x0008, // beginning of string isn't beginning of line + NOTEOL = 0x0010, // end of string isn't end of line + NOTEMPTY = 0x0020, // an empty match is a match failure + + // flags usable only with pcre_compile + CASELESS = 0x0040, // case insensitive match (?i) + DOLLAR_ENDONLY = 0x0080, // only in !MULTILINE mode, $ equiv to \Z + DOTALL = 0x0100, // dot matches newline (?s) + DUPNAMES = 0x0200, // permit duplicate names for named captures + EXTENDED = 0x0400, // whitespace permitted in syntax (?x) + FIRSTLINE = 0x0800, // match must begin before first newline + MULTILINE = 0x1000, // ^ and $ match at internal newlines (?m) + UNGREEDY = 0x4000, // quantifiers aren't greedy unless + // followed with ? (opposite of default) + }; + + // A capture object is a pair of string iterators, such that + // [C.first, C.second) is the range of characters in the "subject" + // string captured by either a full match or some pair of capturing + // parentheses. If both C.first and C.second are null, then the + // associated part of the regex did not match. It is an invariant + // that either both or neither C.first and C.second are null. The + // object provides a couple of helper operations, matched() and str(), + // for common use cases. + struct capture : public std::pair + { + capture(std::string::const_iterator a, + std::string::const_iterator b) + : std::pair + (a, b) + { I((a == std::string::const_iterator(0) + && b == std::string::const_iterator(0)) + || (a != std::string::const_iterator(0) + && b != std::string::const_iterator(0))); } + + bool matched() { return (this->first != std::string::const_iterator(0)); } + std::string str() { return std::string(this->first, this->second); } + }; + + // A matches object stores the result of a PCRE match operation. It + // is a vector of capture objects (see above) such that element N + // corresponds to capture group N of the regexp. Per usual, + // match[0] encompasses the string matched by the entire regular + // expression. + typedef std::vector matches; + + // A regex object is the compiled form of a PCRE regular expression. + class regex + { + const void *basedat; + const void *extradat; + int capturecount; + + // default and copy constructors are restricted + regex(); + regex(const regex &); + regex &operator=(const regex &); + + // thanks to silly C++ we have to have an internal "initialize" method + void init(const char *, pcre::flags); + + public: + regex(const char *pattern, pcre::flags options = DEFAULT); + regex(const std::string &pattern, pcre::flags options = DEFAULT); + ~regex(); + + void study(); // do extra upfront work to speed up subsequent matches + + bool match(const std::string &subject, matches &result, + std::string::const_iterator startoffset + = std::string::const_iterator(), + pcre::flags options = DEFAULT) const; + + bool match(const std::string &subject, + std::string::const_iterator startoffset + = std::string::const_iterator(), + pcre::flags options = DEFAULT) const; + + + // helper function which starts successive matches at the position + // where the last match left off. + bool nextmatch(const std::string &subject, matches &result, + pcre::flags options = DEFAULT) const + { + std::string::const_iterator startoffset(0); + if (result.size() > 0 && result[0].matched()) + startoffset = result[0].second; + return match(subject, result, startoffset, options); + } + }; + + // For later: regex variant that takes monotone's "utf8" pseudostrings and + // sets PCRE_UTF8; named capture support. + + // exceptions thrown for errors from PCRE APIs + struct compile_error : public std::runtime_error + { + explicit compile_error(char const * error, int offset, + char const * pattern); + virtual ~compile_error() throw() {}; + }; + + struct study_error : public std::runtime_error + { + explicit study_error(char const * error) : runtime_error(error) {}; + virtual ~study_error() throw() {}; + }; + + struct match_error : public std::runtime_error + { + explicit match_error(int code); + virtual ~match_error() throw() {}; + }; + +} // namespace pcre +#endif ============================================================ --- ChangeLog 172919140a9f04e8ebbd1d628c93915e951dc8f1 +++ ChangeLog 5e7be0ecfa379320d569de711c255345b07a74a7 @@ -1,3 +1,15 @@ +2006-11-25 Zack Weinberg + + * pcrewrap.cc, pcrewrap.hh: New files. + * Makefile.am: Add them to MOST_SOURCES. Add pcrewrap.cc to + tester_SOURCES. + * configure.ac: Look for libpcre, not libboost-regex. + * pch.hh: Don't include boost/regex.hpp. + * diff_patch.cc, globish.cc, lua.cc, luaext_globish.cc + * packet.cc, uri.cc: Update to new regex interface. + * netsync.cc: Remove vestigial include of boost/regex.hpp + (there are no uses of regular expressions in this file). + 2006-11-20 Matt Johnston * tests/mtn_execute_attr_respects_umask/: use ls -l rather than @@ -8,7 +20,7 @@ 2006-11-20 Thomas Keller @@ -53,7 +65,7 @@ 2006-11-17 Thomas Keller - * cmd_automate.cc, cmd.hh, ...: renamed member variable + * cmd_automate.cc, cmd.hh, ...: renamed member variable "options" to "opts" to make (hopefully) the gcc-3.3 build slaves happy @@ -116,7 +128,7 @@ 2006-11-11 Thomas Moschny * notes/: New directory for developer docs. @@ -146,7 +158,7 @@ 2006-11-10 Nathaniel Smith Improved error message for invalid uri's (per bug #17736) @@ -307,7 +319,7 @@ 2006-10-03 Evan Deaubl * po/sv.po: 35 fuzzies cleared, @@ -407,7 +419,7 @@ 2006-09-23 Grahame Bowland - * automate.cc (get_content_changed): use the file_content + * automate.cc (get_content_changed): use the file_content make rather than the parent_name mark. 2006-09-23 Nathaniel Smith @@ -425,7 +437,7 @@ 2006-09-22 Nathaniel Smith * graph.cc: I dunno, maybe this will make the buildslaves happier? - + 2006-09-22 Nathaniel Smith * graph.cc: Add randomized tests for get_reconstruction_path. ============================================================ --- Makefile.am c77994d06248ca5e37577009cb6445132cb1fe64 +++ Makefile.am cdd7815c1e675ef9cb034b2a0fd9d2bdb36a1f55 @@ -68,6 +68,7 @@ MOST_SOURCES = \ graph.cc graph.hh \ roster_delta.cc roster_delta.hh \ sha1.hh sha1.cc sha1_engine.hh \ + pcrewrap.cc pcrewrap.hh \ rev_height.cc rev_height.hh \ \ lru_writeback_cache.hh \ @@ -278,7 +279,7 @@ unit_tests_SOURCES = $(MOST_SOURCES) uni mtn_SOURCES = $(MOST_SOURCES) monotone.cc usher_SOURCES = contrib/usher.cc unit_tests_SOURCES = $(MOST_SOURCES) unit_tests.cc crypto_tests.cc -tester_SOURCES = $(SANITY_CORE_SOURCES) $(LUAEXT_SOURCES) tester.cc +tester_SOURCES = $(SANITY_CORE_SOURCES) $(LUAEXT_SOURCES) pcrewrap.cc tester.cc txt2c_SOURCES = txt2c.cc ============================================================ --- configure.ac dc18dea85941b866f1520a9c787296cd87417369 +++ configure.ac 661fc6adca568307e46e8d53d8dbd51c6ead71f0 @@ -97,13 +97,15 @@ AC_SEARCH_LIBS([deflate], [z], , AC_MSG_ # simple library checks AC_SEARCH_LIBS([deflate], [z], , AC_MSG_FAILURE([zlib is required])) +# FIXME Use pcre-config; bundled pcre support. +AC_SEARCH_LIBS([pcre_exec], [pcre], , AC_MSG_FAILURE([libpcre is required])) + # check for all things boost-related BOOST_VERSION_CHECK BOOST_VERSION_SPECIFIC_BUGS MTN_BOOST_LIB_FILESYSTEM MTN_BOOST_LIB_DATE_TIME -MTN_BOOST_LIB_REGEX MTN_BOOST_LIB_UNIT_TEST_FRAMEWORK # more complex library checks ============================================================ --- diff_patch.cc a8b1d3481a4ff79239887399a16f224df14a408a +++ diff_patch.cc 11ffd8cefc404f68b6488d385807739bf5b287ca @@ -18,7 +18,6 @@ #include #include -#include #include "diff_patch.hh" #include "interner.hh" #include "lcs.hh" @@ -32,6 +31,7 @@ #include "revision.hh" #include "constants.hh" #include "localized_file_io.hh" +#include "pcrewrap.hh" using std::endl; using std::make_pair; @@ -782,7 +782,7 @@ struct hunk_consumer vector const & b; size_t ctx; ostream & ost; - boost::scoped_ptr encloser_re; + boost::scoped_ptr encloser_re; size_t a_begin, b_begin, a_len, b_len; long skew; @@ -805,7 +805,7 @@ struct hunk_consumer encloser_last_match(a.rend()), encloser_last_search(a.rend()) { if (encloser_pattern != "") - encloser_re.reset(new boost::regex(encloser_pattern)); + encloser_re.reset(new pcre::regex(encloser_pattern)); } }; @@ -835,7 +835,7 @@ hunk_consumer::find_encloser(size_t pos, // i is a reverse_iterator, so this loop goes backward through the vector. for (; i != last; i++) - if (boost::regex_search (*i, *encloser_re)) + if (encloser_re->match(*i)) { encloser_last_match = i; break; ============================================================ --- globish.cc 85d971be4e285160674cbb8cd1ae6e948f433037 +++ globish.cc de92057faf87bd383e7c587f1dec11c16dce53e4 @@ -13,11 +13,9 @@ using std::vector; using std::string; using std::vector; -using boost::regex_match; - -// this converts a globish pattern to a regex. The regex should be usable by -// the Boost regex library operating in default mode, i.e., it should be a -// valid ECMAscript regex. +// this converts a globish pattern to a regex. The regex should be usable +// by the PCRE library operating in default mode, i.e., it should be a valid +// (close approximation to) Perl regex. // // Pattern tranformation: // @@ -46,21 +44,20 @@ maybe_quote(char c, string & re) re += c; } -static void -checked_globish_to_regex(string const & glob, string & regex) +static string +checked_globish_to_regex(string const & glob) { int in_braces = 0; // counter for levels if {} + string regex; - regex.clear(); regex.reserve(glob.size() * 2); L(FL("checked_globish_to_regex: input = '%s'") % glob); if (glob == "") - { - regex = "$.^"; - // and the below loop will do nothing - } + return "$.^"; + + regex += '^'; for (string::const_iterator i = glob.begin(); i != glob.end(); ++i) { char c = *i; @@ -77,7 +74,7 @@ checked_globish_to_regex(string const & break; case '{': in_braces++; - regex += '('; + regex += "(?:"; break; case '}': N(in_braces != 0, @@ -100,11 +97,14 @@ checked_globish_to_regex(string const & break; } } + regex += '$'; N(in_braces == 0, F("run-away brace expression in pattern '%s'") % glob); L(FL("checked_globish_to_regex: output = '%s'") % regex); + + return regex; } void @@ -116,9 +116,8 @@ combine_and_check_globish(vector c bool first = true; for (vector::const_iterator i = patterns.begin(); i != patterns.end(); ++i) { - string tmp; // run for the checking it does - checked_globish_to_regex((*i)(), tmp); + checked_globish_to_regex((*i)()); if (!first) p += ','; first = false; @@ -129,13 +128,14 @@ combine_and_check_globish(vector c pattern = utf8(p); } -globish_matcher::globish_matcher(utf8 const & include_pat, utf8 const & exclude_pat) +globish_matcher::globish_matcher(utf8 const & include_pat, + utf8 const & exclude_pat) + : iglob(include_pat()), xglob(exclude_pat()), + ipat(checked_globish_to_regex(iglob)), + xpat(checked_globish_to_regex(xglob)), + r_inc(ipat), + r_exc(xpat) { - string re; - checked_globish_to_regex(include_pat(), re); - r_inc = re; - checked_globish_to_regex(exclude_pat(), re); - r_exc = re; } bool @@ -143,11 +143,11 @@ globish_matcher::operator()(string const { // regex_match may throw a runtime_error, if the regex turns out to be // really pathological - bool inc_match = regex_match(s, r_inc); - bool exc_match = regex_match(s, r_exc); + bool inc_match = r_inc.match(s); + bool exc_match = r_exc.match(s); bool result = inc_match && !exc_match; L(FL("matching '%s' against '%s' excluding '%s': %s, %s: %s") - % s % r_inc % r_exc + % s % ipat % xpat % (inc_match ? "included" : "not included") % (exc_match ? "excluded" : "not excluded") % (result ? "matches" : "does not match")); @@ -161,31 +161,31 @@ UNIT_TEST(globish, checked_globish_to_re { string pat; - checked_globish_to_regex("*", pat); - BOOST_CHECK(pat == ".*"); - checked_globish_to_regex("?", pat); - BOOST_CHECK(pat == "."); - checked_globish_to_regex("{a,b,c}d", pat); - BOOST_CHECK(pat == "(a|b|c)d"); - checked_globish_to_regex("foo{a,{b,c},?*}d", pat); - BOOST_CHECK(pat == "foo(a|(b|c)|..*)d"); - checked_globish_to_regex("\\a\\b\\|\\{\\*", pat); - BOOST_CHECK(pat == "ab\\|\\{\\*"); - checked_globish_to_regex(".+$^{}", pat); - BOOST_CHECK(pat == "\\.\\+\\$\\^()"); - checked_globish_to_regex(",", pat); + pat = checked_globish_to_regex("*"); + BOOST_CHECK(pat == "^.*$"); + pat = checked_globish_to_regex("?"); + BOOST_CHECK(pat == "^.$"); + pat = checked_globish_to_regex("{a,b,c}d"); + BOOST_CHECK(pat == "^(?:a|b|c)d$"); + pat = checked_globish_to_regex("foo{a,{b,c},?*}d"); + BOOST_CHECK(pat == "^foo(?:a|(?:b|c)|..*)d$"); + pat = checked_globish_to_regex("\\a\\b\\|\\{\\*"); + BOOST_CHECK(pat == "^ab\\|\\{\\*$"); + pat = checked_globish_to_regex(".+$^{}"); + BOOST_CHECK(pat == "^\\.\\+\\$\\^(?:)$"); + pat = checked_globish_to_regex(","); // we're very conservative about metacharacters, and quote all // non-alphanumerics, hence the backslash - BOOST_CHECK(pat == "\\,"); - checked_globish_to_regex("\\.\\+\\$\\^\\(\\)", pat); - BOOST_CHECK(pat == "\\.\\+\\$\\^\\(\\)"); + BOOST_CHECK(pat == "^\\,$"); + pat = checked_globish_to_regex("\\.\\+\\$\\^\\(\\)"); + BOOST_CHECK(pat == "^\\.\\+\\$\\^\\(\\)$"); - BOOST_CHECK_THROW(checked_globish_to_regex("foo\\", pat), informative_failure); - BOOST_CHECK_THROW(checked_globish_to_regex("{foo", pat), informative_failure); - BOOST_CHECK_THROW(checked_globish_to_regex("{foo,bar{baz,quux}", pat), informative_failure); - BOOST_CHECK_THROW(checked_globish_to_regex("foo}", pat), informative_failure); - BOOST_CHECK_THROW(checked_globish_to_regex("foo,bar{baz,quux}}", pat), informative_failure); - BOOST_CHECK_THROW(checked_globish_to_regex("{{{{{{{{{{a,b},c},d},e},f},g},h},i},j},k}", pat), informative_failure); + BOOST_CHECK_THROW(checked_globish_to_regex("foo\\"), informative_failure); + BOOST_CHECK_THROW(checked_globish_to_regex("{foo"), informative_failure); + BOOST_CHECK_THROW(checked_globish_to_regex("{foo,bar{baz,quux}"), informative_failure); + BOOST_CHECK_THROW(checked_globish_to_regex("foo}"), informative_failure); + BOOST_CHECK_THROW(checked_globish_to_regex("foo,bar{baz,quux}}"), informative_failure); + BOOST_CHECK_THROW(checked_globish_to_regex("{{{{{{{{{{a,b},c},d},e},f},g},h},i},j},k}"), informative_failure); } UNIT_TEST(globish, combine_and_check_globish) ============================================================ --- globish.hh 0cc1789ca486ca25778d7c88b5e935c916503091 +++ globish.hh 0ce72827a54b9f14bb186be32cf0731ab40e86ac @@ -28,11 +28,11 @@ // matches nothing, not even the empty string. this hardly ever matters, but // it's nice to have some way to say "don't exclude anything", for instance. -#include +#include #include -#include #include "vocab.hh" +#include "pcrewrap.hh" void combine_and_check_globish(std::vector const &patterns, utf8 & pattern); @@ -45,7 +45,8 @@ private: // pathological bool operator()(std::string const & s); private: - boost::regex r_inc, r_exc; + std::string iglob, xglob, ipat, xpat; + pcre::regex r_inc, r_exc; }; // Local Variables: ============================================================ --- lua.cc e0af7e037ef78d315f4fa61c58f7f11c4ec25768 +++ lua.cc 6e59fe049bc11b83e177f02d9bf5620335c80488 @@ -5,6 +5,7 @@ #include "globish.hh" #include "sanity.hh" +#include "pcrewrap.hh" #include #include @@ -13,7 +14,6 @@ #include #include -#include using std::pair; using std::set; @@ -546,15 +546,14 @@ LUAEXT(search, regex) { const char *re = luaL_checkstring(L, -2); const char *str = luaL_checkstring(L, -1); - boost::cmatch what; bool result = false; try { - result = boost::regex_search(str, what, boost::regex(re)); - } catch (boost::bad_pattern e) { - lua_pushstring(L, e.what()); - lua_error(L); - return 0; + result = pcre::regex(re).match(str); + } catch (pcre::compile_error e) { + return luaL_error(L, (string("error parsing regex: ") + e.what()).c_str()); + } catch (pcre::match_error e) { + return luaL_error(L, (string("error during match: ") + e.what()).c_str()); } lua_pushboolean(L, result); return 1; ============================================================ --- luaext_globish.cc ab1acf27ba10c8676e264f6046be753f4884d590 +++ luaext_globish.cc 506249a961f0209231535ac1c696bda6442d22da @@ -18,8 +18,10 @@ LUAEXT(match, globish) result = globish_matcher(r, n)(s); } catch (informative_failure & e) { return luaL_error(L, e.what()); - } catch (boost::bad_pattern & e) { - return luaL_error(L, e.what()); + } catch (pcre::compile_error e) { + return luaL_error(L, (string("error parsing regex: ") + e.what()).c_str()); + } catch (pcre::match_error e) { + return luaL_error(L, (string("error during match: ") + e.what()).c_str()); } catch (...) { return luaL_error(L, "Unknown error."); } ============================================================ --- netsync.cc 8473129fbbfc6f05efcb2471f7cfbce8e89f51ef +++ netsync.cc f286f2af714d0047ff6de97b163860ee2d9118c4 @@ -21,7 +21,6 @@ #include #include #include -#include #include "app_state.hh" #include "cert.hh" ============================================================ --- packet.cc a59b48af0d11f15f912c85e5cc53a04f7f17b5f9 +++ packet.cc 5042d84abe71bbf3e1027a4475e51955370bef79 @@ -10,7 +10,6 @@ #include #include -#include #include #include "app_state.hh" @@ -23,6 +22,7 @@ #include "simplestring_xform.hh" #include "keys.hh" #include "cert.hh" +#include "pcrewrap.hh" using std::endl; using std::istream; @@ -33,9 +33,6 @@ using boost::lexical_cast; using std::string; using boost::lexical_cast; -using boost::match_default; -using boost::match_results; -using boost::regex; using boost::shared_ptr; void @@ -330,15 +327,14 @@ feed_packet_consumer feed_packet_consumer { app_state & app; - size_t & count; packet_consumer & cons; string ident; string key; string certname; string base; string sp; - feed_packet_consumer(size_t & count, packet_consumer & c, app_state & app_) - : app(app_), count(count), cons(c), + feed_packet_consumer(packet_consumer & c, app_state & app_) + : app(app_), cons(c), ident(constants::regex_legal_id_bytes), key(constants::regex_legal_key_name_bytes), certname(constants::regex_legal_cert_name_bytes), @@ -349,23 +345,23 @@ feed_packet_consumer { E(x, F("malformed packet")); } - bool operator()(match_results const & res) const + void operator()(pcre::matches &res) const { if (res.size() != 4) throw oops("matched impossible packet with " + lexical_cast(res.size()) + " matching parts: " + string(res[0].first, res[0].second)); - I(res[1].matched); - I(res[2].matched); - I(res[3].matched); + I(res[1].matched()); + I(res[2].matched()); + I(res[3].matched()); string type(res[1].first, res[1].second); string args(res[2].first, res[2].second); string body(res[3].first, res[3].second); - if (regex_match(type, regex("[fr]data"))) + if (pcre::regex("[fr]data").match(type)) { L(FL("read data packet")); - require(regex_match(args, regex(ident))); - require(regex_match(body, regex(base))); + require(pcre::regex(ident).match(args)); + require(pcre::regex(base).match(body)); base64 > body_packed(trim_ws(body)); data contents; unpack(body_packed, contents); @@ -381,11 +377,11 @@ feed_packet_consumer else if (type == "fdelta") { L(FL("read delta packet")); - match_results matches; - require(regex_match(args, matches, regex(ident + sp + ident))); - string src_id(matches[1].first, matches[1].second); - string dst_id(matches[2].first, matches[2].second); - require(regex_match(body, regex(base))); + pcre::matches matches; + require(pcre::regex(ident + sp + ident).match(args, matches)); + string src_id(matches[1].str()); + string dst_id(matches[2].str()); + require(pcre::regex(base).match(body)); base64 > body_packed(trim_ws(body)); delta contents; unpack(body_packed, contents); @@ -396,13 +392,13 @@ feed_packet_consumer else if (type == "rcert") { L(FL("read cert packet")); - match_results matches; - require(regex_match(args, matches, regex(ident + sp + certname - + sp + key + sp + base))); - string certid(matches[1].first, matches[1].second); - string name(matches[2].first, matches[2].second); - string keyid(matches[3].first, matches[3].second); - string val(matches[4].first, matches[4].second); + pcre::matches matches; + require(pcre::regex(ident + sp + certname + sp + key + sp + base) + .match(args, matches)); + string certid(matches[1].str()); + string name(matches[2].str()); + string keyid(matches[3].str()); + string val(matches[4].str()); string contents(trim_ws(body)); // canonicalize the base64 encodings to permit searches @@ -416,8 +412,8 @@ feed_packet_consumer else if (type == "pubkey") { L(FL("read pubkey data packet")); - require(regex_match(args, regex(key))); - require(regex_match(body, regex(base))); + require(pcre::regex(key).match(args)); + require(pcre::regex(base).match(body)); string contents(trim_ws(body)); cons.consume_public_key(rsa_keypair_id(args), base64(contents)); @@ -425,18 +421,19 @@ feed_packet_consumer else if (type == "keypair") { L(FL("read keypair data packet")); - require(regex_match(args, regex(key))); - match_results matches; - require(regex_match(body, matches, regex(base + "#" + base))); - string pub_dat(trim_ws(string(matches[1].first, matches[1].second))); - string priv_dat(trim_ws(string(matches[2].first, matches[2].second))); + require(pcre::regex(key).match(args)); + + pcre::matches matches; + require(pcre::regex(base + "#" + base).match(body, matches)); + string pub_dat(trim_ws(matches[1].str())); + string priv_dat(trim_ws(matches[2].str())); cons.consume_key_pair(rsa_keypair_id(args), keypair(pub_dat, priv_dat)); } else if (type == "privkey") { L(FL("read pubkey data packet")); - require(regex_match(args, regex(key))); - require(regex_match(body, regex(base))); + require(pcre::regex(key).match(args)); + require(pcre::regex(base).match(body)); string contents(trim_ws(body)); keypair kp; migrate_private_key(app, @@ -448,23 +445,25 @@ feed_packet_consumer else { W(F("unknown packet type: '%s'") % type); - return true; } - ++count; - return true; } }; static size_t extract_packets(string const & s, packet_consumer & cons, app_state & app) { - static string const head("\\[([a-z]+)[[:space:]]+([^\\[\\]]+)\\]"); - static string const body("([^\\[\\]]+)"); - static string const tail("\\[end\\]"); - static string const whole = head + body + tail; - regex expr(whole); + static pcre::regex expr("\\[([a-z]+)[[:space:]]+([^\\[\\]]+)\\]" + "([^\\[\\]]+)" + "\\[end\\]"); size_t count = 0; - regex_grep(feed_packet_consumer(count, cons, app), s, expr, match_default); + pcre::matches m; + feed_packet_consumer fcons(cons, app); + + while (expr.nextmatch(s, m)) + { + fcons(m); + count++; + } return count; } ============================================================ --- pch.hh e561236387f43aa08076cda7b35e4af7be163be9 +++ pch.hh 86ad5ca677a37ec6382332441b926c3458b77b4e @@ -30,7 +30,6 @@ #include #include #include -#include #include #include #include ============================================================ --- uri.cc 72edc51b298f51cac2c9b431ac72e5987f6d53f9 +++ uri.cc 81b7b978e2ed0e02f6edee4088ae324940e9b52d @@ -9,67 +9,62 @@ #include -#include - +#include "pcrewrap.hh" #include "sanity.hh" #include "uri.hh" using std::string; +static pcre::regex uri_rx( + "^" + "(?:([^:/?#]+):)?" // scheme + "(?://([^/?#]*))?" // authority + "([^?#]*)" // path + "(?:\\?([^#]*))?" // query + "(?:#(.*))?" // fragment + "$"); + +static pcre::regex auth_rx( + "(?:(address@hidden)@)?" // user + "(?:" + "\\[([^\\]]+)]\\]" // ipv6 host + "|" + "([^:/]+)" // normal host + ")" + "(?::([[:digit:]]+))?"); // port + bool parse_uri(string const & in, uri & out) { uri u; - - // This is a simplified URI grammar. It does the basics. - - string scheme_part = "(?:([^:/?#]+):)?"; - string authority_part = "(?://([^/?#]*))?"; - string path_part = "([^?#]*)"; - string query_part = "(?:\\?([^#]*))?"; - string fragment_part = "(?:#(.*))?"; - - string uri_rx = (string("^") - + scheme_part - + authority_part - + path_part - + query_part - + fragment_part - + "$"); - - boost::match_results uri_matches; - if (boost::regex_match(in, uri_matches, boost::regex(uri_rx))) + pcre::matches uri_matches; + + if (uri_rx.match(in, uri_matches)) { + u.scheme = uri_matches[1].str(); - u.scheme = uri_matches.str(1); - // The "authority" fragment gets a bit more post-processing. L(FL("matched URI scheme: '%s'") % u.scheme); - if (uri_matches[2].matched) + if (uri_matches[2].matched()) { - string authority = uri_matches.str(2); + string authority = uri_matches[2].str(); L(FL("matched URI authority: '%s'") % authority); - string user_part = "(?:(address@hidden)@)?"; - string ipv6_host_part = "\\[([^\\]]+)]\\]"; - string normal_host_part = "([^:/]+)"; - string host_part = "(?:" + ipv6_host_part + "|" + normal_host_part + ")"; - string port_part = "(?::([[:digit:]]+))?"; - string auth_rx = user_part + host_part + port_part; - boost::match_results auth_matches; + pcre::matches auth_matches; - N(boost::regex_match(authority, auth_matches, boost::regex(auth_rx)), - F("The URI syntax is invalid. Maybe you used an URI in scp-style?")); + N(auth_rx.match(authority, auth_matches), + F("Invalid URI authority '%s'.\n" + "Maybe you used an URI in scp-style?") % authority); - u.user = auth_matches.str(1); - u.port = auth_matches.str(4); - if (auth_matches[2].matched) - u.host = auth_matches.str(2); + u.user = auth_matches[1].str(); + u.port = auth_matches[4].str(); + if (auth_matches[2].matched()) + u.host = auth_matches[2].str(); else { - I(auth_matches[3].matched); - u.host = auth_matches.str(3); + I(auth_matches[3].matched()); + u.host = auth_matches[3].str(); } L(FL("matched URI user: '%s'") % u.user); L(FL("matched URI host: '%s'") % u.host); @@ -77,9 +72,9 @@ parse_uri(string const & in, uri & out) } - u.path = uri_matches.str(3); - u.query = uri_matches.str(4); - u.fragment = uri_matches.str(5); + u.path = uri_matches[3].str(); + u.query = uri_matches[4].str(); + u.fragment = uri_matches[5].str(); L(FL("matched URI path: '%s'") % u.path); L(FL("matched URI query: '%s'") % u.query); L(FL("matched URI fragment: '%s'") % u.fragment);