# # # patch "ChangeLog" # from [49c9aa7b2d19c2e0d8a55800e9f9082df054ec9d] # to [59ff63b9f2b1f9cfbe8c39483127487fbef17381] # # patch "pcrewrap.cc" # from [cf2674222370e2902e15901902961e4e603c12c0] # to [ed186faf44253b9bf2d169bb900a31bd75c479f2] # # patch "pcrewrap.hh" # from [206922378c80ba9fd283ccd510af1e7857328ef6] # to [7cb55a52af7a1085561a5ace6451893fce56d804] # ============================================================ --- ChangeLog 49c9aa7b2d19c2e0d8a55800e9f9082df054ec9d +++ ChangeLog 59ff63b9f2b1f9cfbe8c39483127487fbef17381 @@ -1,3 +1,12 @@ +2006-12-27 Zack Weinberg + + * pcrewrap.cc, pcrewrap.hh: Remove redundant includes. + Add a couple of "using"s. Improve commentary. Fix formatting. + Add editor control comments to bottoms of files, and de-tabify. + Refactor construction of regex objects to facilitate precompilation. + Always call pcre_study at construction time, and remove separate + study() method. + 2006-12-18 Zack Weinberg * pcre: New directory, contains trimmed-down version of PCRE library. ============================================================ --- pcrewrap.cc cf2674222370e2902e15901902961e4e603c12c0 +++ pcrewrap.cc ed186faf44253b9bf2d169bb900a31bd75c479f2 @@ -1,15 +1,13 @@ -#include -#include -#include -#include - #include "pcrewrap.hh" #define pcre pcre_t #include "pcre.h" #undef pcre -static unsigned int +using std::string; +using std::runtime_error; + +inline unsigned int flags_to_internal(pcre::flags f) { using namespace pcre; @@ -17,7 +15,7 @@ flags_to_internal(pcre::flags f) unsigned int i = 0; i |= C(f, NEWLINE_CR); i |= C(f, NEWLINE_LF); - // NEWLINE_CRLF is handled above + // NEWLINE_CRLF == NEWLINE_CR|NEWLINE_LF and so is handled above i |= C(f, ANCHORED); i |= C(f, NOTBOL); i |= C(f, NOTEOL); @@ -30,110 +28,117 @@ flags_to_internal(pcre::flags f) i |= C(f, FIRSTLINE); i |= C(f, MULTILINE); i |= C(f, UNGREEDY); +#undef C return i; } -namespace pcre +inline std::pair +compile(const char * pattern, pcre::flags options) { - void regex::init(const char *pattern, pcre::flags options) - { - int erroff; - const char *err; - basedat = static_cast - (pcre_compile(pattern, flags_to_internal(options), &err, &erroff, 0)); - if (!basedat) - throw compile_error(err, erroff, pattern); + int erroff; + const char * err; + const pcre_t * basedat = pcre_compile(pattern, flags_to_internal(options), + &err, &erroff, 0); + if (!basedat) + throw pcre::compile_error(err, erroff, pattern); - int errcode = pcre_fullinfo(static_cast(basedat), 0, - PCRE_INFO_CAPTURECOUNT, - static_cast(&capturecount)); - if (errcode < 0) - throw compile_error((F("pcre_fullinfo error %d") % errcode).str().c_str(), - 0, pattern); - } + const pcre_extra * extradat = pcre_study(basedat, 0, &err); + if (err) + throw pcre::study_error(err); - regex::regex(const char *pattern, pcre::flags options) - : basedat(0), extradat(0), capturecount(0) - { - this->init(pattern, options); - } + return std::make_pair(static_cast(basedat), + static_cast(extradat)); +} - regex::regex(const std::string &pattern, pcre::flags options) - : basedat(0), extradat(0), capturecount(0) - { - this->init(pattern.c_str(), options); - } +inline unsigned int +get_capturecount(const void * bd) +{ + unsigned int cc; + int err = pcre_fullinfo(static_cast(bd), 0, + PCRE_INFO_CAPTURECOUNT, + static_cast(&cc)); + if (err < 0) + throw pcre::fullinfo_error(err); + return cc; +} +namespace pcre +{ + regex::regex(const char * pattern, flags options) + : basic_regex(compile(pattern, options)) + {} + + regex::regex(const string & pattern, flags options) + : basic_regex(compile(pattern.c_str(), options)) + {} + regex::~regex() { if (basedat) - pcre_free(const_cast(basedat)); + pcre_free(const_cast(basedat)); if (extradat) - pcre_free(const_cast(extradat)); + pcre_free(const_cast(extradat)); } - void regex::study() + bool + basic_regex::match(const string & subject, matches & result, + string::const_iterator startptr, + flags options) const { - const char *err; - extradat = static_cast - (pcre_study(static_cast(basedat), 0, &err)); - if (err) - throw study_error(err); - } + // pcre_exec wants its caller to provide three integer slots per + // capturing paren, plus three more for the whole-pattern match. + // On exit from pcre_exec, the first two-thirds of the vector will be + // pairs of integers representing [start, end) offsets within the + // string. pcre_exec uses the remaining third of the vector for a + // scratchpad. (Why can't it allocate its own damn scratchpad?) + unsigned int capturecount = get_capturecount(basedat); + std::vector ovec((capturecount + 1) * 3); - bool - regex::match(const std::string &subject, matches &result, - std::string::const_iterator startptr, - pcre::flags options) const - { + // convert the start pointer to an offset within the string (the &* + // converts each iterator to a bare pointer, which can be subtracted -- + // you should be able to subtract random-access iterators directly, + // grumble) int startoffset = 0; - if (startptr != std::string::const_iterator(0)) + if (startptr != string::const_iterator(0)) startoffset = &*startptr - &*subject.data(); - - // pcre_exec has a bizarro calling convention. It wants ovec to - // provide three integer slots per capturing paren, plus three - // more (for the whole-pattern match). The first two-thirds of - // the vector will contain useful pairs of integers on exit from - // pcre_exec; the last third will be used as scribble space by - // pcre_exec. (Why can't it allocate its own damn scribble space?) - std::vector ovec((capturecount + 1) * 3); + int rc = pcre_exec(static_cast(basedat), - static_cast(extradat), - subject.data(), subject.size(), - startoffset, - flags_to_internal(options), - &ovec.front(), ovec.size()); // ??? ovec.data() + static_cast(extradat), + subject.data(), subject.size(), + startoffset, + flags_to_internal(options), + &ovec.front(), ovec.size()); // ??? ovec.data() if (rc >= 0) { - // If the return value is nonnegative, the pattern matched, - // and rc is one more than the number of pairs of integers in - // ovec that are meaningful. - result.clear(); - result.reserve(capturecount + 1); - for (int i = 0; i < rc * 2; i += 2) - { - if (ovec[i] == -1 && ovec[i+1] == -1) - result.push_back(capture(std::string::const_iterator(0), - std::string::const_iterator(0))); - else if (ovec[i] == -1 || ovec[i+1] == -1) - throw match_error(PCRE_ERROR_INTERNAL); // should never happen - else - result.push_back(capture(subject.begin() + ovec[i], - subject.begin() + ovec[i+1])); - } - for (int i = rc; i < capturecount + 1; i++) - result.push_back(capture(std::string::const_iterator(0), - std::string::const_iterator(0))); - I(result.size() == capturecount + 1); - return true; + // If the return value is nonnegative, the pattern matched, + // and rc is one more than the number of pairs of integers in + // ovec that are meaningful. + result.clear(); + result.reserve(capturecount + 1); + for (int i = 0; i < rc * 2; i += 2) + { + if (ovec[i] == -1 && ovec[i+1] == -1) + result.push_back(capture(string::const_iterator(0), + string::const_iterator(0))); + else if (ovec[i] == -1 || ovec[i+1] == -1) + throw match_error(PCRE_ERROR_INTERNAL); // should never happen + else + result.push_back(capture(subject.begin() + ovec[i], + subject.begin() + ovec[i+1])); + } + for (unsigned int i = rc; i < capturecount + 1; i++) + result.push_back(capture(string::const_iterator(0), + string::const_iterator(0))); + I(result.size() == capturecount + 1); + return true; } else if (rc == PCRE_ERROR_NOMATCH) { - result = matches(capturecount + 1, - capture(std::string::const_iterator(0), - std::string::const_iterator(0))); - I(result.size() == capturecount + 1); - return false; + result = matches(capturecount + 1, + capture(string::const_iterator(0), + string::const_iterator(0))); + I(result.size() == capturecount + 1); + return false; } else throw match_error(rc); @@ -142,18 +147,18 @@ namespace pcre // This overload is for when you don't care about captures, only // whether or not it matched. bool - regex::match(const std::string &subject, - std::string::const_iterator startptr, - pcre::flags options) const + basic_regex::match(const string & subject, + string::const_iterator startptr, + flags options) const { int startoffset = 0; - if (startptr != std::string::const_iterator(0)) + if (startptr != string::const_iterator(0)) startoffset = &*startptr - &*subject.data(); int rc = pcre_exec(static_cast(basedat), - static_cast(extradat), - subject.data(), subject.size(), - startoffset, flags_to_internal(options), 0, 0); + static_cast(extradat), + subject.data(), subject.size(), + startoffset, flags_to_internal(options), 0, 0); if (rc == 0) return true; else if (rc == PCRE_ERROR_NOMATCH) @@ -164,14 +169,15 @@ namespace pcre // error handling. - static std::string - compile_error_message(const char *err, int offset, const char *pattern) + static string + compile_error_message(const char * err, int offset, const char * pattern) { return (F("parse error at char %d in pattern '%s': %s") - % offset % pattern % err).str(); + % offset % pattern % err).str(); } - compile_error::compile_error(const char *err, int offset, const char *pattern) + compile_error::compile_error(const char * err, int offset, + const char * pattern) : std::runtime_error(compile_error_message(err, offset, pattern)) {} @@ -179,4 +185,17 @@ namespace pcre : std::runtime_error((F("Error during matching, code %d") % code).str()) {} + fullinfo_error::fullinfo_error(int code) + : std::runtime_error((F("Error getting capture count, code %d") % code) + .str()) + {} + } // namespace pcre + +// Local Variables: +// mode: C++ +// fill-column: 76 +// c-file-style: "gnu" +// indent-tabs-mode: nil +// End: +// vim: et:sw=2:sts=2:ts=2:cino=>2s,{s,\:s,+s,t0,g0,^-2,e-2,n-2,p2s,(0,=s: ============================================================ --- pcrewrap.hh 206922378c80ba9fd283ccd510af1e7857328ef6 +++ pcrewrap.hh 7cb55a52af7a1085561a5ace6451893fce56d804 @@ -19,12 +19,12 @@ namespace pcre NEWLINE_CR = 0x0001, // newline is \r NEWLINE_LF = 0x0002, // newline is \n NEWLINE_CRLF = (NEWLINE_CR|NEWLINE_LF), // newline is \r\n - ANCHORED = 0x0004, // match only at beginning + ANCHORED = 0x0004, // match only at beginning // of string (\A in pat) // flags usable only with pcre_exec - NOTBOL = 0x0008, // beginning of string isn't beginning of line - NOTEOL = 0x0010, // end of string isn't end of line - NOTEMPTY = 0x0020, // an empty match is a match failure + NOTBOL = 0x0008, // beginning of string isn't beginning of line + NOTEOL = 0x0010, // end of string isn't end of line + NOTEMPTY = 0x0020, // an empty match is a match failure // flags usable only with pcre_compile CASELESS = 0x0040, // case insensitive match (?i) @@ -47,16 +47,16 @@ namespace pcre // object provides a couple of helper operations, matched() and str(), // for common use cases. struct capture : public std::pair + std::string::const_iterator> { capture(std::string::const_iterator a, - std::string::const_iterator b) + std::string::const_iterator b) : std::pair - (a, b) + (a, b) { I((a == std::string::const_iterator(0) - && b == std::string::const_iterator(0)) - || (a != std::string::const_iterator(0) - && b != std::string::const_iterator(0))); } + && b == std::string::const_iterator(0)) + || (a != std::string::const_iterator(0) + && b != std::string::const_iterator(0))); } bool matched() { return (this->first != std::string::const_iterator(0)); } std::string str() { return std::string(this->first, this->second); } @@ -69,73 +69,92 @@ namespace pcre // expression. typedef std::vector matches; - // A regex object is the compiled form of a PCRE regular expression. - class regex + // A basic_regex object is the compiled form of a PCRE regular expression. + // You never construct this directly. + struct basic_regex { - const void *basedat; - const void *extradat; - int capturecount; + private: + // disable the default and copy constructors + basic_regex(); + basic_regex(const basic_regex &); + basic_regex & operator=(const basic_regex &); - // default and copy constructors are restricted - regex(); - regex(const regex &); - regex &operator=(const regex &); + protected: + const void * const basedat; + const void * const extradat; - // thanks to silly C++ we have to have an internal "initialize" method - void init(const char *, pcre::flags); + // for use only by subclass constructors + basic_regex(const void * b, const void * e) : basedat(b), extradat(e) {} + basic_regex(std::pair p) + : basedat(p.first), extradat(p.second) {} public: - regex(const char *pattern, pcre::flags options = DEFAULT); - regex(const std::string &pattern, pcre::flags options = DEFAULT); - ~regex(); + ~basic_regex() {} - void study(); // do extra upfront work to speed up subsequent matches + bool match(const std::string & subject, matches & result, + std::string::const_iterator startoffset + = std::string::const_iterator(), + pcre::flags options = DEFAULT) const; - bool match(const std::string &subject, matches &result, - std::string::const_iterator startoffset - = std::string::const_iterator(), - pcre::flags options = DEFAULT) const; + bool match(const std::string & subject, + std::string::const_iterator startoffset + = std::string::const_iterator(), + pcre::flags options = DEFAULT) const; - bool match(const std::string &subject, - std::string::const_iterator startoffset - = std::string::const_iterator(), - pcre::flags options = DEFAULT) const; - // helper function which starts successive matches at the position // where the last match left off. - bool nextmatch(const std::string &subject, matches &result, - pcre::flags options = DEFAULT) const + bool nextmatch(const std::string & subject, matches & result, + pcre::flags options = DEFAULT) const { std::string::const_iterator startoffset(0); if (result.size() > 0 && result[0].matched()) - startoffset = result[0].second; + startoffset = result[0].second; return match(subject, result, startoffset, options); } }; - // For later: regex variant that takes monotone's "utf8" pseudostrings and - // sets PCRE_UTF8; named capture support. + // A regex is the class you are intended to use directly, in normal usage. + struct regex : public basic_regex + { + regex(const char * pattern, pcre::flags options = DEFAULT); + regex(const std::string & pattern, pcre::flags options = DEFAULT); + ~regex(); + }; // exceptions thrown for errors from PCRE APIs struct compile_error : public std::runtime_error { explicit compile_error(char const * error, int offset, - char const * pattern); - virtual ~compile_error() throw() {}; + char const * pattern); + virtual ~compile_error() throw() {} }; struct study_error : public std::runtime_error { explicit study_error(char const * error) : runtime_error(error) {}; - virtual ~study_error() throw() {}; + virtual ~study_error() throw() {} }; + struct fullinfo_error : public std::runtime_error + { + explicit fullinfo_error(int code); + virtual ~fullinfo_error() throw() {} + }; + struct match_error : public std::runtime_error { explicit match_error(int code); - virtual ~match_error() throw() {}; + virtual ~match_error() throw() {} }; } // namespace pcre #endif + +// Local Variables: +// mode: C++ +// fill-column: 76 +// c-file-style: "gnu" +// indent-tabs-mode: nil +// End: +// vim: et:sw=2:sts=2:ts=2:cino=>2s,{s,\:s,+s,t0,g0,^-2,e-2,n-2,p2s,(0,=s: