[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]
[lmi-commits] [lmi] master b87c2d5 2/7: Add C++ wrapper for PCRE2
From: |
Greg Chicares |
Subject: |
[lmi-commits] [lmi] master b87c2d5 2/7: Add C++ wrapper for PCRE2 |
Date: |
Sat, 2 Oct 2021 17:56:48 -0400 (EDT) |
branch: master
commit b87c2d5ffb8a837abc9f8d261930439a650f7406
Author: Gregory W. Chicares <gchicares@sbcglobal.net>
Commit: Gregory W. Chicares <gchicares@sbcglobal.net>
Add C++ wrapper for PCRE2
---
pcre_regex.hpp | 580 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++
1 file changed, 580 insertions(+)
diff --git a/pcre_regex.hpp b/pcre_regex.hpp
new file mode 100644
index 0000000..260e377
--- /dev/null
+++ b/pcre_regex.hpp
@@ -0,0 +1,580 @@
+// C++ wrapper for PCRE2 C API.
+//
+// Copyright (C) 2021 Gregory W. Chicares.
+//
+// This program is free software; you can redistribute it and/or modify
+// it under the terms of the GNU General Public License version 2 as
+// published by the Free Software Foundation.
+//
+// This program is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+// GNU General Public License for more details.
+//
+// You should have received a copy of the GNU General Public License
+// along with this program; if not, write to the Free Software Foundation,
+// Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA
+//
+// https://savannah.nongnu.org/projects/lmi
+// email: <gchicares@sbcglobal.net>
+// snail: Chicares, 186 Belle Woods Drive, Glastonbury CT 06033, USA
+
+#ifndef pcre_regex_hpp
+#define pcre_regex_hpp
+
+#include "config.hpp"
+
+// The PCRE library is readily available for any realistic POSIX
+// system, but not necessarily otherwise.
+#if defined LMI_POSIX
+
+// UTF-8 everywhere; UTF-[^8] nowhere.
+#define PCRE2_CODE_UNIT_WIDTH 8
+#include <pcre2.h>
+
+#include <iostream>
+#include <sstream>
+#include <stdexcept>
+#include <string_view>
+#include <vector>
+
+namespace pcre
+{
+
+namespace detail
+{
+
+// This private class is just a RAII wrapper for pcre2_match_data.
+class match_data
+{
+ public:
+ match_data() = default;
+
+ explicit match_data(pcre2_code* code)
+ {
+ data_ = pcre2_match_data_create_from_pattern(code, nullptr);
+ if(!data_)
+ {
+ throw std::runtime_error("pcre2_match_data_create_from_pattern
failed");
+ }
+ }
+
+ match_data(match_data const&) = delete;
+ match_data& operator=(match_data const&) = delete;
+
+ match_data(match_data&& m) noexcept
+ {
+ data_ = m.data_;
+ m.data_ = nullptr;
+ }
+
+ match_data& operator=(match_data&& m) noexcept
+ {
+ pcre2_match_data_free(data_); // Guaranteed NOP if data_ is null.
+
+ data_ = m.data_;
+ m.data_ = nullptr;
+
+ return *this;
+ }
+
+ ~match_data()
+ {
+ pcre2_match_data_free(data_);
+ }
+
+ [[nodiscard]] pcre2_match_data* as_pcre() const noexcept { return data_; }
+
+ private:
+ pcre2_match_data* data_ {nullptr};
+};
+
+} // namespace detail
+
+/// Represents a single sub-expression match.
+class sub_match
+{
+ public:
+ sub_match() = default;
+
+ [[nodiscard]] std::size_t start_pos() const noexcept { return start_; }
+ [[nodiscard]] std::size_t end_pos() const noexcept { return end_; }
+
+ [[nodiscard]] bool empty() const noexcept { return start_ == end_; }
+
+ [[nodiscard]] std::string_view as_view() const noexcept
+ {
+ return { view_.data() + start_, end_ - start_ };
+ }
+
+ [[nodiscard]] operator std::string() const
+ {
+ return std::string{as_view()};
+ }
+
+ [[nodiscard]] bool operator==(std::string_view const& view) const noexcept
+ {
+ return as_view() == view;
+ }
+
+ [[nodiscard]] bool operator==(char c) const noexcept
+ {
+ return as_view() == std::string_view{&c, 1};
+ }
+
+ private:
+ friend class match_results;
+
+ sub_match(std::string_view const& view, PCRE2_SIZE start, PCRE2_SIZE end)
+ :view_{view}
+ ,start_{start}
+ ,end_{end}
+ {
+ }
+
+ std::string_view view_ ;
+ std::size_t start_ {0};
+ std::size_t end_ {0};
+};
+
+inline
+std::ostream& operator<<(std::ostream& os, sub_match const& sm)
+{
+ return os << sm.as_view();
+}
+
+/// Represents the results of matching a regular expression.
+class match_results : private detail::match_data
+{
+ public:
+ match_results() = default;
+ match_results(match_results const&) = delete;
+ match_results& operator=(match_results const&) = delete;
+ match_results(match_results&& m) = default;
+ match_results& operator=(match_results&& m) = default;
+ ~match_results() = default;
+
+ [[nodiscard]] bool empty() const noexcept
+ {
+ return num_ == 0;
+ }
+
+ [[nodiscard]] explicit operator bool() const noexcept
+ {
+ return !empty();
+ }
+
+ [[nodiscard]] int size() const noexcept
+ {
+ return num_;
+ }
+
+ [[nodiscard]] sub_match operator[](int n) const
+ {
+ // Using this operator after failing to match succeeds by returning
+ // nothing for compatibility with Boost.Regex.
+ if(empty())
+ return {};
+
+ if(n < 0 || n >= size())
+ {
+ std::ostringstream oss;
+ oss
+ << "Invalid captured expression index " << n
+ << " out of range 0.." << size()
+ ;
+
+ throw std::runtime_error(oss.str());
+ }
+
+ // 'ovector' is a vector of pairs of offsets.
+ auto const ovector = pcre2_get_ovector_pointer(as_pcre());
+
+ auto const start = ovector[2*n ];
+ auto const end = ovector[2*n + 1];
+
+ // It is possible for the start to be greater than end when using \K,
+ // but we don't need to support it for now.
+ if(start > end)
+ throw std::runtime_error(R"(captures with \K not supported)");
+
+ // Sub-pattern could have been unused and unset.
+ if(start == PCRE2_UNSET)
+ return {};
+
+ // Note that this view could still be empty, but at least it is
+ // guaranteed to be valid in this case.
+ return { view_, start, end };
+ }
+
+ private:
+ friend class regex;
+
+ // Passing 'num' to the ctor is redundant as it could also be obtained from
+ // match_data using pcre2_get_ovector_count(), but as we already have it in
+ // the code creating match_results, we take it here as a micro optimization
+ // to avoid calling this (admittedly, trivial) function unnecessarily.
+ match_results(detail::match_data&& data, int num, std::string_view const&
view)
+ :detail::match_data{std::move(data)}
+ ,num_{num}
+ ,view_{view}
+ {
+ }
+
+ // Number of subexpressions.
+ int num_ {0};
+
+ std::string_view view_;
+};
+
+/// Provides a way to define and use regular expressions using vaguely
+/// Boost.Regex-compatible API.
+class regex
+{
+ public:
+ enum
+ {ECMAScript = 0 // Does nothing, for compatibility with std::regex
only.
+ ,icase = 1
+ };
+
+ // Define a Boost.Regex-compatible type.
+ using flag_type = int;
+
+ explicit regex(std::string_view const& pattern, int flags = 0)
+ {
+ // We always use PCRE2_DOTALL for compatibility with Boost.Regex.
+ uint32_t options = PCRE2_DOTALL;
+ if(flags & icase)
+ {
+ options |= PCRE2_CASELESS;
+ }
+
+ int errorcode = 0;
+ PCRE2_SIZE erroroffset = 0;
+
+ code_ = pcre2_compile
+ (reinterpret_cast<PCRE2_SPTR>(pattern.data())
+ ,pattern.size()
+ ,options
+ ,&errorcode
+ ,&erroroffset
+ ,nullptr // default compile context
+ );
+
+ if(!code_)
+ {
+ std::ostringstream oss;
+ oss
+ << "Compiling regular expression \"" << pattern << "\" failed "
+ << "at offset " << erroroffset << ": "
+ << get_error_message(errorcode)
+ ;
+
+ throw std::runtime_error(oss.str());
+ }
+ }
+
+ regex(regex const&) = delete;
+ regex& operator=(regex const&) = delete;
+
+ regex(regex&& rx) noexcept
+ {
+ code_ = rx.code_;
+ rx.code_ = nullptr;
+ }
+
+ regex& operator=(regex&& rx) noexcept
+ {
+ pcre2_code_free(code_); // Guaranteed NOP if code_ is null.
+
+ code_ = rx.code_;
+ rx.code_ = nullptr;
+
+ return *this;
+ }
+
+ ~regex()
+ {
+ pcre2_code_free(code_);
+ }
+
+ [[nodiscard]] match_results search(std::string_view const& subject) const
+ {
+ detail::match_data md{code_};
+
+ int const rc = pcre2_match
+ (code_
+ ,reinterpret_cast<PCRE2_SPTR>(subject.data())
+ ,subject.size()
+ ,0 // starting offset
+ ,0 // no options
+ ,md.as_pcre()
+ ,nullptr // default match_results context
+ );
+
+ if(rc < 0)
+ {
+ switch(rc)
+ {
+ case PCRE2_ERROR_NOMATCH:
+ return match_results{};
+
+ default:
+ std::ostringstream oss;
+ oss
+ << "Matching regular expression failed: "
+ << get_error_message(rc)
+ ;
+ throw std::runtime_error(oss.str());
+ }
+ }
+
+ return { std::move(md), rc, subject };
+ }
+
+ [[nodiscard]] std::string replace
+ (std::string_view const& subject
+ ,std::string_view const& replacement
+ ) const
+ {
+ // Assume that the replacement won't be more than twice as long as the
+ // original string.
+ //
+ // This is almost certainly not optimal, but it's not clear how could
+ // we do better in the general case.
+ PCRE2_SIZE outlen = 2*subject.size();
+
+ std::vector<PCRE2_UCHAR> buffer(outlen);
+
+ detail::match_data md{code_};
+
+ // We may need to try twice, first with the estimated buffer size and
+ // then with the actually needed one, so define the lambda performing
+ // the actual PCRE2 function call in order to retry it, if necessary.
+ auto const call_substitute =
+ [this, subject, replacement, &outlen, &buffer, &md]() -> int
+ {
+ return pcre2_substitute
+ (code_
+ ,reinterpret_cast<PCRE2_SPTR>(subject.data())
+ ,subject.size()
+ ,0 // starting offset
+ ,PCRE2_SUBSTITUTE_OVERFLOW_LENGTH
+ ,md.as_pcre()
+ ,nullptr // default match_results context
+ ,reinterpret_cast<PCRE2_SPTR>(replacement.data())
+ ,replacement.size()
+ ,&buffer[0]
+ ,&outlen
+ );
+ };
+
+ int rc = call_substitute();
+ if(rc == PCRE2_ERROR_NOMEMORY)
+ {
+ // The length must have been updated to be big enough now.
+ buffer.resize(outlen);
+
+ rc = call_substitute();
+ }
+
+ if(rc < 0)
+ {
+ std::ostringstream oss;
+ oss
+ << "Substituting regular expression failed: "
+ << get_error_message(rc)
+ ;
+ throw std::runtime_error(oss.str());
+ }
+
+ return std::string{reinterpret_cast<char*>(&buffer[0]), outlen};
+ }
+
+ private:
+ // Return the error message corresponding to the given error code.
+ [[nodiscard]] static std::string get_error_message(int errorcode)
+ {
+ std::ostringstream oss;
+
+ // PCRE documentation states that 128 characters should be enough for
+ // all error messages, but we can afford being even more generous here.
+ char buffer[256];
+ const int rc = pcre2_get_error_message
+ (errorcode
+ ,reinterpret_cast<PCRE2_UCHAR*>(buffer)
+ ,sizeof(buffer)
+ );
+
+ if(rc > 0)
+ {
+ oss << std::string_view{buffer, static_cast<std::size_t>(rc)};
+ }
+ else
+ {
+ switch(rc)
+ {
+ case PCRE2_ERROR_BADDATA:
+ oss << "unknown PCRE error " << errorcode;
+ break;
+
+ case PCRE2_ERROR_NOMEMORY:
+ // Still show the part of the error message we could
+ // retrieve (which is guaranteed to be NUL-terminated).
+ oss
+ << std::string(buffer)
+ << " (error message was unexpectedly truncated)"
+ ;
+ break;
+
+ default:
+ // We can do nothing but give up in this case.
+ oss
+ << "obtaining error message for PCRE error "
+ << errorcode
+ << " failed with error "
+ << rc
+ ;
+ }
+ }
+
+ return oss.str();
+ }
+
+ pcre2_code* code_ {nullptr};
+};
+
+/// Container for match results returned by search_all().
+class all_match_results
+{
+ public:
+ all_match_results(std::string_view const& subject, regex const& rx)
+ :subject_{subject}
+ ,rx_{rx}
+ {
+ }
+
+ class iterator
+ {
+ public:
+ iterator() = default;
+ iterator(iterator const&) = delete;
+ iterator& operator=(iterator const&) = delete;
+
+ [[nodiscard]] match_results const& operator*() noexcept
+ {
+ return match_;
+ }
+
+ iterator& operator++() noexcept
+ {
+ next();
+
+ return *this;
+ }
+
+ [[nodiscard]] bool operator==(iterator const& other) const noexcept
+ {
+ // Only end iterators compare equally.
+ return !results_ && !other.results_;
+ }
+
+ private:
+ friend class all_match_results;
+
+ explicit iterator(all_match_results* results) noexcept
+ :results_{results}
+ {
+ next();
+ }
+
+ void next() noexcept
+ {
+ // Incrementing end iterator is not allowed and will crash.
+ if(!results_->get_next_match(match_))
+ results_ = nullptr;
+ }
+
+ // This pointer is null for the end iterator and valid otherwise.
+ all_match_results* results_ {nullptr};
+
+ // Always valid if results_ is non-null.
+ match_results match_;
+ };
+
+ [[nodiscard]] iterator begin() noexcept { return iterator{this}; }
+ [[nodiscard]] iterator end() noexcept { return {}; }
+
+ private:
+ friend class iterator;
+
+ [[nodiscard]] bool get_next_match(match_results& m) noexcept
+ {
+ if(subject_.empty())
+ return false;
+
+ m = rx_.search(subject_);
+ if(!m)
+ {
+ subject_ = {};
+ return false;
+ }
+
+ // Advance after this match.
+ if(sub_match const sm = m[0]; !sm.empty())
+ {
+ // This is simple if it's not empty.
+ subject_.remove_prefix(sm.end_pos());
+ }
+ else
+ {
+ // Generally speaking, to handle finding all occurrences of a
+ // pattern that can match an empty string, we would need to retry
+ // matching at the same offset with PCRE2_NOTEMPTY_ATSTART and
+ // PCRE2_ANCHORED flags as explained in PCRE2 documentation.
+ //
+ // But for now keep things simple, as we don't use any such
+ // patterns, and just advance to the next position (note that
+ // because we never use PCRE2_NEWLINE_CRLF and similar, we also
+ // don't need to worry about possibly advancing by 2 characters
+ // when there is a newline at the current offset, as PCRE2
+ // documentation also recommends and pcre2demo program shows).
+ subject_.remove_prefix(1);
+ }
+
+ return true;
+ }
+
+ std::string_view subject_;
+ regex const& rx_;
+};
+
+[[nodiscard]]
+inline
+match_results
+search(std::string_view const& s, regex const& rx)
+{
+ return rx.search(s);
+}
+
+[[nodiscard]]
+inline
+std::string
+replace(std::string_view const& s, regex const& rx, std::string_view const& r)
+{
+ return rx.replace(s, r);
+}
+
+[[nodiscard]]
+inline
+all_match_results
+search_all(std::string_view const& s, regex const& rx)
+{
+ return all_match_results{s, rx};
+}
+
+} // namespace pcre
+
+#endif // defined LMI_POSIX
+
+#endif // pcre_regex_hpp
- [lmi-commits] [lmi] master updated (d2dd934 -> 368ee55), Greg Chicares, 2021/10/02
- [lmi-commits] [lmi] master da6b758 1/7: Make PCRE available, for pc-linux-gnu only, Greg Chicares, 2021/10/02
- [lmi-commits] [lmi] master c40b1c2 4/7: Unit-test PCRE (pc-linux-gnu only) as well as std::regex, Greg Chicares, 2021/10/02
- [lmi-commits] [lmi] master 11464e0 6/7: Use PCRE for 'test_coding_rules', Greg Chicares, 2021/10/02
- [lmi-commits] [lmi] master bcfffdd 5/7: Use an ELF 'test_coding_rules' regardless of $LMI_TRIPLET, Greg Chicares, 2021/10/02
- [lmi-commits] [lmi] master b87c2d5 2/7: Add C++ wrapper for PCRE2,
Greg Chicares <=
- [lmi-commits] [lmi] master a167c11 3/7: Unit-test std::regex instead of boost::regex, Greg Chicares, 2021/10/02
- [lmi-commits] [lmi] master 368ee55 7/7: Eradicate the boost regex library, Greg Chicares, 2021/10/02