# # patch "ChangeLog" # from [094cd44781ea02f96dc726319f39f89a932e69a2] # to [17005e6b477d1e10e723855784f06be09057eb21] # # patch "transforms.cc" # from [8efaefb7eb95e69777d68dbc233933446b2771a9] # to [0301854cc96c7c43b527feb0ac5c2914537d9027] # --- ChangeLog +++ ChangeLog @@ -1,3 +1,11 @@ +2005-04-17 Richard Levitte + + * transforms.cc (glob_to_regexp): New function that takes a glob + expression and transforms it into a regexp. This will be useful + for globbing branch expressions when collections are exchanged to + branch globs and regexps. + (glob_to_regexp_test): A unit test for glob_to_regexp(). + 2005-04-17 Matt Johnston * database.cc: mention that it could be the filesystem that --- transforms.cc +++ transforms.cc @@ -705,7 +705,145 @@ dst += linesep_str; } +// glob_to_regexp converts a sh file glob to a regexp. The regexp should +// be usable by the Boost regexp library. +// +// Pattern tranformation: +// +// - Any character except those described below are copied as they are. +// - The backslash (\) escapes the following character. The escaping +// backslash is copied to the regexp along with the following character. +// - * is transformed to .* in the regexp. +// - ? is transformed to . in the regexp. +// - { is transformed to ( in the regexp, unless within [ and ]. +// - } is transformed to ) in the regexp, unless within [ and ]. +// - , is transformed to | in the regexp, if within { and } and not +// within [ and ]. +// - ^ is escaped unless it comes directly after an unescaped [. +// - ! is transformed to ^ in the regexp if it comes directly after an +// unescaped [. +// - ] directly following an unescaped [ is escaped. +string glob_to_regexp(const string & glob) +{ + struct bad_glob { + bad_glob() : what("Bad glob syntax") {} + string what; + }; + + int in_braces = 0; // counter for levels if {} + bool in_brackets = false; // flags if we're inside a [], which + // has higher precedence than {}. + // Also, [ is accepted inside [] unescaped. + bool this_was_opening_bracket = false; + string tmp; + + tmp.reserve(glob.size() * 2); + #ifdef BUILD_UNIT_TESTS + cerr << "DEBUG[glob_to_regexp]: input = \"" << glob << "\"" << endl; +#endif + + for (string::const_iterator i = glob.begin(); i != glob.end(); ++i) + { + char c = *i; + bool last_was_opening_bracket = this_was_opening_bracket; + this_was_opening_bracket = false; + + // Special case ^ and ! at the beginning of a [] expression. + if (in_brackets && last_was_opening_bracket + && (c == '!' || c == '^')) + { + tmp += '^'; + if (++i == glob.end()) + break; + c = *i; + } + + if (c == '\\') + { + tmp += c; + if (++i == glob.end()) + break; + tmp += *i; + } + else if (in_brackets) + { + switch(c) + { + case ']': + if (!last_was_opening_bracket) + { + in_brackets = false; + tmp += c; + break; + } + // Trickling through to the standard character conversion, + // because ] as the first character of a set is regarded as + // a normal character. + default: + if (!(isalnum(c) || c == '_')) + { + tmp += '\\'; + } + tmp += c; + break; + } + } + else + { + switch(c) + { + case '*': + tmp += ".*"; + break; + case '?': + tmp += '.'; + break; + case '{': + in_braces++; + tmp += '('; + break; + case '}': + if (in_braces == 0) + throw bad_glob(); + tmp += ')'; + in_braces--; + break; + case '[': + in_brackets = true; + this_was_opening_bracket = true; + tmp += c; + break; + case ',': + if (in_braces > 0) + { + tmp += '|'; + break; + } + // Trickling through to default: here, since a comma outside of + // brace notation is just a normal character. + default: + if (!(isalnum(c) || c == '_')) + { + tmp += '\\'; + } + tmp += c; + break; + } + } + } + + if (in_braces != 0 || in_brackets) + throw bad_glob(); + +#ifdef BUILD_UNIT_TESTS + cerr << "DEBUG[glob_to_regexp]: output = \"" << tmp << "\"" << endl; +#endif + + return tmp; +} + +#ifdef BUILD_UNIT_TESTS #include "unit_tests.hh" static void @@ -991,6 +1129,15 @@ check_idna_encoding(); } +static void glob_to_regexp_test() +{ + BOOST_CHECK(glob_to_regexp("abc,v") == "abc\\,v"); + BOOST_CHECK(glob_to_regexp("foo[12m,]") == "foo[12m\\,]"); + // A full fledged, use all damn features test... + BOOST_CHECK(glob_to_regexp("foo.{bar*,cookie?{haha,hehe[^\\123!,]}}[!]a^b]") + == "foo\\.(bar.*|cookie.(haha|hehe[^\\123\\!\\,]))[^\\]a\\^b]"); +} + void add_transform_tests(test_suite * suite) { @@ -1002,6 +1149,7 @@ suite->add(BOOST_TEST_CASE(&join_lines_test)); suite->add(BOOST_TEST_CASE(&strip_ws_test)); suite->add(BOOST_TEST_CASE(&encode_test)); + suite->add(BOOST_TEST_CASE(&glob_to_regexp_test)); } #endif // BUILD_UNIT_TESTS