/* regexprops.c -- document the properties of the regular expressions understood by gnulib. Copyright 2005 Free Software Foundation, Inc. This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program. If not, see . */ /* Written by James Youngman, . */ #include #include #include #include #include "regex.h" #include "regextype.h" /* Name this program was run with. */ char *program_name; static void output(const char *s, int escape) { (void) escape; fputs(s, stdout); } static void newline(void) { output("\n", 0); } static void content(const char *s) { output(s, 1); } static void literal(const char *s) { output(s, 0); } static void directive(const char *s) { output(s, 0); } static void enum_item(const char *s) { newline(); directive("@item "); literal(s); newline(); } static void begin_subsection(const char *name, const char *next, const char *prev, const char *up) { (void) next; (void) prev; (void) up; newline(); directive("@node "); content(name); content(" regular expression syntax"); newline(); directive("@subsection "); output("@samp{", 0); content(name); output("}", 0); content(" regular expression syntax"); newline(); } static void begintable_markup(char const *markup) { newline(); directive("@table "); literal(markup); newline(); } static void endtable() { newline(); directive("@end table"); newline(); } static void beginenum() { newline(); directive("@enumerate"); newline(); } static void endenum() { newline(); directive("@end enumerate"); newline(); } static void newpara() { content("\n\n"); } static void describe_regex_syntax(int options) { newpara(); content("The character @samp{.} matches any single character"); if ( (options & RE_DOT_NEWLINE) == 0 ) { content(" except newline"); } if (options & RE_DOT_NOT_NULL) { if ( (options & RE_DOT_NEWLINE) == 0 ) content(" and"); else content(" except"); content(" the null character"); } content(". "); newpara(); if (!(options & RE_LIMITED_OPS)) { begintable_markup("@samp"); if (options & RE_BK_PLUS_QM) { enum_item("\\+"); content("indicates that the regular expression should match one" " or more occurrences of the previous atom or regexp. "); enum_item("\\?"); content("indicates that the regular expression should match zero" " or one occurrence of the previous atom or regexp. "); enum_item("+ and ? "); content("match themselves. "); } else { enum_item("+"); content("indicates that the regular expression should match one" " or more occurrences of the previous atom or regexp. "); enum_item("?"); content("indicates that the regular expression should match zero" " or one occurrence of the previous atom or regexp. "); enum_item("\\+"); literal("matches a @samp{+}"); enum_item("\\?"); literal("matches a @samp{?}. "); } endtable(); } newpara(); content("Bracket expressions are used to match ranges of characters. "); literal("Bracket expressions where the range is backward, for example @samp{[z-a]}, are "); if (options & RE_NO_EMPTY_RANGES) content("invalid"); else content("ignored"); content(". "); if (options & RE_BACKSLASH_ESCAPE_IN_LISTS) literal("Within square brackets, @samp{\\} can be used to quote " "the following character. "); else literal("Within square brackets, @samp{\\} is taken literally. "); if (options & RE_CHAR_CLASSES) content("Character classes are supported; for example " "@samp{[[:digit:]]} will match a single decimal digit. "); else literal("Character classes are not supported, so for example " "you would need to use @samp{[0-9]} " "instead of @samp{[[:digit:]]}. "); if (options & RE_HAT_LISTS_NOT_NEWLINE) { literal("Non-matching lists @address@hidden do not ever match newline. "); } newpara(); if (options & RE_NO_GNU_OPS) { content("GNU extensions are not supported and so " "@samp{\\w}, @samp{\\W}, @samp{\\<}, @samp{\\>}, @samp{\\b}, @samp{\\B}, @samp{\\`}, and @samp{\\'} " "match " "@samp{w}, @samp{W}, @samp{<}, @samp{>}, @samp{b}, @samp{B}, @samp{`}, and @samp{'} respectively. "); } else { content("GNU extensions are supported:"); beginenum(); enum_item("@samp{\\w} matches a character within a word"); enum_item("@samp{\\W} matches a character which is not within a word"); enum_item("@samp{\\<} matches the beginning of a word"); enum_item("@samp{\\>} matches the end of a word"); enum_item("@samp{\\b} matches a word boundary"); enum_item("@samp{\\B} matches characters which are not a word boundary"); enum_item("@samp{\\`} matches the beginning of the whole input"); enum_item("@samp{\\'} matches the end of the whole input"); endenum(); } newpara(); if (options & RE_NO_BK_PARENS) { literal("Grouping is performed with parentheses @samp{()}. "); if (options & RE_UNMATCHED_RIGHT_PAREN_ORD) literal("An unmatched @samp{)} matches just itself. "); } else { literal("Grouping is performed with backslashes followed by parentheses @samp{\\(}, @samp{\\)}. "); } if (options & RE_NO_BK_REFS) { content("A backslash followed by a digit matches that digit. "); } else { literal("A backslash followed by a digit acts as a back-reference and matches the same thing as the previous grouped expression indicated by that number. For example @samp{\\2} matches the second group expression. The order of group expressions is determined by the position of their opening parenthesis "); if (options & RE_NO_BK_PARENS) literal("@samp{(}"); else literal("@samp{\\(}"); content(". "); } newpara(); if (!(options & RE_LIMITED_OPS)) { if (options & RE_NO_BK_VBAR) literal("The alternation operator is @samp{|}. "); else literal("The alternation operator is @samp{\\|}. "); } newpara(); if (options & RE_CONTEXT_INDEP_ANCHORS) { literal("The characters @samp{^} and @samp{$} always represent the beginning and end of a string respectively, except within square brackets. Within brackets, @samp{^} can be used to invert the membership of the character class being specified. "); } else { literal("The character @samp{^} only represents the beginning of a string when it appears:"); beginenum(); enum_item("\nAt the beginning of a regular expression"); enum_item("After an open-group, signified by "); if (options & RE_NO_BK_PARENS) { literal("@samp{(}"); } else { literal("@samp{\\(}"); } newline(); if (!(options & RE_LIMITED_OPS)) { if (options & RE_NEWLINE_ALT) enum_item("After a newline"); if (options & RE_NO_BK_VBAR ) enum_item("After the alternation operator @samp{|}"); else enum_item("After the alternation operator @samp{\\|}"); } endenum(); newpara(); literal("The character @samp{$} only represents the end of a string when it appears:"); beginenum(); enum_item("At the end of a regular expression"); enum_item("Before a close-group, signified by "); if (options & RE_NO_BK_PARENS) { literal("@samp{)}"); } else { literal("@samp{\\)}"); } if (!(options & RE_LIMITED_OPS)) { if (options & RE_NEWLINE_ALT) enum_item("Before a newline"); if (options & RE_NO_BK_VBAR) enum_item("Before the alternation operator @samp{|}"); else enum_item("Before the alternation operator @samp{\\|}"); } endenum(); } newpara(); if (!(options & RE_LIMITED_OPS) ) { if ((options & RE_CONTEXT_INDEP_OPS) && !(options & RE_CONTEXT_INVALID_OPS)) { literal("The characters @samp{*}, @samp{+} and @samp{?} are special anywhere in a regular expression. "); } else { if (options & RE_BK_PLUS_QM) literal("@samp{\\*}, @samp{\\+} and @samp{\\?} "); else literal("@samp{*}, @samp{+} and @samp{?} "); if (options & RE_CONTEXT_INVALID_OPS) { content("are special at any point in a regular expression except the following places, where they are not allowed:"); } else { content("are special at any point in a regular expression except:"); } beginenum(); enum_item("At the beginning of a regular expression"); enum_item("After an open-group, signified by "); if (options & RE_NO_BK_PARENS) { literal("@samp{(}"); } else { literal("@samp{\\(}"); } if (!(options & RE_LIMITED_OPS)) { if (options & RE_NEWLINE_ALT) enum_item("After a newline"); if (options & RE_NO_BK_VBAR) enum_item("After the alternation operator @samp{|}"); else enum_item("After the alternation operator @samp{\\|}"); } endenum(); } } newpara(); if (options & RE_INTERVALS) { if (options & RE_NO_BK_BRACES) { literal("Intervals are specified by @address@hidden and @address@hidden "); if (options & RE_INVALID_INTERVAL_ORD) { literal("Invalid intervals are treated as literals, for example @address@hidden is treated as @address@hidden"); } else { literal("Invalid intervals such as @address@hidden are not accepted. "); } } else { literal("Intervals are specified by @address@hidden and @address@hidden "); if (options & RE_INVALID_INTERVAL_ORD) { literal("Invalid intervals are treated as literals, for example @address@hidden is treated as @address@hidden"); } else { literal("Invalid intervals such as @address@hidden are not accepted. "); } } } newpara(); if (options & RE_NO_POSIX_BACKTRACKING) { content("Matching succeeds as soon as the whole pattern is matched, meaning that the result may not be the longest possible match. "); } else { content("The longest possible match is returned; this applies to the regular expression as a whole and (subject to this constraint) to subexpressions within groups. "); } newpara(); } static void menu(void) { int i, options; const char *name; output("@menu\n", 0); for (i=0; options = get_regex_type_flags(i), name=get_regex_type_name(i); ++i) { output("* ", 0); output(name, 0); content(" regular expression syntax"); output("::", 0); newline(); } output("@end menu\n", 0); } static void describe_all(const char *up) { const char *name, *next, *previous; int options; int i, parent; menu(); previous = ""; for (i=0; options = get_regex_type_flags(i), name=get_regex_type_name(i); ++i) { next = get_regex_type_name(i+1); if (NULL == next) next = ""; begin_subsection(name, next, previous, up); parent = get_regex_type_synonym(i); if (parent >= 0) { content("This is a synonym for "); content(get_regex_type_name(parent)); content("."); } else { describe_regex_syntax(options); } previous = name; } } int main (int argc, char *argv[]) { const char *up = ""; program_name = argv[0]; if (argc > 1) up = argv[1]; describe_all(up); return 0; }