[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]
[SCM] gawk branch, feature/minrx, updated. gawk-4.1.0-5875-g44185361
From: |
Arnold Robbins |
Subject: |
[SCM] gawk branch, feature/minrx, updated. gawk-4.1.0-5875-g44185361 |
Date: |
Wed, 25 Dec 2024 02:58:18 -0500 (EST) |
This is an automated email from the git hooks/post-receive script. It was
generated because a ref change was pushed to the repository containing
the project "gawk".
The branch, feature/minrx has been updated
via 44185361fcedbc956120f8cde11dc98de476a80b (commit)
from b4dd924566ad45d9db42467c66b3b723257f29f1 (commit)
Those revisions listed above that are new to this repository have
not appeared on any other notification email; so we list those
revisions in full, below.
- Log -----------------------------------------------------------------
http://git.sv.gnu.org/cgit/gawk.git/commit/?id=44185361fcedbc956120f8cde11dc98de476a80b
commit 44185361fcedbc956120f8cde11dc98de476a80b
Author: Arnold D. Robbins <arnold@skeeve.com>
Date: Wed Dec 25 09:58:01 2024 +0200
Actually add charset.[ch] files.
diff --git a/support/charset.c b/support/charset.c
new file mode 100644
index 00000000..41a4ebc6
--- /dev/null
+++ b/support/charset.c
@@ -0,0 +1,549 @@
+/*
+ * Copyright (C) 2023, 2024, Arnold David Robbins.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * âAS ISâ AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
+ * TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <assert.h>
+#include <stdio.h>
+#include <stdbool.h>
+#include <stddef.h>
+#include <stdint.h>
+#include <stdlib.h>
+#include <string.h>
+#include <wctype.h>
+#include <wchar.h>
+#include <locale.h>
+#include "charset.h" // for the charset_t typedef
+
+#define INITIAL_ALLOCATION 10
+#define MAX_CODE_POINT 0x10ffff // max Unicode code point
+
+typedef struct set_item {
+ enum set_item_type {
+ CTYPE_ITEM,
+ RANGE_ITEM,
+ } item_type;
+ union {
+ struct _ctype {
+ wctype_t wtype;
+ const char *type_name;
+ } c;
+ struct _range {
+ int32_t start, end;
+ } r;
+ } u;
+} set_item;
+#define wtype u.c.wtype
+#define type_name u.c.type_name
+#define start u.r.start
+#define end u.r.end
+struct _charset {
+ bool complemented; // For [^...] sets
+ bool no_newlines; // If \n can't be in the set
+ bool finalized; // No more changes possible
+ size_t nchars_inuse; // Number of characters used
+ size_t nchars_allocated; // Number of characters allocated
+ int32_t *chars; // Characters added to the set
+ size_t nelems; // Number of elements (items) in use
+ size_t allocated; // Number allocated
+ set_item *items; // Array of items
+};
+/* item_compare_for_searching --- compare two set_items */
+
+static int
+item_compare_for_searching(const void *k, const void *e)
+{
+ set_item *thekey = (set_item *) k;
+ set_item *elem = (set_item *) e;
+
+ assert(thekey->item_type == RANGE_ITEM && elem->item_type ==
RANGE_ITEM);
+
+ if (elem->start <= thekey->start && thekey->start <= elem->end)
+ return 0; // found it
+ else if (thekey->end < elem->start)
+ return -1;
+ else {
+ assert(thekey->start > elem->end);
+ return 1;
+ }
+}
+/* wint_compare --- compare two wint values for qsort */
+
+static int
+wint_compare(const void *l, const void *r)
+{
+ wint_t *left = (wint_t *) l;
+ wint_t *right = (wint_t *) r;
+
+ return *left - *right;
+}
+/* item_compare_for_sorting --- compare two set_items */
+
+static int
+item_compare_for_sorting(const void *l, const void *r)
+{
+ set_item *left = (set_item *) l;
+ set_item *right = (set_item *) r;
+
+ if (left->item_type == CTYPE_ITEM && right->item_type == CTYPE_ITEM) {
+ return left->wtype - right->wtype;
+ } else if (left->item_type == CTYPE_ITEM && right->item_type ==
RANGE_ITEM) {
+ return -1;
+ } else if (left->item_type == RANGE_ITEM && right->item_type ==
CTYPE_ITEM) {
+ return +1;
+ } else {
+ assert(left->item_type == RANGE_ITEM && right->item_type ==
RANGE_ITEM);
+ return left->start - right->start;
+ }
+}
+
+/* is_found --- return true if the character is found */
+
+static bool
+is_found(const charset_t *set, int32_t the_char)
+{
+ set_item *items = set->items;
+ int i;
+
+ if (set->items == NULL) // empty set, can't match
+ return false;
+ if (set->nelems == 1 && set->items[0].item_type == RANGE_ITEM) {
+ return (set->items[0].start <= the_char && the_char <=
set->items[0].end);
+ }
+ for (i = 0; i < set->nelems; i++) {
+ // linear search of ctype items
+ if (items[i].item_type == RANGE_ITEM)
+ break;
+
+ assert(items[i].item_type == CTYPE_ITEM);
+ if (iswctype(the_char, items[i].wtype))
+ return true;
+ }
+
+ if (i >= set->nelems)
+ return false;
+ assert(items[i].item_type == RANGE_ITEM);
+
+ // binary search to see if we have it
+ set_item *found;
+ set_item key;
+ key.item_type = RANGE_ITEM;
+ key.start = key.end = the_char;
+
+ found = bsearch(& key, set->items + i, set->nelems - i,
+ sizeof(set_item),
item_compare_for_searching);
+
+ return found != NULL;
+}
+/* finalize --- condense all the info into the final data structure */
+
+static void
+finalize(charset_t *set)
+{
+ assert(set != NULL);
+ int result = 0;
+
+ qsort(set->chars, set->nchars_inuse, sizeof(wint_t), wint_compare);
+ size_t i, j;
+ for (i = 0, j = 1; j < set->nchars_inuse; i++, j++) {
+ if (set->chars[i] == set->chars[j]) {
+ for (int k = j + 1; k < set->nchars_inuse; j++, k++) {
+ set->chars[j] = set->chars[k];
+ }
+ set->chars[j] = L'\0';
+
+ set->nchars_inuse--;
+ j = i;
+ i--; // keep searching from same spot
+ }
+ }
+ if (set->chars != NULL)
+ set->chars[set->nchars_inuse] = L'\0'; // not strictly
necessary, but doesn't hurt
+ size_t range_start, total;
+ range_start = total = 0;
+ for (i = 0, j = 1; j < set->nchars_inuse; i++, j++) {
+ if (set->chars[j] == set->chars[i] + 1) { // ab...
+ continue;
+ } else if (set->chars[j] > set->chars[i] + 1) {
+ // acd...
+ // push a and start next range at c
+ result = charset_add_range(set,
set->chars[range_start], set->chars[i]);
+ if (result != CSET_SUCCESS)
+ return;
+ total++;
+ range_start = j;
+ }
+ }
+ // Get any final range or character
+ if (set->nchars_inuse > 0 && range_start <= set->nchars_inuse - 1) {
+ result = charset_add_range(set, set->chars[range_start],
+ set->chars[set->nchars_inuse-1]);
+ if (result != CSET_SUCCESS)
+ return;
+ total++;
+ }
+ set->nchars_inuse = total;
+ // sort it
+ qsort(set->items, set->nelems,
+ sizeof(set_item), item_compare_for_sorting);
+
+ // condense it
+ set_item *items = set->items;
+ for (i = 0, j = 1; j < set->nelems; i++, j++) {
+ if ( items[i].item_type == CTYPE_ITEM
+ && items[j].item_type == CTYPE_ITEM
+ && items[i].wtype == items[j].wtype) {
+ free((void *) items[j].type_name);
+ for (int k = j + 1; k < set->nelems; j++, k++)
+ items[j] = items[k];
+
+ set->nelems--;
+ i--; // compensate for loop, continue checking at
current position
+ j = i + 1;
+ } else if (items[i].item_type != items[j].item_type) {
+ continue;
+ } else if (items[i].item_type == RANGE_ITEM) {
+ bool need_shift = false;
+ if (items[i].start == items[j].start && items[i].end ==
items[j].end) {
+ need_shift = true;
+ } else if (items[i].end + 1 == items[j].start) {
+ items[i].end = items[j].end;
+ need_shift = true;
+ } else if (items[i].start < items[j].start &&
items[i].end > items[j].end) {
+ need_shift = true;
+ } else if ( items[i].start <= items[j].start
+ && items[i].end > items[j].start
+ && items[j].end >= items[i].end) {
+ items[i].end = items[j].end;
+ need_shift = true;
+ }
+ if (need_shift) {
+ for (int k = j + 1; k < set->nelems; j++, k++)
+ items[j] = items[k];
+
+ set->nelems--;
+ i--; // compensate for loop, continue
checking at current position
+ j = i + 1;
+ }
+ // otherwise, just continue around the loop
+ }
+ }
+ set->finalized = true;
+}
+/* charset_create --- make a new charset_t and initialize it */
+
+charset_t *
+charset_create(int *errcode)
+{
+ if (errcode == NULL)
+ return NULL;
+
+ charset_t *set = (charset_t *) malloc(sizeof(charset_t));
+ if (set == NULL) {
+ *errcode = CSET_ESPACE;
+ return NULL;
+ }
+
+ memset(set, 0, sizeof(charset_t));
+
+ *errcode = CSET_SUCCESS;
+ return set;
+}
+/* charset_add_char --- add a single wide character to the set */
+
+int
+charset_add_char(charset_t *set, int32_t wc)
+{
+ if (set == NULL)
+ return CSET_EBADPTR;
+ if (set->finalized)
+ return CSET_EFROZEN;
+
+ if (wc < 0)
+ return CSET_ERANGE;
+
+ if (set->chars == NULL) {
+ set->chars = (int32_t *) malloc(sizeof(int32_t) *
INITIAL_ALLOCATION);
+ if (set->chars == NULL)
+ return CSET_ESPACE;
+
+ set->nchars_allocated = INITIAL_ALLOCATION;
+ set->nchars_inuse = 0;
+ } else if (set->nchars_inuse + 1 >= set->nchars_allocated) {
+ int new_amount = set->nchars_allocated * 2;
+ int32_t *new_data = (int32_t *) realloc(set->chars, new_amount
* sizeof(int32_t));
+
+ if (new_data == NULL)
+ return CSET_ESPACE;
+
+ memset(new_data + set->nchars_allocated, 0,
set->nchars_allocated * sizeof(int32_t));
+ set->nchars_allocated = new_amount;
+ set->chars = new_data;
+ }
+
+ set->chars[set->nchars_inuse++] = wc;
+ set->chars[set->nchars_inuse] = L'\0'; // make it into a string
+
+ return CSET_SUCCESS;
+}
+/* charset_add_range --- add a range item */
+
+int
+charset_add_range(charset_t *set, int32_t first, int32_t last)
+{
+ if (set == NULL)
+ return CSET_EBADPTR;
+ if (set->finalized)
+ return CSET_EFROZEN;
+
+ if (first < 0 || last < 0 || first > last)
+ return CSET_ERANGE;
+
+ if (set->items == NULL) {
+ set->items = (set_item *) malloc(sizeof(set_item) *
INITIAL_ALLOCATION);
+ if (set->items == NULL)
+ return CSET_ESPACE;
+
+ set->allocated = INITIAL_ALLOCATION;
+ set->nelems = 0;
+ } else if (set->nelems + 1 >= set->allocated) {
+ int new_amount = set->allocated * 2;
+ set_item *new_data = (set_item *) realloc(set->items,
new_amount * sizeof(set_item));
+
+ if (new_data == NULL)
+ return CSET_ESPACE;
+
+ memset(new_data + set->allocated, 0, set->allocated *
sizeof(set_item));
+ set->allocated = new_amount;
+ set->items = new_data;
+ }
+
+ set_item new_item;
+ new_item.item_type = RANGE_ITEM;
+ new_item.start = first;
+ new_item.end = last;
+ set->items[set->nelems++] = new_item;
+
+ return CSET_SUCCESS;
+}
+/* charset_invert --- mark charset to return success if requested character
not found */
+
+int
+charset_invert(charset_t *set)
+{
+ if (set == NULL)
+ return CSET_EBADPTR;
+ if (set->finalized)
+ return CSET_EFROZEN;
+
+ set->complemented = true;
+ return CSET_SUCCESS;
+}
+/* charset_set_no_newline --- set the value of the "no newlines" flag */
+
+int charset_set_no_newlines(charset_t *set, bool no_newlines)
+{
+ if (set == NULL)
+ return CSET_EBADPTR;
+ if (set->finalized)
+ return CSET_EFROZEN;
+
+ set->no_newlines = no_newlines;
+ return CSET_SUCCESS;
+}
+/* charset_add_cclass --- add a character class, like "alnum" */
+
+int
+charset_add_cclass(charset_t *set, const char *cclass)
+{
+ if (set == NULL)
+ return CSET_EBADPTR;
+ if (set->finalized)
+ return CSET_EFROZEN;
+
+ if (set->items == NULL) {
+ set->items = (set_item *) malloc(sizeof(set_item) *
INITIAL_ALLOCATION);
+ if (set->items == NULL)
+ return CSET_ESPACE;
+
+ set->allocated = INITIAL_ALLOCATION;
+ set->nelems = 0;
+ } else if (set->nelems + 1 >= set->allocated) {
+ int new_amount = set->allocated * 2;
+ set_item *new_data = (set_item *) realloc(set->items,
new_amount * sizeof(set_item));
+
+ if (new_data == NULL)
+ return CSET_ESPACE;
+
+ memset(new_data + set->allocated, 0, set->allocated *
sizeof(set_item));
+ set->allocated = new_amount;
+ set->items = new_data;
+ }
+
+ wctype_t the_type = wctype(cclass);
+ if (the_type == 0) // not a known class name
+ return CSET_ECTYPE;
+
+ const char *class_name = strdup(cclass);
+ if (class_name == NULL)
+ return CSET_ESPACE;
+
+ set_item new_item;
+ new_item.item_type = CTYPE_ITEM;
+
+ new_item.wtype = the_type;
+ new_item.type_name = class_name;
+ set->items[set->nelems++] = new_item;
+
+ return CSET_SUCCESS;
+}
+/* charset_add_equiv --- add an equivalence class */
+
+int
+charset_add_equiv(charset_t *set, int32_t equiv)
+{
+ if (set == NULL)
+ return CSET_EBADPTR;
+ if (set->finalized)
+ return CSET_EFROZEN;
+
+ if (equiv < 0)
+ return CSET_ERANGE;
+
+ wchar_t wcs_in[2];
+ wchar_t wcs[2];
+ wchar_t abuf[100], wbuf[100];
+ int result;
+
+ wcs_in[0] = equiv;
+ wcs_in[1] = 0;
+ wcsxfrm(abuf, wcs_in, 99);
+ wcs[1] = 0;
+ for (wchar_t u = 1; u <= MAX_CODE_POINT; ++u) {
+ wcs[0] = u;
+ wcsxfrm(wbuf, wcs, 99);
+ if (abuf[0] == wbuf[0])
+ if ((result = charset_add_char(set, u)) != CSET_SUCCESS)
+ return result;
+ }
+
+ return CSET_SUCCESS;
+}
+/* charset_add_collate --- add a collating sequence */
+
+int
+charset_add_collate(charset_t *set, const int32_t *collate)
+{
+ if (set == NULL)
+ return CSET_EBADPTR;
+ if (set->finalized)
+ return CSET_EFROZEN;
+
+ // only single character collating sequences allowed,
+ // at least right now
+ if (collate[1] != L'\0')
+ return CSET_ECOLLATE;
+
+ return charset_add_char(set, collate[0]);
+}
+/* charset_in_set --- see if a character is in the set */
+
+bool
+charset_in_set(const charset_t *set, int32_t the_char)
+{
+ if (set == NULL || the_char < 0)
+ return false;
+
+ if (! set->finalized) {
+ finalize((charset_t *) set);
+
+ if (! set->finalized) // finalize() failed
+ return false;
+ }
+
+ if (the_char == L'\n' && set->no_newlines && set->complemented)
+ return false;
+
+ bool found = is_found(set, the_char);
+ if (set->complemented)
+ found = ! found; // reverse sense of the match
+
+ return found;
+}
+/* charset_free --- free all storage */
+
+int
+charset_free(const charset_t *set)
+{
+ if (set == NULL)
+ return CSET_EBADPTR;
+ // no need to check for finalized
+
+ if (set->items != NULL) {
+ for (int i = 0; i < set->nelems; i++) {
+ if (set->items[i].item_type == CTYPE_ITEM)
+ free((void *) set->items[i].type_name);
+ else
+ break;
+ }
+ free((void *) set->items);
+ }
+
+ if (set->chars != NULL)
+ free((void *) set->chars);
+
+ free((void *) set);
+
+ return CSET_SUCCESS;
+}
+/* charset_dump --- dump out the data structures */
+
+void
+charset_dump(const charset_t *set, FILE *fp)
+{
+ static const char *boolval[] = {
+ "false",
+ "true",
+ };
+
+ if (set == NULL || fp == NULL)
+ return;
+
+ fprintf(fp, "complemented = %s\n", boolval[!! set->complemented]);
+ fprintf(fp, "no_newlines = %s\n", boolval[!! set->no_newlines]);
+ fprintf(fp, "finalized = %s\n", boolval[!! set->finalized]);
+
+ set_item *items = set->items;
+ for (int i = 0; i < set->nelems; i++) {
+ if (items[i].item_type == CTYPE_ITEM) {
+ fprintf(fp, "%3d. CTYPE: [:%s:]\n", i,
items[i].type_name);
+ continue;
+ }
+ assert(items[i].item_type == RANGE_ITEM);
+ fprintf(fp, "%3d. RANGE: start = L'%lc', end = L'%lc'\n",
+ i, items[i].start, items[i].end);
+ }
+ fflush(fp);
+}
diff --git a/support/charset.h b/support/charset.h
new file mode 100644
index 00000000..34d78d47
--- /dev/null
+++ b/support/charset.h
@@ -0,0 +1,66 @@
+#ifndef CHARSET_H
+#define CHARSET_H 1
+
+/*
+ * Copyright (C) 2023, 2024, Arnold David Robbins.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * âAS ISâ AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
+ * TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <stdio.h>
+#include <stdbool.h>
+#include <stdint.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef struct _charset charset_t;
+
+// error code values:
+enum {
+ CSET_SUCCESS = 0, // No problems
+ CSET_EBADPTR, // NULL pointer received
+ CSET_EFROZEN, // Cannot add more characters to the set
+ CSET_ECOLLATE, // Corresponds to REG_ECOLLATE
+ CSET_ECTYPE, // Corresponds to REG_ECTYPE
+ CSET_ESPACE, // Corresponds to REG_ESPACE
+ CSET_ERANGE, // Corresponds to REG_ERANGE
+};
+charset_t *charset_create(int *errcode);
+int charset_add_char(charset_t *set, int32_t wc);
+int charset_add_range(charset_t *set, int32_t first, int32_t last);
+int charset_invert(charset_t *set);
+int charset_set_no_newlines(charset_t *set, bool no_newlines);
+int charset_add_cclass(charset_t *set, const char *cclass);
+int charset_add_equiv(charset_t *set, int32_t equiv);
+int charset_add_collate(charset_t *set, const int32_t *collate);
+bool charset_in_set(const charset_t *set, int32_t the_char);
+int charset_free(const charset_t *set);
+void charset_dump(const charset_t *set, FILE *fp);
+
+#ifdef __cplusplus
+}
+#endif
+#endif /* CHARSET_H */
-----------------------------------------------------------------------
Summary of changes:
support/charset.c | 549 ++++++++++++++++++++++++++++++++++++++++++++++++++++++
support/charset.h | 66 +++++++
2 files changed, 615 insertions(+)
create mode 100644 support/charset.c
create mode 100644 support/charset.h
hooks/post-receive
--
gawk
[Prev in Thread] |
Current Thread |
[Next in Thread] |
- [SCM] gawk branch, feature/minrx, updated. gawk-4.1.0-5875-g44185361,
Arnold Robbins <=