[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]
branch master updated: UTF-8 flag on strings for XS parser
From: |
Gavin D. Smith |
Subject: |
branch master updated: UTF-8 flag on strings for XS parser |
Date: |
Fri, 25 Feb 2022 12:39:33 -0500 |
This is an automated email from the git hooks/post-receive script.
gavin pushed a commit to branch master
in repository texinfo.
The following commit(s) were added to refs/heads/master by this push:
new eac8797369 UTF-8 flag on strings for XS parser
eac8797369 is described below
commit eac879736912372fba273b560a332e33c73bb2c8
Author: Gavin Smith <gavinsmith0123@gmail.com>
AuthorDate: Fri Feb 25 17:39:24 2022 +0000
UTF-8 flag on strings for XS parser
* tp/Texinfo/XS/parsetexi/api.c (newSVpv_utf8): New function.
(element_to_perl_hash, build_single_index_data, build_line_nr_hash)
(convert_error): Use it in many more places where the string being
created should be "Perl-internal". Suggestion from Patrice.
---
ChangeLog | 9 ++++
tp/Texinfo/XS/parsetexi/api.c | 60 +++++++++++-----------
tp/t/results/include/cpp_lines.pl | 2 +-
.../non_ascii_command_line/Chapteur.html | 3 +-
.../non_ascii_command_line/os\303\251.2" | 3 +-
5 files changed, 44 insertions(+), 33 deletions(-)
diff --git a/ChangeLog b/ChangeLog
index 68261ceb60..d6cbab2945 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,12 @@
+2022-02-24 Gavin Smith <gavinsmith0123@gmail.com>
+
+ UTF-8 flag on strings for XS parser
+
+ * tp/Texinfo/XS/parsetexi/api.c (newSVpv_utf8): New function.
+ (element_to_perl_hash, build_single_index_data, build_line_nr_hash)
+ (convert_error): Use it in many more places where the string being
+ created should be "Perl-internal". Suggestion from Patrice.
+
2022-02-24 Patrice Dumas <pertusus@free.fr>
One function in Texinfo::Common to handle file name encoding
diff --git a/tp/Texinfo/XS/parsetexi/api.c b/tp/Texinfo/XS/parsetexi/api.c
index 1b06962e3e..3ed7314cbb 100644
--- a/tp/Texinfo/XS/parsetexi/api.c
+++ b/tp/Texinfo/XS/parsetexi/api.c
@@ -293,6 +293,19 @@ build_node_spec (NODE_SPEC_EXTRA *value)
return newRV_inc ((SV *)hv);
}
+/* Used to create a "Perl-internal" string that represents a sequence
+ of Unicode codepoints with no specific encoding. */
+static SV *
+newSVpv_utf8 (char *str, STRLEN len)
+{
+ SV *sv;
+ dTHX;
+
+ sv = newSVpv (str, len);
+ SvUTF8_on (sv);
+ return sv;
+}
+
/* Set E->hv and 'hv' on E's descendants. e->parent->hv is assumed
to already exist. */
static void
@@ -402,22 +415,11 @@ element_to_perl_hash (ELEMENT *e)
if (e->text.space > 0)
{
- sv = newSVpv (e->text.text, e->text.end);
+ sv = newSVpv_utf8 (e->text.text, e->text.end);
if (e->cmd != CM_value)
hv_store (e->hv, "text", strlen ("text"), sv, 0);
else
hv_store (e->hv, "type", strlen ("type"), sv, 0);
-
- SvUTF8_on (sv);
- /* The strings here have to be in UTF-8 to start with.
- This leads to an unnecessary round trip with "@documentencoding
- ISO-8859-1" for Info and plain text output, when we first convert the
- characters in the input file to UTF-8, and convert them back again
for
- the output.
-
- The alternative is to leave the UTF-8 flag off, and hope that Perl
- interprets 8-bit encodings like ISO-8859-1 correctly. See
- "How does Perl store UTF-8 strings?" in "man perlguts". */
}
if (e->extra_number > 0)
@@ -483,7 +485,7 @@ element_to_perl_hash (ELEMENT *e)
case extra_string:
{ /* A simple string. */
char *value = (char *) f;
- STORE(newSVpv (value, 0));
+ STORE(newSVpv_utf8 (value, 0));
break;
}
case extra_integer:
@@ -505,15 +507,14 @@ element_to_perl_hash (ELEMENT *e)
{
if (f->contents.list[j]->text.end > 0)
{
- av_push (av,
- newSVpv (f->contents.list[j]->text.text,
- f->contents.list[j]->text.end));
+ SV *sv = newSVpv_utf8 (f->contents.list[j]->text.text,
+ f->contents.list[j]->text.end);
+ av_push (av, sv);
}
else
{
/* Empty strings permitted. */
- av_push (av,
- newSVpv ("", 0));
+ av_push (av, newSVpv ("", 0));
}
}
break;
@@ -577,8 +578,10 @@ element_to_perl_hash (ELEMENT *e)
hv_store (type, "content", strlen ("content"),
build_perl_array (&eft->content->contents), 0);
if (eft->normalized)
- hv_store (type, "normalized", strlen ("normalized"),
- newSVpv (eft->normalized, 0), 0);
+ {
+ SV *sv = newSVpv_utf8 (eft->normalized, 0);
+ hv_store (type, "normalized", strlen ("normalized"), sv, 0);
+ }
STORE(newRV_inc ((SV *)type));
break;
}
@@ -617,7 +620,7 @@ element_to_perl_hash (ELEMENT *e)
if (line_nr->macro)
{
- STORE("macro", newSVpv (line_nr->macro, 0));
+ STORE("macro", newSVpv_utf8 (line_nr->macro, 0));
}
else
STORE("macro", newSVpv ("", 0));
@@ -745,7 +748,7 @@ build_single_index_data (INDEX *i)
hv = (HV *) i->hv;
}
- STORE("name", newSVpv (i->name, 0));
+ STORE("name", newSVpv_utf8 (i->name, 0));
STORE("in_code", i->in_code ? newSViv(1) : newSViv(0));
if (i->merged_in)
@@ -767,7 +770,7 @@ build_single_index_data (INDEX *i)
hv_store (ultimate->contained_hv, i->name, strlen (i->name),
newSViv (1), 0);
- STORE("merged_in", newSVpv (ultimate->name, 0));
+ STORE("merged_in", newSVpv_utf8 (ultimate->name, 0));
if (i->contained_hv)
{
@@ -809,7 +812,7 @@ build_single_index_data (INDEX *i)
e = &i->index_entries[j];
entry = newHV ();
- STORE2("index_name", newSVpv (i->name, 0));
+ STORE2("index_name", newSVpv_utf8 (i->name, 0));
STORE2("index_at_command",
newSVpv (command_name(e->index_at_command), 0));
STORE2("index_type_command",
@@ -860,7 +863,7 @@ build_single_index_data (INDEX *i)
if (e->node)
STORE2("node", newRV_inc ((SV *)e->node->hv));
if (e->sortas)
- STORE2("sortas", newSVpv (e->sortas, 0));
+ STORE2("sortas", newSVpv_utf8 (e->sortas, 0));
/* Create ignored_chars hash. */
{
@@ -1124,12 +1127,12 @@ build_line_nr_hash (LINE_NR line_nr)
if (line_nr.macro)
{
hv_store (hv, "macro", strlen ("macro"),
- newSVpv (line_nr.macro, 0), 0);
+ newSVpv_utf8 (line_nr.macro, 0), 0);
}
else
{
hv_store (hv, "macro", strlen ("macro"),
- newSVpv ("", 0), 0);
+ newSVpv_utf8 ("", 0), 0);
}
return newRV_inc ((SV *) hv);
@@ -1147,8 +1150,7 @@ convert_error (int i)
e = error_list[i];
hv = newHV ();
- msg = newSVpv (e.message, 0);
- SvUTF8_on (msg);
+ msg = newSVpv_utf8 (e.message, 0);
hv_store (hv, "message", strlen ("message"), msg, 0);
hv_store (hv, "type", strlen ("type"),
diff --git a/tp/t/results/include/cpp_lines.pl
b/tp/t/results/include/cpp_lines.pl
index 3b942f5488..2acca375d6 100644
--- a/tp/t/results/include/cpp_lines.pl
+++ b/tp/t/results/include/cpp_lines.pl
@@ -704,7 +704,7 @@ $result_trees{'cpp_lines'} = {
'cmdname' => 'documentlanguage',
'extra' => {
'spaces_before_argument' => ' ',
- 'text_arg' => 'là ng'
+ 'text_arg' => "l\x{e0}ng"
},
'line_nr' => {
'file_name' => 'accentêd',
diff --git
a/tp/tests/formatting/res_parser/non_ascii_command_line/Chapteur.html
b/tp/tests/formatting/res_parser/non_ascii_command_line/Chapteur.html
index 71f800ef1a..e7ee8b9acd 100644
--- a/tp/tests/formatting/res_parser/non_ascii_command_line/Chapteur.html
+++ b/tp/tests/formatting/res_parser/non_ascii_command_line/Chapteur.html
@@ -69,7 +69,8 @@ ul.mark-néni {list-style-type: "vàça"}
<img class="image" src="dîrectory/imàge.êxt" alt="âlt">
-
+<pre class="verbatim">In included téxt.
+</pre>
</div>
<hr>
<p>
diff --git
"a/tp/tests/formatting/res_parser/non_ascii_command_line/os\303\251.2"
"b/tp/tests/formatting/res_parser/non_ascii_command_line/os\303\251.2"
index 054aa9681a..eeddb28fa3 100644
--- "a/tp/tests/formatting/res_parser/non_ascii_command_line/os\303\251.2"
+++ "b/tp/tests/formatting/res_parser/non_ascii_command_line/os\303\251.2"
@@ -3,5 +3,4 @@ texi2any: warning: Destruktïw is not a valid language code
texi2any: warning: unknown variable from command line: Kommandöh
osé.texi:23: @include: could not find not_existïng.téxi
osé.texi:21: warning: @image file `dîrectory/imàge' (for HTML) not found,
using `dîrectory/imàge.êxt'
-osé.texi:25: @verbatiminclude: could not find included_akçentêd.texi
-osé.texi:27: @verbatiminclude: could not find vi_not_existïng.téxi
+osé.texi:27: @verbatiminclude: could not find vi_not_existïng.téxi
[Prev in Thread] |
Current Thread |
[Next in Thread] |
- branch master updated: UTF-8 flag on strings for XS parser,
Gavin D. Smith <=