[Top][All Lists]
[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]
texinfo/tp Texinfo/Convert/Text.pm Texinfo/Conv...
From: |
Patrice Dumas |
Subject: |
texinfo/tp Texinfo/Convert/Text.pm Texinfo/Conv... |
Date: |
Sat, 06 Nov 2010 12:26:39 +0000 |
CVSROOT: /sources/texinfo
Module name: texinfo
Changes by: Patrice Dumas <pertusus> 10/11/06 12:26:38
Modified files:
tp/Texinfo/Convert: Text.pm Unicode.pm
Added files:
tp/t : accents.t
Log message:
Unit tests and fixes for accents with enable encoding set.
CVSWeb URLs:
http://cvs.savannah.gnu.org/viewcvs/texinfo/tp/Texinfo/Convert/Text.pm?cvsroot=texinfo&r1=1.13&r2=1.14
http://cvs.savannah.gnu.org/viewcvs/texinfo/tp/Texinfo/Convert/Unicode.pm?cvsroot=texinfo&r1=1.2&r2=1.3
http://cvs.savannah.gnu.org/viewcvs/texinfo/tp/t/accents.t?cvsroot=texinfo&rev=1.1
Patches:
Index: Texinfo/Convert/Text.pm
===================================================================
RCS file: /sources/texinfo/texinfo/tp/Texinfo/Convert/Text.pm,v
retrieving revision 1.13
retrieving revision 1.14
diff -u -b -r1.13 -r1.14
--- Texinfo/Convert/Text.pm 6 Nov 2010 00:41:28 -0000 1.13
+++ Texinfo/Convert/Text.pm 6 Nov 2010 12:26:38 -0000 1.14
@@ -555,9 +555,9 @@
$ignored_types{$type} = 1;
}
-sub _accent_stack($)
+# find the innermost accent and the correspponding text
+sub _find_innermost_accent($)
{
-#unicode_to_eight_bit
my $current = shift;
my @accent_commands = ();
my $text = '';
@@ -574,8 +574,9 @@
}
push @accent_commands, $current->{'cmdname'};
my $arg = $current->{'args'}->[0];
+ # a construct like @'e without content
if (defined($arg->{'text'})) {
- return ($arg->{'text'}, address@hidden, $current);
+ return ($arg->{'text'}, $current, address@hidden);
}
if (!$arg->{'contents'}) {
print STDERR "BUG: No content in accent command\n";
@@ -583,6 +584,7 @@
print STDERR Texinfo::Convert::Texinfo::convert($current)."\n";
last;
}
+ # inside the braces of an accent
foreach my $content (@{$arg->{'contents'}}) {
if (!($content->{'extra'} and $content->{'extra'}->{'invalid_nesting'})
and !($content->{'cmdname'} and ($content->{'cmdname'} eq 'c'
@@ -605,7 +607,27 @@
}
last;
}
- return ($text, address@hidden, $current);
+ return ($text, $current, address@hidden);
+}
+
+# return the 8 bit, if it exists, and the unicode codepoint
+sub _eight_bit_and_unicode_point($$)
+{
+ my $char = shift;
+ my $encoding_map_name = shift;
+ my ($eight_bit, $codepoint);
+ if (ord($char) <= 128) {
+ # 7bit ascii characters, the same in every 8bit encodings
+ $eight_bit = uc(sprintf("%02x",ord($char)));
+ $codepoint = uc(sprintf("%04x",ord($char)));
+ } elsif (ord($char) <= hex(0xFFFF)) {
+ $codepoint = uc(sprintf("%04x",ord($char)));
+ if (exists($unicode_to_eight_bit{$encoding_map_name}->{$codepoint})) {
+ $eight_bit
+ = $unicode_to_eight_bit{$encoding_map_name}->{$codepoint};
+ }
+ }
+ return ($eight_bit, $codepoint);
}
sub eight_bit_accents($$$)
@@ -614,115 +636,96 @@
my $encoding = shift;
my $convert_accent = shift;
- my $debug = 0;
+ my $debug;
+ #$debug = 1;
+
+ my ($text, $innermost_accent, $stack) = _find_innermost_accent($current);
- my ($text, $stack, $innermost_accent) = _accent_stack($current);
+ print STDERR "INNERMOST: $innermost_accent->{'cmdname'}($text)\n"
+ if ($debug);
# accents are formatted and the intermediate results are kept, such
# that we can return the maximum of multiaccented letters that can be
# rendered with a given eight bit formatting.
my $accent = $innermost_accent;
- if ($debug) {
- print STDERR "INNERMOST: $innermost_accent->{'cmdname'}($text)\n";
- }
- my $current_result = Texinfo::Convert::Unicode::unicode_accent($text,
$accent);
- print STDERR 'PARTIAL_RESULTS: '.Encode::encode('utf8', $current_result) if
($debug);
+ my $current_result = $text;
+ my @results_stack;
- my @results_stack = ([$current_result, $accent]);
- if ($accent ne $current) {
- while ($accent->{'parent'}->{'parent'}) {
- $accent = $accent->{'parent'}->{'parent'};
- $current_result =
- Texinfo::Convert::Unicode::unicode_accent($current_result, $accent);
- print STDERR '|'.Encode::encode('utf8', $current_result) if ($debug);
+ while (1) {
+ $current_result
+ = Texinfo::Convert::Unicode::unicode_accent($current_result, $accent);
push @results_stack, [$current_result, $accent];
last if ($accent eq $current);
+ $accent = $accent->{'parent'}->{'parent'};
}
- }
- print STDERR "\n" if ($debug);
if ($debug) {
print STDERR "stack: ".join('|',@$stack)."\nPARTIAL_RESULATS_STACK:\n";
foreach my $partial_result (@results_stack) {
- print STDERR " -> ".Encode::encode('utf8',
$partial_result->[0])."|$partial_result->[1]->{'cmdname'}\n";
+ print STDERR " -> ".Encode::encode('utf8', $partial_result->[0])
+ ."|$partial_result->[1]->{'cmdname'}\n";
}
}
my $encoding_map_name
= $Texinfo::Commands::eight_bit_encoding_aliases{$encoding};
- my $eight_bit;
- my $result;
- my $accent_done;
# At this point we have the utf8 encoded results for the accent
# commands stack, with all the intermediate results.
# For each one we'll check if it is possible to encode it in the
# current eight bit output encoding table
+ my ($eight_bit, $dummy)
+ = _eight_bit_and_unicode_point($text, $encoding_map_name);
+ my $eight_bit_command_index = -1;
foreach my $partial_result (@results_stack) {
my $char = $partial_result->[0];
- my $new_eight_bit = '';
- my $new_codepoint;
-
- if (ord($char) <= 128) {
- # 7bit ascii characters, the same in every 8bit encodings
- $new_eight_bit = uc(sprintf("%02x",ord($char)));
- $new_codepoint = uc(sprintf("%04x",ord($char)));
- } elsif (ord($char) <= hex(0xFFFF)) {
- $new_codepoint = uc(sprintf("%04x",ord($char)));
- if (exists($unicode_to_eight_bit{$encoding_map_name}->{$new_codepoint}))
{
- $new_eight_bit
- = $unicode_to_eight_bit{$encoding_map_name}->{$new_codepoint};
- }
- }
+ my ($new_eight_bit, $new_codepoint) = _eight_bit_and_unicode_point($char,
+ $encoding_map_name);
if ($debug) {
my $eight_bit_txt = 'undef';
$eight_bit_txt = $eight_bit if (defined($eight_bit));
print STDERR "" . Encode::encode('utf8', $char) . "
($partial_result->[1]->{'cmdname'}), new_codepoint: $new_codepoint 8bit:
$new_eight_bit old:$eight_bit_txt\n";
}
- # no corresponding eight bit character found
- last if ($new_eight_bit eq '');
+ # no corresponding eight bit character found for a composed character
+ last if (!$new_eight_bit);
# in that case, the new eight bit character is the same than the one
- # found with one less character (and it isnt a @dotless{i}). It may
- # mean 2 things
+ # found with one less character (and it isn't a @dotless{i}). It may
+ # hapen in 2 case, both meaning that there is no corresponding 8bit char:
+ #
# -> there are 2 characters in accent. This could happen, for example
# if an accent that cannot be rendered is found and it leads to
# appending or prepending a character. For example this happens for
# @={@,address@hidden, where @,address@hidden is expanded to a 2
character:
# n with a tilde, followed by a ,
- # In nthat case, the additional utf8 accent is prepended, which
+ # In that case, the additional utf8 diacritic is appended, which
# means that it is composed with the , and leaves n with a tilde
# untouched.
- # -> ord(char) leads to the same for the more inner character.
- # this, for example, happens for @ubaraccent{a}, where ord(a) is
- # the same than ord(a with underbar).
- last if (defined($eight_bit) and (($new_eight_bit eq $eight_bit)
- and !($partial_result->[1]->{'cmdname'} eq 'dotless' and $char eq
'i')));
- $result = $partial_result->[0];
- $accent_done = $partial_result->[1];
+ # -> the diacritic is appended but the normal form doesn't lead
+ # to a composed character, such that the first character
+ # of the string is unchanged. This, for example, happens for
+ # @ubaraccent{a} since there is no composed accent with a and an
+ # underbar.
+ last if (($new_eight_bit eq $eight_bit)
+ and !($partial_result->[1]->{'cmdname'} eq 'dotless' and $char eq 'i'));
$eight_bit = $new_eight_bit;
+ $eight_bit_command_index++;
}
- my $accent_remaining = '';
- if (defined($accent_done)) {
- if ($accent_done ne $current) {
- $accent_remaining = $accent_done->{'parent'}->{'parent'};
- }
- } else {
- $accent_remaining = $innermost_accent;
- $result = $text;
- }
- print STDERR "initial: $current, remaining: $accent_remaining, result: "
+ # handle the remaining accents, that have not been converted to 8bit
+ # compatible unicode
+ my $result = $text;
+ $result = $results_stack[$eight_bit_command_index]->[0]
+ if ($eight_bit_command_index > -1);
+ for (my $remaining_accents = $eight_bit_command_index+1;
+ $remaining_accents <= $#results_stack; $remaining_accents++) {
+ $result = &$convert_accent($result,
$results_stack[$remaining_accents]->[1]);
+ print STDERR "REMAINING($remaining_accents) "
.Encode::encode('utf8', $result)."\n" if ($debug);
- if ($accent_remaining) {
- while (1) {
- $result = &$convert_accent($result, $accent_remaining);
- if ($accent_remaining eq $current) {
- return $result;
- }
- $accent_remaining = $accent_remaining->{'parent'}->{'parent'};
- }
}
+
+ # An important remark is that the final conversion to 8bit is left to
+ # perl.
return $result;
}
Index: Texinfo/Convert/Unicode.pm
===================================================================
RCS file: /sources/texinfo/texinfo/tp/Texinfo/Convert/Unicode.pm,v
retrieving revision 1.2
retrieving revision 1.3
diff -u -b -r1.2 -r1.3
--- Texinfo/Convert/Unicode.pm 6 Nov 2010 00:41:28 -0000 1.2
+++ Texinfo/Convert/Unicode.pm 6 Nov 2010 12:26:38 -0000 1.3
@@ -19,7 +19,7 @@
package Texinfo::Convert::Unicode;
-use 5.00405;
+use 5.006;
use strict;
use Encode;
Index: t/accents.t
===================================================================
RCS file: t/accents.t
diff -N t/accents.t
--- /dev/null 1 Jan 1970 00:00:00 -0000
+++ t/accents.t 6 Nov 2010 12:26:38 -0000 1.1
@@ -0,0 +1,83 @@
+use strict;
+
+use Test::More;
+BEGIN { plan tests => 18 };
+
+use Texinfo::Convert::Text;
+use Texinfo::Parser;
+
+ok(1, "modules loading");
+
+
+sub test_accent_stack ($)
+{
+ my $test = shift;
+ my $texi = $test->[0];
+ my $name = $test->[1];
+ my $reference = $test->[2];
+ my $parser = Texinfo::Parser::parser({'context' => 'preformatted'});
+ my $tree = $parser->parse_texi_text($texi);
+ my ($text, $innermost_accent, $stack) =
+ Texinfo::Convert::Text::_find_innermost_accent($tree->{'contents'}->[0]);
+ if (defined($reference)) {
+ ok ($reference eq join('|',($text, @$stack)), 'innermost '.$name);
+ } else {
+ print STDERR join('|',($text, @$stack))."\n";
+ }
+}
+
+foreach my $test (['@~e', 'simple', 'e|~'],
+ ['@address@hidden','dotless','i|~|dotless'],
+ ['@address@hidden comment
+e}', 'comment', 'e|~'],
+ ['@~{@@}','no_brace_command', '@|~'],
+ ['@address@hidden','no_brace_command', 'TeX|~'],
+ ) {
+ test_accent_stack($test);
+}
+
+sub test_enable_encoding ($)
+{
+ my $test = shift;
+ my $texi = $test->[0];
+ my $name = $test->[1];
+ my $reference = $test->[2];
+ my $parser = Texinfo::Parser::parser({'context' => 'preformatted'});
+ my $tree = $parser->parse_texi_text($texi);
+ my $result =
+ Texinfo::Convert::Text::eight_bit_accents($tree->{'contents'}->[0],
+ 'iso-8859-1', \&Texinfo::Convert::Text::ascii_accents);
+ if (defined($reference)) {
+ #ok (Encode::decode('iso-8859-1', $reference) eq $result, $name);
+ #ok ($reference eq Encode::encode('iso-8859-1', $result), $name);
+ is (Encode::encode('iso-8859-1', $result), $reference, $name);
+ } else {
+ my $ord = '';
+ foreach my $char (split '', $result) {
+ $ord .= ord($char).'-';
+ }
+ $ord =~ s/-$//;
+ print STDERR "$name ($ord)--> utf8: ".Encode::encode('utf8', $result).
+ " latin1: ".Encode::encode('iso-8859-1', $result)."\n";
+ }
+}
+
+# some come from encodings/weird_accents.texi
+foreach my $test (
+ ['@~e', 'no 8bit encoding', "e~"],
+ ['@~n', 'simple encoding', chr(241)],
+ ['@~{n}' , 'brace encoding', chr(241)],
+ ['@address@hidden', 'dotless', chr(238)],
+ ['@address@hidden', 'no 8bit dotless', 'i~'],
+ ['@address@hidden@dotless{i}}}', 'no 8 cplx dotless', 'i~='],
+ ['@address@hidden@dotless{i}}}', 'complex dotless',
chr(238).'='],
+ ['@={@,address@hidden', 'complex encoding', chr(241).',='],
+ ['@udotaccent{r}', 'udotaccent', '.r'],
+ ['@address@hidden', 'complex ubaraccent', 'a_='],
+ ['@address@hidden@`r}}', 'complex udotaccent', '.r`^' ],
+ ['@address@hidden@\'address@hidden', 'command in accent', '=']
+ ) {
+ test_enable_encoding($test);
+}
+
+1;
[Prev in Thread] |
Current Thread |
[Next in Thread] |
- texinfo/tp Texinfo/Convert/Text.pm Texinfo/Conv...,
Patrice Dumas <=