texinfo/tp Texinfo/Convert/Text.pm Texinfo/Conv...

texinfo-commits
[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]
texinfo/tp Texinfo/Convert/Text.pm Texinfo/Conv...

From:	Patrice Dumas
Subject:	texinfo/tp Texinfo/Convert/Text.pm Texinfo/Conv...
Date:	Sat, 06 Nov 2010 12:26:39 +0000
CVSROOT:        /sources/texinfo
Module name:    texinfo
Changes by:     Patrice Dumas <pertusus>        10/11/06 12:26:38

Modified files:
        tp/Texinfo/Convert: Text.pm Unicode.pm 
Added files:
        tp/t           : accents.t 

Log message:
        Unit tests and fixes for accents with enable encoding set.

CVSWeb URLs:
http://cvs.savannah.gnu.org/viewcvs/texinfo/tp/Texinfo/Convert/Text.pm?cvsroot=texinfo&r1=1.13&r2=1.14
http://cvs.savannah.gnu.org/viewcvs/texinfo/tp/Texinfo/Convert/Unicode.pm?cvsroot=texinfo&r1=1.2&r2=1.3
http://cvs.savannah.gnu.org/viewcvs/texinfo/tp/t/accents.t?cvsroot=texinfo&rev=1.1

Patches:
Index: Texinfo/Convert/Text.pm
===================================================================
RCS file: /sources/texinfo/texinfo/tp/Texinfo/Convert/Text.pm,v
retrieving revision 1.13
retrieving revision 1.14
diff -u -b -r1.13 -r1.14
--- Texinfo/Convert/Text.pm     6 Nov 2010 00:41:28 -0000       1.13
+++ Texinfo/Convert/Text.pm     6 Nov 2010 12:26:38 -0000       1.14
@@ -555,9 +555,9 @@
   $ignored_types{$type} = 1;
 }
 
-sub _accent_stack($)
+# find the innermost accent and the correspponding text
+sub _find_innermost_accent($)
 {
-#unicode_to_eight_bit
   my $current = shift;
   my @accent_commands = ();
   my $text = '';
@@ -574,8 +574,9 @@
     }
     push @accent_commands, $current->{'cmdname'};
     my $arg = $current->{'args'}->[0];
+    # a construct like @'e without content
     if (defined($arg->{'text'})) {
-      return ($arg->{'text'}, address@hidden, $current);
+      return ($arg->{'text'}, $current, address@hidden);
     }
     if (!$arg->{'contents'}) {
       print STDERR "BUG: No content in accent command\n";
@@ -583,6 +584,7 @@
       print STDERR Texinfo::Convert::Texinfo::convert($current)."\n";
       last;
     }
+    # inside the braces of an accent
     foreach my $content (@{$arg->{'contents'}}) {
       if (!($content->{'extra'} and $content->{'extra'}->{'invalid_nesting'})
          and !($content->{'cmdname'} and ($content->{'cmdname'} eq 'c'
@@ -605,7 +607,27 @@
     }
     last;
   }
-  return ($text, address@hidden, $current);
+  return ($text, $current, address@hidden);
+}
+
+# return the 8 bit, if it exists, and the unicode codepoint
+sub _eight_bit_and_unicode_point($$)
+{
+  my $char = shift;
+  my $encoding_map_name = shift;
+  my ($eight_bit, $codepoint);
+  if (ord($char) <= 128) { 
+    # 7bit ascii characters, the same in every 8bit encodings
+    $eight_bit = uc(sprintf("%02x",ord($char)));
+    $codepoint = uc(sprintf("%04x",ord($char)));
+  } elsif (ord($char) <= hex(0xFFFF)) {
+    $codepoint = uc(sprintf("%04x",ord($char)));
+    if (exists($unicode_to_eight_bit{$encoding_map_name}->{$codepoint})) {
+     $eight_bit 
+         = $unicode_to_eight_bit{$encoding_map_name}->{$codepoint};
+    }
+  }
+  return ($eight_bit, $codepoint);
 }
 
 sub eight_bit_accents($$$)
@@ -614,115 +636,96 @@
   my $encoding = shift;
   my $convert_accent = shift;
 
-  my $debug = 0;
+  my $debug;
+  #$debug = 1;
+
+  my ($text, $innermost_accent, $stack) = _find_innermost_accent($current);
 
-  my ($text, $stack, $innermost_accent) = _accent_stack($current);
+  print STDERR "INNERMOST: $innermost_accent->{'cmdname'}($text)\n"
+    if ($debug);
 
   # accents are formatted and the intermediate results are kept, such
   # that we can return the maximum of multiaccented letters that can be
   # rendered with a given eight bit formatting.
   my $accent = $innermost_accent;
-  if ($debug) {
-    print STDERR "INNERMOST: $innermost_accent->{'cmdname'}($text)\n";
-  }
-  my $current_result = Texinfo::Convert::Unicode::unicode_accent($text, 
$accent);
-  print STDERR 'PARTIAL_RESULTS: '.Encode::encode('utf8', $current_result) if 
($debug);
+  my $current_result = $text;
+  my @results_stack;
   
-  my @results_stack = ([$current_result, $accent]);
-  if ($accent ne $current) {
-    while ($accent->{'parent'}->{'parent'}) {
-      $accent = $accent->{'parent'}->{'parent'};
-      $current_result = 
-        Texinfo::Convert::Unicode::unicode_accent($current_result, $accent);
-      print STDERR '|'.Encode::encode('utf8', $current_result) if ($debug);
+  while (1) {
+    $current_result 
+      = Texinfo::Convert::Unicode::unicode_accent($current_result, $accent);
       push @results_stack, [$current_result, $accent];
       last if ($accent eq $current);
+    $accent = $accent->{'parent'}->{'parent'};
     }
-  }
-  print STDERR "\n" if ($debug);
 
   if ($debug) {
     print STDERR "stack: ".join('|',@$stack)."\nPARTIAL_RESULATS_STACK:\n";
     foreach my $partial_result (@results_stack) {
-      print STDERR "   -> ".Encode::encode('utf8', 
$partial_result->[0])."|$partial_result->[1]->{'cmdname'}\n";
+      print STDERR "   -> ".Encode::encode('utf8', $partial_result->[0])
+                            ."|$partial_result->[1]->{'cmdname'}\n";
     }
   }
 
   my $encoding_map_name 
        = $Texinfo::Commands::eight_bit_encoding_aliases{$encoding};
-  my $eight_bit;
-  my $result;
-  my $accent_done;
   # At this point we have the utf8 encoded results for the accent
   # commands stack, with all the intermediate results.
   # For each one we'll check if it is possible to encode it in the 
   # current eight bit output encoding table
+  my ($eight_bit, $dummy) 
+     = _eight_bit_and_unicode_point($text, $encoding_map_name);
+  my $eight_bit_command_index = -1;
   foreach my $partial_result (@results_stack) {
     my $char = $partial_result->[0];
-    my $new_eight_bit = '';
-    my $new_codepoint;
-
-    if (ord($char) <= 128) { 
-      # 7bit ascii characters, the same in every 8bit encodings
-      $new_eight_bit = uc(sprintf("%02x",ord($char)));
-      $new_codepoint = uc(sprintf("%04x",ord($char)));
-    } elsif (ord($char) <= hex(0xFFFF)) {
-      $new_codepoint = uc(sprintf("%04x",ord($char)));
-      if (exists($unicode_to_eight_bit{$encoding_map_name}->{$new_codepoint})) 
{
-         $new_eight_bit 
-            = $unicode_to_eight_bit{$encoding_map_name}->{$new_codepoint};
-      }
-    }
 
+    my ($new_eight_bit, $new_codepoint) = _eight_bit_and_unicode_point($char,
+                                                           $encoding_map_name);
     if ($debug) {
       my $eight_bit_txt = 'undef';
       $eight_bit_txt = $eight_bit if (defined($eight_bit));
       print STDERR "" . Encode::encode('utf8', $char) . " 
($partial_result->[1]->{'cmdname'}), new_codepoint: $new_codepoint 8bit: 
$new_eight_bit old:$eight_bit_txt\n";
     }
 
-    # no corresponding eight bit character found
-    last if ($new_eight_bit eq '');
+    # no corresponding eight bit character found for a composed character
+    last if (!$new_eight_bit);
 
     # in that case, the new eight bit character is the same than the one 
-    # found with one less character (and it isnt a @dotless{i}). It may
-    # mean 2 things
+    # found with one less character (and it isn't a @dotless{i}). It may
+    # hapen in 2 case, both meaning that there is no corresponding 8bit char:
+    #
     # -> there are 2 characters in accent. This could happen, for example
     #    if an accent that cannot be rendered is found and it leads to 
     #    appending or prepending a character. For example this happens for
     #    @={@,address@hidden, where @,address@hidden is expanded to a 2 
character:
     #    n with a tilde, followed by a , 
-    #    In nthat case, the additional utf8 accent is prepended, which 
+    #    In that case, the additional utf8 diacritic is appended, which 
     #    means that it is composed with the , and leaves n with a tilde 
     #    untouched. 
-    # -> ord(char) leads to the same for the more inner character.
-    #    this, for example, happens for @ubaraccent{a}, where ord(a) is
-    #    the same than ord(a with underbar).
-    last if (defined($eight_bit) and (($new_eight_bit eq $eight_bit)
-       and !($partial_result->[1]->{'cmdname'} eq 'dotless' and $char eq 
'i')));
-    $result = $partial_result->[0];
-    $accent_done = $partial_result->[1];
+    # -> the diacritic is appended but the normal form doesn't lead
+    #    to a composed character, such that the first character
+    #    of the string is unchanged. This, for example, happens for 
+    #    @ubaraccent{a} since there is no composed accent with a and an 
+    #    underbar.
+    last if (($new_eight_bit eq $eight_bit)
+       and !($partial_result->[1]->{'cmdname'} eq 'dotless' and $char eq 'i'));
     $eight_bit = $new_eight_bit;
+    $eight_bit_command_index++;
   }
-  my $accent_remaining = '';
-  if (defined($accent_done)) {
-    if ($accent_done ne $current) {
-      $accent_remaining = $accent_done->{'parent'}->{'parent'};
-    }
-  } else {
-    $accent_remaining = $innermost_accent;
-    $result = $text;
-  }
-  print STDERR "initial: $current, remaining: $accent_remaining, result: "
+  # handle the remaining accents, that have not been converted to 8bit
+  # compatible unicode
+  my $result = $text;
+  $result = $results_stack[$eight_bit_command_index]->[0] 
+    if ($eight_bit_command_index > -1);
+  for (my $remaining_accents = $eight_bit_command_index+1; 
+          $remaining_accents <= $#results_stack; $remaining_accents++) {
+    $result = &$convert_accent($result, 
$results_stack[$remaining_accents]->[1]);
+    print STDERR "REMAINING($remaining_accents) "
      .Encode::encode('utf8', $result)."\n" if ($debug);
-  if ($accent_remaining) {
-    while (1) {
-      $result = &$convert_accent($result, $accent_remaining);
-      if ($accent_remaining eq $current) {
-        return $result;
-      }
-      $accent_remaining = $accent_remaining->{'parent'}->{'parent'};
-    }
   }
+
+  # An important remark is that the final conversion to 8bit is left to
+  # perl.
   return $result;
 }
 

Index: Texinfo/Convert/Unicode.pm
===================================================================
RCS file: /sources/texinfo/texinfo/tp/Texinfo/Convert/Unicode.pm,v
retrieving revision 1.2
retrieving revision 1.3
diff -u -b -r1.2 -r1.3
--- Texinfo/Convert/Unicode.pm  6 Nov 2010 00:41:28 -0000       1.2
+++ Texinfo/Convert/Unicode.pm  6 Nov 2010 12:26:38 -0000       1.3
@@ -19,7 +19,7 @@
 
 package Texinfo::Convert::Unicode;
 
-use 5.00405;
+use 5.006;
 use strict;
 
 use Encode;

Index: t/accents.t
===================================================================
RCS file: t/accents.t
diff -N t/accents.t
--- /dev/null   1 Jan 1970 00:00:00 -0000
+++ t/accents.t 6 Nov 2010 12:26:38 -0000       1.1
@@ -0,0 +1,83 @@
+use strict;
+
+use Test::More;
+BEGIN { plan tests => 18 };
+
+use Texinfo::Convert::Text;
+use Texinfo::Parser;
+
+ok(1, "modules loading");
+
+
+sub test_accent_stack ($)
+{
+  my $test = shift;
+  my $texi = $test->[0];
+  my $name = $test->[1]; 
+  my $reference = $test->[2]; 
+  my $parser = Texinfo::Parser::parser({'context' => 'preformatted'});
+  my $tree = $parser->parse_texi_text($texi);
+  my ($text, $innermost_accent, $stack) = 
+    Texinfo::Convert::Text::_find_innermost_accent($tree->{'contents'}->[0]);
+  if (defined($reference)) {
+    ok ($reference eq join('|',($text, @$stack)), 'innermost '.$name);
+  } else {
+    print STDERR join('|',($text, @$stack))."\n";
+  }
+}
+
+foreach my $test (['@~e', 'simple', 'e|~'],
+          ['@address@hidden','dotless','i|~|dotless'],
+          ['@address@hidden comment
+e}', 'comment', 'e|~'],
+          ['@~{@@}','no_brace_command', '@|~'],
+          ['@address@hidden','no_brace_command', 'TeX|~'],
+        ) { 
+  test_accent_stack($test);
+}
+
+sub test_enable_encoding ($)
+{
+  my $test = shift;
+  my $texi = $test->[0];
+  my $name = $test->[1]; 
+  my $reference = $test->[2]; 
+  my $parser = Texinfo::Parser::parser({'context' => 'preformatted'});
+  my $tree = $parser->parse_texi_text($texi);
+  my $result = 
+       Texinfo::Convert::Text::eight_bit_accents($tree->{'contents'}->[0], 
+    'iso-8859-1', \&Texinfo::Convert::Text::ascii_accents);
+  if (defined($reference)) {
+    #ok (Encode::decode('iso-8859-1', $reference) eq $result, $name);
+    #ok ($reference eq Encode::encode('iso-8859-1', $result), $name);
+    is (Encode::encode('iso-8859-1', $result), $reference, $name);
+  } else {
+    my $ord = '';
+    foreach my $char (split '', $result) {
+      $ord .= ord($char).'-';
+    }
+    $ord =~ s/-$//;
+    print STDERR "$name ($ord)--> utf8: ".Encode::encode('utf8', $result).
+        " latin1: ".Encode::encode('iso-8859-1', $result)."\n";
+  }
+}
+
+# some come from encodings/weird_accents.texi
+foreach my $test (
+        ['@~e',                   'no 8bit encoding',    "e~"],
+        ['@~n',                   'simple encoding',     chr(241)],
+        ['@~{n}' ,                'brace encoding',      chr(241)],
+        ['@address@hidden',       'dotless',             chr(238)],
+        ['@address@hidden',       'no 8bit dotless',     'i~'],
+        ['@address@hidden@dotless{i}}}',   'no 8 cplx dotless',   'i~='],
+        ['@address@hidden@dotless{i}}}',   'complex dotless',     
chr(238).'='],
+        ['@={@,address@hidden',         'complex encoding',    chr(241).',='],
+        ['@udotaccent{r}',        'udotaccent',          '.r'],
+        ['@address@hidden',    'complex ubaraccent',  'a_='],
+        ['@address@hidden@`r}}',  'complex udotaccent',  '.r`^' ],
+        ['@address@hidden@\'address@hidden', 'command in accent',   '=']
+                 ) {
+  test_enable_encoding($test);
+}
+
+1;
[Prev in Thread]
Current Thread
[Next in Thread]
texinfo/tp Texinfo/Convert/Text.pm Texinfo/Conv..., Patrice Dumas <=
Prev by Date: texinfo/tp Texinfo/Commands.pm Texinfo/Parser.p...
Next by Date: texinfo/tp Texinfo/Convert/NodeNameNormalizatio...
Previous by thread: texinfo ChangeLog doc/texinfo.txi doc/texi2olda...
Next by thread: texinfo/tp Texinfo/Convert/NodeNameNormalizatio...
Index(es):
- Date
- Thread