[6248] Paragraph.pm add

texinfo-commits
[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]
[6248] Paragraph.pm add_text use split

From:	Gavin D. Smith
Subject:	[6248] Paragraph.pm add_text use split
Date:	Mon, 04 May 2015 22:52:17 +0000
Revision: 6248
          http://svn.sv.gnu.org/viewvc/?view=rev&root=texinfo&revision=6248
Author:   gavin
Date:     2015-05-04 22:52:16 +0000 (Mon, 04 May 2015)
Log Message:
-----------
Paragraph.pm add_text use split

Modified Paths:
--------------
    trunk/ChangeLog
    trunk/tp/Texinfo/Convert/Paragraph.pm

Modified: trunk/ChangeLog
===================================================================
--- trunk/ChangeLog     2015-05-04 22:06:03 UTC (rev 6247)
+++ trunk/ChangeLog     2015-05-04 22:52:16 UTC (rev 6248)
@@ -3,6 +3,15 @@
        * tp/Texinfo/Convert/Unicode.pm (string_width): Calculate string
        width using "unpack" function.
 
+       * tp/Texinfo/Convert/Paragraph.pm
+
+       (_add_next): Add new argument "$newlines_impossible" to elimate
+       a regex match.  Don't access arguments we don't need.
+       (add_text): Use "split" function to split up text.
+       Try to eliminate regex matches for a newline character.
+       Save some flags in local variables.  Reorder a condition.  Add 
+       "o" flag to some regexes that used variables.
+
 2015-05-03  Karl Berry  <address@hidden>
 
        * pretest 5.9.91.

Modified: trunk/tp/Texinfo/Convert/Paragraph.pm
===================================================================
--- trunk/tp/Texinfo/Convert/Paragraph.pm       2015-05-04 22:06:03 UTC (rev 
6247)
+++ trunk/tp/Texinfo/Convert/Paragraph.pm       2015-05-04 22:52:16 UTC (rev 
6248)
@@ -181,19 +181,20 @@
 }
 
 # add a word and/or spaces and end of sentence.
-sub _add_next($;$$$$$)
+sub _add_next($;$$$$$$)
 {
-  my $paragraph = shift;
-  my $word = shift;
-  my $underlying_word = shift;
-  my $space = shift;
-  my $end_sentence = shift;
-  my $transparent = shift;
+  my $paragraph = $_[0];
+  my $word = $_[1];
+  my $space = $_[3];
+  my $end_sentence = $_[4];
+  my $transparent = $_[5];
   my $result = '';
 
-  $underlying_word = $word if (!defined($underlying_word));
+  if (defined($word)) {
+    my $underlying_word = $_[2];
+    my $newlines_impossible = $_[6];
+    $underlying_word = $word if (!defined($underlying_word));
 
-  if (defined($word)) {
     if (!defined($paragraph->{'word'})) {
       $paragraph->{'word'} = '';
       $paragraph->{'underlying_word'} = '';
@@ -212,7 +213,7 @@
     
     $paragraph->{'word'} .= $word;
     $paragraph->{'underlying_word'} .= $underlying_word unless($transparent);
-    if ($word =~ /\n/) {
+    if (!$newlines_impossible and $word =~ /\n/) {
       $result .= $paragraph->{'space'};
       $paragraph->{'space'} = '';
       $result .= $paragraph->{'word'};
@@ -321,21 +322,36 @@
   $paragraph->{'end_line_count'} = 0;
   my $result = '';
 
-  while ($text ne '') {
-    if ($paragraph->{'DEBUG'}) {
+  my $protect_spaces_flag = $paragraph->{'protect_spaces'};
+
+  my @segments = split
+/([^\S\x{202f}\x{00a0}]+)|(\p{InFullwidth})|((?:[^\s\p{InFullwidth}]|[\x{202f}\x{00a0}])+)/,
+    $text;
+
+  # Check now if a newline exists anywhere in the string to
+  # try to eliminate regex checks later.
+  my $newline_possible_flag = ($text =~ /\n/);
+
+  my $debug_flag = $paragraph->{'DEBUG'};
+  while (@segments) {
+    # $empty_segment should be an empty string; the other variables
+    # here were recognized as field separators by splice.
+    my ($empty_segment, $spaces, $fullwidth_segment, $added_word)
+     = splice (@segments, 0, 4);
+
+    if ($debug_flag) {
       my $word = 'UNDEF';
       $word = $paragraph->{'word'} if (defined($paragraph->{'word'}));
       print STDERR "p ($paragraph->{'counter'}+$paragraph->{'word_counter'}) s 
`"._print_escaped_spaces($paragraph->{'space'})."', w `$word'\n";
       #print STDERR "TEXT: "._print_escaped_spaces($text)."|\n"
     }
     # \x{202f}\x{00a0} are non breaking spaces
-    if ($text =~ s/^([^\S\x{202f}\x{00a0}]+)//) {
-      my $spaces = $1;
+    if (defined $spaces) {
       $underlying_text =~ s/^([^\S\x{202f}\x{00a0}]+)//
         if defined($underlying_text);
-      print STDERR "SPACES($paragraph->{'counter'}) 
`"._print_escaped_spaces($spaces)."'\n" if ($paragraph->{'DEBUG'});
+      print STDERR "SPACES($paragraph->{'counter'}) 
`"._print_escaped_spaces($spaces)."'\n" if $debug_flag;
       #my $added_word = $paragraph->{'word'};
-      if ($paragraph->{'protect_spaces'}) {
+      if ($protect_spaces_flag) {
         $paragraph->{'word'} .= $spaces;
         $paragraph->{'underlying_word'} .= $spaces;
         $paragraph->{'word_counter'} += length($spaces);
@@ -368,9 +384,9 @@
       } else {
         $result .= _add_pending_word($paragraph);
         if ($paragraph->{'counter'} != 0) {
-          if (!$paragraph->{'frenchspacing'} 
-              and $paragraph->{'end_sentence'} 
-              and $paragraph->{'end_sentence'} > 0) {
+          if ($paragraph->{'end_sentence'} 
+              and $paragraph->{'end_sentence'} > 0
+              and !$paragraph->{'frenchspacing'}) {
             if (length($paragraph->{'space'}) >= 1 or length($spaces) > 1) {
               # more than one space, we can make sure tht there are only 
               # 2 spaces
@@ -386,9 +402,11 @@
               $paragraph->{'space'} = $new_space;
             }
           } else {
-            my $new_space = substr($spaces, 0, 1);
-            $new_space =~ s/^[\n\r]/ /;
-            $paragraph->{'space'} = $new_space;
+            $paragraph->{'space'} = substr($spaces, 0, 1);
+            if ($paragraph->{'space'} eq "\n"
+                or $paragraph->{'space'} eq "\r") {
+              $paragraph->{'space'} = " ";
+            }
           }
         }
       }
@@ -399,37 +417,11 @@
                       > $paragraph->{'max'}) {
         $result .= _cut_line($paragraph);
       }
-      if ($spaces =~ /\n/ and $paragraph->{'keep_end_lines'}) {
+      if ($newline_possible_flag
+          and $paragraph->{'keep_end_lines'} and $spaces =~ /\n/) {
         $result .= _end_line($paragraph);
       }
-    } elsif ($text =~ s/^(\p{InFullwidth})//) {
-      my $added = $1;
-      my $underlying_added;
-      if (defined($underlying_text)) {
-        $underlying_text =~ s/^(\p{InFullwidth})//;
-        $underlying_added = $1;
-      } else {
-        $underlying_added = $added;
-      }
-      
-      print STDERR "EAST_ASIAN\n" if ($paragraph->{'DEBUG'});
-      if (!defined($paragraph->{'word'})) {
-        $paragraph->{'word'} = '';
-        $paragraph->{'underlying_word'} = '';
-      }
-      $paragraph->{'word'} .= $added;
-      $paragraph->{'underlying_word'} .= $underlying_added; 
-      $paragraph->{'word_counter'} += 2;
-      if ($paragraph->{'counter'} != 0 and
-          $paragraph->{'counter'} + $paragraph->{'word_counter'} 
-                               > $paragraph->{'max'}) {
-        $result .= _cut_line($paragraph);
-      }
-      $result .= _add_pending_word($paragraph);
-      delete $paragraph->{'end_sentence'};
-      $paragraph->{'space'} = '';
-    } elsif ($text =~ s/^(([^\s\p{InFullwidth}]|[\x{202f}\x{00a0}])+)//) {
-      my $added_word = $1;
+    } elsif (defined $added_word) {
       my $underlying_added_word;
       if (defined($underlying_text)) {
         $underlying_text =~ s/^(([^\s\p{InFullwidth}]|[\x{202f}\x{00a0}])+)//;
@@ -438,14 +430,15 @@
         $underlying_added_word = $added_word;
       }
 
-      $result .= _add_next($paragraph, $added_word, $underlying_added_word);
+      $result .= _add_next($paragraph, $added_word, $underlying_added_word,
+                           undef, undef, undef, !$newline_possible_flag);
 
       # now check if it is considered as an end of sentence
       if (defined($paragraph->{'end_sentence'})
-          and $underlying_added_word =~ /^[$after_punctuation_characters]*$/) {
+          and $underlying_added_word =~ /^[$after_punctuation_characters]*$/o) 
{
         # do nothing in the case of a continuation of 
after_punctuation_characters
-      } elsif ($paragraph->{'underlying_word'} =~ 
/[$end_sentence_character][$after_punctuation_characters]*$/
-           and $paragraph->{'underlying_word'} !~ 
/[[:upper:]][$end_sentence_character$after_punctuation_characters]*$/) {
+      } elsif ($paragraph->{'underlying_word'} =~ 
/[$end_sentence_character][$after_punctuation_characters]*$/o
+           and $paragraph->{'underlying_word'} !~ 
/[[:upper:]][$end_sentence_character$after_punctuation_characters]*$/o) {
         if ($paragraph->{'frenchspacing'}) {
           $paragraph->{'end_sentence'} = -1;
         } else {
@@ -457,14 +450,31 @@
         print STDERR "delete END_SENTENCE($paragraph->{'end_sentence'}): 
text\n" 
           if (defined($paragraph->{'end_sentence'}) and $paragraph->{'DEBUG'});
       }
-    } else {
-      # Some characters are not handled by the cases above.
-      # For example, it happened for strange caracters that seems to be
-      # some special spaces.  It is a bit strange since the cases above 
-      # include a possibility and the complement.  Maybe a character 
-      # invalid in a given encoding?
-      #die "Unknown caracter leading $text";
-      last;
+    } elsif (defined $fullwidth_segment) {
+      my $underlying_added;
+      if (defined($underlying_text)) {
+        $underlying_text =~ s/^(\p{InFullwidth})//;
+        $underlying_added = $1;
+      } else {
+        $underlying_added = $fullwidth_segment;
+      }
+      
+      print STDERR "EAST_ASIAN\n" if ($paragraph->{'DEBUG'});
+      if (!defined($paragraph->{'word'})) {
+        $paragraph->{'word'} = '';
+        $paragraph->{'underlying_word'} = '';
+      }
+      $paragraph->{'word'} .= $fullwidth_segment;
+      $paragraph->{'underlying_word'} .= $underlying_added; 
+      $paragraph->{'word_counter'} += 2;
+      if ($paragraph->{'counter'} != 0 and
+          $paragraph->{'counter'} + $paragraph->{'word_counter'} 
+                               > $paragraph->{'max'}) {
+        $result .= _cut_line($paragraph);
+      }
+      $result .= _add_pending_word($paragraph);
+      delete $paragraph->{'end_sentence'};
+      $paragraph->{'space'} = '';
     }
   }
   return $result;
[Prev in Thread]
Current Thread
[Next in Thread]
[6248] Paragraph.pm add_text use split, Gavin D. Smith <=
Prev by Date: [6247] unicode string_width implementation
Next by Date: [6249] po
Previous by thread: [6247] unicode string_width implementation
Next by thread: [6249] po
Index(es):
- Date
- Thread