[5999] better support for files produced with CR-LF line endings

texinfo-commits
[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]
[5999] better support for files produced with CR-LF line endings

From:	Gavin D. Smith
Subject:	[5999] better support for files produced with CR-LF line endings
Date:	Mon, 29 Dec 2014 14:21:25 +0000
Revision: 5999
          http://svn.sv.gnu.org/viewvc/?view=rev&root=texinfo&revision=5999
Author:   gavin
Date:     2014-12-29 14:21:24 +0000 (Mon, 29 Dec 2014)
Log Message:
-----------
better support for files produced with CR-LF line endings

Modified Paths:
--------------
    trunk/ChangeLog
    trunk/info/filesys.c
    trunk/info/nodes.c
    trunk/info/nodes.h

Modified: trunk/ChangeLog
===================================================================
--- trunk/ChangeLog     2014-12-27 22:23:46 UTC (rev 5998)
+++ trunk/ChangeLog     2014-12-29 14:21:24 UTC (rev 5999)
@@ -1,3 +1,16 @@
+2014-12-29  Gavin Smith  <address@hidden>
+
+       * info/filesys.c, info/nodes.c (convert_eols): Function moved 
+       between files.  Arguments changed.
+       * info/nodes.c (adjust_nodestart): Take extra argument giving 
+       the slack to search for a node.
+       (find_node_from_tag): New function, wrapping adjust_nodestart.  
+       Call convert_eols on file if we don't find a node.
+       (info_node_of_tag): Call find_node_from_tag instead of 
+       adjust_nodestart.  Rearrange logic in function so not to call 
+       itself when handling an anchor.
+       * info/nodes.h (N_EOLs_Converted): New flag.
+
 2014-12-27  Gavin Smith  <address@hidden>
 
        * info/nodes.h (TAGS_TABLE_BEG_LABEL): Define without trailing 

Modified: trunk/info/filesys.c
===================================================================
--- trunk/info/filesys.c        2014-12-27 22:23:46 UTC (rev 5998)
+++ trunk/info/filesys.c        2014-12-29 14:21:24 UTC (rev 5999)
@@ -269,43 +269,7 @@
   free (try_filename);
   return 0;
 }
-
-#if 0
-/* Given a chunk of text and its length, convert all CRLF pairs at every
-   end-of-line into a single Newline character.  Return the length of
-   produced text.
 
-   This is required because the rest of code is too entrenched in having
-   a single newline at each EOL; in particular, searching for various
-   Info headers and cookies can become extremely tricky if that assumption
-   breaks.
-
-   FIXME: this could also support Mac-style text files with a single CR
-   at the EOL, but what about random CR characters in non-Mac files?  Can
-   we afford converting them into newlines as well?  Maybe implement some
-   heuristics here, like in Emacs 20.
-
-   FIXME: is it a good idea to show the EOL type on the modeline? */
-static long
-convert_eols (char *text, long int textlen)
-{
-  register char *s = text;
-  register char *d = text;
-
-  while (textlen--)
-    {
-      if (*s == '\r' && textlen && s[1] == '\n')
-       {
-         s++;
-         textlen--;
-       }
-      *d++ = *s++;
-    }
-
-  return d - text;
-}
-#endif
-
 /* Read the contents of PATHNAME, returning a buffer with the contents of
    that file in it, and returning the size of that buffer in FILESIZE.
    If the file turns out to be compressed, set IS_COMPRESSED to non-zero.
@@ -355,19 +319,6 @@
       close (descriptor);
     }
 
-  /* EOL conversion is disabled because it makes the tag table for a file 
-     incorrect.  See the test in info/t/cr-tag-table.sh. */
-#if 0
-  /* Convert any DOS-style CRLF EOLs into Unix-style NL.
-     Seems like a good idea to have even on Unix, in case the Info
-     files are coming from some Windows system across a network.  */
-  fsize = convert_eols (contents, fsize);
-
-  /* EOL conversion can shrink the text quite a bit.  We don't
-     want to waste storage.  */
-  contents = xrealloc (contents, 1 + fsize);
-  contents[fsize] = '\0';
-#endif
   *filesize = fsize;
 
   return contents;

Modified: trunk/info/nodes.c
===================================================================
--- trunk/info/nodes.c  2014-12-27 22:23:46 UTC (rev 5998)
+++ trunk/info/nodes.c  2014-12-29 14:21:24 UTC (rev 5999)
@@ -866,7 +866,6 @@
                                       char **filename, char **nodename,
                                       char *filename_in, char *nodename_in);
 static void node_set_body_start (NODE *node);
-static int adjust_nodestart (FILE_BUFFER *file_buffer, NODE *tag);
 
 /* Return a pointer to a newly allocated NODE structure, with
    fields filled in. */
@@ -1082,6 +1081,35 @@
   return node;
 }
 
+
+/* Convert any CRLF pairs in the SOURCE file and place the converted buffer in 
+   DESTINATION.  DESTINATION->contents must be allocated on the heap and at 
+   least as big as SOURCE->contents, including a terminating null.  
DESTINATION 
+   is allowed to be the same as SOURCE to convert in place. */
+void
+convert_eols (FILE_BUFFER *destination, FILE_BUFFER *source)
+{
+  register char *d = destination->contents;
+  register char *s = source->contents;
+
+  long textlen = source->filesize;
+  while (textlen--)
+    {
+      if (*s == '\r' && textlen && s[1] == '\n')
+        {
+          s++;
+          textlen--;
+        }
+      *d++ = *s++;
+    }
+  *d = '\0';
+
+  /* EOL conversion can shrink the text quite a bit.  We don't
+     want to waste storage.  */
+  destination->contents = xrealloc (destination->contents,
+                                    d - destination->contents + 1);
+}
+
 /* Magic number that RMS used to decide how much a tags table pointer could
    be off by.  I feel that it should be much smaller, like 4.  */
 #define DEFAULT_INFO_FUDGE 1000
@@ -1093,7 +1121,7 @@
    Set NODE->nodestart directly on the separator that precedes this node.
    If the node could not be found, return 0. */
 static int
-adjust_nodestart (FILE_BUFFER *fb, NODE *node)
+adjust_nodestart (FILE_BUFFER *fb, NODE *node, int slack)
 {
   long position = -1;
   SEARCH_BINDING s;
@@ -1112,15 +1140,15 @@
       position = find_node_in_binding (node->nodename, &s);
     }
 
-  if (position == -1) 
+  if (position == -1)
     {
       if (strict_node_location_p)
         return 0;
 
       /* Oh well, I guess we have to try to find it in a larger area. */
 
-      s.start -= DEFAULT_INFO_FUDGE;
-      s.end += DEFAULT_INFO_FUDGE;
+      s.start -= slack;
+      s.end += slack;
 
       if (s.start < 0)
         s.start = 0;
@@ -1132,17 +1160,101 @@
       /* If the node still couldn't be found, we lose big. */
       if (position == -1)
         return 0;
- 
+
       /* Set the flag in NODE->flags to say that the the tags table could
          need updating (if we used a tag to get here, that is). */
       if (node->flags & N_HasTagsTable)
         node->flags |= N_UpdateTags;
     }
 
+  /* Do we want this? */
+  /* TODO: Use TAG again to store the tags, and add an extra field to store
+     the original values. */
   node->nodestart = s.buffer + position - fb->contents;
   return 1;
 }
 
+/* Look in the contents of *FB_PTR for a node referred to with TAG.
+  
+   If we have to update the contents of the file, *PARENT and *FB_PTR can be 
+   changed to a different FILE_BUFFER. */
+static int
+find_node_from_tag (FILE_BUFFER **parent, FILE_BUFFER **fb_ptr, NODE *tag)
+{
+  int success;
+
+  FILE_BUFFER *fb = *fb_ptr;
+  int file_already_used = 1;
+  FILE_BUFFER *dest_fb;
+  int slack;
+
+  /* Start off with a small fudge to reduce chance of finding a node and then
+     later having to convert the EOL's, leaving us with the question of what to
+     do with the existing buffer and the nodes that refer to it. */
+  if (!(fb->flags & N_EOLs_Converted))
+    slack = DEFAULT_INFO_FUDGE;
+  else
+    slack = DEFAULT_INFO_FUDGE;
+
+  success = adjust_nodestart (fb, tag, slack);
+  if (success)
+    return success;
+
+  if (fb->flags & N_EOLs_Converted || strict_node_location_p)
+    return 0;
+
+  /* Convert EOL's.  If the Info file was produced under MS-Windows with
+     some versions of makeinfo, it's possible that it has CR-LF line endings 
+     with the CR bytes not counted in the tag table. */
+
+  /* TODO: Check if there are already nodes in windows from this file.  If
+     not, we can convert the buffer in place. */  
+
+  if (file_already_used)
+    {
+      FILE_BUFFER *new_fb = xmalloc (sizeof (FILE_BUFFER));
+
+      memcpy (new_fb, fb, sizeof (FILE_BUFFER));
+      new_fb->contents = xmalloc (fb->filesize + 1);
+
+      /* TODO: Copy and restore tags table. */
+
+      add_pointer_to_array (new_fb, info_loaded_files_index,
+              info_loaded_files, info_loaded_files_slots, 10);
+
+      dest_fb = new_fb;
+    }
+
+  convert_eols (dest_fb, fb);
+  dest_fb->flags |= N_EOLs_Converted;
+
+  success = adjust_nodestart (dest_fb, tag, DEFAULT_INFO_FUDGE);
+  if (success)
+    {
+      /* Stop the old record being used again. */
+      if (dest_fb != fb)
+        {
+          fb->fullpath = "";
+          fb->filename = "";
+
+          /* TODO: Could we also try to convert nodes referring to the old 
+             buffer, to save space? */
+        }
+
+      /* If file is split, leave PARENT as it is, otherwise update both FB_PTR 
+         and PARENT to the new file. */
+      if (*parent == *fb_ptr)
+        *parent = dest_fb;
+      *fb_ptr = dest_fb;
+      return success;
+    }
+  else
+    /* Throw the converted buffer away?  Or keep it to stop us ever having
+       to do the conversion step again? */
+
+  return 0;
+}
+
 /* Calculate the length of the node. */
 static void
 set_tag_nodelen (FILE_BUFFER *subfile, NODE *tag)
@@ -1164,13 +1276,20 @@
 {
   NODE *tag = *tag_ptr;
   NODE *node;
-  /* If not a split file, subfile == fb */
-  FILE_BUFFER *subfile;
+  int is_anchor;
+  NODE *anchor_tag;
+  int node_pos, anchor_pos;
+
+  FILE_BUFFER *parent; /* File containing tag table. */
+  FILE_BUFFER *subfile; /* File containing node. */
  
   if (!tag->subfile)
-    subfile = fb;
+    parent = subfile = fb;
   else
-    subfile = info_find_subfile (tag->subfile);
+    {
+      parent = fb;
+      subfile = info_find_subfile (tag->subfile);
+    }
 
   if (!subfile)
     return NULL;
@@ -1189,41 +1308,10 @@
 
   node = 0;
 
-  if (tag->nodelen != 0) /* If not an anchor. */
+  is_anchor = tag->nodelen == 0;
+ 
+  if (is_anchor)
     {
-      /* If TAG->nodelen hasn't been calculated yet, then we aren't
-         in a position to trust the entry pointer.  Adjust things so
-         that TAG->nodestart gets the exact address of the start of
-         the node separator which starts this node.  If we cannot
-         do that, the node isn't really here. */
-      if (tag->nodelen == -1)
-        {
-          if (!adjust_nodestart (subfile, tag))
-            return NULL; /* Node not found. */
-
-          set_tag_nodelen (subfile, tag);
-        }
-
-      tag->contents = subfile->contents + tag->nodestart;
-      tag->contents += skip_node_separator (tag->contents);
-      node_set_body_start (tag);
-
-      /* Read locations of references in node and similar.  Strip Info file
-         syntax from node if preprocess_nodes=On.  Adjust the offsets of
-         anchors that occur within the node.*/
-      node = scan_node_contents (fb, tag_ptr);
-      node->nodename = xstrdup (node->nodename);
-
-      /* We can't set this when tag table is built, because
-         if file is split, we don't know which of the sub-files
-         are compressed. */
-      if (subfile->flags & N_IsCompressed)
-        node->flags |= N_IsCompressed;
-    }
-  else /* anchor, return containing node */
-    {
-      int anchor_pos, node_pos;
-
       anchor_pos = tag_ptr - fb->tags;
 
       /* Look backwards in the tag table for the node preceding
@@ -1239,25 +1327,56 @@
       if (node_pos < 0)
         return NULL;
 
-      /* Get the actual node from the tag.  This is a recursive call, but
-         it can't recurse again, because we call it with a real node.  */
-      node = info_node_of_tag (fb, &fb->tags[node_pos]);
+      anchor_tag = tag;
+      tag = fb->tags[node_pos];
+      tag_ptr = &fb->tags[node_pos];
+    }
 
-      if (node)
-        {
-          /* Start displaying the node at the anchor position.  */
-          node->display_pos = tag->nodestart
-            - (node->nodestart
-               + skip_node_separator (subfile->contents
-                                      + fb->tags[node_pos]->nodestart));
+  /* Get the node. */
 
-          /* Otherwise an anchor at the end of a node ends up displaying at
-             the end of the last line of the node (way over on the right of
-             the screen), which looks wrong.  */
-          if (node->display_pos >= (unsigned long) node->nodelen)
-            node->display_pos = node->nodelen - 1;
-        }
+  /* If TAG->nodelen hasn't been calculated yet, then we aren't
+     in a position to trust the entry pointer.  Adjust things so
+     that TAG->nodestart gets the exact address of the start of
+     the node separator which starts this node.  If we cannot
+     do that, the node isn't really here. */
+  if (tag->nodelen == -1)
+    {
+      if (!find_node_from_tag (&parent, &subfile, tag))
+        return NULL; /* Node not found. */
+
+      set_tag_nodelen (subfile, tag);
     }
 
+  tag->contents = subfile->contents + tag->nodestart;
+  tag->contents += skip_node_separator (tag->contents);
+  node_set_body_start (tag);
+
+  /* Read locations of references in node and similar.  Strip Info file
+     syntax from node if preprocess_nodes=On.  Adjust the offsets of
+     anchors that occur within the node.*/
+  node = scan_node_contents (parent, tag_ptr);
+  node->nodename = xstrdup (node->nodename);
+
+  /* We can't set this when tag table is built, because
+     if file is split, we don't know which of the sub-files
+     are compressed. */
+  if (subfile->flags & N_IsCompressed)
+    node->flags |= N_IsCompressed;
+
+  if (is_anchor)
+    {
+      /* Start displaying the node at the anchor position.  */
+
+      node->display_pos = anchor_tag->nodestart
+        - (node->nodestart
+           + skip_node_separator (subfile->contents + tag->nodestart));
+
+      /* Otherwise an anchor at the end of a node ends up displaying at
+         the end of the last line of the node (way over on the right of
+         the screen), which looks wrong.  */
+      if (node->display_pos >= (unsigned long) node->nodelen)
+        node->display_pos = node->nodelen - 1;
+    }
+
   return node;
 }

Modified: trunk/info/nodes.h
===================================================================
--- trunk/info/nodes.h  2014-12-27 22:23:46 UTC (rev 5998)
+++ trunk/info/nodes.h  2014-12-29 14:21:24 UTC (rev 5999)
@@ -70,6 +70,7 @@
 #define N_IsIndex      0x200    /* An index node. */
 #define N_IsDir        0x400    /* A dir node. */
 #define N_Subfile      0x800    /* File buffer is a subfile of a split file. */
+#define N_EOLs_Converted 0x1000 /* CR bytes were stripped before LF. */
 
 /* String constants. */
 #define INFO_FILE_LABEL                 "File:"
[Prev in Thread]
Current Thread
[Next in Thread]
[5999] better support for files produced with CR-LF line endings, Gavin D. Smith <=
Prev by Date: [5998] better support of Info files with CR-LF line endings
Next by Date: [6000] Add misisng reference test files.
Previous by thread: [5998] better support of Info files with CR-LF line endings
Next by thread: [6000] Add misisng reference test files.
Index(es):
- Date
- Thread