[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]
lynx-dev optimization Re: internal links (patch11)
From: |
Leonid Pauzner |
Subject: |
lynx-dev optimization Re: internal links (patch11) |
Date: |
Tue, 5 Nov 2002 02:09:42 +0300 (MSK) |
One more patch that optimize parsing of large html, with many anchors.
Sorry, I do not know the release schedule - I sent too many small patches...
May pick them together if there is a problem. Looking forward dev10.
* optimization for parsing html with many relative links, href="#fragment" -
HTAnchor_findChildAndLink() and HTML_start_element(), case HTML_A:
now avoid significant overhead when link == LINK_INTERNAL (e.g. resolving
against base, lots of reallocations, parent lookup, etc. all are useless).
Two functions affected. [HTAnchor.c, HTML.c]. The code work both with and
without DONT_TRACK_INTERNAL_LINKS symbol.
This patch applied on top of my previous patches,
...it also undo my patch #7 re: HTParseAnchor() in HTParse.[c,h]
diff -u -p -r LYNX2-8-.590/src/html.c LYNX2-8-/src/html.c
--- LYNX2-8-.590/src/html.c Sun Oct 6 17:43:28 2002
+++ LYNX2-8-/src/html.c Mon Nov 4 22:55:26 2002
@@ -661,7 +661,6 @@ PUBLIC void HTML_write ARGS3(HTStructure
* context an internal link makes no sense (e.g., IMG SRC=).
*/
-#ifndef DONT_TRACK_INTERNAL_LINKS
/* A flag is used to keep track of whether an "URL reference" encountered
had a real "URL" or not. In the latter case, it will be marked as
"internal". The flag is set before we start messing around with the
@@ -674,12 +673,6 @@ PUBLIC void HTML_write ARGS3(HTStructure
just an abbreviation. - kw */
#define INTERN_LT (HTLinkType *)(intern_flag ? LINK_INTERNAL : NULL)
-#else /* !DONT_TRACK_INTERNAL_LINKS */
-
-#define CHECK_FOR_INTERN(flag,s) /* do nothing */ ;
-#define INTERN_LT (HTLinkType *)NULL
-
-#endif /* DONT_TRACK_INTERNAL_LINKS */
#ifdef USE_COLOR_STYLE
# if !OPT_SCN
@@ -3020,78 +3013,80 @@ PRIVATE int HTML_start_element ARGS6(
value[HTML_A_NAME] && *value[HTML_A_NAME]) {
StrAllocCopy(id_string, value[HTML_A_NAME]);
}
- if (id_string) {
+ if (id_string)
TRANSLATE_AND_UNESCAPE_TO_STD(&id_string);
- if (*id_string == '\0') {
- FREE(id_string);
- }
- }
/*
* Handle the reference. - FM
*/
if (present && present[HTML_A_HREF]) {
-#ifndef DONT_TRACK_INTERNAL_LINKS
- if (present[HTML_A_ISMAP])
- intern_flag = FALSE;
- else
- CHECK_FOR_INTERN(intern_flag,value[HTML_A_HREF]);
-#endif
/*
- * Prepare to do housekeeping on the reference. - FM
+ * Set to know we are making the content bold.
*/
- if (!value[HTML_A_HREF] || *value[HTML_A_HREF] == '\0') {
- StrAllocCopy(href, me->node_anchor->address);
- } else if (*value[HTML_A_HREF] == '#') {
- StrAllocCopy(href, me->node_anchor->address);
- if (strlen(value[HTML_A_HREF]) > 1) {
- StrAllocCat(href, value[HTML_A_HREF]);
- }
- } else {
+ me->inBoldA = TRUE;
+
+ CHECK_FOR_INTERN(intern_flag,value[HTML_A_HREF]);
+ if (present[HTML_A_ISMAP]) /*???*/
+ intern_flag = FALSE;
+
+ if (intern_flag) {
+ /*** FAST WAY: ***/
StrAllocCopy(href, value[HTML_A_HREF]);
- }
- url_type = LYLegitimizeHREF(me, &href, TRUE, TRUE);
+ if (href && *href)
+ TRANSLATE_AND_UNESCAPE_TO_STD(&href);
- /*
- * Deal with our ftp gateway kludge. - FM
- */
- if (!url_type && !strncmp(href, "/foo/..", 7) &&
- (isFTP_URL(me->node_anchor->address) ||
- isFILE_URL(me->node_anchor->address))) {
- for (i = 0; (href[i] = href[i+7]) != 0; i++)
- ;
- }
+ } else {
+ /*
+ * Prepare to do housekeeping on the reference. - FM
+ */
+ if (!value[HTML_A_HREF] || *value[HTML_A_HREF] == '\0') {
+ StrAllocCopy(href, me->node_anchor->address);
+ } else if (*value[HTML_A_HREF] == '#') {
+ StrAllocCopy(href, me->node_anchor->address);
+ if (strlen(value[HTML_A_HREF]) > 1) {
+ StrAllocCat(href, value[HTML_A_HREF]);
+ }
+ } else {
+ StrAllocCopy(href, value[HTML_A_HREF]);
+ }
+ url_type = LYLegitimizeHREF(me, &href, TRUE, TRUE);
- /*
- * Set to know we are making the content bold.
- */
- me->inBoldA = TRUE;
+ /*
+ * Deal with our ftp gateway kludge. - FM
+ */
+ if (!url_type && !strncmp(href, "/foo/..", 7) &&
+ (isFTP_URL(me->node_anchor->address) ||
+ isFILE_URL(me->node_anchor->address))) {
+ for (i = 0; (href[i] = href[i+7]) != 0; i++)
+ ;
+ }
- /*
- * Check whether a base tag is in effect. - FM
- */
- if ((me->inBASE && *href != '\0' && *href != '#') &&
- (temp = HTParse(href, me->base_href, PARSE_ALL)) &&
- *temp != '\0')
/*
- * Use reference related to the base.
+ * Check whether a base tag is in effect. - FM
*/
- StrAllocCopy(href, temp);
- FREE(temp);
+ if ((me->inBASE && *href != '\0' && *href != '#') &&
+ (temp = HTParse(href, me->base_href, PARSE_ALL)) &&
+ *temp != '\0')
+ /*
+ * Use reference related to the base.
+ */
+ StrAllocCopy(href, temp);
+ FREE(temp);
- /*
- * Check whether to fill in localhost. - FM
- */
- LYFillLocalFileURL(&href,
- ((*href != '\0' && *href != '#' &&
- me->inBASE) ?
- me->base_href : me->node_anchor->address));
+ /*
+ * Check whether to fill in localhost. - FM
+ */
+ LYFillLocalFileURL(&href,
+ ((*href != '\0' && *href != '#' &&
+ me->inBASE) ?
+ me->base_href : me->node_anchor->address));
+ }
} else {
if (bold_name_anchors == TRUE) {
me->inBoldA = TRUE;
}
}
-#ifndef DONT_TRACK_INTERNAL_LINKS
+
if (present && present[HTML_A_TYPE] && value[HTML_A_TYPE]) {
StrAllocCopy(temp, value[HTML_A_TYPE]);
if (!intern_flag && href &&
@@ -3108,7 +3103,6 @@ PRIVATE int HTML_start_element ARGS6(
FREE(temp);
}
}
-#endif /* DONT_TRACK_INTERNAL_LINKS */
me->CurrentA = HTAnchor_findChildAndLink(
me->node_anchor, /* Parent */
diff -u -p -r LYNX2-8-.590/src/lymainlo.c LYNX2-8-/src/lymainlo.c
--- LYNX2-8-.590/src/lymainlo.c Sun Oct 6 17:43:28 2002
+++ LYNX2-8-/src/lymainlo.c Mon Nov 4 22:55:30 2002
@@ -158,6 +158,7 @@ PRIVATE int str_n_cmp(const char *p, con
#include <LYexit.h>
#include <LYLeaks.h>
+PUBLIC HTLinkType * LINK_INTERNAL = 0;
#ifndef DONT_TRACK_INTERNAL_LINKS
#define NO_INTERNAL_OR_DIFFERENT(c,n) TRUE
@@ -5233,6 +5234,12 @@ int mainloop NOARGS
unsigned int len;
int i;
int follow_col = -1, key_count = 0, last_key = 0;
+
+/* "internal" means "within the same document, with certainty".
+ It includes a space so it cannot conflict with any (valid) "TYPE"
+ attributes on A elements. [According to which DTD, anyway??] - kw
+ */
+ LINK_INTERNAL = HTAtom_for("internal link"); /* init */
/*
* curdoc.address contains the name of the file that is currently open.
diff -u -p -r LYNX2-8-.590/www/library/implemen/htanchor.h
LYNX2-8-/www/library/implemen/htanchor.h
--- LYNX2-8-.590/www/library/implemen/htanchor.h Fri Oct 18 03:56:38 2002
+++ LYNX2-8-/www/library/implemen/htanchor.h Mon Nov 4 22:55:04 2002
@@ -131,11 +131,8 @@ typedef struct _DocAddress {
BOOL safe;
} DocAddress;
-/* "internal" means "within the same document, with certainty".
- It includes a space so it cannot conflict with any (valid) "TYPE"
- attributes on A elements. [According to which DTD, anyway??] - kw */
-
-#define LINK_INTERNAL HTAtom_for("internal link")
+/* "internal" means "within the same document, with certainty". */
+extern HTLinkType * LINK_INTERNAL;
/* Create new or find old sub-anchor
** ---------------------------------
diff -u -p -r LYNX2-8-.590/www/library/implemen/htanchor.c
LYNX2-8-/www/library/implemen/htanchor.c
--- LYNX2-8-.590/www/library/implemen/htanchor.c Sun Oct 6 17:43:28 2002
+++ LYNX2-8-/www/library/implemen/htanchor.c Tue Nov 5 01:11:24 2002
@@ -70,34 +70,14 @@ PRIVATE HTParentAnchor * HTParentAnchor_
HTParentAnchor *newAnchor = typecalloc(HTParentAnchor);
if (newAnchor == NULL)
outofmem(__FILE__, "HTParentAnchor_new");
+ /* calloc: all pointers initialized to NULL */
+
newAnchor->parent = newAnchor;
- newAnchor->bookmark = NULL; /* Bookmark filename. - FM */
newAnchor->isISMAPScript = FALSE; /* Lynx appends ?0,0 if TRUE. - FM */
newAnchor->isHEAD = FALSE; /* HEAD request if TRUE. - FM */
newAnchor->safe = FALSE; /* Safe. - FM */
-#ifdef SOURCE_CACHE
- newAnchor->source_cache_file = NULL;
- newAnchor->source_cache_chunk = NULL;
-#endif
- newAnchor->FileCache = NULL; /* Path to a disk-cached copy. - FM */
- newAnchor->SugFname = NULL; /* Suggested filename. - FM */
- newAnchor->RevTitle = NULL; /* TITLE for a LINK with REV. -
FM */
- newAnchor->citehost = NULL; /* LINK REL=citehost - RDC */
- newAnchor->cache_control = NULL; /* Cache-Control. - FM */
newAnchor->no_cache = FALSE; /* no-cache? - FM */
- newAnchor->content_type = NULL; /* Content-Type. - FM */
- newAnchor->content_language = NULL; /* Content-Language. - FM */
- newAnchor->content_encoding = NULL; /* Compression algorithm. - FM */
- newAnchor->content_base = NULL; /* Content-Base. - FM */
- newAnchor->content_disposition = NULL; /* Content-Disposition. - FM */
- newAnchor->content_location = NULL; /* Content-Location. - FM */
- newAnchor->content_md5 = NULL; /* Content-MD5. - FM */
newAnchor->content_length = 0; /* Content-Length. - FM */
- newAnchor->date = NULL; /* Date. - FM */
- newAnchor->expires = NULL; /* Expires. - FM */
- newAnchor->last_modified = NULL; /* Last-Modified. - FM */
- newAnchor->ETag = NULL; /* ETag (HTTP/1.1 cache validator) */
- newAnchor->server = NULL; /* Server. - FM */
return(newAnchor);
}
@@ -255,6 +235,10 @@ PUBLIC HTChildAnchor * HTAnchor_findChil
}
+PRIVATE HTParentAnchor * HTAnchor_findAddress_nofragment PARAMS((
+ CONST DocAddress * newdoc));
+
+
/* Create or find a child anchor with a possible link
** --------------------------------------------------
**
@@ -270,30 +254,46 @@ PUBLIC HTChildAnchor * HTAnchor_findChil
{
HTChildAnchor * child = HTAnchor_findChild(parent, tag);
- CTRACE((tfp,"Entered HTAnchor_findChildAndLink\n"));
+ CTRACE((tfp,"Entered HTAnchor_findChildAndLink: tag=`%s',%s href=`%s'\n",
+ NonNull(tag),
+ (ltype == LINK_INTERNAL) ? " (internal link)" : "",
+ NonNull(href) ));
if (href && *href) {
- char *relative_to = HTAnchor_address((HTAnchor *)parent);
+ CONST char *fragment = NULL;
DocAddress parsed_doc;
- HTAnchor * dest;
+ HTParentAnchor * dest;
- parsed_doc.address = HTParse(href, relative_to, PARSE_ALL);
-#ifndef DONT_TRACK_INTERNAL_LINKS
- if (ltype && parent->post_data && ltype == LINK_INTERNAL) {
- /* for internal links, find a destination with the same
- post data if the source of the link has post data. - kw */
- parsed_doc.post_data = parent->post_data;
- parsed_doc.post_content_type = parent->post_content_type;
- } else
-#endif
- {
+ if (ltype == LINK_INTERNAL) {
+ dest = parent;
+ fragment = href+1;
+ } else {
+ char *relative_to = HTAnchor_address((HTAnchor *)parent);
+ /* hmm, it seems HTML.c always resolve href to absolute url??? */
+ parsed_doc.address = HTParse(href, relative_to,
+ PARSE_ACCESS | PARSE_HOST | PARSE_PATH | PARSE_PUNCTUATION);
parsed_doc.post_data = NULL;
parsed_doc.post_content_type = NULL;
+ parsed_doc.bookmark = NULL;
+ parsed_doc.isHEAD = FALSE;
+ parsed_doc.safe = FALSE;
+ dest = HTAnchor_findAddress_nofragment(&parsed_doc);
+ FREE(relative_to);
+ FREE(parsed_doc.address);
+ fragment = HTParse(href, "", PARSE_ANCHOR);
}
- parsed_doc.bookmark = NULL;
- parsed_doc.isHEAD = FALSE;
- parsed_doc.safe = FALSE;
- dest = HTAnchor_findAddress(&parsed_doc);
+
+ /*
+ ** [comment from HTAnchor_findAddress()]
+ ** If the address represents a sub-anchor, we load its parent,
+ ** then we create a child anchor within that document.
+ */
+ if (*fragment)
+ dest = (HTParentAnchor *)HTAnchor_findChild(dest, fragment);
+
+
+ if (ltype != LINK_INTERNAL)
+ FREE(fragment);
#define DUPLICATE_ANCHOR_NAME_WORKAROUND
@@ -307,7 +307,7 @@ PUBLIC HTChildAnchor * HTAnchor_findChil
CTRACE((tfp,
"*** Duplicate ChildAnchor %p named `%s' with %d links",
child, tag, child_links));
- if (dest == testdest1 && ltype == child->mainLink.type) {
+ if ((HTAnchor *)dest == testdest1 && ltype ==
child->mainLink.type) {
CTRACE((tfp,", same dest %p and type, keeping it\n",
testdest1));
} else {
@@ -318,13 +318,12 @@ PUBLIC HTChildAnchor * HTAnchor_findChil
}
}
#endif
- HTAnchor_link((HTAnchor *)child, dest, ltype);
- FREE(parsed_doc.address);
- FREE(relative_to);
+ HTAnchor_link((HTAnchor *)child, (HTAnchor *)dest, ltype);
}
return(child);
}
+
#ifdef LY_FIND_LEAKS
/*
** Function for freeing the adult hash table. - FM
@@ -376,7 +375,7 @@ PUBLIC HTAnchor * HTAnchor_findAddress A
CONST DocAddress *, newdoc)
{
/* Anchor tag specified ? */
- char *tag = HTParseAnchor(newdoc->address);
+ char *tag = HTParse(newdoc->address, "", PARSE_ANCHOR);
CTRACE((tfp,"Entered HTAnchor_findAddress\n"));
@@ -384,7 +383,7 @@ PUBLIC HTAnchor * HTAnchor_findAddress A
** If the address represents a sub-anchor, we recursively load its
** parent, then we create a child anchor within that document.
*/
- if (tag && *tag) {
+ if (*tag) {
DocAddress parsed_doc;
HTParentAnchor * foundParent;
HTChildAnchor * foundAnchor;
@@ -397,14 +396,22 @@ PUBLIC HTAnchor * HTAnchor_findAddress A
parsed_doc.isHEAD = newdoc->isHEAD;
parsed_doc.safe = newdoc->safe;
- foundParent = (HTParentAnchor *)HTAnchor_findAddress(&parsed_doc);
+ foundParent = HTAnchor_findAddress_nofragment(&parsed_doc);
foundAnchor = HTAnchor_findChild (foundParent, tag);
FREE(parsed_doc.address);
FREE(tag);
return (HTAnchor *)foundAnchor;
- } else {
+ }
+ FREE(tag);
+ return (HTAnchor *)HTAnchor_findAddress_nofragment(newdoc);
+}
+
+/* The address has no anchor tag for sure.
+ */
+PRIVATE HTParentAnchor * HTAnchor_findAddress_nofragment ARGS1(
+ CONST DocAddress *, newdoc)
+{
/*
- ** If the address has no anchor tag,
** check whether we have this node.
*/
int hash;
@@ -412,8 +419,6 @@ PUBLIC HTAnchor * HTAnchor_findAddress A
HTList *grownups;
HTParentAnchor * foundAnchor;
- FREE(tag);
-
/*
** Select list from hash table,
*/
@@ -448,7 +453,7 @@ PUBLIC HTAnchor * HTAnchor_findAddress A
{
CTRACE((tfp, "Anchor %p with address `%s' already exists.\n",
(void *)foundAnchor, newdoc->address));
- return (HTAnchor *)foundAnchor;
+ return foundAnchor;
}
}
@@ -469,13 +474,14 @@ PUBLIC HTAnchor * HTAnchor_findAddress A
foundAnchor->isHEAD = newdoc->isHEAD;
foundAnchor->safe = newdoc->safe;
HTList_addObject (adults, foundAnchor);
- return (HTAnchor *)foundAnchor;
- }
+
+ return foundAnchor;
}
+
/* Create new or find old named anchor - simple form
** -------------------------------------------------
**
-** Like the previous one, but simpler to use for simple cases.
+** Like HTAnchor_findAddress, but simpler to use for simple cases.
** No post data etc. can be supplied. - kw
*/
PUBLIC HTAnchor * HTAnchor_findSimpleAddress ARGS1(
diff -u -p -r LYNX2-8-.590/www/library/implemen/htparse.c
LYNX2-8-/www/library/implemen/htparse.c
--- LYNX2-8-.590/www/library/implemen/htparse.c Sat Nov 2 22:11:02 2002
+++ LYNX2-8-/www/library/implemen/htparse.c Mon Nov 4 22:55:08 2002
@@ -485,29 +485,6 @@ PUBLIC char * HTParse ARGS3(
}
-PUBLIC char * HTParseAnchor ARGS1(
- CONST char *, aName)
-{
- if (!aName)
- return 0;
-
- if (!strncasecomp(aName, "http://", 7) ||
- !strncasecomp(aName, "file://", 7) ||
- !strncasecomp(aName, "https://", 8)) { /* fast way */
- CONST char * p;
- for (p = aName; *p && *p != '#'; p++)
- ;
- if (*p++) {
- char * res = 0;
- StrAllocCopy(res, p);
- return res;
- }
- return 0;
- }
- return HTParse(aName, "", PARSE_ANCHOR); /* may have unescaped hashes */
-}
-
-
/* Simplify a filename. HTSimplify()
** --------------------
**
diff -u -p -r LYNX2-8-.590/www/library/implemen/htparse.h
LYNX2-8-/www/library/implemen/htparse.h
--- LYNX2-8-.590/www/library/implemen/htparse.h Mon Nov 4 21:15:42 2002
+++ LYNX2-8-/www/library/implemen/htparse.h Mon Nov 4 22:55:04 2002
@@ -70,10 +70,6 @@ extern char * HTParse PARAMS((
CONST char * relatedName,
int wanted));
-extern char * HTParseAnchor PARAMS(( /* faster then HTParse() */
- CONST char * aName));
-
-
/* Simplify a filename. HTSimplify()
** --------------------
**
; To UNSUBSCRIBE: Send "unsubscribe lynx-dev" to address@hidden
- lynx-dev more memory optimizations (patch10), Leonid Pauzner, 2002/11/03
- Re: lynx-dev more memory optimizations (patch10), Thomas Dickey, 2002/11/03
- Re: lynx-dev optimization Re: internal links (patch11), Leonid Pauzner, 2002/11/04
- Re: lynx-dev optimization Re: internal links (patch11), Philip Webb, 2002/11/04
- Re: lynx-dev optimization Re: internal links (patch11), Stef Caunter, 2002/11/04
- Re: lynx-dev optimization Re: internal links (patch11), Frédéric L . W . Meunier, 2002/11/05
- Re: lynx-dev optimization Re: internal links (patch11), Philip Webb, 2002/11/05