From 49933c84012536388e1f9d0bc4070e377d824309 Mon Sep 17 00:00:00 2001
From: Maks Orlovich
Date: Tue, 1 Mar 2016 09:43:56 -0500
Subject: Parse attributes, they have image URLs.
* src/convert.h: Add link_noquote_html_p to permit rewriting URLs deep
inside attributes without adding extraneous quoting
* src/convert.c (convert_links): Honor link_noquote_html_p
* src/html_url.c (tag_handle_img): New function. Add srcset parsing.
diff --git a/src/convert.c b/src/convert.c
index df8d58d..509923e 100644
--- a/src/convert.c
+++ b/src/convert.c
@@ -308,7 +308,7 @@ convert_links (const char *file, struct urlpos *links)
char *quoted_newname = local_quote_string (newname,
link->link_css_p);
- if (link->link_css_p)
+ if (link->link_css_p || link->link_noquote_html_p)
p = replace_plain (p, link->size, fp, quoted_newname);
else if (!link->link_refresh_p)
p = replace_attr (p, link->size, fp, quoted_newname);
@@ -329,7 +329,7 @@ convert_links (const char *file, struct urlpos *links)
char *newname = convert_basename (p, link);
char *quoted_newname = local_quote_string (newname, link->link_css_p);
- if (link->link_css_p)
+ if (link->link_css_p || link->link_noquote_html_p)
p = replace_plain (p, link->size, fp, quoted_newname);
else if (!link->link_refresh_p)
p = replace_attr (p, link->size, fp, quoted_newname);
@@ -352,7 +352,7 @@ convert_links (const char *file, struct urlpos *links)
char *newlink = link->url->url;
char *quoted_newlink = html_quote_string (newlink);
- if (link->link_css_p)
+ if (link->link_css_p || link->link_noquote_html_p)
p = replace_plain (p, link->size, fp, newlink);
else if (!link->link_refresh_p)
p = replace_attr (p, link->size, fp, quoted_newlink);
diff --git a/src/convert.h b/src/convert.h
index b3cd196..e3ff6f0 100644
--- a/src/convert.h
+++ b/src/convert.h
@@ -69,6 +69,7 @@ struct urlpos {
unsigned int link_base_p :1; /* the url came from */
unsigned int link_inline_p :1; /* needed to render the page */
unsigned int link_css_p :1; /* the url came from CSS */
+ unsigned int link_noquote_html_p :1; /* from HTML, but doesn't need " */
unsigned int link_expect_html :1; /* expected to contain HTML */
unsigned int link_expect_css :1; /* expected to contain CSS */
diff --git a/src/html-url.c b/src/html-url.c
index 0743587..ab04204 100644
--- a/src/html-url.c
+++ b/src/html-url.c
@@ -56,6 +56,7 @@ typedef void (*tag_handler_t) (int, struct taginfo *, struct map_context *);
DECLARE_TAG_HANDLER (tag_find_urls);
DECLARE_TAG_HANDLER (tag_handle_base);
DECLARE_TAG_HANDLER (tag_handle_form);
+DECLARE_TAG_HANDLER (tag_handle_img);
DECLARE_TAG_HANDLER (tag_handle_link);
DECLARE_TAG_HANDLER (tag_handle_meta);
@@ -105,7 +106,7 @@ static struct known_tag {
{ TAG_FORM, "form", tag_handle_form },
{ TAG_FRAME, "frame", tag_find_urls },
{ TAG_IFRAME, "iframe", tag_find_urls },
- { TAG_IMG, "img", tag_find_urls },
+ { TAG_IMG, "img", tag_handle_img },
{ TAG_INPUT, "input", tag_find_urls },
{ TAG_LAYER, "layer", tag_find_urls },
{ TAG_LINK, "link", tag_handle_link },
@@ -183,7 +184,8 @@ static const char *additional_attributes[] = {
"name", /* used by tag_handle_meta */
"content", /* used by tag_handle_meta */
"action", /* used by tag_handle_form */
- "style" /* used by check_style_attr */
+ "style", /* used by check_style_attr */
+ "srcset", /* used by tag_handle_img */
};
static struct hash_table *interesting_tags;
@@ -674,6 +676,88 @@ tag_handle_meta (int tagid _GL_UNUSED, struct taginfo *tag, struct map_context *
}
}
+/* Handle the IMG tag. This requires special handling for the srcset attr,
+ while the traditional src/lowsrc/href attributes can be handled generically.
+*/
+
+static void
+tag_handle_img (int tagid, struct taginfo *tag, struct map_context *ctx) {
+ int attrind;
+ char *srcset;
+
+ /* Use the generic approach for the attributes without special syntax. */
+ tag_find_urls(tagid, tag, ctx);
+
+ srcset = find_attr (tag, "srcset", &attrind);
+ if (srcset)
+ {
+ /* These are relative to the input text. */
+ int base_ind = ATTR_POS (tag,attrind,ctx);
+ int size = strlen (srcset);
+
+ /* These are relative to srcset. */
+ int offset, url_start, url_end;
+
+ /* Make sure to line up base_ind with srcset[0], not outside quotes. */
+ if (ctx->text[base_ind] == '"' || ctx->text[base_ind] == '\'')
+ ++base_ind;
+
+ offset = 0;
+ while (offset < size)
+ {
+ bool has_descriptor = true;
+
+ /* Skip over initial whitespace and commas. Note there is no \v
+ in HTML5 whitespace. */
+ url_start = offset + strspn (srcset + offset, " \f\n\r\t,");
+
+ if (url_start == size)
+ return;
+
+ /* URL is any non-whitespace chars (including commas) - but with
+ trailing commas removed. */
+ url_end = url_start + strcspn (srcset + url_start, " \f\n\r\t");
+ while ((url_end - 1) > url_start && srcset[url_end - 1] == ',')
+ {
+ has_descriptor = false;
+ --url_end;
+ }
+
+ if (url_end > url_start)
+ {
+ char *url_text = strdupdelim (srcset + url_start,
+ srcset + url_end);
+ struct urlpos *up = append_url (url_text, base_ind + url_start,
+ url_end - url_start, ctx);
+ up->link_inline_p = 1;
+ up->link_noquote_html_p = 1;
+ xfree (url_text);
+ }
+
+ /* If the URL wasn't terminated by a , there may also be a descriptor
+ which we just skip. */
+ if (has_descriptor)
+ {
+ /* This is comma-terminated, except there may be one level of
+ parentheses escaping that. */
+ bool in_paren = false;
+ for (offset = url_end; offset < size; ++offset)
+ {
+ char c = srcset[offset];
+ if (c == '(')
+ in_paren = true;
+ else if (c == ')' && in_paren)
+ in_paren = false;
+ else if (c == ',' && !in_paren)
+ break;
+ }
+ }
+ else
+ offset = url_end;
+ }
+ }
+}
+
/* Dispatch the tag handler appropriate for the tag we're mapping
over. See known_tags[] for definition of tag handlers. */