From 49933c84012536388e1f9d0bc4070e377d824309 Mon Sep 17 00:00:00 2001 From: Maks Orlovich Date: Tue, 1 Mar 2016 09:43:56 -0500 Subject: Parse attributes, they have image URLs. * src/convert.h: Add link_noquote_html_p to permit rewriting URLs deep inside attributes without adding extraneous quoting * src/convert.c (convert_links): Honor link_noquote_html_p * src/html_url.c (tag_handle_img): New function. Add srcset parsing. diff --git a/src/convert.c b/src/convert.c index df8d58d..509923e 100644 --- a/src/convert.c +++ b/src/convert.c @@ -308,7 +308,7 @@ convert_links (const char *file, struct urlpos *links) char *quoted_newname = local_quote_string (newname, link->link_css_p); - if (link->link_css_p) + if (link->link_css_p || link->link_noquote_html_p) p = replace_plain (p, link->size, fp, quoted_newname); else if (!link->link_refresh_p) p = replace_attr (p, link->size, fp, quoted_newname); @@ -329,7 +329,7 @@ convert_links (const char *file, struct urlpos *links) char *newname = convert_basename (p, link); char *quoted_newname = local_quote_string (newname, link->link_css_p); - if (link->link_css_p) + if (link->link_css_p || link->link_noquote_html_p) p = replace_plain (p, link->size, fp, quoted_newname); else if (!link->link_refresh_p) p = replace_attr (p, link->size, fp, quoted_newname); @@ -352,7 +352,7 @@ convert_links (const char *file, struct urlpos *links) char *newlink = link->url->url; char *quoted_newlink = html_quote_string (newlink); - if (link->link_css_p) + if (link->link_css_p || link->link_noquote_html_p) p = replace_plain (p, link->size, fp, newlink); else if (!link->link_refresh_p) p = replace_attr (p, link->size, fp, quoted_newlink); diff --git a/src/convert.h b/src/convert.h index b3cd196..e3ff6f0 100644 --- a/src/convert.h +++ b/src/convert.h @@ -69,6 +69,7 @@ struct urlpos { unsigned int link_base_p :1; /* the url came from */ unsigned int link_inline_p :1; /* needed to render the page */ unsigned int link_css_p :1; /* the url came from CSS */ + unsigned int link_noquote_html_p :1; /* from HTML, but doesn't need " */ unsigned int link_expect_html :1; /* expected to contain HTML */ unsigned int link_expect_css :1; /* expected to contain CSS */ diff --git a/src/html-url.c b/src/html-url.c index 0743587..ab04204 100644 --- a/src/html-url.c +++ b/src/html-url.c @@ -56,6 +56,7 @@ typedef void (*tag_handler_t) (int, struct taginfo *, struct map_context *); DECLARE_TAG_HANDLER (tag_find_urls); DECLARE_TAG_HANDLER (tag_handle_base); DECLARE_TAG_HANDLER (tag_handle_form); +DECLARE_TAG_HANDLER (tag_handle_img); DECLARE_TAG_HANDLER (tag_handle_link); DECLARE_TAG_HANDLER (tag_handle_meta); @@ -105,7 +106,7 @@ static struct known_tag { { TAG_FORM, "form", tag_handle_form }, { TAG_FRAME, "frame", tag_find_urls }, { TAG_IFRAME, "iframe", tag_find_urls }, - { TAG_IMG, "img", tag_find_urls }, + { TAG_IMG, "img", tag_handle_img }, { TAG_INPUT, "input", tag_find_urls }, { TAG_LAYER, "layer", tag_find_urls }, { TAG_LINK, "link", tag_handle_link }, @@ -183,7 +184,8 @@ static const char *additional_attributes[] = { "name", /* used by tag_handle_meta */ "content", /* used by tag_handle_meta */ "action", /* used by tag_handle_form */ - "style" /* used by check_style_attr */ + "style", /* used by check_style_attr */ + "srcset", /* used by tag_handle_img */ }; static struct hash_table *interesting_tags; @@ -674,6 +676,88 @@ tag_handle_meta (int tagid _GL_UNUSED, struct taginfo *tag, struct map_context * } } +/* Handle the IMG tag. This requires special handling for the srcset attr, + while the traditional src/lowsrc/href attributes can be handled generically. +*/ + +static void +tag_handle_img (int tagid, struct taginfo *tag, struct map_context *ctx) { + int attrind; + char *srcset; + + /* Use the generic approach for the attributes without special syntax. */ + tag_find_urls(tagid, tag, ctx); + + srcset = find_attr (tag, "srcset", &attrind); + if (srcset) + { + /* These are relative to the input text. */ + int base_ind = ATTR_POS (tag,attrind,ctx); + int size = strlen (srcset); + + /* These are relative to srcset. */ + int offset, url_start, url_end; + + /* Make sure to line up base_ind with srcset[0], not outside quotes. */ + if (ctx->text[base_ind] == '"' || ctx->text[base_ind] == '\'') + ++base_ind; + + offset = 0; + while (offset < size) + { + bool has_descriptor = true; + + /* Skip over initial whitespace and commas. Note there is no \v + in HTML5 whitespace. */ + url_start = offset + strspn (srcset + offset, " \f\n\r\t,"); + + if (url_start == size) + return; + + /* URL is any non-whitespace chars (including commas) - but with + trailing commas removed. */ + url_end = url_start + strcspn (srcset + url_start, " \f\n\r\t"); + while ((url_end - 1) > url_start && srcset[url_end - 1] == ',') + { + has_descriptor = false; + --url_end; + } + + if (url_end > url_start) + { + char *url_text = strdupdelim (srcset + url_start, + srcset + url_end); + struct urlpos *up = append_url (url_text, base_ind + url_start, + url_end - url_start, ctx); + up->link_inline_p = 1; + up->link_noquote_html_p = 1; + xfree (url_text); + } + + /* If the URL wasn't terminated by a , there may also be a descriptor + which we just skip. */ + if (has_descriptor) + { + /* This is comma-terminated, except there may be one level of + parentheses escaping that. */ + bool in_paren = false; + for (offset = url_end; offset < size; ++offset) + { + char c = srcset[offset]; + if (c == '(') + in_paren = true; + else if (c == ')' && in_paren) + in_paren = false; + else if (c == ',' && !in_paren) + break; + } + } + else + offset = url_end; + } + } +} + /* Dispatch the tag handler appropriate for the tag we're mapping over. See known_tags[] for definition of tag handlers. */