[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]
bug#73188: [PATCH 3/3] PEG: add large string-peg patch
From: |
Ekaitz Zarraga |
Subject: |
bug#73188: [PATCH 3/3] PEG: add large string-peg patch |
Date: |
Sun, 22 Dec 2024 21:01:08 +0100 |
---
test-suite/tests/peg.test | 117 ++++++++++++++++++++++++++++++++++++--
1 file changed, 113 insertions(+), 4 deletions(-)
diff --git a/test-suite/tests/peg.test b/test-suite/tests/peg.test
index d9e3e1b22..d8d047288 100644
--- a/test-suite/tests/peg.test
+++ b/test-suite/tests/peg.test
@@ -86,7 +86,7 @@
End <-- '*)'
C <- Begin N* End
N <- C / (!Begin !End Z)
-Z <- [^X-Z]") ;; Forbid some characters to test not-in-range
+Z <- .")
;; A short /etc/passwd file.
(define *etc-passwd*
@@ -126,9 +126,6 @@ SLASH < '/'")
(match-pattern C "(*blah*)")
(make-prec 0 8 "(*blah*)"
'((Begin "(*") "blah" (End "*)")))))
- (pass-if
- "simple comment with forbidden char"
- (not (match-pattern C "(*blYh*)")))
(pass-if
"simple comment padded"
(equal?
@@ -288,3 +285,115 @@ number <-- [0-9]+")
(equal? (eq-parse "1+1/2*3+(1+1)/2")
'(+ (+ 1 (* (/ 1 2) 3)) (/ (+ 1 1) 2)))))
+
+(define html-grammar
+"
+# Based on code from https://github.com/Fantom-Factory/afHtmlParser
+# 2014-2023 Steve Eynon. This code was originally released under the following
+# terms:
+#
+# Permission to use, copy, modify, and/or distribute this software for any
+# purpose with or without fee is hereby granted, provided that the above
+# copyright notice and this permission notice appear in all copies.
+#
+# THE SOFTWARE IS PROVIDED \"AS IS\" AND THE AUTHOR DISCLAIMS ALL
+# WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES
+# OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE
+# FOR ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY
+# DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER
+# IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING
+# OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+
+# PEG Rules for parsing well formed HTML 5 documents
+# https://html.spec.whatwg.org/multipage/syntax.html
+
+html <-- bom? blurb* doctype? blurb* xmlProlog? blurb* elem blurb*
+bom <-- \"\\uFEFF\"
+xmlProlog <-- \"<?xml\" (!\"?>\" .)+ \"?>\"
+
+# ---- Doctype ----
+
+doctype <-- \"<!DOCTYPE\" [ \\t\\n\\f\\r]+ [a-zA-Z0-9]+
(doctypePublicId / doctypeSystemId)* [ \\t\\n\\f\\r]* \">\"
+doctypePublicId <-- [ \\t\\n\\f\\r]+ \"PUBLIC\" [ \\t\\n\\f\\r]+
((\"\\\"\" [^\"]* \"\\\"\") / (\"'\" [^']* \"'\"))
+doctypeSystemId <-- [ \\t\\n\\f\\r]+ (\"SYSTEM\" [ \\t\\n\\f\\r]+)?
((\"\\\"\" [^\"]* \"\\\"\") / (\"'\" [^']* \"'\"))
+
+# ---- Elems ----
+
+elem <-- voidElem / rawTextElem / escRawTextElem /
selfClosingElem / normalElem
+voidElem <-- \"<\" voidElemName attributes \">\"
+rawTextElem <-- \"<\" rawTextElemName attributes \">\"
rawTextContent endElem
+escRawTextElem <-- \"<\" escRawTextElemName attributes \">\"
escRawTextContent endElem
+selfClosingElem <-- \"<\" elemName attributes \"/>\"
+normalElem <-- \"<\" elemName attributes \">\"
normalContent? endElem?
+endElem <-- \"</\" elemName \">\"
+
+elemName <-- [a-zA-Z] [^\\t\\n\\f />]*
+voidElemName <-- \"area\" / \"base\" / \"br\" / \"col\" / \"embed\" /
+ \"hr\" / \"img\" / \"input\" / \"keygen\" / \"link\" /
+ \"meta\" / \"param\" / \"source\" / \"track\" / \"wbr\"
+rawTextElemName <-- \"script\" / \"style\"
+escRawTextElemName <-- \"textarea\" / \"title\"
+
+rawTextContent <-- (!(\"</script>\" / \"</style>\") .)+
+escRawTextContent <-- ((!(\"</textarea>\" / \"</title>\" / \"&\") .)+ /
charRef)*
+normalContent <-- !\"</\" (([^<&]+ / charRef) / comment / cdata / elem)*
+
+# ---- Attributes ----
+
+attributes <-- (&[^/>] ([ \\t]+ / doubleQuoteAttr / singleQuoteAttr /
unquotedAttr / emptyAttr))*
+attrName <-- [^ \\t\\n\\r\\f\"'>/=]+
+emptyAttr <-- attrName+
+unquotedAttr <-- attrName [ \\t]* \"=\" [ \\t]* (charRef / [^
\\t\\n\\r\\f\"'=<>`&]+)+
+singleQuoteAttr <-- attrName [ \\t]* \"=\" [ \\t]* \"'\" (charRef /
[^'&]+)* \"'\"
+doubleQuoteAttr <-- attrName [ \\t]* \"=\" [ \\t]* \"\\\"\" (charRef /
[^\"&]+)* \"\\\"\"
+
+# ---- Character References ----
+
+charRef <-- &\"&\" (decNumCharRef / hexNumCharRef / namedCharRef /
borkedRef)
+namedCharRef <-- \"&\" [^;>]+ \";\"
+decNumCharRef <-- \"&#\" [0-9]+ \";\"
+hexNumCharRef <-- \"&#x\" [a-fA-F0-9]+ \";\"
+borkedRef <-- \"&\" &[ \\t]
+
+# ---- Misc ----
+
+cdata <-- \"<![CDATA[\" (!\"]]>\" .)+ \"]]>\"
+comment <-- \"<!--\" (!\"--\" .)+ \"-->\"
+blurb <-- [ \\t\\n\\f\\r]+ / comment")
+
+(define html-example "
+<!DOCTYPE html>
+<html>
+<head>
+ <title>Example Domain</title>
+ <meta charset=\"utf-8\" />
+ <meta http-equiv=\"Content-type\" content=\"text/html; charset=utf-8\" />
+ <meta name=\"viewport\" content=\"width=device-width, initial-scale=1\" />
+ <style type=\"text/css\">
+ body {
+ background-color: #f0f0f2;
+ margin: 0;
+ padding: 0;
+ }
+ </style>
+</head>
+
+<body>
+<div>
+ <h1>Example Domain</h1>
+ <p>This domain is for use in illustrative examples in documents. You may
+ use this domain in literature without prior coordination or asking for
+ permission.</p> <p><a href=\"https://www.iana.org/domains/example\">More
+ information...</a></p>
+</div>
+</body>
+</html>
+")
+
+(with-test-prefix "Parsing with complex grammars"
+ (eeval `(define-peg-string-patterns ,html-grammar))
+ (pass-if
+ "HTML parsing"
+ (equal?
+ (peg:tree (match-pattern html html-example))
+ '(html (blurb "\n") (doctype "<!DOCTYPE html>") (blurb "\n") (elem
(normalElem "<" (elemName "html") attributes ">" (normalContent "\n" (elem
(normalElem "<" (elemName "head") attributes ">" (normalContent "\n " (elem
(escRawTextElem "<" (escRawTextElemName "title") attributes ">"
(escRawTextContent "Example Domain") (endElem "</" (elemName "title") ">")))
"\n " (elem (selfClosingElem "<" (elemName "meta") (attributes " "
(doubleQuoteAttr (attrName "charset") "=\"utf-8\"") " ") "/>")) "\n " (elem
(selfClosingElem "<" (elemName "meta") (attributes " " (doubleQuoteAttr
(attrName "http-equiv") "=\"Content-type\"") " " (doubleQuoteAttr (attrName
"content") "=\"text/html; charset=utf-8\"") " ") "/>")) "\n " (elem
(selfClosingElem "<" (elemName "meta") (attributes " " (doubleQuoteAttr
(attrName "name") "=\"viewport\"") " " (doubleQuoteAttr (attrName "content")
"=\"width=device-width, initial-scale=1\"") " ") "/>")) "\n " (elem
(rawTextElem "<" (rawTextElemName "style") (attributes " " (doubleQuoteAttr
(attrName "type") "=\"text/css\"")) ">" (rawTextContent "\n body {\n
background-color: #f0f0f2;\n margin: 0;\n padding: 0;\n }\n
") (endElem "</" (elemName "style") ">"))) "\n") (endElem "</" (elemName
"head") ">"))) "\n\n" (elem (normalElem "<" (elemName "body") attributes ">"
(normalContent "\n" (elem (normalElem "<" (elemName "div") attributes ">"
(normalContent "\n " (elem (normalElem "<" (elemName "h1") attributes ">"
(normalContent "Example Domain") (endElem "</" (elemName "h1") ">"))) "\n "
(elem (normalElem "<" (elemName "p") attributes ">" (normalContent "This domain
is for use in illustrative examples in documents. You may\n use this domain
in literature without prior coordination or asking for\n permission.")
(endElem "</" (elemName "p") ">"))) " " (elem (normalElem "<" (elemName "p")
attributes ">" (normalContent (elem (normalElem "<" (elemName "a") (attributes
" " (doubleQuoteAttr (attrName "href")
"=\"https://www.iana.org/domains/example\"")) ">" (normalContent "More\n
information...") (endElem "</" (elemName "a") ">")))) (endElem "</" (elemName
"p") ">"))) "\n") (endElem "</" (elemName "div") ">"))) "\n") (endElem "</"
(elemName "body") ">"))) "\n") (endElem "</" (elemName "html") ">"))) (blurb
"\n")))))
--
2.46.0