Hi Ludovic,
Thanks for the review
On 2024-10-20 12:10, Ludovic Courtès wrote:
+(define (Primary->defn lst for-syntax)
+ (let ((value (second lst)))
+ (case (car value)
+ ('DOT #'peg-any)
+ ('Identifier (Identifier->defn value for-syntax))
+ ('Expression (Expression->defn value for-syntax))
+ ('Literal (Literal->defn value for-syntax))
+ ('Class (Class->defn value for-syntax)))))
I get these compiler warnings:
--8<---------------cut here---------------start------------->8---
ice-9/peg/string-peg.scm:258:7: warning: duplicate datum quote in clause ((quote NOT)
(quasisyntax (not-followed-by (unsyntax (Suffix->defn (third lst) for-syntax))))) of
case expression (case (car suffix) ((quote AND) (quasisyntax (followed-by (unsyntax
(Suffix->defn (third lst) for-syntax))))) ((quote NOT) (quasisyntax (not-followed-by
(unsyntax (Suffix->defn (third lst) for-syntax))))) (else (Suffix->defn suffix
for-syntax)))
ice-9/peg/string-peg.scm:277:9: warning: duplicate datum quote in clause
((quote STAR) (quasisyntax (* (unsyntax out)))) of case expression (case (caar
extra) ((quote QUESTION) (quasisyntax (? (unsyntax out)))) ((quote STAR)
(quasisyntax (* (unsyntax out)))) ((quote PLUS) (quasisyntax (+ (unsyntax
out)))))
ice-9/peg/string-peg.scm:278:9: warning: duplicate datum quote in clause
((quote PLUS) (quasisyntax (+ (unsyntax out)))) of case expression (case (caar
extra) ((quote QUESTION) (quasisyntax (? (unsyntax out)))) ((quote STAR)
(quasisyntax (* (unsyntax out)))) ((quote PLUS) (quasisyntax (+ (unsyntax
out)))))
ice-9/peg/string-peg.scm:284:7: warning: duplicate datum quote in clause ((quote Identifier)
(Identifier->defn value for-syntax)) of case expression (case (car value) ((quote DOT) (syntax
peg-any)) ((quote Identifier) (Identifier->defn value for-syntax)) ((quote Expression)
(Expression->defn value for-syntax)) ((quote Literal) (Literal->defn value for-syntax))
((quote NotInClass) (NotInClass->defn value for-syntax)) ((quote Class) (Class->defn value
for-syntax)))
ice-9/peg/string-peg.scm:285:7: warning: duplicate datum quote in clause ((quote Expression)
(Expression->defn value for-syntax)) of case expression (case (car value) ((quote DOT) (syntax
peg-any)) ((quote Identifier) (Identifier->defn value for-syntax)) ((quote Expression)
(Expression->defn value for-syntax)) ((quote Literal) (Literal->defn value for-syntax))
((quote NotInClass) (NotInClass->defn value for-syntax)) ((quote Class) (Class->defn value
for-syntax)))
ice-9/peg/string-peg.scm:286:7: warning: duplicate datum quote in clause ((quote Literal)
(Literal->defn value for-syntax)) of case expression (case (car value) ((quote DOT) (syntax
peg-any)) ((quote Identifier) (Identifier->defn value for-syntax)) ((quote Expression)
(Expression->defn value for-syntax)) ((quote Literal) (Literal->defn value for-syntax))
((quote NotInClass) (NotInClass->defn value for-syntax)) ((quote Class) (Class->defn value
for-syntax)))
ice-9/peg/string-peg.scm:287:7: warning: duplicate datum quote in clause ((quote NotInClass)
(NotInClass->defn value for-syntax)) of case expression (case (car value) ((quote DOT) (syntax
peg-any)) ((quote Identifier) (Identifier->defn value for-syntax)) ((quote Expression)
(Expression->defn value for-syntax)) ((quote Literal) (Literal->defn value for-syntax))
((quote NotInClass) (NotInClass->defn value for-syntax)) ((quote Class) (Class->defn value
for-syntax)))
ice-9/peg/string-peg.scm:288:7: warning: duplicate datum quote in clause ((quote Class)
(Class->defn value for-syntax)) of case expression (case (car value) ((quote DOT) (syntax
peg-any)) ((quote Identifier) (Identifier->defn value for-syntax)) ((quote Expression)
(Expression->defn value for-syntax)) ((quote Literal) (Literal->defn value for-syntax))
((quote NotInClass) (NotInClass->defn value for-syntax)) ((quote Class) (Class->defn value
for-syntax)))
--8<---------------cut here---------------end--------------->8---
And indeed, the correct syntax is:
(case value (DOT …) (Identifier …) …)
or:
(match value ('DOT …) ('Identifier …) …)
The former returns *unspecified* when passed a value not matched by any
clause, whereas the latter throws an error.
So I would recommend ‘match’.
I'm always in doubt with this but the thing worked.
I always check in the internet how to do the case, and I always get the
warning... my bad!
At any rate, that makes me wonder whether this code path is tested at
all. As written, ‘Primary->defn’ would always return *unspecified*.
Is there a test we could add?
I made some tests and it works, maybe unexpectedly:
scheme@(guile-user)> (define a 'b)
scheme@(guile-user)> (case a ('c 3)('b 1)('d 2))
;;; <stdin>:12:15: warning: duplicate datum quote in clause ((quote b)
1) of case expression (case a ((quote c) 3) ((quote b) 1) ((quote d) 2))
;;; <stdin>:12:21: warning: duplicate datum quote in clause ((quote d)
2) of case expression (case a ((quote c) 3) ((quote b) 1) ((quote d) 2))
$8 = 1
While your proposal doesn't:
scheme@(guile-user)> (case a (c 3)(b 1)(d 2))
While compiling expression:
Syntax error:
unknown file:14:8: case: invalid clause in subform (c 3) of (case a (c
3) (b 1) (d 2))
I tested further and this is what it should be:
scheme@(guile-user)> (case a ((b) 1)((d) 2))
$1 = 1
I used match instead as you proposed and made it all work with no
warnings and better error reporting.
+(define (Range->defn lst for-syntax)
(cond
- ((list? el)
- (cond
- ((eq? (car el) 'peg-literal)
- (peg-literal->defn el for-syntax))
- ((eq? (car el) 'peg-charclass)
- (peg-charclass->defn el for-syntax))
- ((eq? (car el) 'peg-nonterminal)
- (datum->syntax for-syntax (string->symbol (cadr el))))))
- ((string? el)
+ ((= 2 (length lst))
+ (second (second lst)))
+ ((= 3 (length lst))
+ #`(range
+ #,(Char->defn (second lst) for-syntax)
+ #,(Char->defn (third lst) for-syntax)))))
Keep in mind that ‘length’ is O(N), and that car/first, second/cadr are
frowned upon. I would write it as:
(match lst
((x y) y)
((x y z) #`(range …)))
Yeah, I was very bad at `match` when I wrote this thing :(
But! Now I spent some hours with it and rewrote the thing for `match`!
(Ideally with identifiers more meaningful than x, y, and z. :-))
;; Transforms the nonterminals defined in the PEG parser written as a PEG to
the nonterminals defined in the PEG parser written with S-expressions.
(define (grammar-transform x)
@@ -69,7 +77,7 @@
(peg:tree (match-pattern (@@ (ice-9 peg) peg-grammar) (@@ (ice-9 peg)
peg-as-peg)))
(tree-map
grammar-transform
- (peg:tree (match-pattern grammar (@@ (ice-9 peg) peg-as-peg)))))))
+ (peg:tree (match-pattern (@@ (ice-9 peg) peg-grammar) (@@ (ice-9 peg)
peg-as-peg)))))))
What happened to the ‘grammar’ binding? I can’t see where it was coming
from.
This is a very good catch!
I "fixed" a missing identifier by this, but it happened to be skipping a
really important check: Is our PEG written as PEG understood like our
PEG definition in sexps?
The `Grammar` is coming from the processing of `peg-as-peg` as a
peg-string. This is what I was missing.
This was hiding some errors that I fixed, and now I'm pretty confident
of the thing. Wow! This was a very good one! Thanks for pointing it out.
From 64a17be08581465d11185b4a0ca636354d2f944c Mon Sep 17 00:00:00 2001
From: Ekaitz Zarraga <eka...@elenq.tech>
Date: Fri, 11 Oct 2024 14:24:30 +0200
Subject: [PATCH v3 2/2] PEG: Add support for `not-in-range` and [^...]
Modern PEG supports inversed class like `[^a-z]` that would get any
character not in the `a-z` range. This commit adds support for that and
also for a new `not-in-range` PEG pattern for scheme.
* module/ice-9/peg/codegen.scm (cg-not-in-range): New function.
* module/ice-9/peg/string-peg.scm: Add support for `[^...]`
* test-suite/tests/peg.test: Test it.
* doc/ref/api-peg.texi: Document accordingly.
This one LGTM.
In addition to the issues mentioned above, could you add an entry in the
‘NEWS’ file, probably under a new “New interfaces and functionality”
heading?
Yes!
Added it in both commits: In the first commit I added that PEG was
improved, and in the second that `not-in-range` was added.
Thanks,
Ludo’.
Attached new version of both patches.
Thanks for the help, the kind of things you point out are the ones that
I wanted help with! Thanks! Now I'm a better matcher.
This helps me improve,
Ekaitz
From 7941589941fe7a0fb6f71b8681ccb1d976946912 Mon Sep 17 00:00:00 2001
From: Ekaitz Zarraga <eka...@elenq.tech>
Date: Fri, 11 Oct 2024 14:24:30 +0200
Subject: [PATCH v4 2/2] PEG: Add support for `not-in-range` and [^...]
Modern PEG supports inversed class like `[^a-z]` that would get any
character not in the `a-z` range. This commit adds support for that and
also for a new `not-in-range` PEG pattern for scheme.
* module/ice-9/peg/codegen.scm (cg-not-in-range): New function.
* module/ice-9/peg/string-peg.scm: Add support for `[^...]`
* test-suite/tests/peg.test: Test it.
* doc/ref/api-peg.texi: Document accordingly.
---
NEWS | 3 ++-
doc/ref/api-peg.texi | 8 +++++++
module/ice-9/peg/codegen.scm | 22 +++++++++++++++++++
module/ice-9/peg/string-peg.scm | 38 ++++++++++++++++++++++++++++++---
test-suite/tests/peg.test | 6 +++++-
5 files changed, 72 insertions(+), 5 deletions(-)
diff --git a/NEWS b/NEWS
index df43f3754..17ef560b1 100644
--- a/NEWS
+++ b/NEWS
@@ -32,7 +32,8 @@ Changes in 3.0.11 (since 3.0.10)
** PEG parser
PEG parser has been rewritten to cover all the functionality defined in
-<https://bford.info/pub/lang/peg.pdf>.
+<https://bford.info/pub/lang/peg.pdf>. Also added the `not-in-range` pattern
+to `(ice-9 peg)` that is also available from PEG strings via `[^...]`.
Changes in 3.0.10 (since 3.0.9)
diff --git a/doc/ref/api-peg.texi b/doc/ref/api-peg.texi
index 84a9e6c6b..edb090b20 100644
--- a/doc/ref/api-peg.texi
+++ b/doc/ref/api-peg.texi
@@ -147,6 +147,14 @@ Parses any character falling between @var{a} and @var{z}.
@code{(range #\a #\z)}
@end deftp
+@deftp {PEG Pattern} {inverse range of characters} a z
+Parses any character not falling between @var{a} and @var{z}.
+
+@code{"[^a-z]"}
+
+@code{(not-in-range #\a #\z)}
+@end deftp
+
Example:
@example
diff --git a/module/ice-9/peg/codegen.scm b/module/ice-9/peg/codegen.scm
index d80c3e849..82367ef55 100644
--- a/module/ice-9/peg/codegen.scm
+++ b/module/ice-9/peg/codegen.scm
@@ -140,6 +140,27 @@ return EXP."
((none) #`(list (1+ pos) '()))
(else (error "bad accum" accum))))))))))
+;; Generates code for matching a range of characters not between start and end.
+;; E.g.: (cg-not-in-range syntax #\a #\z 'body)
+(define (cg-not-in-range pat accum)
+ (syntax-case pat ()
+ ((start end)
+ (if (not (and (char? (syntax->datum #'start))
+ (char? (syntax->datum #'end))))
+ (error "range PEG should have characters after it; instead got"
+ #'start #'end))
+ #`(lambda (str len pos)
+ (and (< pos len)
+ (let ((c (string-ref str pos)))
+ (and (or (char<? c start) (char>? c end))
+ #,(case accum
+ ((all) #`(list (1+ pos)
+ (list 'cg-not-in-range (string c))))
+ ((name) #`(list (1+ pos) 'cg-not-in-range))
+ ((body) #`(list (1+ pos) (string c)))
+ ((none) #`(list (1+ pos) '()))
+ (else (error "bad accum" accum))))))))))
+
;; Generate code to match a pattern and do nothing with the result
(define (cg-ignore pat accum)
(syntax-case pat ()
@@ -304,6 +325,7 @@ return EXP."
(assq-set! peg-compiler-alist symbol function)))
(add-peg-compiler! 'range cg-range)
+(add-peg-compiler! 'not-in-range cg-not-in-range)
(add-peg-compiler! 'ignore cg-ignore)
(add-peg-compiler! 'capture cg-capture)
(add-peg-compiler! 'and cg-and)
diff --git a/module/ice-9/peg/string-peg.scm b/module/ice-9/peg/string-peg.scm
index 4b923220a..0026f8930 100644
--- a/module/ice-9/peg/string-peg.scm
+++ b/module/ice-9/peg/string-peg.scm
@@ -54,7 +54,7 @@ Prefix <-- (AND / NOT)? Suffix
Suffix <-- Primary (QUESTION / STAR / PLUS)?
Primary <-- Identifier !LEFTARROW
/ OPEN Expression CLOSE
- / Literal / Class / DOT
+ / Literal / Class / NotInClass / DOT
# Lexical syntax
Identifier <-- IdentStart IdentCont* Spacing
@@ -64,6 +64,7 @@ IdentCont <- IdentStart / [0-9]
Literal <-- SQUOTE (!SQUOTE Char)* SQUOTE Spacing
/ DQUOTE (!DQUOTE Char)* DQUOTE Spacing
+NotInClass <-- OPENBRACKET NOTIN (!CLOSEBRACKET Range)* CLOSEBRACKET Spacing
Class <-- OPENBRACKET !NOTIN (!CLOSEBRACKET Range)* CLOSEBRACKET Spacing
Range <-- Char DASH Char / Char
Char <-- '\\\\' [nrt'\"\\[\\]\\\\]
@@ -78,6 +79,7 @@ DQUOTE < [\"]
DASH < '-'
OPENBRACKET < '['
CLOSEBRACKET < ']'
+NOTIN < '^'
SLASH < '/' Spacing
AND <-- '&' Spacing
NOT <-- '!' Spacing
@@ -122,6 +124,7 @@ EndOfFile < !.
(and OPEN Expression CLOSE)
Literal
Class
+ NotInClass
DOT))
(define-sexp-parser Identifier all
(and IdentStart (* IdentCont) Spacing))
@@ -133,7 +136,11 @@ EndOfFile < !.
(or (and SQUOTE (* (and (not-followed-by SQUOTE) Char)) SQUOTE Spacing)
(and DQUOTE (* (and (not-followed-by DQUOTE) Char)) DQUOTE Spacing)))
(define-sexp-parser Class all
- (and OPENBRACKET (* (and (not-followed-by CLOSEBRACKET) Range)) CLOSEBRACKET Spacing))
+ (and OPENBRACKET (not-followed-by NOTIN)
+ (* (and (not-followed-by CLOSEBRACKET) Range)) CLOSEBRACKET Spacing))
+(define-sexp-parser NotInClass all
+ (and OPENBRACKET NOTIN
+ (* (and (not-followed-by CLOSEBRACKET) Range)) CLOSEBRACKET Spacing))
(define-sexp-parser Range all
(or (and Char DASH Char) Char))
(define-sexp-parser Char all
@@ -143,6 +150,8 @@ EndOfFile < !.
(and (not-followed-by "\\") peg-any)))
(define-sexp-parser LEFTARROW body
(and (or "<--" "<-" "<") Spacing)) ; NOTE: <-- and < are extensions
+(define-sexp-parser NOTIN none
+ (and "^"))
(define-sexp-parser SLASH none
(and "/" Spacing))
(define-sexp-parser AND all
@@ -279,6 +288,7 @@ EndOfFile < !.
('Identifier (Identifier->defn value for-syntax))
('Expression (Expression->defn value for-syntax))
('Literal (Literal->defn value for-syntax))
+ ('NotInClass (NotInClass->defn value for-syntax))
('Class (Class->defn value for-syntax)))))
;; (Identifier "hello")
@@ -291,13 +301,35 @@ EndOfFile < !.
(define (Literal->defn lst for-syntax)
(apply string (map (lambda (x) (Char->defn x for-syntax)) (cdr lst))))
-;; TODO: empty Class can happen: `[]`, but what does it represent?
+;; (NotInClass ...)
+;; `-> (and ...)
+(define (NotInClass->defn lst for-syntax)
+ #`(and #,@(map (lambda (x) (NotInRange->defn x for-syntax))
+ (cdr lst))))
+
;; (Class ...)
;; `-> (or ...)
(define (Class->defn lst for-syntax)
#`(or #,@(map (lambda (x) (Range->defn x for-syntax))
(cdr lst))))
+;; NOTE: It's coming from NotInClass.
+;; For one character:
+;; (Range (Char "a"))
+;; `-> (not-in-range #\a #\a)
+;; Or for a range:
+;; (Range (Char "a") (Char "b"))
+;; `-> (not-in-range #\a #\b)
+(define (NotInRange->defn lst for-syntax)
+ (match lst
+ (('Range c)
+ (let ((ch (Char->defn c for-syntax)))
+ #`(not-in-range #,ch #,ch)))
+ (('Range range-beginning range-end)
+ #`(not-in-range
+ #,(Char->defn range-beginning for-syntax)
+ #,(Char->defn range-end for-syntax)))))
+
;; For one character:
;; (Range (Char "a"))
;; `-> "a"
diff --git a/test-suite/tests/peg.test b/test-suite/tests/peg.test
index 1136c03f1..d9e3e1b22 100644
--- a/test-suite/tests/peg.test
+++ b/test-suite/tests/peg.test
@@ -38,6 +38,7 @@
(Identifier Identifier)
(Literal Literal)
(Class Class)
+ (NotInClass NotInClass)
(Range Range)
(Char Char)
(LEFTARROW LEFTARROW)
@@ -85,7 +86,7 @@
End <-- '*)'
C <- Begin N* End
N <- C / (!Begin !End Z)
-Z <- .")
+Z <- [^X-Z]") ;; Forbid some characters to test not-in-range
;; A short /etc/passwd file.
(define *etc-passwd*
@@ -125,6 +126,9 @@ SLASH < '/'")
(match-pattern C "(*blah*)")
(make-prec 0 8 "(*blah*)"
'((Begin "(*") "blah" (End "*)")))))
+ (pass-if
+ "simple comment with forbidden char"
+ (not (match-pattern C "(*blYh*)")))
(pass-if
"simple comment padded"
(equal?
--
2.46.0
From 459d83ec2710bed5feb3d2a9ca28da8ede9da007 Mon Sep 17 00:00:00 2001
From: Ekaitz Zarraga <eka...@elenq.tech>
Date: Wed, 11 Sep 2024 21:19:26 +0200
Subject: [PATCH v4 1/2] PEG: Add full support for PEG + some extensions
This commit adds support for PEG as described in:
<https://bford.info/pub/lang/peg.pdf>
It adds support for the missing features (comments, underscores in
identifiers and escaping) while keeping the extensions (dashes in
identifiers, < and <--).
The naming system tries to be as close as possible to the one proposed
in the paper.
* module/ice-9/peg/string-peg.scm: Rewrite PEG parser.
* test-suite/tests/peg.test: Fix import
---
NEWS | 7 +
doc/ref/api-peg.texi | 8 +-
module/ice-9/peg/string-peg.scm | 446 ++++++++++++++++++++------------
test-suite/tests/peg.test | 32 ++-
4 files changed, 313 insertions(+), 180 deletions(-)
diff --git a/NEWS b/NEWS
index 9fd14c39d..df43f3754 100644
--- a/NEWS
+++ b/NEWS
@@ -27,6 +27,13 @@ Changes in 3.0.11 (since 3.0.10)
** Guile is compiled with -fexcess-precision=standard for i[3456]86 when possible
(<https://debbugs.gnu.org/43262>)
+* New interfaces and functionality
+
+** PEG parser
+
+PEG parser has been rewritten to cover all the functionality defined in
+<https://bford.info/pub/lang/peg.pdf>.
+
Changes in 3.0.10 (since 3.0.9)
diff --git a/doc/ref/api-peg.texi b/doc/ref/api-peg.texi
index d34ddc64c..84a9e6c6b 100644
--- a/doc/ref/api-peg.texi
+++ b/doc/ref/api-peg.texi
@@ -17,6 +17,10 @@ Wikipedia has a clear and concise introduction to PEGs if you want to
familiarize yourself with the syntax:
@url{http://en.wikipedia.org/wiki/Parsing_expression_grammar}.
+The paper that introduced PEG contains a more detailed description of how PEG
+works, and describes its syntax in detail:
+@url{https://bford.info/pub/lang/peg.pdf}
+
The @code{(ice-9 peg)} module works by compiling PEGs down to lambda
expressions. These can either be stored in variables at compile-time by
the define macros (@code{define-peg-pattern} and
@@ -216,8 +220,8 @@ should propagate up the parse tree. The normal @code{<-} propagates the
matched text up the parse tree, @code{<--} propagates the matched text
up the parse tree tagged with the name of the nonterminal, and @code{<}
discards that matched text and propagates nothing up the parse tree.
-Also, nonterminals may consist of any alphanumeric character or a ``-''
-character (in normal PEGs nonterminals can only be alphabetic).
+Also, nonterminals may include ``-'' character, while in normal PEG it is not
+allowed.
For example, if we:
@lisp
diff --git a/module/ice-9/peg/string-peg.scm b/module/ice-9/peg/string-peg.scm
index 45ed14bb1..4b923220a 100644
--- a/module/ice-9/peg/string-peg.scm
+++ b/module/ice-9/peg/string-peg.scm
@@ -1,6 +1,7 @@
;;;; string-peg.scm --- representing PEG grammars as strings
;;;;
-;;;; Copyright (C) 2010, 2011 Free Software Foundation, Inc.
+;;;; Copyright (C) 2010, 2011, Free Software Foundation, Inc.
+;;;; Copyright (C) 2024 Ekaitz Zarraga <eka...@elenq.tech>
;;;;
;;;; This library is free software; you can redistribute it and/or
;;;; modify it under the terms of the GNU Lesser General Public
@@ -21,10 +22,15 @@
#:export (peg-as-peg
define-peg-string-patterns
peg-grammar)
+ #:use-module (ice-9 match)
#:use-module (ice-9 peg using-parsers)
+ #:use-module (srfi srfi-1)
#:use-module (ice-9 peg codegen)
#:use-module (ice-9 peg simplify-tree))
+;; This module provides support for PEG as described in:
+;; <https://bford.info/pub/lang/peg.pdf>
+
;; Gets the left-hand depth of a list.
(define (depth lst)
(if (or (not (list? lst)) (null? lst))
@@ -38,22 +44,58 @@
;; Grammar for PEGs in PEG grammar.
(define peg-as-peg
-"grammar <-- (nonterminal ('<--' / '<-' / '<') sp pattern)+
-pattern <-- alternative (SLASH sp alternative)*
-alternative <-- ([!&]? sp suffix)+
-suffix <-- primary ([*+?] sp)*
-primary <-- '(' sp pattern ')' sp / '.' sp / literal / charclass / nonterminal !'<'
-literal <-- ['] (!['] .)* ['] sp
-charclass <-- LB (!']' (CCrange / CCsingle))* RB sp
-CCrange <-- . '-' .
-CCsingle <-- .
-nonterminal <-- [a-zA-Z0-9-]+ sp
-sp < [ \t\n]*
-SLASH < '/'
-LB < '['
-RB < ']'
+"# Hierarchical syntax
+Grammar <-- Spacing Definition+ EndOfFile
+Definition <-- Identifier LEFTARROW Expression
+
+Expression <-- Sequence (SLASH Sequence)*
+Sequence <-- Prefix*
+Prefix <-- (AND / NOT)? Suffix
+Suffix <-- Primary (QUESTION / STAR / PLUS)?
+Primary <-- Identifier !LEFTARROW
+ / OPEN Expression CLOSE
+ / Literal / Class / DOT
+
+# Lexical syntax
+Identifier <-- IdentStart IdentCont* Spacing
+# NOTE: `-` is an extension
+IdentStart <- [a-zA-Z_] / '-'
+IdentCont <- IdentStart / [0-9]
+
+Literal <-- SQUOTE (!SQUOTE Char)* SQUOTE Spacing
+ / DQUOTE (!DQUOTE Char)* DQUOTE Spacing
+Class <-- OPENBRACKET !NOTIN (!CLOSEBRACKET Range)* CLOSEBRACKET Spacing
+Range <-- Char DASH Char / Char
+Char <-- '\\\\' [nrt'\"\\[\\]\\\\]
+ / '\\\\' [0-7][0-7][0-7]
+ / '\\\\' [0-7][0-7]?
+ / !'\\\\' .
+
+# NOTE: `<--` and `<` are extensions
+LEFTARROW <- ('<--' / '<-' / '<') Spacing
+SQUOTE < [']
+DQUOTE < [\"]
+DASH < '-'
+OPENBRACKET < '['
+CLOSEBRACKET < ']'
+SLASH < '/' Spacing
+AND <-- '&' Spacing
+NOT <-- '!' Spacing
+QUESTION <-- '?' Spacing
+STAR <-- '*' Spacing
+PLUS <-- '+' Spacing
+OPEN < '(' Spacing
+CLOSE < ')' Spacing
+DOT <-- '.' Spacing
+
+Spacing < (Space / Comment)*
+Comment < '#' (!EndOfLine .)* EndOfLine
+Space < ' ' / '\t' / EndOfLine
+EndOfLine < '\\r\\n' / '\\n' / '\\r'
+EndOfFile < !.
")
+
(define-syntax define-sexp-parser
(lambda (x)
(syntax-case x ()
@@ -63,35 +105,78 @@ RB < ']'
(syn (wrap-parser-for-users x matchf accumsym #'sym)))
#`(define sym #,syn))))))
-(define-sexp-parser peg-grammar all
- (+ (and peg-nonterminal (or "<--" "<-" "<") peg-sp peg-pattern)))
-(define-sexp-parser peg-pattern all
- (and peg-alternative
- (* (and (ignore "/") peg-sp peg-alternative))))
-(define-sexp-parser peg-alternative all
- (+ (and (? (or "!" "&")) peg-sp peg-suffix)))
-(define-sexp-parser peg-suffix all
- (and peg-primary (* (and (or "*" "+" "?") peg-sp))))
-(define-sexp-parser peg-primary all
- (or (and "(" peg-sp peg-pattern ")" peg-sp)
- (and "." peg-sp)
- peg-literal
- peg-charclass
- (and peg-nonterminal (not-followed-by "<"))))
-(define-sexp-parser peg-literal all
- (and "'" (* (and (not-followed-by "'") peg-any)) "'" peg-sp))
-(define-sexp-parser peg-charclass all
- (and (ignore "[")
- (* (and (not-followed-by "]")
- (or charclass-range charclass-single)))
- (ignore "]")
- peg-sp))
-(define-sexp-parser charclass-range all (and peg-any "-" peg-any))
-(define-sexp-parser charclass-single all peg-any)
-(define-sexp-parser peg-nonterminal all
- (and (+ (or (range #\a #\z) (range #\A #\Z) (range #\0 #\9) "-")) peg-sp))
-(define-sexp-parser peg-sp none
- (* (or " " "\t" "\n")))
+(define-sexp-parser Grammar all
+ (and Spacing (+ Definition) EndOfFile))
+(define-sexp-parser Definition all
+ (and Identifier LEFTARROW Expression))
+(define-sexp-parser Expression all
+ (and Sequence (* (and SLASH Sequence))))
+(define-sexp-parser Sequence all
+ (* Prefix))
+(define-sexp-parser Prefix all
+ (and (? (or AND NOT)) Suffix))
+(define-sexp-parser Suffix all
+ (and Primary (? (or QUESTION STAR PLUS))))
+(define-sexp-parser Primary all
+ (or (and Identifier (not-followed-by LEFTARROW))
+ (and OPEN Expression CLOSE)
+ Literal
+ Class
+ DOT))
+(define-sexp-parser Identifier all
+ (and IdentStart (* IdentCont) Spacing))
+(define-sexp-parser IdentStart body
+ (or (or (range #\a #\z) (range #\A #\Z) "_") "-")) ; NOTE: - is an extension
+(define-sexp-parser IdentCont body
+ (or IdentStart (range #\0 #\9)))
+(define-sexp-parser Literal all
+ (or (and SQUOTE (* (and (not-followed-by SQUOTE) Char)) SQUOTE Spacing)
+ (and DQUOTE (* (and (not-followed-by DQUOTE) Char)) DQUOTE Spacing)))
+(define-sexp-parser Class all
+ (and OPENBRACKET (* (and (not-followed-by CLOSEBRACKET) Range)) CLOSEBRACKET Spacing))
+(define-sexp-parser Range all
+ (or (and Char DASH Char) Char))
+(define-sexp-parser Char all
+ (or (and "\\" (or "n" "r" "t" "'" "\"" "[" "]" "\\"))
+ (and "\\" (range #\0 #\7) (range #\0 #\7) (range #\0 #\7))
+ (and "\\" (range #\0 #\7) (? (range #\0 #\7)))
+ (and (not-followed-by "\\") peg-any)))
+(define-sexp-parser LEFTARROW body
+ (and (or "<--" "<-" "<") Spacing)) ; NOTE: <-- and < are extensions
+(define-sexp-parser SLASH none
+ (and "/" Spacing))
+(define-sexp-parser AND all
+ (and "&" Spacing))
+(define-sexp-parser NOT all
+ (and "!" Spacing))
+(define-sexp-parser QUESTION all
+ (and "?" Spacing))
+(define-sexp-parser STAR all
+ (and "*" Spacing))
+(define-sexp-parser PLUS all
+ (and "+" Spacing))
+(define-sexp-parser OPEN none
+ (and "(" Spacing))
+(define-sexp-parser CLOSE none
+ (and ")" Spacing))
+(define-sexp-parser DOT all
+ (and "." Spacing))
+(define-sexp-parser SQUOTE none "'")
+(define-sexp-parser DQUOTE none "\"")
+(define-sexp-parser OPENBRACKET none "[")
+(define-sexp-parser CLOSEBRACKET none "]")
+(define-sexp-parser DASH none "-")
+(define-sexp-parser Spacing none
+ (* (or Space Comment)))
+(define-sexp-parser Comment none
+ (and "#" (* (and (not-followed-by EndOfLine) peg-any)) EndOfLine))
+(define-sexp-parser Space none
+ (or " " "\t" EndOfLine))
+(define-sexp-parser EndOfLine none
+ (or "\r\n" "\n" "\r"))
+(define-sexp-parser EndOfFile none
+ (not-followed-by peg-any))
+
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;;;;; PARSE STRING PEGS
@@ -101,7 +186,7 @@ RB < ']'
;; will define all of the nonterminals in the grammar with equivalent
;; PEG s-expressions.
(define (peg-parser str for-syntax)
- (let ((parsed (match-pattern peg-grammar str)))
+ (let ((parsed (match-pattern Grammar str)))
(if (not parsed)
(begin
;; (display "Invalid PEG grammar!\n")
@@ -110,11 +195,154 @@ RB < ']'
(cond
((or (not (list? lst)) (null? lst))
lst)
- ((eq? (car lst) 'peg-grammar)
- #`(begin
- #,@(map (lambda (x) (peg-nonterm->defn x for-syntax))
- (context-flatten (lambda (lst) (<= (depth lst) 2))
- (cdr lst))))))))))
+ ((eq? (car lst) 'Grammar)
+ (Grammar->defn lst for-syntax)))))))
+
+;; (Grammar (Definition ...) (Definition ...))
+(define (Grammar->defn lst for-syntax)
+ #`(begin
+ #,@(map (lambda (x) (Definition->defn x for-syntax))
+ (context-flatten (lambda (lst) (<= (depth lst) 1))
+ (cdr lst)))))
+
+;; (Definition (Identifier "Something") "<-" (Expression ...))
+;; `-> (define-peg-pattern Something 'all ...)
+(define (Definition->defn lst for-syntax)
+ (match lst
+ (('Definition ('Identifier identifier) grabber expression)
+ #`(define-peg-pattern
+ #,(datum->syntax for-syntax (string->symbol identifier))
+ #,(match grabber
+ ("<--" (datum->syntax for-syntax 'all))
+ ("<-" (datum->syntax for-syntax 'body))
+ ("<" (datum->syntax for-syntax 'none)))
+ #,(compressor
+ (Expression->defn expression for-syntax)
+ for-syntax)))))
+
+;; (Expression X)
+;; `-> (or X)
+;; (Expression X Y)
+;; `-> (or X Y)
+;; (Expression X (Y Z ...))
+;; `-> (or X Y Z ...)
+(define (Expression->defn lst for-syntax)
+ (match lst
+ (('Expression seq ...)
+ #`(or #,@(map (lambda (x) (Sequence->defn x for-syntax))
+ (keyword-flatten '(Sequence) seq))))))
+
+;; (Sequence X)
+;; `-> (and X)
+;; (Sequence X Y)
+;; `-> (and X Y)
+;; (Sequence X (Y Z ...))
+;; `-> (and X Y Z ...)
+(define (Sequence->defn lst for-syntax)
+ (match lst
+ (('Sequence pre ...)
+ #`(and #,@(map (lambda (x) (Prefix->defn x for-syntax))
+ (keyword-flatten '(Prefix) pre))))))
+
+;; (Prefix (Suffix ...))
+;; `-> (...)
+;; (Prefix (NOT "!") (Suffix ...))
+;; `-> (not-followed-by ...)
+;; (Prefix (AND "&") (Suffix ...))
+;; `-> (followed-by ...)
+(define (Prefix->defn lst for-syntax)
+ (match lst
+ (('Prefix ('AND _) su) #`(followed-by #,(Suffix->defn su for-syntax)))
+ (('Prefix ('NOT _) su) #`(not-followed-by #,(Suffix->defn su for-syntax)))
+ (('Prefix suffix) (Suffix->defn suffix for-syntax))))
+
+;; (Suffix (Primary ...))
+;; `-> (...)
+;; (Suffix (Primary ...) (STAR "*"))
+;; `-> (* ...)
+;; (Suffix (Primary ...) (QUESTION "?"))
+;; `-> (? ...)
+;; (Suffix (Primary ...) (PLUS "+"))
+;; `-> (+ ...)
+(define (Suffix->defn lst for-syntax)
+ (match lst
+ (('Suffix prim) (Primary->defn prim for-syntax))
+ (('Suffix prim ('STAR _)) #`(* #,(Primary->defn prim for-syntax)))
+ (('Suffix prim ('QUESTION _)) #`(? #,(Primary->defn prim for-syntax)))
+ (('Suffix prim ('PLUS _)) #`(+ #,(Primary->defn prim for-syntax)))))
+
+
+(define (Primary->defn lst for-syntax)
+ (let ((value (second lst)))
+ (match (car value)
+ ('DOT #'peg-any)
+ ('Identifier (Identifier->defn value for-syntax))
+ ('Expression (Expression->defn value for-syntax))
+ ('Literal (Literal->defn value for-syntax))
+ ('Class (Class->defn value for-syntax)))))
+
+;; (Identifier "hello")
+;; `-> hello
+(define (Identifier->defn lst for-syntax)
+ (datum->syntax for-syntax (string->symbol (second lst))))
+
+;; (Literal (Char "a") (Char "b") (Char "c"))
+;; `-> "abc"
+(define (Literal->defn lst for-syntax)
+ (apply string (map (lambda (x) (Char->defn x for-syntax)) (cdr lst))))
+
+;; TODO: empty Class can happen: `[]`, but what does it represent?
+;; (Class ...)
+;; `-> (or ...)
+(define (Class->defn lst for-syntax)
+ #`(or #,@(map (lambda (x) (Range->defn x for-syntax))
+ (cdr lst))))
+
+;; For one character:
+;; (Range (Char "a"))
+;; `-> "a"
+;; Or for a range:
+;; (Range (Char "a") (Char "b"))
+;; `-> (range #\a #\b)
+(define (Range->defn lst for-syntax)
+ (match lst
+ (('Range ch)
+ (string (Char->defn ch for-syntax)))
+ (('Range range-beginning range-end)
+ #`(range
+ #,(Char->defn range-beginning for-syntax)
+ #,(Char->defn range-end for-syntax)))))
+
+;; (Char "a")
+;; `-> #\a
+;; (Char "\\n")
+;; `-> #\newline
+;; (Char "\\135")
+;; `-> #\]
+(define (Char->defn lst for-syntax)
+ (let* ((charstr (second lst))
+ (first (string-ref charstr 0)))
+ (cond
+ ((= 1 (string-length charstr)) first)
+ ((char-numeric? (string-ref charstr 1))
+ (integer->char
+ (reduce + 0
+ (map
+ (lambda (x y)
+ (* (- (char->integer x) (char->integer #\0)) y))
+ (reverse (string->list charstr 1))
+ '(1 8 64)))))
+ (else
+ (case (string-ref charstr 1)
+ ((#\n) #\newline)
+ ((#\r) #\return)
+ ((#\t) #\tab)
+ ((#\') #\')
+ ((#\]) #\])
+ ((#\\) #\\)
+ ((#\[) #\[))))))
+
+(define peg-grammar Grammar)
;; Macro wrapper for PEG-PARSER. Parses PEG grammars expressed as strings and
;; defines all the appropriate nonterminals.
@@ -124,119 +352,6 @@ RB < ']'
((_ str)
(peg-parser (syntax->datum #'str) x)))))
-;; lst has format (nonterm grabber pattern), where
-;; nonterm is a symbol (the name of the nonterminal),
-;; grabber is a string (either "<", "<-" or "<--"), and
-;; pattern is the parse of a PEG pattern expressed as as string.
-(define (peg-nonterm->defn lst for-syntax)
- (let* ((nonterm (car lst))
- (grabber (cadr lst))
- (pattern (caddr lst))
- (nonterm-name (datum->syntax for-syntax
- (string->symbol (cadr nonterm)))))
- #`(define-peg-pattern #,nonterm-name
- #,(cond
- ((string=? grabber "<--") (datum->syntax for-syntax 'all))
- ((string=? grabber "<-") (datum->syntax for-syntax 'body))
- (else (datum->syntax for-syntax 'none)))
- #,(compressor (peg-pattern->defn pattern for-syntax) for-syntax))))
-
-;; lst has format ('peg-pattern ...).
-;; After the context-flatten, (cdr lst) has format
-;; (('peg-alternative ...) ...), where the outer list is a collection
-;; of elements from a '/' alternative.
-(define (peg-pattern->defn lst for-syntax)
- #`(or #,@(map (lambda (x) (peg-alternative->defn x for-syntax))
- (context-flatten (lambda (x) (eq? (car x) 'peg-alternative))
- (cdr lst)))))
-
-;; lst has format ('peg-alternative ...).
-;; After the context-flatten, (cdr lst) has the format
-;; (item ...), where each item has format either ("!" ...), ("&" ...),
-;; or ('peg-suffix ...).
-(define (peg-alternative->defn lst for-syntax)
- #`(and #,@(map (lambda (x) (peg-body->defn x for-syntax))
- (context-flatten (lambda (x) (or (string? (car x))
- (eq? (car x) 'peg-suffix)))
- (cdr lst)))))
-
-;; lst has the format either
-;; ("!" ('peg-suffix ...)), ("&" ('peg-suffix ...)), or
-;; ('peg-suffix ...).
-(define (peg-body->defn lst for-syntax)
- (cond
- ((equal? (car lst) "&")
- #`(followed-by #,(peg-suffix->defn (cadr lst) for-syntax)))
- ((equal? (car lst) "!")
- #`(not-followed-by #,(peg-suffix->defn (cadr lst) for-syntax)))
- ((eq? (car lst) 'peg-suffix)
- (peg-suffix->defn lst for-syntax))
- (else `(peg-parse-body-fail ,lst))))
-
-;; lst has format ('peg-suffix <peg-primary> (? (/ "*" "?" "+")))
-(define (peg-suffix->defn lst for-syntax)
- (let ((inner-defn (peg-primary->defn (cadr lst) for-syntax)))
- (cond
- ((null? (cddr lst))
- inner-defn)
- ((equal? (caddr lst) "*")
- #`(* #,inner-defn))
- ((equal? (caddr lst) "?")
- #`(? #,inner-defn))
- ((equal? (caddr lst) "+")
- #`(+ #,inner-defn)))))
-
-;; Parse a primary.
-(define (peg-primary->defn lst for-syntax)
- (let ((el (cadr lst)))
- (cond
- ((list? el)
- (cond
- ((eq? (car el) 'peg-literal)
- (peg-literal->defn el for-syntax))
- ((eq? (car el) 'peg-charclass)
- (peg-charclass->defn el for-syntax))
- ((eq? (car el) 'peg-nonterminal)
- (datum->syntax for-syntax (string->symbol (cadr el))))))
- ((string? el)
- (cond
- ((equal? el "(")
- (peg-pattern->defn (caddr lst) for-syntax))
- ((equal? el ".")
- (datum->syntax for-syntax 'peg-any))
- (else (datum->syntax for-syntax
- `(peg-parse-any unknown-string ,lst)))))
- (else (datum->syntax for-syntax
- `(peg-parse-any unknown-el ,lst))))))
-
-;; Trims characters off the front and end of STR.
-;; (trim-1chars "'ab'") -> "ab"
-(define (trim-1chars str) (substring str 1 (- (string-length str) 1)))
-
-;; Parses a literal.
-(define (peg-literal->defn lst for-syntax)
- (datum->syntax for-syntax (trim-1chars (cadr lst))))
-
-;; Parses a charclass.
-(define (peg-charclass->defn lst for-syntax)
- #`(or
- #,@(map
- (lambda (cc)
- (cond
- ((eq? (car cc) 'charclass-range)
- #`(range #,(datum->syntax
- for-syntax
- (string-ref (cadr cc) 0))
- #,(datum->syntax
- for-syntax
- (string-ref (cadr cc) 2))))
- ((eq? (car cc) 'charclass-single)
- (datum->syntax for-syntax (cadr cc)))))
- (context-flatten
- (lambda (x) (or (eq? (car x) 'charclass-range)
- (eq? (car x) 'charclass-single)))
- (cdr lst)))))
-
;; Compresses a list to save the optimizer work.
;; e.g. (or (and a)) -> a
(define (compressor-core lst)
@@ -263,11 +378,10 @@ RB < ']'
(let ((string (syntax->datum #'str-stx)))
(compile-peg-pattern
(compressor
- (peg-pattern->defn
- (peg:tree (match-pattern peg-pattern string)) #'str-stx)
+ (Expression->defn
+ (peg:tree (match-pattern Expression string)) #'str-stx)
#'str-stx)
(if (eq? accum 'all) 'body accum))))
(else (error "Bad embedded PEG string" args))))
(add-peg-compiler! 'peg peg-string-compile)
-
diff --git a/test-suite/tests/peg.test b/test-suite/tests/peg.test
index f516571e8..1136c03f1 100644
--- a/test-suite/tests/peg.test
+++ b/test-suite/tests/peg.test
@@ -28,17 +28,25 @@
;; the nonterminals defined in the PEG parser written with
;; S-expressions.
(define grammar-mapping
- '((grammar peg-grammar)
- (pattern peg-pattern)
- (alternative peg-alternative)
- (suffix peg-suffix)
- (primary peg-primary)
- (literal peg-literal)
- (charclass peg-charclass)
- (CCrange charclass-range)
- (CCsingle charclass-single)
- (nonterminal peg-nonterminal)
- (sp peg-sp)))
+ '((Grammar Grammar)
+ (Definition Definition)
+ (Expression Expression)
+ (Sequence Sequence)
+ (Prefix Prefix)
+ (Suffix Suffix)
+ (Primary Primary)
+ (Identifier Identifier)
+ (Literal Literal)
+ (Class Class)
+ (Range Range)
+ (Char Char)
+ (LEFTARROW LEFTARROW)
+ (AND AND)
+ (NOT NOT)
+ (QUESTION QUESTION)
+ (STAR STAR)
+ (PLUS PLUS)
+ (DOT DOT)))
;; Transforms the nonterminals defined in the PEG parser written as a PEG to the nonterminals defined in the PEG parser written with S-expressions.
(define (grammar-transform x)
@@ -69,7 +77,7 @@
(peg:tree (match-pattern (@@ (ice-9 peg) peg-grammar) (@@ (ice-9 peg) peg-as-peg)))
(tree-map
grammar-transform
- (peg:tree (match-pattern grammar (@@ (ice-9 peg) peg-as-peg)))))))
+ (peg:tree (match-pattern Grammar (@@ (ice-9 peg) peg-as-peg)))))))
;; A grammar for pascal-style comments from Wikipedia.
(define comment-grammar
--
2.46.0