Hi Ludovic,

Thanks for the review

On 2024-10-20 12:10, Ludovic Courtès wrote:

+(define (Primary->defn lst for-syntax)
+  (let ((value (second lst)))
+    (case (car value)
+      ('DOT        #'peg-any)
+      ('Identifier (Identifier->defn value for-syntax))
+      ('Expression (Expression->defn value for-syntax))
+      ('Literal    (Literal->defn value for-syntax))
+      ('Class      (Class->defn value for-syntax)))))

I get these compiler warnings:

--8<---------------cut here---------------start------------->8---
ice-9/peg/string-peg.scm:258:7: warning: duplicate datum quote in clause ((quote NOT) 
(quasisyntax (not-followed-by (unsyntax (Suffix->defn (third lst) for-syntax))))) of 
case expression (case (car suffix) ((quote AND) (quasisyntax (followed-by (unsyntax 
(Suffix->defn (third lst) for-syntax))))) ((quote NOT) (quasisyntax (not-followed-by 
(unsyntax (Suffix->defn (third lst) for-syntax))))) (else (Suffix->defn suffix 
for-syntax)))
ice-9/peg/string-peg.scm:277:9: warning: duplicate datum quote in clause 
((quote STAR) (quasisyntax (* (unsyntax out)))) of case expression (case (caar 
extra) ((quote QUESTION) (quasisyntax (? (unsyntax out)))) ((quote STAR) 
(quasisyntax (* (unsyntax out)))) ((quote PLUS) (quasisyntax (+ (unsyntax 
out)))))
ice-9/peg/string-peg.scm:278:9: warning: duplicate datum quote in clause 
((quote PLUS) (quasisyntax (+ (unsyntax out)))) of case expression (case (caar 
extra) ((quote QUESTION) (quasisyntax (? (unsyntax out)))) ((quote STAR) 
(quasisyntax (* (unsyntax out)))) ((quote PLUS) (quasisyntax (+ (unsyntax 
out)))))
ice-9/peg/string-peg.scm:284:7: warning: duplicate datum quote in clause ((quote Identifier) 
(Identifier->defn value for-syntax)) of case expression (case (car value) ((quote DOT) (syntax 
peg-any)) ((quote Identifier) (Identifier->defn value for-syntax)) ((quote Expression) 
(Expression->defn value for-syntax)) ((quote Literal) (Literal->defn value for-syntax)) 
((quote NotInClass) (NotInClass->defn value for-syntax)) ((quote Class) (Class->defn value 
for-syntax)))
ice-9/peg/string-peg.scm:285:7: warning: duplicate datum quote in clause ((quote Expression) 
(Expression->defn value for-syntax)) of case expression (case (car value) ((quote DOT) (syntax 
peg-any)) ((quote Identifier) (Identifier->defn value for-syntax)) ((quote Expression) 
(Expression->defn value for-syntax)) ((quote Literal) (Literal->defn value for-syntax)) 
((quote NotInClass) (NotInClass->defn value for-syntax)) ((quote Class) (Class->defn value 
for-syntax)))
ice-9/peg/string-peg.scm:286:7: warning: duplicate datum quote in clause ((quote Literal) 
(Literal->defn value for-syntax)) of case expression (case (car value) ((quote DOT) (syntax 
peg-any)) ((quote Identifier) (Identifier->defn value for-syntax)) ((quote Expression) 
(Expression->defn value for-syntax)) ((quote Literal) (Literal->defn value for-syntax)) 
((quote NotInClass) (NotInClass->defn value for-syntax)) ((quote Class) (Class->defn value 
for-syntax)))
ice-9/peg/string-peg.scm:287:7: warning: duplicate datum quote in clause ((quote NotInClass) 
(NotInClass->defn value for-syntax)) of case expression (case (car value) ((quote DOT) (syntax 
peg-any)) ((quote Identifier) (Identifier->defn value for-syntax)) ((quote Expression) 
(Expression->defn value for-syntax)) ((quote Literal) (Literal->defn value for-syntax)) 
((quote NotInClass) (NotInClass->defn value for-syntax)) ((quote Class) (Class->defn value 
for-syntax)))
ice-9/peg/string-peg.scm:288:7: warning: duplicate datum quote in clause ((quote Class) 
(Class->defn value for-syntax)) of case expression (case (car value) ((quote DOT) (syntax 
peg-any)) ((quote Identifier) (Identifier->defn value for-syntax)) ((quote Expression) 
(Expression->defn value for-syntax)) ((quote Literal) (Literal->defn value for-syntax)) 
((quote NotInClass) (NotInClass->defn value for-syntax)) ((quote Class) (Class->defn value 
for-syntax)))
--8<---------------cut here---------------end--------------->8---

And indeed, the correct syntax is:

   (case value (DOT …) (Identifier …) …)

or:

   (match value ('DOT …) ('Identifier …) …)

The former returns *unspecified* when passed a value not matched by any
clause, whereas the latter throws an error.

So I would recommend ‘match’.

I'm always in doubt with this but the thing worked.
I always check in the internet how to do the case, and I always get the warning... my bad!

At any rate, that makes me wonder whether this code path is tested at
all.  As written, ‘Primary->defn’ would always return *unspecified*.
Is there a test we could add?

I made some tests and it works, maybe unexpectedly:

        scheme@(guile-user)> (define a 'b)
        scheme@(guile-user)> (case a ('c 3)('b 1)('d 2))
;;; <stdin>:12:15: warning: duplicate datum quote in clause ((quote b) 1) of case expression (case a ((quote c) 3) ((quote b) 1) ((quote d) 2)) ;;; <stdin>:12:21: warning: duplicate datum quote in clause ((quote d) 2) of case expression (case a ((quote c) 3) ((quote b) 1) ((quote d) 2))
        $8 = 1

While your proposal doesn't:
        
        scheme@(guile-user)> (case a (c 3)(b 1)(d 2))
        While compiling expression:
        Syntax error:
unknown file:14:8: case: invalid clause in subform (c 3) of (case a (c 3) (b 1) (d 2))

I tested further and this is what it should be:

        scheme@(guile-user)> (case a ((b) 1)((d) 2))
        $1 = 1

I used match instead as you proposed and made it all work with no warnings and better error reporting.

+(define (Range->defn lst for-syntax)
    (cond
-   ((list? el)
-    (cond
-     ((eq? (car el) 'peg-literal)
-      (peg-literal->defn el for-syntax))
-     ((eq? (car el) 'peg-charclass)
-      (peg-charclass->defn el for-syntax))
-     ((eq? (car el) 'peg-nonterminal)
-      (datum->syntax for-syntax (string->symbol (cadr el))))))
-   ((string? el)
+    ((= 2 (length lst))
+     (second (second lst)))
+    ((= 3 (length lst))
+     #`(range
+         #,(Char->defn (second lst) for-syntax)
+         #,(Char->defn (third lst) for-syntax)))))

Keep in mind that ‘length’ is O(N), and that car/first, second/cadr are
frowned upon.  I would write it as:

   (match lst
     ((x y) y)
     ((x y z) #`(range …)))

Yeah, I was very bad at `match` when I wrote this thing :(
But! Now I spent some hours with it and rewrote the thing for `match`!

(Ideally with identifiers more meaningful than x, y, and z. :-))

  ;; Transforms the nonterminals defined in the PEG parser written as a PEG to 
the nonterminals defined in the PEG parser written with S-expressions.
  (define (grammar-transform x)
@@ -69,7 +77,7 @@
      (peg:tree (match-pattern (@@ (ice-9 peg) peg-grammar) (@@ (ice-9 peg) 
peg-as-peg)))
      (tree-map
       grammar-transform
-     (peg:tree (match-pattern grammar (@@ (ice-9 peg) peg-as-peg)))))))
+     (peg:tree (match-pattern (@@ (ice-9 peg) peg-grammar) (@@ (ice-9 peg) 
peg-as-peg)))))))

What happened to the ‘grammar’ binding?  I can’t see where it was coming
from.

This is a very good catch!
I "fixed" a missing identifier by this, but it happened to be skipping a really important check: Is our PEG written as PEG understood like our PEG definition in sexps?

The `Grammar` is coming from the processing of `peg-as-peg` as a peg-string. This is what I was missing.

This was hiding some errors that I fixed, and now I'm pretty confident of the thing. Wow! This was a very good one! Thanks for pointing it out.

 From 64a17be08581465d11185b4a0ca636354d2f944c Mon Sep 17 00:00:00 2001
From: Ekaitz Zarraga <eka...@elenq.tech>
Date: Fri, 11 Oct 2024 14:24:30 +0200
Subject: [PATCH v3 2/2] PEG: Add support for `not-in-range` and [^...]

Modern PEG supports inversed class like `[^a-z]` that would get any
character not in the `a-z` range. This commit adds support for that and
also for a new `not-in-range` PEG pattern for scheme.

* module/ice-9/peg/codegen.scm (cg-not-in-range): New function.
* module/ice-9/peg/string-peg.scm: Add support for `[^...]`
* test-suite/tests/peg.test: Test it.
* doc/ref/api-peg.texi: Document accordingly.

This one LGTM.

In addition to the issues mentioned above, could you add an entry in the
‘NEWS’ file, probably under a new “New interfaces and functionality”
heading?

Yes!
Added it in both commits: In the first commit I added that PEG was improved, and in the second that `not-in-range` was added.

Thanks,
Ludo’.

Attached new version of both patches.

Thanks for the help, the kind of things you point out are the ones that I wanted help with! Thanks! Now I'm a better matcher.

This helps me improve,
Ekaitz
From 7941589941fe7a0fb6f71b8681ccb1d976946912 Mon Sep 17 00:00:00 2001
From: Ekaitz Zarraga <eka...@elenq.tech>
Date: Fri, 11 Oct 2024 14:24:30 +0200
Subject: [PATCH v4 2/2] PEG: Add support for `not-in-range` and [^...]

Modern PEG supports inversed class like `[^a-z]` that would get any
character not in the `a-z` range. This commit adds support for that and
also for a new `not-in-range` PEG pattern for scheme.

* module/ice-9/peg/codegen.scm (cg-not-in-range): New function.
* module/ice-9/peg/string-peg.scm: Add support for `[^...]`
* test-suite/tests/peg.test: Test it.
* doc/ref/api-peg.texi: Document accordingly.
---
 NEWS                            |  3 ++-
 doc/ref/api-peg.texi            |  8 +++++++
 module/ice-9/peg/codegen.scm    | 22 +++++++++++++++++++
 module/ice-9/peg/string-peg.scm | 38 ++++++++++++++++++++++++++++++---
 test-suite/tests/peg.test       |  6 +++++-
 5 files changed, 72 insertions(+), 5 deletions(-)

diff --git a/NEWS b/NEWS
index df43f3754..17ef560b1 100644
--- a/NEWS
+++ b/NEWS
@@ -32,7 +32,8 @@ Changes in 3.0.11 (since 3.0.10)
 ** PEG parser
 
 PEG parser has been rewritten to cover all the functionality defined in
-<https://bford.info/pub/lang/peg.pdf>.
+<https://bford.info/pub/lang/peg.pdf>. Also added the `not-in-range` pattern
+to `(ice-9 peg)` that is also available from PEG strings via `[^...]`.
 
 
 Changes in 3.0.10 (since 3.0.9)
diff --git a/doc/ref/api-peg.texi b/doc/ref/api-peg.texi
index 84a9e6c6b..edb090b20 100644
--- a/doc/ref/api-peg.texi
+++ b/doc/ref/api-peg.texi
@@ -147,6 +147,14 @@ Parses any character falling between @var{a} and @var{z}.
 @code{(range #\a #\z)}
 @end deftp
 
+@deftp {PEG Pattern} {inverse range of characters} a z
+Parses any character not falling between @var{a} and @var{z}.
+
+@code{"[^a-z]"}
+
+@code{(not-in-range #\a #\z)}
+@end deftp
+
 Example:
 
 @example
diff --git a/module/ice-9/peg/codegen.scm b/module/ice-9/peg/codegen.scm
index d80c3e849..82367ef55 100644
--- a/module/ice-9/peg/codegen.scm
+++ b/module/ice-9/peg/codegen.scm
@@ -140,6 +140,27 @@ return EXP."
                          ((none) #`(list (1+ pos) '()))
                          (else (error "bad accum" accum))))))))))
 
+;; Generates code for matching a range of characters not between start and end.
+;; E.g.: (cg-not-in-range syntax #\a #\z 'body)
+(define (cg-not-in-range pat accum)
+  (syntax-case pat ()
+    ((start end)
+     (if (not (and (char? (syntax->datum #'start))
+                   (char? (syntax->datum #'end))))
+         (error "range PEG should have characters after it; instead got"
+                #'start #'end))
+     #`(lambda (str len pos)
+         (and (< pos len)
+              (let ((c (string-ref str pos)))
+                (and (or (char<? c start) (char>? c end))
+                     #,(case accum
+                         ((all) #`(list (1+ pos)
+                                        (list 'cg-not-in-range (string c))))
+                         ((name) #`(list (1+ pos) 'cg-not-in-range))
+                         ((body) #`(list (1+ pos) (string c)))
+                         ((none) #`(list (1+ pos) '()))
+                         (else (error "bad accum" accum))))))))))
+
 ;; Generate code to match a pattern and do nothing with the result
 (define (cg-ignore pat accum)
   (syntax-case pat ()
@@ -304,6 +325,7 @@ return EXP."
         (assq-set! peg-compiler-alist symbol function)))
 
 (add-peg-compiler! 'range cg-range)
+(add-peg-compiler! 'not-in-range cg-not-in-range)
 (add-peg-compiler! 'ignore cg-ignore)
 (add-peg-compiler! 'capture cg-capture)
 (add-peg-compiler! 'and cg-and)
diff --git a/module/ice-9/peg/string-peg.scm b/module/ice-9/peg/string-peg.scm
index 4b923220a..0026f8930 100644
--- a/module/ice-9/peg/string-peg.scm
+++ b/module/ice-9/peg/string-peg.scm
@@ -54,7 +54,7 @@ Prefix <-- (AND / NOT)? Suffix
 Suffix <-- Primary (QUESTION / STAR / PLUS)?
 Primary <-- Identifier !LEFTARROW
            / OPEN Expression CLOSE
-           / Literal / Class / DOT
+           / Literal / Class / NotInClass / DOT
 
 # Lexical syntax
 Identifier <-- IdentStart IdentCont* Spacing
@@ -64,6 +64,7 @@ IdentCont <- IdentStart / [0-9]
 
 Literal <-- SQUOTE (!SQUOTE Char)* SQUOTE Spacing
         / DQUOTE (!DQUOTE Char)* DQUOTE Spacing
+NotInClass <-- OPENBRACKET NOTIN  (!CLOSEBRACKET Range)* CLOSEBRACKET Spacing
 Class <-- OPENBRACKET !NOTIN  (!CLOSEBRACKET Range)* CLOSEBRACKET Spacing
 Range <-- Char DASH Char / Char
 Char <-- '\\\\' [nrt'\"\\[\\]\\\\]
@@ -78,6 +79,7 @@ DQUOTE < [\"]
 DASH < '-'
 OPENBRACKET < '['
 CLOSEBRACKET < ']'
+NOTIN < '^'
 SLASH < '/' Spacing
 AND <-- '&' Spacing
 NOT <-- '!' Spacing
@@ -122,6 +124,7 @@ EndOfFile < !.
       (and OPEN Expression CLOSE)
       Literal
       Class
+      NotInClass
       DOT))
 (define-sexp-parser Identifier all
   (and IdentStart (* IdentCont) Spacing))
@@ -133,7 +136,11 @@ EndOfFile < !.
   (or (and SQUOTE (* (and (not-followed-by SQUOTE) Char)) SQUOTE Spacing)
       (and DQUOTE (* (and (not-followed-by DQUOTE) Char)) DQUOTE Spacing)))
 (define-sexp-parser Class all
-  (and OPENBRACKET (* (and (not-followed-by CLOSEBRACKET) Range)) CLOSEBRACKET Spacing))
+  (and OPENBRACKET (not-followed-by NOTIN)
+       (* (and (not-followed-by CLOSEBRACKET) Range)) CLOSEBRACKET Spacing))
+(define-sexp-parser NotInClass all
+  (and OPENBRACKET NOTIN
+       (* (and (not-followed-by CLOSEBRACKET) Range)) CLOSEBRACKET Spacing))
 (define-sexp-parser Range all
   (or (and Char DASH Char) Char))
 (define-sexp-parser Char all
@@ -143,6 +150,8 @@ EndOfFile < !.
       (and (not-followed-by "\\") peg-any)))
 (define-sexp-parser LEFTARROW body
   (and (or "<--" "<-" "<") Spacing)) ; NOTE: <-- and < are extensions
+(define-sexp-parser NOTIN none
+  (and "^"))
 (define-sexp-parser SLASH none
   (and "/" Spacing))
 (define-sexp-parser AND all
@@ -279,6 +288,7 @@ EndOfFile < !.
       ('Identifier (Identifier->defn value for-syntax))
       ('Expression (Expression->defn value for-syntax))
       ('Literal    (Literal->defn value for-syntax))
+      ('NotInClass (NotInClass->defn value for-syntax))
       ('Class      (Class->defn value for-syntax)))))
 
 ;; (Identifier "hello")
@@ -291,13 +301,35 @@ EndOfFile < !.
 (define (Literal->defn lst for-syntax)
   (apply string (map (lambda (x) (Char->defn x for-syntax)) (cdr lst))))
 
-;; TODO: empty Class can happen: `[]`, but what does it represent?
+;; (NotInClass ...)
+;;  `-> (and ...)
+(define (NotInClass->defn lst for-syntax)
+  #`(and #,@(map (lambda (x) (NotInRange->defn x for-syntax))
+                 (cdr lst))))
+
 ;; (Class ...)
 ;;  `-> (or ...)
 (define (Class->defn lst for-syntax)
   #`(or #,@(map (lambda (x) (Range->defn x for-syntax))
                 (cdr lst))))
 
+;; NOTE: It's coming from NotInClass.
+;; For one character:
+;; (Range (Char "a"))
+;;  `-> (not-in-range #\a #\a)
+;; Or for a range:
+;; (Range (Char "a") (Char "b"))
+;;  `-> (not-in-range #\a #\b)
+(define (NotInRange->defn lst for-syntax)
+  (match lst
+    (('Range c)
+     (let ((ch (Char->defn c for-syntax)))
+       #`(not-in-range #,ch #,ch)))
+    (('Range range-beginning range-end)
+     #`(not-in-range
+         #,(Char->defn range-beginning for-syntax)
+         #,(Char->defn range-end       for-syntax)))))
+
 ;; For one character:
 ;; (Range (Char "a"))
 ;;  `-> "a"
diff --git a/test-suite/tests/peg.test b/test-suite/tests/peg.test
index 1136c03f1..d9e3e1b22 100644
--- a/test-suite/tests/peg.test
+++ b/test-suite/tests/peg.test
@@ -38,6 +38,7 @@
     (Identifier Identifier)
     (Literal Literal)
     (Class Class)
+    (NotInClass NotInClass)
     (Range Range)
     (Char Char)
     (LEFTARROW LEFTARROW)
@@ -85,7 +86,7 @@
 End <-- '*)'
 C <- Begin N* End
 N <- C / (!Begin !End Z)
-Z <- .")
+Z <- [^X-Z]") ;; Forbid some characters to test not-in-range
 
 ;; A short /etc/passwd file.
 (define *etc-passwd*
@@ -125,6 +126,9 @@ SLASH < '/'")
     (match-pattern C "(*blah*)")
     (make-prec 0 8 "(*blah*)"
 	       '((Begin "(*") "blah" (End "*)")))))
+  (pass-if
+   "simple comment with forbidden char"
+   (not (match-pattern C "(*blYh*)")))
   (pass-if
    "simple comment padded"
    (equal?
-- 
2.46.0

From 459d83ec2710bed5feb3d2a9ca28da8ede9da007 Mon Sep 17 00:00:00 2001
From: Ekaitz Zarraga <eka...@elenq.tech>
Date: Wed, 11 Sep 2024 21:19:26 +0200
Subject: [PATCH v4 1/2] PEG: Add full support for PEG + some extensions

This commit adds support for PEG as described in:

    <https://bford.info/pub/lang/peg.pdf>

It adds support for the missing features (comments, underscores in
identifiers and escaping) while keeping the extensions (dashes in
identifiers, < and <--).

The naming system tries to be as close as possible to the one proposed
in the paper.

* module/ice-9/peg/string-peg.scm: Rewrite PEG parser.
* test-suite/tests/peg.test: Fix import
---
 NEWS                            |   7 +
 doc/ref/api-peg.texi            |   8 +-
 module/ice-9/peg/string-peg.scm | 446 ++++++++++++++++++++------------
 test-suite/tests/peg.test       |  32 ++-
 4 files changed, 313 insertions(+), 180 deletions(-)

diff --git a/NEWS b/NEWS
index 9fd14c39d..df43f3754 100644
--- a/NEWS
+++ b/NEWS
@@ -27,6 +27,13 @@ Changes in 3.0.11 (since 3.0.10)
 ** Guile is compiled with -fexcess-precision=standard for i[3456]86 when possible
    (<https://debbugs.gnu.org/43262>)
 
+* New interfaces and functionality
+
+** PEG parser
+
+PEG parser has been rewritten to cover all the functionality defined in
+<https://bford.info/pub/lang/peg.pdf>.
+
 
 Changes in 3.0.10 (since 3.0.9)
 
diff --git a/doc/ref/api-peg.texi b/doc/ref/api-peg.texi
index d34ddc64c..84a9e6c6b 100644
--- a/doc/ref/api-peg.texi
+++ b/doc/ref/api-peg.texi
@@ -17,6 +17,10 @@ Wikipedia has a clear and concise introduction to PEGs if you want to
 familiarize yourself with the syntax:
 @url{http://en.wikipedia.org/wiki/Parsing_expression_grammar}.
 
+The paper that introduced PEG contains a more detailed description of how PEG
+works, and describes its syntax in detail:
+@url{https://bford.info/pub/lang/peg.pdf}
+
 The @code{(ice-9 peg)} module works by compiling PEGs down to lambda
 expressions.  These can either be stored in variables at compile-time by
 the define macros (@code{define-peg-pattern} and
@@ -216,8 +220,8 @@ should propagate up the parse tree.  The normal @code{<-} propagates the
 matched text up the parse tree, @code{<--} propagates the matched text
 up the parse tree tagged with the name of the nonterminal, and @code{<}
 discards that matched text and propagates nothing up the parse tree.
-Also, nonterminals may consist of any alphanumeric character or a ``-''
-character (in normal PEGs nonterminals can only be alphabetic).
+Also, nonterminals may include ``-'' character, while in normal PEG it is not
+allowed.
 
 For example, if we:
 @lisp
diff --git a/module/ice-9/peg/string-peg.scm b/module/ice-9/peg/string-peg.scm
index 45ed14bb1..4b923220a 100644
--- a/module/ice-9/peg/string-peg.scm
+++ b/module/ice-9/peg/string-peg.scm
@@ -1,6 +1,7 @@
 ;;;; string-peg.scm --- representing PEG grammars as strings
 ;;;;
-;;;; 	Copyright (C) 2010, 2011 Free Software Foundation, Inc.
+;;;; 	Copyright (C) 2010, 2011, Free Software Foundation, Inc.
+;;;; 	Copyright (C) 2024 Ekaitz Zarraga <eka...@elenq.tech>
 ;;;;
 ;;;; This library is free software; you can redistribute it and/or
 ;;;; modify it under the terms of the GNU Lesser General Public
@@ -21,10 +22,15 @@
   #:export (peg-as-peg
             define-peg-string-patterns
             peg-grammar)
+  #:use-module (ice-9 match)
   #:use-module (ice-9 peg using-parsers)
+  #:use-module (srfi srfi-1)
   #:use-module (ice-9 peg codegen)
   #:use-module (ice-9 peg simplify-tree))
 
+;; This module provides support for PEG as described in:
+;;   <https://bford.info/pub/lang/peg.pdf>
+
 ;; Gets the left-hand depth of a list.
 (define (depth lst)
   (if (or (not (list? lst)) (null? lst))
@@ -38,22 +44,58 @@
 
 ;; Grammar for PEGs in PEG grammar.
 (define peg-as-peg
-"grammar <-- (nonterminal ('<--' / '<-' / '<') sp pattern)+
-pattern <-- alternative (SLASH sp alternative)*
-alternative <-- ([!&]? sp suffix)+
-suffix <-- primary ([*+?] sp)*
-primary <-- '(' sp pattern ')' sp / '.' sp / literal / charclass / nonterminal !'<'
-literal <-- ['] (!['] .)* ['] sp
-charclass <-- LB (!']' (CCrange / CCsingle))* RB sp
-CCrange <-- . '-' .
-CCsingle <-- .
-nonterminal <-- [a-zA-Z0-9-]+ sp
-sp < [ \t\n]*
-SLASH < '/'
-LB < '['
-RB < ']'
+"# Hierarchical syntax
+Grammar <-- Spacing Definition+ EndOfFile
+Definition <-- Identifier LEFTARROW Expression
+
+Expression <-- Sequence (SLASH Sequence)*
+Sequence <-- Prefix*
+Prefix <-- (AND / NOT)? Suffix
+Suffix <-- Primary (QUESTION / STAR / PLUS)?
+Primary <-- Identifier !LEFTARROW
+           / OPEN Expression CLOSE
+           / Literal / Class / DOT
+
+# Lexical syntax
+Identifier <-- IdentStart IdentCont* Spacing
+# NOTE: `-` is an extension
+IdentStart <- [a-zA-Z_] / '-'
+IdentCont <- IdentStart / [0-9]
+
+Literal <-- SQUOTE (!SQUOTE Char)* SQUOTE Spacing
+        / DQUOTE (!DQUOTE Char)* DQUOTE Spacing
+Class <-- OPENBRACKET !NOTIN  (!CLOSEBRACKET Range)* CLOSEBRACKET Spacing
+Range <-- Char DASH Char / Char
+Char <-- '\\\\' [nrt'\"\\[\\]\\\\]
+       / '\\\\' [0-7][0-7][0-7]
+       / '\\\\' [0-7][0-7]?
+       / !'\\\\' .
+
+# NOTE: `<--` and `<` are extensions
+LEFTARROW <- ('<--' / '<-' / '<') Spacing
+SQUOTE < [']
+DQUOTE < [\"]
+DASH < '-'
+OPENBRACKET < '['
+CLOSEBRACKET < ']'
+SLASH < '/' Spacing
+AND <-- '&' Spacing
+NOT <-- '!' Spacing
+QUESTION <-- '?' Spacing
+STAR <-- '*' Spacing
+PLUS <-- '+' Spacing
+OPEN < '(' Spacing
+CLOSE < ')' Spacing
+DOT <-- '.' Spacing
+
+Spacing < (Space / Comment)*
+Comment < '#' (!EndOfLine .)* EndOfLine
+Space < ' ' / '\t' / EndOfLine
+EndOfLine < '\\r\\n' / '\\n' / '\\r'
+EndOfFile < !.
 ")
 
+
 (define-syntax define-sexp-parser
   (lambda (x)
     (syntax-case x ()
@@ -63,35 +105,78 @@ RB < ']'
               (syn (wrap-parser-for-users x matchf accumsym #'sym)))
            #`(define sym #,syn))))))
 
-(define-sexp-parser peg-grammar all
-  (+ (and peg-nonterminal (or "<--" "<-" "<") peg-sp peg-pattern)))
-(define-sexp-parser peg-pattern all
-  (and peg-alternative
-       (* (and (ignore "/") peg-sp peg-alternative))))
-(define-sexp-parser peg-alternative all
-  (+ (and (? (or "!" "&")) peg-sp peg-suffix)))
-(define-sexp-parser peg-suffix all
-  (and peg-primary (* (and (or "*" "+" "?") peg-sp))))
-(define-sexp-parser peg-primary all
-  (or (and "(" peg-sp peg-pattern ")" peg-sp)
-      (and "." peg-sp)
-      peg-literal
-      peg-charclass
-      (and peg-nonterminal (not-followed-by "<"))))
-(define-sexp-parser peg-literal all
-  (and "'" (* (and (not-followed-by "'") peg-any)) "'" peg-sp))
-(define-sexp-parser peg-charclass all
-  (and (ignore "[")
-       (* (and (not-followed-by "]")
-               (or charclass-range charclass-single)))
-       (ignore "]")
-       peg-sp))
-(define-sexp-parser charclass-range all (and peg-any "-" peg-any))
-(define-sexp-parser charclass-single all peg-any)
-(define-sexp-parser peg-nonterminal all
-  (and (+ (or (range #\a #\z) (range #\A #\Z) (range #\0 #\9) "-")) peg-sp))
-(define-sexp-parser peg-sp none
-  (* (or " " "\t" "\n")))
+(define-sexp-parser Grammar all
+  (and Spacing (+ Definition) EndOfFile))
+(define-sexp-parser Definition all
+  (and Identifier LEFTARROW Expression))
+(define-sexp-parser Expression all
+  (and Sequence (* (and SLASH Sequence))))
+(define-sexp-parser Sequence all
+  (* Prefix))
+(define-sexp-parser Prefix all
+  (and (? (or AND NOT)) Suffix))
+(define-sexp-parser Suffix all
+  (and Primary (? (or QUESTION STAR PLUS))))
+(define-sexp-parser Primary all
+  (or (and Identifier (not-followed-by LEFTARROW))
+      (and OPEN Expression CLOSE)
+      Literal
+      Class
+      DOT))
+(define-sexp-parser Identifier all
+  (and IdentStart (* IdentCont) Spacing))
+(define-sexp-parser IdentStart body
+  (or (or (range #\a #\z) (range #\A #\Z) "_") "-")) ; NOTE: - is an extension
+(define-sexp-parser IdentCont body
+  (or IdentStart (range #\0 #\9)))
+(define-sexp-parser Literal all
+  (or (and SQUOTE (* (and (not-followed-by SQUOTE) Char)) SQUOTE Spacing)
+      (and DQUOTE (* (and (not-followed-by DQUOTE) Char)) DQUOTE Spacing)))
+(define-sexp-parser Class all
+  (and OPENBRACKET (* (and (not-followed-by CLOSEBRACKET) Range)) CLOSEBRACKET Spacing))
+(define-sexp-parser Range all
+  (or (and Char DASH Char) Char))
+(define-sexp-parser Char all
+  (or (and "\\" (or "n" "r" "t" "'" "\"" "[" "]" "\\"))
+      (and "\\" (range #\0 #\7) (range #\0 #\7) (range #\0 #\7))
+      (and "\\" (range #\0 #\7) (? (range #\0 #\7)))
+      (and (not-followed-by "\\") peg-any)))
+(define-sexp-parser LEFTARROW body
+  (and (or "<--" "<-" "<") Spacing)) ; NOTE: <-- and < are extensions
+(define-sexp-parser SLASH none
+  (and "/" Spacing))
+(define-sexp-parser AND all
+  (and "&" Spacing))
+(define-sexp-parser NOT all
+  (and "!" Spacing))
+(define-sexp-parser QUESTION all
+  (and "?" Spacing))
+(define-sexp-parser STAR all
+  (and "*" Spacing))
+(define-sexp-parser PLUS all
+  (and "+" Spacing))
+(define-sexp-parser OPEN none
+  (and "(" Spacing))
+(define-sexp-parser CLOSE none
+  (and ")" Spacing))
+(define-sexp-parser DOT all
+  (and "." Spacing))
+(define-sexp-parser SQUOTE none "'")
+(define-sexp-parser DQUOTE none "\"")
+(define-sexp-parser OPENBRACKET none "[")
+(define-sexp-parser CLOSEBRACKET none "]")
+(define-sexp-parser DASH none "-")
+(define-sexp-parser Spacing none
+  (* (or Space Comment)))
+(define-sexp-parser Comment none
+  (and "#" (* (and (not-followed-by EndOfLine) peg-any)) EndOfLine))
+(define-sexp-parser Space none
+  (or " " "\t" EndOfLine))
+(define-sexp-parser EndOfLine none
+  (or "\r\n" "\n" "\r"))
+(define-sexp-parser EndOfFile none
+  (not-followed-by peg-any))
+
 
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;;;;; PARSE STRING PEGS
@@ -101,7 +186,7 @@ RB < ']'
 ;; will define all of the nonterminals in the grammar with equivalent
 ;; PEG s-expressions.
 (define (peg-parser str for-syntax)
-  (let ((parsed (match-pattern peg-grammar str)))
+  (let ((parsed (match-pattern Grammar str)))
     (if (not parsed)
         (begin
           ;; (display "Invalid PEG grammar!\n")
@@ -110,11 +195,154 @@ RB < ']'
           (cond
            ((or (not (list? lst)) (null? lst))
             lst)
-           ((eq? (car lst) 'peg-grammar)
-            #`(begin
-                #,@(map (lambda (x) (peg-nonterm->defn x for-syntax))
-                        (context-flatten (lambda (lst) (<= (depth lst) 2))
-                                         (cdr lst))))))))))
+           ((eq? (car lst) 'Grammar)
+            (Grammar->defn lst for-syntax)))))))
+
+;; (Grammar (Definition ...) (Definition ...))
+(define (Grammar->defn lst for-syntax)
+  #`(begin
+      #,@(map (lambda (x) (Definition->defn x for-syntax))
+              (context-flatten (lambda (lst) (<= (depth lst) 1))
+                               (cdr lst)))))
+
+;; (Definition (Identifier "Something") "<-" (Expression ...))
+;;  `-> (define-peg-pattern Something 'all ...)
+(define (Definition->defn lst for-syntax)
+  (match lst
+    (('Definition ('Identifier identifier) grabber expression)
+     #`(define-peg-pattern
+         #,(datum->syntax for-syntax (string->symbol identifier))
+         #,(match grabber
+                  ("<--" (datum->syntax for-syntax 'all))
+                  ("<-"  (datum->syntax for-syntax 'body))
+                  ("<"   (datum->syntax for-syntax 'none)))
+         #,(compressor
+             (Expression->defn expression for-syntax)
+             for-syntax)))))
+
+;; (Expression X)
+;;  `-> (or X)
+;; (Expression X Y)
+;;  `-> (or X Y)
+;; (Expression X (Y Z ...))
+;;  `-> (or X Y Z ...)
+(define (Expression->defn lst for-syntax)
+  (match lst
+    (('Expression seq ...)
+     #`(or #,@(map (lambda (x) (Sequence->defn x for-syntax))
+                   (keyword-flatten '(Sequence) seq))))))
+
+;; (Sequence X)
+;;  `-> (and X)
+;; (Sequence X Y)
+;;  `-> (and X Y)
+;; (Sequence X (Y Z ...))
+;;  `-> (and X Y Z ...)
+(define (Sequence->defn lst for-syntax)
+  (match lst
+    (('Sequence pre ...)
+     #`(and #,@(map (lambda (x) (Prefix->defn x for-syntax))
+                    (keyword-flatten '(Prefix) pre))))))
+
+;; (Prefix (Suffix ...))
+;;  `-> (...)
+;; (Prefix (NOT "!") (Suffix ...))
+;;  `-> (not-followed-by ...)
+;; (Prefix (AND "&") (Suffix ...))
+;;  `-> (followed-by ...)
+(define (Prefix->defn lst for-syntax)
+  (match lst
+    (('Prefix ('AND _) su) #`(followed-by     #,(Suffix->defn su for-syntax)))
+    (('Prefix ('NOT _) su) #`(not-followed-by #,(Suffix->defn su for-syntax)))
+    (('Prefix suffix) (Suffix->defn suffix for-syntax))))
+
+;; (Suffix (Primary ...))
+;;  `-> (...)
+;; (Suffix (Primary ...) (STAR "*"))
+;;  `-> (* ...)
+;; (Suffix (Primary ...) (QUESTION "?"))
+;;  `-> (? ...)
+;; (Suffix (Primary ...) (PLUS "+"))
+;;  `-> (+ ...)
+(define (Suffix->defn lst for-syntax)
+  (match lst
+    (('Suffix prim)               (Primary->defn prim for-syntax))
+    (('Suffix prim ('STAR     _)) #`(* #,(Primary->defn prim for-syntax)))
+    (('Suffix prim ('QUESTION _)) #`(? #,(Primary->defn prim for-syntax)))
+    (('Suffix prim ('PLUS     _)) #`(+ #,(Primary->defn prim for-syntax)))))
+
+
+(define (Primary->defn lst for-syntax)
+  (let ((value (second lst)))
+    (match (car value)
+      ('DOT        #'peg-any)
+      ('Identifier (Identifier->defn value for-syntax))
+      ('Expression (Expression->defn value for-syntax))
+      ('Literal    (Literal->defn value for-syntax))
+      ('Class      (Class->defn value for-syntax)))))
+
+;; (Identifier "hello")
+;;  `-> hello
+(define (Identifier->defn lst for-syntax)
+  (datum->syntax for-syntax (string->symbol (second lst))))
+
+;; (Literal (Char "a") (Char "b") (Char "c"))
+;;  `-> "abc"
+(define (Literal->defn lst for-syntax)
+  (apply string (map (lambda (x) (Char->defn x for-syntax)) (cdr lst))))
+
+;; TODO: empty Class can happen: `[]`, but what does it represent?
+;; (Class ...)
+;;  `-> (or ...)
+(define (Class->defn lst for-syntax)
+  #`(or #,@(map (lambda (x) (Range->defn x for-syntax))
+                (cdr lst))))
+
+;; For one character:
+;; (Range (Char "a"))
+;;  `-> "a"
+;; Or for a range:
+;; (Range (Char "a") (Char "b"))
+;;  `-> (range #\a #\b)
+(define (Range->defn lst for-syntax)
+  (match lst
+    (('Range ch)
+     (string (Char->defn ch for-syntax)))
+    (('Range range-beginning range-end)
+     #`(range
+         #,(Char->defn range-beginning for-syntax)
+         #,(Char->defn range-end       for-syntax)))))
+
+;; (Char "a")
+;;  `-> #\a
+;; (Char "\\n")
+;;  `-> #\newline
+;; (Char "\\135")
+;;  `-> #\]
+(define (Char->defn lst for-syntax)
+  (let* ((charstr (second lst))
+         (first   (string-ref charstr 0)))
+    (cond
+      ((= 1 (string-length charstr)) first)
+      ((char-numeric? (string-ref charstr 1))
+       (integer->char
+         (reduce + 0
+                 (map
+                   (lambda (x y)
+                     (* (- (char->integer x) (char->integer #\0)) y))
+                   (reverse (string->list charstr 1))
+                   '(1 8 64)))))
+      (else
+        (case (string-ref charstr 1)
+          ((#\n) #\newline)
+          ((#\r) #\return)
+          ((#\t) #\tab)
+          ((#\') #\')
+          ((#\]) #\])
+          ((#\\) #\\)
+          ((#\[) #\[))))))
+
+(define peg-grammar Grammar)
 
 ;; Macro wrapper for PEG-PARSER.  Parses PEG grammars expressed as strings and
 ;; defines all the appropriate nonterminals.
@@ -124,119 +352,6 @@ RB < ']'
       ((_ str)
        (peg-parser (syntax->datum #'str) x)))))
 
-;; lst has format (nonterm grabber pattern), where
-;;   nonterm is a symbol (the name of the nonterminal),
-;;   grabber is a string (either "<", "<-" or "<--"), and
-;;   pattern is the parse of a PEG pattern expressed as as string.
-(define (peg-nonterm->defn lst for-syntax)
-  (let* ((nonterm (car lst))
-         (grabber (cadr lst))
-         (pattern (caddr lst))
-         (nonterm-name (datum->syntax for-syntax
-                                      (string->symbol (cadr nonterm)))))
-    #`(define-peg-pattern #,nonterm-name
-       #,(cond
-          ((string=? grabber "<--") (datum->syntax for-syntax 'all))
-          ((string=? grabber "<-") (datum->syntax for-syntax 'body))
-          (else (datum->syntax for-syntax 'none)))
-       #,(compressor (peg-pattern->defn pattern for-syntax) for-syntax))))
-
-;; lst has format ('peg-pattern ...).
-;; After the context-flatten, (cdr lst) has format
-;;   (('peg-alternative ...) ...), where the outer list is a collection
-;;   of elements from a '/' alternative.
-(define (peg-pattern->defn lst for-syntax)
-  #`(or #,@(map (lambda (x) (peg-alternative->defn x for-syntax))
-                (context-flatten (lambda (x) (eq? (car x) 'peg-alternative))
-                                 (cdr lst)))))
-
-;; lst has format ('peg-alternative ...).
-;; After the context-flatten, (cdr lst) has the format
-;;   (item ...), where each item has format either ("!" ...), ("&" ...),
-;;   or ('peg-suffix ...).
-(define (peg-alternative->defn lst for-syntax)
-  #`(and #,@(map (lambda (x) (peg-body->defn x for-syntax))
-                 (context-flatten (lambda (x) (or (string? (car x))
-                                             (eq? (car x) 'peg-suffix)))
-                                  (cdr lst)))))
-
-;; lst has the format either
-;;   ("!" ('peg-suffix ...)), ("&" ('peg-suffix ...)), or
-;;     ('peg-suffix ...).
-(define (peg-body->defn lst for-syntax)
-    (cond
-      ((equal? (car lst) "&")
-       #`(followed-by #,(peg-suffix->defn (cadr lst) for-syntax)))
-      ((equal? (car lst) "!")
-       #`(not-followed-by #,(peg-suffix->defn (cadr lst) for-syntax)))
-      ((eq? (car lst) 'peg-suffix)
-       (peg-suffix->defn lst for-syntax))
-      (else `(peg-parse-body-fail ,lst))))
-
-;; lst has format ('peg-suffix <peg-primary> (? (/ "*" "?" "+")))
-(define (peg-suffix->defn lst for-syntax)
-  (let ((inner-defn (peg-primary->defn (cadr lst) for-syntax)))
-    (cond
-      ((null? (cddr lst))
-       inner-defn)
-      ((equal? (caddr lst) "*")
-       #`(* #,inner-defn))
-      ((equal? (caddr lst) "?")
-       #`(? #,inner-defn))
-      ((equal? (caddr lst) "+")
-       #`(+ #,inner-defn)))))
-
-;; Parse a primary.
-(define (peg-primary->defn lst for-syntax)
-  (let ((el (cadr lst)))
-  (cond
-   ((list? el)
-    (cond
-     ((eq? (car el) 'peg-literal)
-      (peg-literal->defn el for-syntax))
-     ((eq? (car el) 'peg-charclass)
-      (peg-charclass->defn el for-syntax))
-     ((eq? (car el) 'peg-nonterminal)
-      (datum->syntax for-syntax (string->symbol (cadr el))))))
-   ((string? el)
-    (cond
-     ((equal? el "(")
-      (peg-pattern->defn (caddr lst) for-syntax))
-     ((equal? el ".")
-      (datum->syntax for-syntax 'peg-any))
-     (else (datum->syntax for-syntax
-                          `(peg-parse-any unknown-string ,lst)))))
-   (else (datum->syntax for-syntax
-                        `(peg-parse-any unknown-el ,lst))))))
-
-;; Trims characters off the front and end of STR.
-;; (trim-1chars "'ab'") -> "ab"
-(define (trim-1chars str) (substring str 1 (- (string-length str) 1)))
-
-;; Parses a literal.
-(define (peg-literal->defn lst for-syntax)
-  (datum->syntax for-syntax (trim-1chars (cadr lst))))
-
-;; Parses a charclass.
-(define (peg-charclass->defn lst for-syntax)
-  #`(or
-     #,@(map
-         (lambda (cc)
-           (cond
-            ((eq? (car cc) 'charclass-range)
-             #`(range #,(datum->syntax
-                         for-syntax
-                         (string-ref (cadr cc) 0))
-                      #,(datum->syntax
-                         for-syntax
-                         (string-ref (cadr cc) 2))))
-            ((eq? (car cc) 'charclass-single)
-             (datum->syntax for-syntax (cadr cc)))))
-         (context-flatten
-          (lambda (x) (or (eq? (car x) 'charclass-range)
-                          (eq? (car x) 'charclass-single)))
-          (cdr lst)))))
-
 ;; Compresses a list to save the optimizer work.
 ;; e.g. (or (and a)) -> a
 (define (compressor-core lst)
@@ -263,11 +378,10 @@ RB < ']'
      (let ((string (syntax->datum #'str-stx)))
        (compile-peg-pattern
         (compressor
-         (peg-pattern->defn
-          (peg:tree (match-pattern peg-pattern string)) #'str-stx)
+         (Expression->defn
+          (peg:tree (match-pattern Expression string)) #'str-stx)
          #'str-stx)
         (if (eq? accum 'all) 'body accum))))
      (else (error "Bad embedded PEG string" args))))
 
 (add-peg-compiler! 'peg peg-string-compile)
-
diff --git a/test-suite/tests/peg.test b/test-suite/tests/peg.test
index f516571e8..1136c03f1 100644
--- a/test-suite/tests/peg.test
+++ b/test-suite/tests/peg.test
@@ -28,17 +28,25 @@
 ;; the nonterminals defined in the PEG parser written with
 ;; S-expressions.
 (define grammar-mapping
-  '((grammar peg-grammar)
-    (pattern peg-pattern)
-    (alternative peg-alternative)
-    (suffix peg-suffix)
-    (primary peg-primary)
-    (literal peg-literal)
-    (charclass peg-charclass)
-    (CCrange charclass-range)
-    (CCsingle charclass-single)
-    (nonterminal peg-nonterminal)
-    (sp peg-sp)))
+  '((Grammar Grammar)
+    (Definition Definition)
+    (Expression Expression)
+    (Sequence Sequence)
+    (Prefix Prefix)
+    (Suffix Suffix)
+    (Primary Primary)
+    (Identifier Identifier)
+    (Literal Literal)
+    (Class Class)
+    (Range Range)
+    (Char Char)
+    (LEFTARROW LEFTARROW)
+    (AND AND)
+    (NOT NOT)
+    (QUESTION QUESTION)
+    (STAR STAR)
+    (PLUS PLUS)
+    (DOT DOT)))
 
 ;; Transforms the nonterminals defined in the PEG parser written as a PEG to the nonterminals defined in the PEG parser written with S-expressions.
 (define (grammar-transform x)
@@ -69,7 +77,7 @@
     (peg:tree (match-pattern (@@ (ice-9 peg) peg-grammar) (@@ (ice-9 peg) peg-as-peg)))
     (tree-map
      grammar-transform
-     (peg:tree (match-pattern grammar (@@ (ice-9 peg) peg-as-peg)))))))
+     (peg:tree (match-pattern Grammar (@@ (ice-9 peg) peg-as-peg)))))))
 
 ;; A grammar for pascal-style comments from Wikipedia.
 (define comment-grammar
-- 
2.46.0

Reply via email to