I'm having a problem using lxml.etree to make a treebuilding parser that validates; I have test code where invalid xml is detected and an error raised when the line below target=ET.TreeBuilder(), is commented out.

The validation error looks as expected >  python tlxml.py invalid.rml
re.compile('^.*(?:\\W|\\b)(?P<fn>dynamic_rml\\.dtd|rml\\.dtd|rml_0_2\\.dtd|rml_0_3\\.dtd|rml_1_0\\.dtd)$',
 re.MULTILINE)
Resolving url='../rml.dtd' context=<lxml.etree._ParserContext object at 
0x7f66103273c0> dtdPath='rml.dtd'
Traceback (most recent call last):
  File "/home/robin/devel/reportlab/REPOS/rlextra/tmp/tlxml.py", line 78, in 
<module>
    tree = ET.parse(sys.argv[1],parser)
  File "src/lxml/etree.pyx", line 3521, in lxml.etree.parse
  File "src/lxml/parser.pxi", line 1859, in lxml.etree._parseDocument
  File "src/lxml/parser.pxi", line 1885, in lxml.etree._parseDocumentFromURL
  File "src/lxml/parser.pxi", line 1789, in lxml.etree._parseDocFromFile
  File "src/lxml/parser.pxi", line 1177, in 
lxml.etree._BaseParser._parseDocFromFile
  File "src/lxml/parser.pxi", line 615, in 
lxml.etree._ParserContext._handleParseResultDoc
  File "src/lxml/parser.pxi", line 725, in lxml.etree._handleParseResult
  File "src/lxml/parser.pxi", line 654, in lxml.etree._raiseParseError
  File "invalid.rml", line 23
lxml.etree.XMLSyntaxError: No declaration for attribute x of element place1, 
line 23, column 55

when I have the target=etree.TreeBuilder() active the validation does not work and the tree is formed and passed to the primitive tuple tree builder so the output looks like
$ python tlxml.py invalid.rml Resolving url='../rml.dtd' context=<lxml.etree._TargetParserContext object at 0x7f73d7b159c0> dtdPath='rml.dtd'
('document',
 {'filename': 'test_000_simple.pdf', 'invariant': '1'},
 ['\n\n',
  ('stylesheet',
> ........
       None,
       44),
      '\n    \t\t\n    \t\t'],
     40),
    '\n'],
   35),
  '\n\n'],
 2)

If I use the standard example EchoTarget the validation also fails. So I assume that the target argument makes the validation fail. Is there a way to get validation to work with a target?

The code is
######################################################################################################
from pprint import pprint
from lxml import etree as ET
import sys, os, re
from rlextra.rml2pdf.rml2pdf import CompatibleDTDNames as rmlDTDPat
rmlDTDPat = re.compile('^.*(?:\\W|\\b)(?P<fn>%s)$' % '|'.join((re.escape(_) for 
_ in rmlDTDPat)),re.M)

class TT:
    def __init__(self):
        pass

    def __call__(self,e):
        return (e.tag,e.attrib or None,self.content(e),e.sourceline)

    def content(self,e):
        t = e.text
        if len(e)==0 and t is None:
            return t
        else:
            r = [].append
            if t is not None: r(t)
            for c in e:
                r(self(c))
                t = c.tail
                if t is not None:
                    r(t)
            return r.__self__

class RMLDTDResolver(ET.Resolver):
    __dtds = None
    def resolve(self, url, id, context):
        m = rmlDTDPat.match(url)
        if m:
            if self.__dtds is None:
                from rlextra import rml2pdf
                self.__dtds = {}
                for fn in ('rml.dtd','dynamic_rml.dtd'):
                    with 
open(os.path.join(os.path.dirname(rml2pdf.__file__),fn),'r') as _:
                        self.__dtds[fn] = _.read()
            fn = m.group('fn')
            dtdPath = 'rml.dtd' if fn.startswith('rml') else 'dynamic.dtd'
            print(f"Resolving url={url!r} context={context!r} {dtdPath=}")
            return self.resolve_string(
                    self.__dtds[dtdPath],
                    context,
                    )
        else:
            return None

parser = ET.XMLParser(
                    load_dtd=True,
                    dtd_validation=True,
                    attribute_defaults=True,
                    no_network=True,
                    remove_comments=True,
                    remove_pis=True,
                    strip_cdata=True,
                    resolve_entities=True,
                    target=ET.TreeBuilder(),   #if commented the parser 
validates
                    )
parser.resolvers.add(RMLDTDResolver())
tree = ET.parse(sys.argv[1],parser)
pprint(TT()(tree))
######################################################################################################
--
https://mail.python.org/mailman/listinfo/python-list

Reply via email to