writerfilter/documentation/ooxml/model.rng | 473 +++++++++++++++++++++++++++++ writerfilter/documentation/ooxml/model.xml | 42 -- 2 files changed, 473 insertions(+), 42 deletions(-)
New commits: commit 59a68fe4ad8ca32fb016e4f1955ef6c18bcd3044 Author: Miklos Vajna <vmik...@collabora.co.uk> Date: Thu Aug 14 10:40:05 2014 +0200 Add rng schema for model.xml Change-Id: I1b75c5c42a131c7994868ea3261120c6a5b7650e diff --git a/writerfilter/documentation/ooxml/model.rng b/writerfilter/documentation/ooxml/model.rng new file mode 100644 index 0000000..d21045b --- /dev/null +++ b/writerfilter/documentation/ooxml/model.rng @@ -0,0 +1,473 @@ +<?xml version="1.0" encoding="UTF-8"?> +<!-- + * This file is part of the LibreOffice project. + * + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. + * +--> +<!-- +This file is both a relax-ng schema for writerfilter/source/ooxml/model.xml and +documentation for that file. The schema has two parts: + +- first part: a subset of the relax-ng grammar to define *what* we expect as + the input in a DOCX file +- second part: additional annotation on top of that to define *how* to handle + that expected input +--> +<grammar xmlns="http://relaxng.org/ns/structure/1.0"> + <!-- + First part: a subset of the relax-ng XML markup. + + The order of elements in this part follow a bottom-up approach. + --> + + <!-- Basic building blocks: element, attribute and their contents. --> + + <!-- + Describes an XML element. + + Example: + + <element name="charset"> + <ref name="CT_Charset"/> + </element> + --> + <define name="element-element"> + <element name="element" ns="http://relaxng.org/ns/structure/1.0"> + <optional> + <attribute name="name"/> + </optional> + <oneOrMore> + <choice> + <ref name="attribute-element"/> + <ref name="data-element"/> + <ref name="ref-element"/> + <ref name="text-element"/> + </choice> + </oneOrMore> + </element> + </define> + + <!-- + Describes an attribute. + + Example: + + <attribute name="name"> + <text/> + </attribute> + --> + <define name="attribute-element"> + <element name="attribute" ns="http://relaxng.org/ns/structure/1.0"> + <optional> + <attribute name="name"/> + </optional> + <zeroOrMore> + <choice> + <ref name="data-element"/> + <ref name="ref-element"/> + <ref name="text-element"/> + </choice> + </zeroOrMore> + </element> + </define> + + <!-- + Describes the type of the data contained in an attribute. Possible values: + boolean, integer or string. See also <text>. + --> + <define name="data-element"> + <element name="data" ns="http://relaxng.org/ns/structure/1.0"> + <attribute name="type"/> + </element> + </define> + + <!-- + Describes that the data used inside the parent (element or attribute) is a + string. It is just a short-hand for <data type="string"/>. + --> + <define name="text-element"> + <element name="text" ns="http://relaxng.org/ns/structure/1.0"> + <empty/> + </element> + </define> + + <!-- + Describes an enumeration element: a possible value for an attribute. + --> + <define name="value-element"> + <element name="value" ns="http://relaxng.org/ns/structure/1.0"> + <text/> + </element> + </define> + + <!-- + This element is ignored during parsing, it just helps readability. + + Example: + + <choice> + <value>true</value> + <value>false</value> + </choice> + --> + <define name="choice-element"> + <element name="choice" ns="http://relaxng.org/ns/structure/1.0"> + <oneOrMore> + <choice> + <ref name="data-element"/> + <ref name="element-element"/> + <ref name="ref-element"/> + <ref name="text-element"/> + <ref name="value-element"/> + </choice> + </oneOrMore> + </element> + </define> + + <!-- Grouping elements: define and grammar. --> + + <!-- + A define is named definition of its contents, so that multiple <ref> elements + can refer to it, to avoid copy&paste. OOXML named (complex and simple) types + are described using defines. + --> + <define name="define-element"> + <element name="define" ns="http://relaxng.org/ns/structure/1.0"> + <attribute name="name"/> + <oneOrMore> + <choice> + <ref name="choice-element"/> + <ref name="attribute-element"/> + <ref name="element-element"/> + <ref name="data-element"/> + <ref name="ref-element"/> + <empty/> + </choice> + </oneOrMore> + </element> + </define> + + <!-- + A reference to a define. + --> + <define name="ref-element"> + <element name="ref" ns="http://relaxng.org/ns/structure/1.0"> + <attribute name="name"/> + </element> + </define> + + <!-- + A grammar is a set of defines, one grammar is equivalent to one .xsd file + from the OOXML spec. + --> + <define name="grammar-element"> + <element name="grammar" ns="http://relaxng.org/ns/structure/1.0"> + <attribute name="ns"/> + <optional> + <attribute name="datatypeLibrary"/> + </optional> + <optional> + <attribute name="attributeFormDefault"/> + </optional> + <zeroOrMore> + <ref name="include-element"/> + </zeroOrMore> + <oneOrMore> + <ref name="define-element"/> + </oneOrMore> + </element> + </define> + + <!-- + Controls the resolution of <ref> elements. The order is: + + - the current grammar + - included grammars, if there are any + - the first define in the whole model + --> + <define name="include-element"> + <element name="include" ns="http://relaxng.org/ns/structure/1.0"> + <attribute name="href"/> + </element> + </define> + + <!-- + Second part: custom markup, building on top of the first one. + + The order of elements in this part follow a top-down approach. + + The output of the code generated from these elements is a token stream. There + are two types of tokens: SPRM tokens and attribute ones. SPRM refers to + Single PRoperty Modifier, in this context it means a token that contains other + tokens. It's used to represent an XML element. That means that SPRM tokens + can contain other SPRM tokens, and also attribute tokens, while attribute + tokens only contain simple types (boolean, integer, string). + + More terminology: the types in the OOXML schema have two typical prefixes: + + - CT_something: complex type, used to describe an XML element + - ST_something: simple type, used to describe the contents of an attribute + + For tokens the following abbreviations are used: + + - NS_something: namespace + - LN_something: local name + --> + + <!-- + The model element is the toplevel container for the XML element / + attribute mapping definition. It contains namespace aliases, direct token + definitions and mapping definitions for each namespace. + --> + <define name="model-element"> + <element name="model"> + <oneOrMore> + <ref name="namespace-alias-element"/> + </oneOrMore> + <oneOrMore> + <ref name="token-element"/> + </oneOrMore> + <oneOrMore> + <ref name="namespace-element"/> + </oneOrMore> + </element> + </define> + + <!-- + A namespace-alias element defines an alias for an URI. Multiple URI's + can have the same alias, that's how both strict and transitional OOXML is + supported by the same tokenizer. + --> + <define name="namespace-alias-element"> + <element name="namespace-alias"> + <!-- The URI of the namespace, e.g. http://schemas.openxmlformats.org/wordprocessingml/2006/main --> + <attribute name="name"/> + <!-- The alias of the namespace, e.g. w14 --> + <attribute name="alias"/> + </element> + </define> + + <!-- + A token element can explicitly define a token. This allows generating + such a token in the tokenizers and handling it in the domain mapper. Ideally + tokens are *not* defined this way, they are mapped to an XML element or + attribute from the OOXML specification. + --> + <define name="token-element"> + <element name="token"> + <!-- + The token name must be ooxml:something, then in C++ it'll be the + NS_ooxml::LN_something ("OOXML namespace, something local name") + constant. + --> + <attribute name="tokenid"/> + </element> + </define> + + <!-- + A namespace element is a container for a subset of the relax-ng grammar + of a part of the OOXML specification. It also contains the resource + definitions, which specify how XML elements and attributes are mapped to + tokens. + --> + <define name="namespace-element"> + <element name="namespace"> + <attribute name="name"/> + <optional> + <attribute name="file"/> + </optional> + <optional> + <attribute name="url"/> + </optional> + <zeroOrMore> + <ref name="start-element"/> + </zeroOrMore> + <ref name="grammar-element"/> + <zeroOrMore> + <ref name="resource-element"/> + </zeroOrMore> + </element> + </define> + + <!-- + A start element is similar to the relax-ng start element, but this one has a + name attribute to refer to a define, while the relax-ng one has a ref child + element to do the same. + --> + <define name="start-element"> + <element name="start"> + <attribute name="name"/> + </element> + </define> + + <!-- + A resource element always matches (by its name attribute) a define from the + grammar of the namespace. It describes how that (simple or complex) type is + parsed during import. + + Example: + + <resource name="CT_Font" resource="Properties"> + ... + </resource> + + or + + <resource name="CT_OMathPara" resource="Stream"/> + --> + <define name="resource-element"> + <element name="resource"> + <!-- There should be a define element with the same name attribute. --> + <attribute name="name"/> + <!-- + This means the resource element will be handled by the + OOXMLFastContextHandler<resource> class. + + The two most important resources: + + - Properties: this maps elements/attributes to SPRM/attribute tokens + - Stream: If the element itself does not require any special handling, + but the subelemenents are interesting, use this resource. If no + explicit resource element is available, then a null context will be + created and the element and all its subelements will be ignored. + --> + <attribute name="resource"/> + <optional> + <attribute name="tokenid"/> + </optional> + <zeroOrMore> + <choice> + <ref name="resource-element-element"/> + <ref name="resource-attribute-element"/> + <ref name="resource-value-element"/> + <ref name="resource-action-element"/> + </choice> + </zeroOrMore> + </element> + </define> + + <!-- + The <element> child of a <resource> defines what element name will be handled + via what token. + + Example: + + <element name="charset" tokenid="ooxml:CT_Font_charset"/> + + Means the <charset> element will be handled in the sprm() function of the handler + class as a NS_ooxml::LN_CT_Font_charset case. (sprm() is a logging wrapper + around lcl_sprm(), which is the real implementation.) + --> + <define name="resource-element-element"> + <element name="element"> + <attribute name="name"/> + <attribute name="tokenid"/> + </element> + </define> + + <!-- + The <attribute> child of a <resource> defines what attribute name will be + handled via what token. + + Example: + + <attribute name="name" tokenid="ooxml:CT_Font_name"/> + + Means the <name> attribute will be handled in the attribute() (real + implementation in lcl_attribute()) function of the handler class as a + NS_ooxml::LN_CT_Font_name case. + --> + <define name="resource-attribute-element"> + <element name="attribute"> + <attribute name="name"/> + <optional> + <attribute name="tokenid"/> + </optional> + <optional> + <attribute name="action"/> + </optional> + </element> + </define> + + <!-- + A <value> inside a <resource> defines how to map the string data of a value + to a token. The tokenid attribute defines the token name, the text of the + element defines the string. This is useful in case the value of an attribute + is a choice from a predefined list. + --> + <define name="resource-value-element"> + <element name="value"> + <attribute name="tokenid"/> + <text/> + </element> + </define> + + <!-- + An <action> inside a <resource> can perform additional actions in the + following situations: + + - start of the element + - end of the element + - character data of the element + + Example: + + <resource name="CT_TxbxContent" resource="Stream"> + <action name="start" action="startTxbxContent"/> + <action name="end" action="endTxbxContent"/> + </resource> + + That means that when: + + - <txbxContent> starts, OOXMLFastContextHandler::startTxbxContent() will be called + - <txbxContent> ends, OOXMLFastContextHandler::endTxbxContent() will be called + --> + <define name="resource-action-element"> + <element name="action"> + <attribute name="name"/> + <attribute name="action"/> + <optional> + <attribute name="tokenid"/> + </optional> + <optional> + <attribute name="sendtokenid"/> + </optional> + <optional> + <ref name="resource-action-cond-element"/> + </optional> + </element> + </define> + + <!-- + Some actions take parameters, which can be defined by the <cond> element. + + Example: + + <resource name="CT_FldChar" resource="Stream"> + <action name="start" action="fieldstart"> + <cond tokenid="ooxml:CT_FldChar_fldCharType" value="ooxml:Value_ST_FldCharType_begin"/> + </action> + </resource> + + That means: + + - if the <fldChar> starts with an fldCharType attribute being "begin" + - then perform the "fieldstart" action. + --> + <define name="resource-action-cond-element"> + <element name="cond"> + <attribute name="tokenid"/> + <attribute name="value"/> + </element> + </define> + + <!-- The entry point of the schema. --> + <start> + <ref name="model-element"/> + </start> +</grammar> +<!-- vim: ft=xml shiftwidth=2 softtabstop=2 expandtab: +--> diff --git a/writerfilter/documentation/ooxml/model.xml b/writerfilter/documentation/ooxml/model.xml deleted file mode 100644 index 75ee217..0000000 --- a/writerfilter/documentation/ooxml/model.xml +++ /dev/null @@ -1,42 +0,0 @@ -These are various notes about ooxml/model.xml and related stuff. They have been -mostly found out by trial and error, because existing documentation is poor -or nonexistent, so I don't actually understand writerfilter that much (and -think nothing nice about it) and don't think it (both writerfilter and my -understanding/liking of it) could be noticeably improved. In an ideal world -it should be nuked from orbit and started again from scratch with a saner design. - -- -CT_xxx (Complex Type) - it seems to be used for XML elements -ST_xxx (Simple Type) - it seems to be used for XML attributes - -- SPRM (the Sprm structure specified a modification to a property of a -character, paragraph, table, or section in the binary .doc format) - in -the context of OOXML it seems to pretty much mean "XML element" - -- - -Format of the <resource> tag (shortened CT_Font example): - - <resource name="CT_Font" resource="Properties" tag="font"> - <element name="charset" tokenid="ooxml:CT_Font_charset"/> - <attribute name="name" tokenid="ooxml:CT_Font_name"/> - </resource> - -CT_Font is the type that is defined how it will be handled. -resource="XXX" means it will be handled by OOXMLFastContextHandlerXXX class -no idea what tag="font" means or if it matters -<element> defines the <w:charset> subelement will be handled in sprm() function - as NS_ooxml::LN_CT_Font_charset case -<attribute> defines the <w:name> attribute of the element will be handled - in attribute() function as NS_ooxml::LN_CT_Font_name case -in both cases sprm()/attribute() may mean actually any of the various strange - naming ideas like lcl_sprm() - -- -If an element (and its subelements) are not processed but the element itself -does not require any special handling, make sure something like the below is present. -Otherwise null context will be created and the element and all its subelements -will be ignored. - -<resource name="CT_OMathPara" resource="Stream" tag="math"/> - _______________________________________________ Libreoffice-commits mailing list libreoffice-comm...@lists.freedesktop.org http://lists.freedesktop.org/mailman/listinfo/libreoffice-commits