This is an automated email from the ASF dual-hosted git repository.
desruisseaux pushed a commit to branch geoapi-4.0
in repository https://gitbox.apache.org/repos/asf/sis.git
The following commit(s) were added to refs/heads/geoapi-4.0 by this push:
new cf568b124f Ignore zero-width spaces and other ignorable characters in
CRS identifiers.
cf568b124f is described below
commit cf568b124f9499b2019ee4352ce9e1b86a8f850b
Author: Martin Desruisseaux <[email protected]>
AuthorDate: Sun Jan 21 21:57:38 2024 +0100
Ignore zero-width spaces and other ignorable characters in CRS identifiers.
https://issues.apache.org/jira/browse/SIS-490
---
.../sis/metadata/iso/citation/Citations.java | 64 ++-------------------
.../factory/GeodeticAuthorityFactory.java | 8 ++-
.../factory/MultiAuthoritiesFactory.java | 2 +-
.../sis/referencing/factory/package-info.java | 2 +-
.../main/org/apache/sis/util/CharSequences.java | 66 ++++++++++++++++++++++
.../apache/sis/util/internal/DefinitionURI.java | 3 +-
6 files changed, 79 insertions(+), 66 deletions(-)
diff --git
a/endorsed/src/org.apache.sis.metadata/main/org/apache/sis/metadata/iso/citation/Citations.java
b/endorsed/src/org.apache.sis.metadata/main/org/apache/sis/metadata/iso/citation/Citations.java
index 8b6e802a88..956b3c2545 100644
---
a/endorsed/src/org.apache.sis.metadata/main/org/apache/sis/metadata/iso/citation/Citations.java
+++
b/endorsed/src/org.apache.sis.metadata/main/org/apache/sis/metadata/iso/citation/Citations.java
@@ -850,17 +850,8 @@ public final class Citations extends Static {
* Those characters are illegal in XML identifiers, and should therefore
be removed if the Unicode identifier
* may also be used as XML identifier.
*
- * <p>If non-null, the result is suitable for use as a XML identifier
except for a few uncommon characters.</p>
- *
- * <h4>Compatibility note</h4>
- * the following characters are invalid in XML identifiers. However, since
they are valid in Unicode identifiers,
- * they could be included in the string returned by this method:
- * <ul>
- * <li>{@code µ}</li>
- * <li>{@code ª} (feminine ordinal indicator)</li>
- * <li>{@code º} (masculine ordinal indicator)</li>
- * <li>{@code ⁔}</li>
- * </ul>
+ * <p>If non-null, the result is suitable for use as a XML identifier
except for a few uncommon characters.
+ * See {@link CharSequences#trimIgnorables(CharSequence)} for more
information.</p>
*
* @param citation the citation for which to infer the code space, or
{@code null}.
* @return a non-empty code space for the given citation without leading
or trailing whitespaces,
@@ -872,55 +863,8 @@ public final class Citations extends Static {
if (citation instanceof IdentifierSpace<?>) {
return ((IdentifierSpace<?>) citation).getName();
} else {
- return
removeIgnorableCharacters(Identifiers.getIdentifier(citation, true));
- }
- }
-
- /**
- * Removes characters that are ignorable according Unicode specification.
- *
- * @param identifier the character sequence from which to remove
ignorable characters, or {@code null}.
- * @return a character sequence with ignorable character removed. May be
the same instance as the given argument.
- */
- private static String removeIgnorableCharacters(final String identifier) {
- if (identifier != null) {
- /*
- * First perform a quick check to see if there is any ignorable
characters.
- * We make this check because those characters are valid according
Unicode
- * but not according XML. However, there is usually no such
characters, so
- * we will avoid the StringBuilder creation in the vast majority
of times.
- *
- * Note that 'µ' and its friends are not ignorable, so we do not
remove them.
- * This method is aimed for "getUnicodeIdentifier", not
"getXmlIdentifier".
- */
- final int length = identifier.length();
- for (int i=0; i<length;) {
- int c = identifier.codePointAt(i);
- int n = Character.charCount(c);
- if (Character.isIdentifierIgnorable(c)) {
- /*
- * Found an ignorable character. Create the buffer and
copy non-ignorable characters.
- * Following algorithm is inefficient, since we fill the
buffer character-by-character
- * (a more efficient approach would be to perform bulk
appends). However, we presume
- * that this block will be rarely executed, so it is not
worth to optimize it.
- */
- final StringBuilder buffer = new StringBuilder(length -
n).append(identifier, 0, i);
- while ((i += n) < length) {
- c = identifier.codePointAt(i);
- n = Character.charCount(c);
- if (!Character.isIdentifierIgnorable(c)) {
- buffer.appendCodePoint(c);
- }
- }
- /*
- * No need to verify if the buffer is empty, because
ignorable
- * characters are not legal Unicode identifier start.
- */
- return buffer.toString();
- }
- i += n;
- }
+ CharSequence cs =
CharSequences.trimIgnorables(Identifiers.getIdentifier(citation, true));
+ return (cs != null) ? cs.toString() : null;
}
- return identifier;
}
}
diff --git
a/endorsed/src/org.apache.sis.referencing/main/org/apache/sis/referencing/factory/GeodeticAuthorityFactory.java
b/endorsed/src/org.apache.sis.referencing/main/org/apache/sis/referencing/factory/GeodeticAuthorityFactory.java
index 810632fb1e..0474fa9551 100644
---
a/endorsed/src/org.apache.sis.referencing/main/org/apache/sis/referencing/factory/GeodeticAuthorityFactory.java
+++
b/endorsed/src/org.apache.sis.referencing/main/org/apache/sis/referencing/factory/GeodeticAuthorityFactory.java
@@ -64,7 +64,7 @@ import org.apache.sis.util.resources.Errors;
*
* @author Martin Desruisseaux (IRD, Geomatys)
* @author Johann Sorel (Geomatys)
- * @version 1.4
+ * @version 1.5
* @since 0.7
*/
public abstract class GeodeticAuthorityFactory extends AbstractFactory
implements AuthorityFactory {
@@ -1265,7 +1265,8 @@ public abstract class GeodeticAuthorityFactory extends
AbstractFactory implement
/**
* Trims the namespace, if present. For example if this factory is an EPSG
authority factory
* and the specified code start with the {@code "EPSG:"} prefix, then the
prefix is removed.
- * Otherwise, the string is returned unchanged (except for leading and
trailing spaces).
+ * Otherwise, the string is returned unchanged except for leading and
trailing spaces which
+ * are removed, together with {@link Character#isIdentifierIgnorable(int)
ignorable characters}.
*
* @param code the code to trim.
* @return the code with the namespace part removed if that part matched
one of the values given by
@@ -1273,7 +1274,8 @@ public abstract class GeodeticAuthorityFactory extends
AbstractFactory implement
*
* @since 0.8
*/
- protected final String trimNamespace(final String code) {
+ protected final String trimNamespace(String code) {
+ code = CharSequences.trimIgnorables(code).toString();
int s = code.indexOf(Constants.DEFAULT_SEPARATOR);
if (s >= 0) {
final int end = CharSequences.skipTrailingWhitespaces(code, 0,
s);
diff --git
a/endorsed/src/org.apache.sis.referencing/main/org/apache/sis/referencing/factory/MultiAuthoritiesFactory.java
b/endorsed/src/org.apache.sis.referencing/main/org/apache/sis/referencing/factory/MultiAuthoritiesFactory.java
index e908bd7d4d..9f82fd9fcd 100644
---
a/endorsed/src/org.apache.sis.referencing/main/org/apache/sis/referencing/factory/MultiAuthoritiesFactory.java
+++
b/endorsed/src/org.apache.sis.referencing/main/org/apache/sis/referencing/factory/MultiAuthoritiesFactory.java
@@ -145,7 +145,7 @@ import org.apache.sis.util.collection.BackingStoreException;
* do not need to be thread-safe. See constructor Javadoc for more information.
*
* @author Martin Desruisseaux (IRD, Geomatys)
- * @version 1.4
+ * @version 1.5
*
* @see org.apache.sis.referencing.CRS#getAuthorityFactory(String)
*
diff --git
a/endorsed/src/org.apache.sis.referencing/main/org/apache/sis/referencing/factory/package-info.java
b/endorsed/src/org.apache.sis.referencing/main/org/apache/sis/referencing/factory/package-info.java
index 373041442d..55d4937b4e 100644
---
a/endorsed/src/org.apache.sis.referencing/main/org/apache/sis/referencing/factory/package-info.java
+++
b/endorsed/src/org.apache.sis.referencing/main/org/apache/sis/referencing/factory/package-info.java
@@ -56,7 +56,7 @@
* </table>
*
* @author Martin Desruisseaux (IRD, Geomatys)
- * @version 1.4
+ * @version 1.5
* @since 0.6
*/
package org.apache.sis.referencing.factory;
diff --git
a/endorsed/src/org.apache.sis.util/main/org/apache/sis/util/CharSequences.java
b/endorsed/src/org.apache.sis.util/main/org/apache/sis/util/CharSequences.java
index 699dda468a..7a361609fc 100644
---
a/endorsed/src/org.apache.sis.util/main/org/apache/sis/util/CharSequences.java
+++
b/endorsed/src/org.apache.sis.util/main/org/apache/sis/util/CharSequences.java
@@ -992,6 +992,72 @@ search: for (; fromIndex <= toIndex; fromIndex++) {
return text;
}
+ /**
+ * Returns a text with ignorable characters in Unicode identifier removed.
While valid in identifiers,
+ * those {@linkplain Character#isIdentifierIgnorable(int) ignorable
characters} are often non-displayed.
+ * An example of ignorable character is the zero-width space.
+ *
+ * <h4>Relationship with XML</h4>
+ * Unlike Unicode identifiers, ignorable characters are invalid in XML
identifiers.
+ * This restriction avoids, for example, homograph attacks in domain name.
+ * So this method can be used for converting an Unicode identifier to an
XML identifier,
+ * except for the characters listed below. Those characters are
non-ignorable
+ * (so not removed by this method), but nevertheless invalid in XML
identifiers.
+ * <ul>
+ * <li>{@code µ} (U+00B5) — micro</li>
+ * <li>{@code ª} (U+00AA) — feminine ordinal indicator</li>
+ * <li>{@code º} (U+00BA) — masculine ordinal indicator</li>
+ * <li>{@code ⁔} (U+2054) — inverted undertie</li>
+ * </ul>
+ *
+ * @param text the text from which to remove ignorable characters, or
{@code null}.
+ * @return text with ignorable characters removed, or {@code null} if the
given text was null.
+ *
+ * @see Character#isIdentifierIgnorable(int)
+ *
+ * @since 1.5
+ */
+ public static CharSequence trimIgnorables(final CharSequence text) {
+ if (text != null) {
+ /*
+ * First perform a quick check to see if there is any ignorable
characters.
+ * We make this check because there is usually no such characters,
+ * so we will avoid the StringBuilder creation in the vast
majority of times.
+ *
+ * Note that 'µ' and its friends are not ignorable, so we do not
remove them.
+ * This method is aimed for `getUnicodeIdentifier`, not
`getXmlIdentifier`.
+ */
+ final int length = text.length();
+ for (int i=0; i<length;) {
+ int c = codePointAt(text, i);
+ int n = Character.charCount(c);
+ if (Character.isIdentifierIgnorable(c)) {
+ /*
+ * Found an ignorable character. Create the buffer and
copy non-ignorable characters.
+ * Following algorithm is inefficient, since we fill the
buffer character-by-character
+ * (a more efficient approach would be to perform bulk
appends). However, we presume
+ * that this block will be rarely executed, so it is not
worth to optimize it.
+ */
+ final StringBuilder buffer = new StringBuilder(length -
n).append(text, 0, i);
+ while ((i += n) < length) {
+ c = codePointAt(text, i);
+ n = Character.charCount(c);
+ if (!Character.isIdentifierIgnorable(c)) {
+ buffer.appendCodePoint(c);
+ }
+ }
+ /*
+ * No need to verify if the buffer is empty, because
ignorable
+ * characters are not legal Unicode identifier start.
+ */
+ return buffer.toString();
+ }
+ i += n;
+ }
+ }
+ return text;
+ }
+
/**
* Trims the fractional part of the given formatted number, provided that
it doesn't change
* the value. This method assumes that the number is formatted in the US
locale, typically
diff --git
a/endorsed/src/org.apache.sis.util/main/org/apache/sis/util/internal/DefinitionURI.java
b/endorsed/src/org.apache.sis.util/main/org/apache/sis/util/internal/DefinitionURI.java
index e57f627865..245dec83e6 100644
---
a/endorsed/src/org.apache.sis.util/main/org/apache/sis/util/internal/DefinitionURI.java
+++
b/endorsed/src/org.apache.sis.util/main/org/apache/sis/util/internal/DefinitionURI.java
@@ -271,7 +271,8 @@ public final class DefinitionURI {
* @param uri the URI to parse.
* @return the parse result, or {@code null} if the given URI is not
recognized.
*/
- public static DefinitionURI parse(final String uri) {
+ public static DefinitionURI parse(String uri) {
+ uri = CharSequences.trimIgnorables(uri).toString();
return parse(uri, false, -1, uri.length());
}