This is an automated email from the ASF dual-hosted git repository.
tballison pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tika.git
The following commit(s) were added to refs/heads/main by this push:
new cf7d87a046 TIKA-4747 -- add axml detection (#2865)
cf7d87a046 is described below
commit cf7d87a046f099c8a9b681bbb0c18c3e05c816ae
Author: Tim Allison <[email protected]>
AuthorDate: Thu Jun 4 21:38:58 2026 -0400
TIKA-4747 -- add axml detection (#2865)
---
.../org/apache/tika/mime/tika-mimetypes.xml | 15 +++
.../org/apache/tika/mime/MimeDetectionTest.java | 5 +
.../org/apache/tika/mime/test-android-binary.xml | Bin 0 -> 64 bytes
.../apache/tika/parser/AndroidBinaryXMLTest.java | 122 +++++++++++++++++++++
4 files changed, 142 insertions(+)
diff --git
a/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml
b/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml
index f447bf3d97..34b9d27943 100644
--- a/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml
+++ b/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml
@@ -352,6 +352,21 @@
<sub-class-of type="application/java-archive"/>
<glob pattern="*.apk"/>
</mime-type>
+ <mime-type type="application/vnd.android.axml">
+ <acronym>AXML</acronym>
+ <_comment>Android Binary XML</_comment>
+
<tika:link>https://developer.android.com/guide/topics/manifest/manifest-intro</tika:link>
+ <!-- Compiled AndroidManifest.xml / res/*.xml inside an APK. The .xml
extension would
+ otherwise route it to the XML parser, which fails on the binary
header. Signature:
+ RES_XML_TYPE(0x0003)+headerSize(0x0008)=0x00080003 LE, plus
RES_STRING_POOL_TYPE
+ (0x0001) at offset 8 (the variable per-file size at offset 4 is
skipped). Not a
+ sub-class-of application/xml: must not reach an XML parser. -->
+ <magic priority="50">
+ <match value="0x03000800" type="string" offset="0">
+ <match value="0x0001" type="little16" offset="8"/>
+ </match>
+ </magic>
+ </mime-type>
<mime-type type="application/x-tika-java-enterprise-archive">
<sub-class-of type="application/java-archive"/>
<glob pattern="*.ear"/>
diff --git
a/tika-core/src/test/java/org/apache/tika/mime/MimeDetectionTest.java
b/tika-core/src/test/java/org/apache/tika/mime/MimeDetectionTest.java
index 76268b5fea..77d4604471 100644
--- a/tika-core/src/test/java/org/apache/tika/mime/MimeDetectionTest.java
+++ b/tika-core/src/test/java/org/apache/tika/mime/MimeDetectionTest.java
@@ -85,6 +85,11 @@ public class MimeDetectionTest {
// truncated xml should still be detected as xml, See TIKA-3596
testFile("application/xml", "truncated-utf16-xml.xyz");
+
+ // Android Binary XML (compiled AndroidManifest.xml / res/*.xml inside
an APK).
+ // Carries a .xml extension, so magic must win over the *.xml glob and
it must
+ // NOT be routed to application/xml / the XML parser. See TIKA-4747.
+ testFile("application/vnd.android.axml", "test-android-binary.xml");
}
@Test
diff --git
a/tika-core/src/test/resources/org/apache/tika/mime/test-android-binary.xml
b/tika-core/src/test/resources/org/apache/tika/mime/test-android-binary.xml
new file mode 100644
index 0000000000..d022dee128
Binary files /dev/null and
b/tika-core/src/test/resources/org/apache/tika/mime/test-android-binary.xml
differ
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-integration-tests/src/test/java/org/apache/tika/parser/AndroidBinaryXMLTest.java
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-integration-tests/src/test/java/org/apache/tika/parser/AndroidBinaryXMLTest.java
new file mode 100644
index 0000000000..41286f218d
--- /dev/null
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-integration-tests/src/test/java/org/apache/tika/parser/AndroidBinaryXMLTest.java
@@ -0,0 +1,122 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser;
+
+import static org.junit.jupiter.api.Assertions.assertEquals;
+import static org.junit.jupiter.api.Assertions.assertNull;
+
+import java.io.ByteArrayOutputStream;
+import java.nio.ByteBuffer;
+import java.nio.ByteOrder;
+import java.nio.charset.StandardCharsets;
+import java.util.List;
+import java.util.zip.ZipEntry;
+import java.util.zip.ZipOutputStream;
+
+import org.junit.jupiter.api.Test;
+
+import org.apache.tika.TikaTest;
+import org.apache.tika.io.TikaInputStream;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.TikaCoreProperties;
+
+/**
+ * Android Binary XML (AXML) is the compiled binary form of
AndroidManifest.xml and the
+ * res/*.xml resources packed inside an APK. Those entries keep a .xml
extension and live
+ * inside the (zip) APK, so before TIKA-4747 the *.xml glob caused them to be
detected as
+ * application/xml and handed to the XML parser, which failed on the binary
header with
+ * "Invalid byte 1 of 1-byte UTF-8 sequence". This was a large source of
exceptions in
+ * regression runs over APK-heavy corpora.
+ *
+ * <p>Real corpus APKs can't be committed, so this builds an equivalent zip in
memory:
+ * two compiled (AXML) entries plus one genuine text-XML entry under assets/
as a control,
+ * and asserts the AXML entries are detected as application/vnd.android.axml
and produce no
+ * exception, while the text-XML entry is still application/xml.
+ */
+public class AndroidBinaryXMLTest extends TikaTest {
+
+ private static final String AXML = "application/vnd.android.axml";
+
+ /**
+ * Minimal compiled-AXML header: a RES_XML_TYPE ResChunk_header plus the
ResStringPool
+ * chunk real AXML always carries. The magic matches 0x00080003 (LE) at
offset 0 and the
+ * string-pool type 0x0001 at offset 8, so both must be present.
+ */
+ private static byte[] axmlBytes() {
+ ByteBuffer bb = ByteBuffer.allocate(64).order(ByteOrder.LITTLE_ENDIAN);
+ bb.putShort((short) 0x0003); // RES_XML_TYPE
+ bb.putShort((short) 0x0008); // headerSize
+ bb.putInt(64); // total chunk size == file length
(skipped by magic)
+ bb.putShort((short) 0x0001); // RES_STRING_POOL_TYPE (checked at
offset 8)
+ bb.putShort((short) 0x001C); // string-pool headerSize
+ bb.putInt(0x00000038); // string-pool chunk size == 64 - 8
(spans offset 8..EOF)
+ // remaining bytes (string/style counts, flags, offsets) left zero
+ return bb.array();
+ }
+
+ private static byte[] zipWith(String[] names, byte[][] contents) throws
Exception {
+ ByteArrayOutputStream bos = new ByteArrayOutputStream();
+ try (ZipOutputStream zos = new ZipOutputStream(bos)) {
+ for (int i = 0; i < names.length; i++) {
+ zos.putNextEntry(new ZipEntry(names[i]));
+ zos.write(contents[i]);
+ zos.closeEntry();
+ }
+ }
+ return bos.toByteArray();
+ }
+
+ @Test
+ public void testAxmlInsideZipNotRoutedToXmlParser() throws Exception {
+ byte[] textXml =
+ "<?xml
version=\"1.0\"?><root><city>example</city></root>".getBytes(StandardCharsets.UTF_8);
+ byte[] zip = zipWith(
+ new String[] {"AndroidManifest.xml", "res/anim/anim0to1.xml",
"assets/province_data.xml"},
+ new byte[][] {axmlBytes(), axmlBytes(), textXml});
+
+ List<Metadata> metadataList;
+ try (TikaInputStream tis = TikaInputStream.get(zip)) {
+ metadataList = getRecursiveMetadata(tis, true);
+ }
+
+ Metadata manifest = byPathSuffix(metadataList, "AndroidManifest.xml");
+ Metadata resAnim = byPathSuffix(metadataList, "anim0to1.xml");
+ Metadata assetXml = byPathSuffix(metadataList, "province_data.xml");
+
+ // The two compiled AXML entries: detected as AXML, NOT routed to the
XML parser.
+ assertEquals(AXML, manifest.get(Metadata.CONTENT_TYPE));
+ assertEquals(AXML, resAnim.get(Metadata.CONTENT_TYPE));
+ assertNull(manifest.get(TikaCoreProperties.EMBEDDED_EXCEPTION),
+ "AXML manifest must not throw a parse exception");
+ assertNull(resAnim.get(TikaCoreProperties.EMBEDDED_EXCEPTION),
+ "AXML resource must not throw a parse exception");
+
+ // Control: a genuine text XML under assets/ is still detected and
parsed as XML.
+ assertEquals("application/xml", assetXml.get(Metadata.CONTENT_TYPE));
+ assertNull(assetXml.get(TikaCoreProperties.EMBEDDED_EXCEPTION));
+ }
+
+ private static Metadata byPathSuffix(List<Metadata> metadataList, String
suffix) {
+ for (Metadata m : metadataList) {
+ String path = m.get(TikaCoreProperties.EMBEDDED_RESOURCE_PATH);
+ if (path != null && path.endsWith(suffix)) {
+ return m;
+ }
+ }
+ throw new AssertionError("No embedded entry found ending with: " +
suffix);
+ }
+}