This is an automated email from the ASF dual-hosted git repository.
tballison pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tika.git
The following commit(s) were added to refs/heads/main by this push:
new c136fa0460 TIKA-4727 -- Small tweaks: improve embedded file name
handling and add pagination in hslf (#2812)
c136fa0460 is described below
commit c136fa04604d73c327862715923e794180b0bc9a
Author: Tim Allison <[email protected]>
AuthorDate: Thu May 14 12:37:53 2026 -0400
TIKA-4727 -- Small tweaks: improve embedded file name handling and add
pagination in hslf (#2812)
---
.../main/java/org/apache/tika/metadata/Office.java | 30 +++++
.../org/apache/tika/metadata/PageAnchoring.java | 139 +++++++++++++++++++++
.../org/apache/tika/metadata/TikaPagedText.java | 39 +++++-
.../apache/tika/metadata/TestPageAnchoring.java | 118 +++++++++++++++++
.../parser/microsoft/AbstractPOIFSExtractor.java | 22 +++-
.../tika/parser/microsoft/ExcelExtractor.java | 115 +++++++++++++++--
.../tika/parser/microsoft/HSLFExtractor.java | 33 +++++
.../microsoft/ooxml/AbstractOOXMLExtractor.java | 38 +++++-
.../ooxml/SXSLFPowerPointExtractorDecorator.java | 70 +++++++++++
.../ooxml/XSSFExcelExtractorDecorator.java | 82 +++++++++++-
.../tika/parser/microsoft/ExcelParserTest.java | 63 ++++++++++
.../parser/microsoft/PowerPointParserTest.java | 53 ++++++++
.../apache/tika/parser/odf/OpenDocumentParser.java | 103 ++++++++++++++-
.../org/apache/tika/parser/odf/ODFParserTest.java | 87 +++++++++++++
14 files changed, 972 insertions(+), 20 deletions(-)
diff --git a/tika-core/src/main/java/org/apache/tika/metadata/Office.java
b/tika-core/src/main/java/org/apache/tika/metadata/Office.java
index 393e3d5004..51e4ebb0d9 100644
--- a/tika-core/src/main/java/org/apache/tika/metadata/Office.java
+++ b/tika-core/src/main/java/org/apache/tika/metadata/Office.java
@@ -230,4 +230,34 @@ public interface Office {
// Security-relevant: Framesets can load external URLs
Property HAS_FRAMESETS =
Property.internalBoolean("msoffice:doc:has-framesets");
+
+ /**
+ * 1-based sheet number for a resource (e.g. an embedded image)
+ * anchored to exactly one sheet of a workbook. For resources
+ * spanning multiple sheets see {@link #SHEET_NUMBERS}. This is the
+ * spreadsheet analogue of {@link TikaPagedText#PAGE_NUMBER} — the
+ * convention is identical, but the property name reflects the
+ * domain (Excel sheets are not "pages" in the printer sense).
+ */
+ Property SHEET_NUMBER =
Property.internalInteger("msoffice:excel:sheet-number");
+
+ /**
+ * 1-based sheet numbers a resource is anchored to, as a sequence.
+ * Used for resources that appear on multiple sheets of a workbook.
+ * See {@link TikaPagedText#PAGE_NUMBERS} for the full convention
+ * (single-page, multi-page, unlinked, unknown); the same rules
+ * apply here, with {@link #UNLINKED_SHEET} as the "known unlinked"
+ * sentinel.
+ */
+ Property SHEET_NUMBERS =
Property.internalIntegerSequence("msoffice:excel:sheet-numbers");
+
+ /**
+ * Sentinel value used as the sole element of {@link #SHEET_NUMBERS}
+ * when an embedded resource is present in a workbook but not
+ * referenced from any sheet. Mirrors
+ * {@link TikaPagedText#UNLINKED_PAGE} — negative because real sheet
+ * numbers are 1-based. See {@link PageAnchoring} for the helper
+ * that writes this convention.
+ */
+ int UNLINKED_SHEET = -1;
}
diff --git
a/tika-core/src/main/java/org/apache/tika/metadata/PageAnchoring.java
b/tika-core/src/main/java/org/apache/tika/metadata/PageAnchoring.java
new file mode 100644
index 0000000000..80f0539ba9
--- /dev/null
+++ b/tika-core/src/main/java/org/apache/tika/metadata/PageAnchoring.java
@@ -0,0 +1,139 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.metadata;
+
+import java.util.Collection;
+import java.util.Objects;
+
+/**
+ * Helpers for tagging an embedded resource's metadata with the pages or
+ * sheets of its parent document on which it appears. Centralised so every
+ * paginated/sheeted parser (HSLF/XSLF for PowerPoint, OpenDocumentParser
+ * for ODP, HSSF/XSSF for Excel, PDF, ...) encodes the same convention
+ * identically.
+ *
+ * <p>Convention:
+ * <ul>
+ * <li>{@code indices == null}: caller does not know the resource's
+ * anchoring — both single- and sequence-valued properties are
+ * left unset.</li>
+ * <li>{@code indices.isEmpty()}: the resource is known to be present in
+ * the container but referenced from no page/sheet — the
+ * sequence property is set to a single-element array containing the
+ * sentinel value (see {@link TikaPagedText#UNLINKED_PAGE} and
+ * {@link Office#UNLINKED_SHEET}).</li>
+ * <li>{@code indices.size() == 1}: both the single-valued and the
+ * sequence-valued property are set, the sequence carrying the one
+ * index as its only element. Consumers that only inspect the
+ * single-valued property still see the right answer.</li>
+ * <li>{@code indices.size() > 1}: only the sequence-valued property is
+ * set; the single-valued property is cleared, because no single
+ * index would be meaningful.</li>
+ * </ul>
+ *
+ * <p>This class only mutates the metadata it is handed — it is the
+ * caller's job to compute the index set (e.g. by pre-scanning a slide
+ * file for picture references before the embedded-resource emission).
+ */
+public final class PageAnchoring {
+
+ private PageAnchoring() {
+ }
+
+ /**
+ * Apply {@link TikaPagedText#PAGE_NUMBER} / {@link
TikaPagedText#PAGE_NUMBERS}
+ * to {@code target}, per the page-anchoring convention. Used by
+ * presentation parsers (PPT, PPTX, ODP), and by PDF parsers when
+ * tagging per-page embedded resources.
+ *
+ * @param target metadata to write to
+ * @param pages 1-based page numbers, an empty collection for
+ * "known unlinked", or {@code null} for unknown
+ */
+ public static void applyPageMetadata(Metadata target, Collection<Integer>
pages) {
+ applyAnchorMetadata(target, pages,
+ TikaPagedText.PAGE_NUMBER, TikaPagedText.PAGE_NUMBERS,
+ TikaPagedText.UNLINKED_PAGE);
+ }
+
+ /**
+ * Apply {@link Office#SHEET_NUMBER} / {@link Office#SHEET_NUMBERS} to
+ * {@code target}, per the same convention as
+ * {@link #applyPageMetadata}. Used by spreadsheet parsers (HSSF,
+ * XSSF) when tagging an embedded resource with the sheets it appears
+ * on.
+ *
+ * @param target metadata to write to
+ * @param sheets 1-based sheet numbers, an empty collection for
+ * "known unlinked", or {@code null} for unknown
+ */
+ public static void applySheetMetadata(Metadata target, Collection<Integer>
sheets) {
+ applyAnchorMetadata(target, sheets,
+ Office.SHEET_NUMBER, Office.SHEET_NUMBERS,
+ Office.UNLINKED_SHEET);
+ }
+
+ /**
+ * Shared core for {@link #applyPageMetadata} and {@link
#applySheetMetadata}.
+ * Exposed so future paginated-resource conventions (e.g. arbitrary
+ * index spaces other than pages/sheets) can reuse the same logic
+ * without copying it.
+ *
+ * @param target metadata to write to (no-op if {@code null})
+ * @param indices the anchor indices, or {@code null} for
"unknown"
+ * @param singleProperty {@code internalInteger} property set when there
+ * is exactly one anchor index
+ * @param sequenceProperty {@code internalIntegerSequence} property
+ * populated with all anchor indices
+ * @param unlinkedSentinel value used as the sole element of
+ * {@code sequenceProperty} when {@code indices}
+ * is empty (known unlinked)
+ */
+ public static void applyAnchorMetadata(Metadata target,
Collection<Integer> indices,
+ Property singleProperty,
+ Property sequenceProperty,
+ int unlinkedSentinel) {
+ if (target == null || indices == null) {
+ return;
+ }
+ int[] arr;
+ if (indices.isEmpty()) {
+ arr = new int[]{unlinkedSentinel};
+ } else {
+ arr = indices.stream()
+ .filter(Objects::nonNull)
+ .mapToInt(Integer::intValue)
+ .sorted()
+ .toArray();
+ if (arr.length == 0) {
+ arr = new int[]{unlinkedSentinel};
+ }
+ }
+ // Clear any previous value, then append each element. Necessary
+ // because Metadata has no set-int[] overload; using add() builds
+ // the sequence one element at a time. Clearing first guards
+ // against accidental double application.
+ target.remove(sequenceProperty.getName());
+ target.remove(singleProperty.getName());
+ for (int v : arr) {
+ target.add(sequenceProperty, v);
+ }
+ if (arr.length == 1 && arr[0] != unlinkedSentinel) {
+ target.set(singleProperty, arr[0]);
+ }
+ }
+}
diff --git
a/tika-core/src/main/java/org/apache/tika/metadata/TikaPagedText.java
b/tika-core/src/main/java/org/apache/tika/metadata/TikaPagedText.java
index e4bf1454e2..0b2fe85ea8 100644
--- a/tika-core/src/main/java/org/apache/tika/metadata/TikaPagedText.java
+++ b/tika-core/src/main/java/org/apache/tika/metadata/TikaPagedText.java
@@ -26,10 +26,45 @@ package org.apache.tika.metadata;
public interface TikaPagedText {
String TIKA_PAGED_TEXT_PREFIX = "tika_pg:";
/**
- * 1-based page number for a specific page
+ * 1-based page number for a specific page. Set when a resource is
+ * anchored to exactly one page; for resources spanning multiple pages
+ * see {@link #PAGE_NUMBERS}.
*/
Property PAGE_NUMBER = Property.internalInteger(TIKA_PAGED_TEXT_PREFIX +
"page_number");
- Property PAGE_ROTATION = Property.internalRational(TIKA_PAGED_TEXT_PREFIX
+ "page_rotation");
+ /**
+ * 1-based page numbers an embedded resource is anchored to, as a
+ * sequence. Used for resources that appear on multiple pages
+ * (e.g. a logo or a shared image referenced from several slides
+ * of a presentation, or an image embedded in multiple sheets of a
+ * workbook).
+ *
+ * <p>Conventions:
+ * <ul>
+ * <li>Resource on a single page: {@code PAGE_NUMBERS = [N]} and
+ * {@code PAGE_NUMBER = N} are both set.</li>
+ * <li>Resource on multiple pages: {@code PAGE_NUMBERS = [N1, N2, ...]};
+ * {@code PAGE_NUMBER} is not set.</li>
+ * <li>Resource present in the container but not referenced from any
+ * page (an "unlinked" or orphan resource):
+ * {@code PAGE_NUMBERS = [-1]} (see {@link #UNLINKED_PAGE}).
+ * Distinguishes "we know this image is unanchored" from
+ * "this format has no page concept" (which leaves the property
+ * unset).</li>
+ * <li>Format has no page concept, or per-page anchoring is unknown:
+ * both {@code PAGE_NUMBER} and {@code PAGE_NUMBERS} unset.</li>
+ * </ul>
+ */
+ Property PAGE_NUMBERS =
+ Property.internalIntegerSequence(TIKA_PAGED_TEXT_PREFIX +
"page_numbers");
+ /**
+ * Sentinel value used as the sole element of {@link #PAGE_NUMBERS}
+ * when an embedded resource is present in a paginated container but
+ * not referenced from any page. Chosen because real page numbers
+ * are 1-based, so any negative value is out-of-band.
+ */
+ int UNLINKED_PAGE = -1;
+
+ Property PAGE_ROTATION = Property.internalRational(TIKA_PAGED_TEXT_PREFIX
+ "page_rotation");
}
diff --git
a/tika-core/src/test/java/org/apache/tika/metadata/TestPageAnchoring.java
b/tika-core/src/test/java/org/apache/tika/metadata/TestPageAnchoring.java
new file mode 100644
index 0000000000..cc12971bb4
--- /dev/null
+++ b/tika-core/src/test/java/org/apache/tika/metadata/TestPageAnchoring.java
@@ -0,0 +1,118 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.metadata;
+
+import static org.junit.jupiter.api.Assertions.assertArrayEquals;
+import static org.junit.jupiter.api.Assertions.assertEquals;
+import static org.junit.jupiter.api.Assertions.assertNull;
+
+import java.util.Arrays;
+import java.util.Collections;
+
+import org.junit.jupiter.api.Test;
+
+public class TestPageAnchoring {
+
+ @Test
+ void nullPagesIsNoOp() {
+ Metadata m = new Metadata();
+ PageAnchoring.applyPageMetadata(m, null);
+ assertNull(m.get(TikaPagedText.PAGE_NUMBER));
+ assertNull(m.get(TikaPagedText.PAGE_NUMBERS));
+ }
+
+ @Test
+ void nullTargetIsNoOp() {
+ PageAnchoring.applyPageMetadata(null, Arrays.asList(1, 2));
+ }
+
+ @Test
+ void emptyPagesMeansUnlinked() {
+ Metadata m = new Metadata();
+ PageAnchoring.applyPageMetadata(m, Collections.emptyList());
+ assertArrayEquals(new int[]{TikaPagedText.UNLINKED_PAGE},
+ m.getIntValues(TikaPagedText.PAGE_NUMBERS));
+ // PAGE_NUMBER is intentionally unset when unlinked.
+ assertNull(m.get(TikaPagedText.PAGE_NUMBER));
+ }
+
+ @Test
+ void singlePageSetsBothProperties() {
+ Metadata m = new Metadata();
+ PageAnchoring.applyPageMetadata(m, Collections.singletonList(7));
+ assertArrayEquals(new int[]{7},
m.getIntValues(TikaPagedText.PAGE_NUMBERS));
+ assertEquals("7", m.get(TikaPagedText.PAGE_NUMBER));
+ }
+
+ @Test
+ void multiPageSetsSequenceOnly() {
+ Metadata m = new Metadata();
+ PageAnchoring.applyPageMetadata(m, Arrays.asList(3, 1, 7));
+ assertArrayEquals(new int[]{1, 3, 7},
m.getIntValues(TikaPagedText.PAGE_NUMBERS));
+ // No single-page-number when the resource spans multiple pages.
+ assertNull(m.get(TikaPagedText.PAGE_NUMBER));
+ }
+
+ @Test
+ void duplicatePagesArePreservedSorted() {
+ // The helper doesn't dedupe — callers control set semantics by
+ // using a Set. This documents the current behaviour.
+ Metadata m = new Metadata();
+ PageAnchoring.applyPageMetadata(m, Arrays.asList(2, 2, 1));
+ assertArrayEquals(new int[]{1, 2, 2},
m.getIntValues(TikaPagedText.PAGE_NUMBERS));
+ }
+
+ @Test
+ void reapplicationClearsPriorValues() {
+ Metadata m = new Metadata();
+ PageAnchoring.applyPageMetadata(m, Arrays.asList(1, 2, 3));
+ PageAnchoring.applyPageMetadata(m, Collections.singletonList(5));
+ assertArrayEquals(new int[]{5},
m.getIntValues(TikaPagedText.PAGE_NUMBERS));
+ assertEquals("5", m.get(TikaPagedText.PAGE_NUMBER));
+ }
+
+ @Test
+ void reapplicationFromSingleToMultiClearsPageNumber() {
+ Metadata m = new Metadata();
+ PageAnchoring.applyPageMetadata(m, Collections.singletonList(4));
+ assertEquals("4", m.get(TikaPagedText.PAGE_NUMBER));
+ PageAnchoring.applyPageMetadata(m, Arrays.asList(4, 5));
+ assertArrayEquals(new int[]{4, 5},
m.getIntValues(TikaPagedText.PAGE_NUMBERS));
+ assertNull(m.get(TikaPagedText.PAGE_NUMBER));
+ }
+
+ @Test
+ void sheetMetadataMirrorsPageMetadata() {
+ // Sanity check that applySheetMetadata writes to Office's properties
+ // with the same convention as applyPageMetadata writes to
TikaPagedText's.
+ Metadata m = new Metadata();
+ PageAnchoring.applySheetMetadata(m, Arrays.asList(2, 4, 6));
+ assertArrayEquals(new int[]{2, 4, 6},
m.getIntValues(Office.SHEET_NUMBERS));
+ assertNull(m.get(Office.SHEET_NUMBER), "multi-sheet should leave
SHEET_NUMBER unset");
+
+ Metadata m2 = new Metadata();
+ PageAnchoring.applySheetMetadata(m2, Collections.singletonList(3));
+ assertArrayEquals(new int[]{3}, m2.getIntValues(Office.SHEET_NUMBERS));
+ assertEquals("3", m2.get(Office.SHEET_NUMBER));
+
+ Metadata m3 = new Metadata();
+ PageAnchoring.applySheetMetadata(m3, Collections.emptyList());
+ assertArrayEquals(new int[]{Office.UNLINKED_SHEET},
+ m3.getIntValues(Office.SHEET_NUMBERS));
+ assertNull(m3.get(Office.SHEET_NUMBER));
+ }
+}
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/AbstractPOIFSExtractor.java
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/AbstractPOIFSExtractor.java
index 37267b6644..6bb79a16c0 100644
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/AbstractPOIFSExtractor.java
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/AbstractPOIFSExtractor.java
@@ -21,6 +21,7 @@ import java.io.IOException;
import java.io.InputStream;
import java.nio.charset.StandardCharsets;
import java.util.Iterator;
+import java.util.Locale;
import org.apache.commons.io.output.UnsynchronizedByteArrayOutputStream;
import org.apache.poi.hpsf.ClassID;
@@ -213,7 +214,7 @@ abstract class AbstractPOIFSExtractor {
metadata.set(TikaCoreProperties.RESOURCE_NAME_KEY, name);
} else {
metadata.set(TikaCoreProperties.RESOURCE_NAME_KEY,
- name + '.' + type.getExtension());
+ appendExtensionIfMissing(name,
type.getExtension()));
metadata.set(TikaCoreProperties.RESOURCE_NAME_EXTENSION_INFERRED, true);
}
}
@@ -225,7 +226,7 @@ abstract class AbstractPOIFSExtractor {
metadata.set(TikaCoreProperties.RESOURCE_NAME_KEY, rName);
} else {
metadata.set(TikaCoreProperties.RESOURCE_NAME_KEY,
- rName + '.' + type.getExtension());
+ appendExtensionIfMissing(rName,
type.getExtension()));
metadata.set(TikaCoreProperties.RESOURCE_NAME_EXTENSION_INFERRED, true);
}
}
@@ -320,7 +321,7 @@ abstract class AbstractPOIFSExtractor {
// Record what we can do about it
metadata.set(Metadata.CONTENT_TYPE, mediaType.toString());
- metadata.set(TikaCoreProperties.RESOURCE_NAME_KEY, rName +
extension);
+ metadata.set(TikaCoreProperties.RESOURCE_NAME_KEY,
appendExtensionIfMissing(rName, extension));
metadata.set(TikaCoreProperties.RESOURCE_NAME_EXTENSION_INFERRED,
true);
metadata.set(Metadata.CONTENT_LENGTH, Integer.toString(length));
parseEmbedded(parentDir, tis, xhtml, metadata, outputHtml);
@@ -416,6 +417,21 @@ abstract class AbstractPOIFSExtractor {
}
+ /**
+ * Appends {@code ext} to {@code name} only when {@code name} does not
already end with it
+ * (case-insensitive). {@code ext} may or may not have a leading dot.
+ */
+ private static String appendExtensionIfMissing(String name, String ext) {
+ if (StringUtils.isBlank(ext)) {
+ return name;
+ }
+ String dotExt = ext.startsWith(".") ? ext : "." + ext;
+ if
(name.toLowerCase(Locale.ROOT).endsWith(dotExt.toLowerCase(Locale.ROOT))) {
+ return name;
+ }
+ return name + dotExt;
+ }
+
public static String tryToGetMsgTitle(DirectoryEntry node, String
defaultVal) {
for (String entryName : new String[] {"__substg1.0_0037001F",
"__substg1.0_0E1D001F", "__substg1.0_0070001F"} ) {
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ExcelExtractor.java
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ExcelExtractor.java
index a9d5f3f3ff..869f88b1c0 100644
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ExcelExtractor.java
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ExcelExtractor.java
@@ -17,10 +17,13 @@
package org.apache.tika.parser.microsoft;
import java.awt.Point;
+import java.io.ByteArrayOutputStream;
import java.io.IOException;
import java.text.NumberFormat;
import java.util.ArrayList;
import java.util.Comparator;
+import java.util.HashMap;
+import java.util.LinkedHashSet;
import java.util.List;
import java.util.Locale;
import java.util.Map;
@@ -29,9 +32,13 @@ import java.util.SortedMap;
import java.util.TreeMap;
import java.util.TreeSet;
+import org.apache.poi.ddf.DefaultEscherRecordFactory;
import org.apache.poi.ddf.EscherBSERecord;
import org.apache.poi.ddf.EscherBlipRecord;
+import org.apache.poi.ddf.EscherOptRecord;
+import org.apache.poi.ddf.EscherProperty;
import org.apache.poi.ddf.EscherRecord;
+import org.apache.poi.ddf.EscherSimpleProperty;
import org.apache.poi.hssf.eventusermodel.FormatTrackingHSSFListener;
import org.apache.poi.hssf.eventusermodel.HSSFEventFactory;
import org.apache.poi.hssf.eventusermodel.HSSFListener;
@@ -42,9 +49,11 @@ import org.apache.poi.hssf.record.BOFRecord;
import org.apache.poi.hssf.record.BoundSheetRecord;
import org.apache.poi.hssf.record.CellValueRecordInterface;
import org.apache.poi.hssf.record.ColumnInfoRecord;
+import org.apache.poi.hssf.record.ContinueRecord;
import org.apache.poi.hssf.record.CountryRecord;
import org.apache.poi.hssf.record.DateWindow1904Record;
import org.apache.poi.hssf.record.DrawingGroupRecord;
+import org.apache.poi.hssf.record.DrawingRecord;
import org.apache.poi.hssf.record.EOFRecord;
import org.apache.poi.hssf.record.ExtendedFormatRecord;
import org.apache.poi.hssf.record.FooterRecord;
@@ -80,6 +89,7 @@ import org.apache.tika.exception.TikaException;
import org.apache.tika.io.TikaInputStream;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.Office;
+import org.apache.tika.metadata.PageAnchoring;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.sax.XHTMLContentHandler;
import org.apache.tika.utils.StringUtils;
@@ -306,6 +316,9 @@ public class ExcelExtractor extends AbstractPOIFSExtractor {
* contiguous. Collect them for later processing.
*/
private final List<DrawingGroupRecord> drawingGroups = new
ArrayList<>();
+ // Per-sheet drawing bytes (DrawingRecord + Continues), keyed by
0-based sheet index.
+ private final Map<Integer, ByteArrayOutputStream> sheetDrawingData =
new HashMap<>();
+ private boolean inDrawing = false;
private final List<String> hiddenSheets = new ArrayList<>();
private final List<String> veryHiddenSheets = new ArrayList<>();
@@ -374,6 +387,8 @@ public class ExcelExtractor extends AbstractPOIFSExtractor {
hssfRequest.addListener(formatListener, FormatRecord.sid);
hssfRequest.addListener(formatListener,
ExtendedFormatRecord.sid);
hssfRequest.addListener(formatListener,
DrawingGroupRecord.sid);
+ hssfRequest.addListener(formatListener, DrawingRecord.sid);
+ hssfRequest.addListener(formatListener, ContinueRecord.sid);
hssfRequest.addListener(formatListener, ProtectRecord.sid);
hssfRequest.addListener(formatListener, ColumnInfoRecord.sid);
hssfRequest.addListener(formatListener, RowRecord.sid);
@@ -397,11 +412,66 @@ public class ExcelExtractor extends
AbstractPOIFSExtractor {
// Output any extra text that came after all the sheets
processExtraText();
- // Look for embeded images, now that the drawing records
- // have been fully matched with their continue data
+ // Build the blip-index → set-of-1-based-sheet-numbers map from
+ // per-sheet drawing buffers, then walk the workbook-level BSE
+ // pool in order, tagging each emitted picture with its sheets.
+ Map<Integer, Set<Integer>> picToSheets = buildPicToSheetsMap();
for (DrawingGroupRecord dgr : drawingGroups) {
dgr.decode();
- findPictures(dgr.getEscherRecords());
+ findPictures(dgr.getEscherRecords(), picToSheets, new
int[]{0});
+ }
+ }
+
+ private void appendDrawingBytes(byte[] data) {
+ sheetDrawingData
+ .computeIfAbsent(currentSheetIndex & 0xFFFF,
+ k -> new ByteArrayOutputStream())
+ .write(data, 0, data.length);
+ }
+
+ private Map<Integer, Set<Integer>> buildPicToSheetsMap() {
+ Map<Integer, Set<Integer>> picToSheets = new HashMap<>();
+ DefaultEscherRecordFactory factory = new
DefaultEscherRecordFactory();
+ for (Map.Entry<Integer, ByteArrayOutputStream> e :
sheetDrawingData.entrySet()) {
+ int sheetNum = e.getKey() + 1; // 1-based
+ byte[] data = e.getValue().toByteArray();
+ int pos = 0;
+ while (pos < data.length - 8) {
+ EscherRecord rec;
+ try {
+ rec = factory.createRecord(data, pos);
+ int consumed = rec.fillFields(data, pos, factory);
+ if (consumed <= 0) {
+ break;
+ }
+ pos += consumed;
+ } catch (Exception ex) {
+ break; // best-effort: stop on malformed segment
+ }
+ collectPibs(rec, sheetNum, picToSheets);
+ }
+ }
+ return picToSheets;
+ }
+
+ private void collectPibs(EscherRecord rec, int sheetNum,
+ Map<Integer, Set<Integer>> picToSheets) {
+ if (rec instanceof EscherOptRecord) {
+ for (EscherProperty prop : ((EscherOptRecord)
rec).getEscherProperties()) {
+ // Property ID is the low 14 bits; 0x0104 = blip-id (pib).
+ if ((prop.getPropertyNumber() & 0x3FFF) == 0x104
+ && prop instanceof EscherSimpleProperty) {
+ int pib = ((EscherSimpleProperty)
prop).getPropertyValue();
+ if (pib > 0) {
+ picToSheets
+ .computeIfAbsent(pib, k -> new
LinkedHashSet<>())
+ .add(sheetNum);
+ }
+ }
+ }
+ }
+ for (EscherRecord child : rec.getChildRecords()) {
+ collectPibs(child, sheetNum, picToSheets);
}
}
@@ -437,6 +507,11 @@ public class ExcelExtractor extends AbstractPOIFSExtractor
{
private void internalProcessRecord(Record record)
throws SAXException, TikaException, IOException {
+ // Drawing-chain tracking: snapshot then reset. ContinueRecord
+ // is only a drawing continuation when the prior record was a
+ // DrawingRecord or another drawing ContinueRecord.
+ boolean wasInDrawingChain = inDrawing;
+ inDrawing = false;
switch (record.getSid()) {
case BOFRecord.sid: // start of workbook, worksheet etc.
records
BOFRecord bof = (BOFRecord) record;
@@ -570,6 +645,20 @@ public class ExcelExtractor extends AbstractPOIFSExtractor
{
drawingGroups.add((DrawingGroupRecord) record);
break;
+ case DrawingRecord.sid:
+ if (currentSheetIndex >= 0) {
+ appendDrawingBytes(((DrawingRecord)
record).getRecordData());
+ inDrawing = true;
+ }
+ break;
+
+ case ContinueRecord.sid:
+ if (wasInDrawingChain && currentSheetIndex >= 0) {
+ appendDrawingBytes(((ContinueRecord)
record).getData());
+ inDrawing = true;
+ }
+ break;
+
case HeaderRecord.sid:
if
(extractor.officeParserConfig.isIncludeHeadersAndFooters()) {
HeaderRecord headerRecord = (HeaderRecord) record;
@@ -733,24 +822,34 @@ public class ExcelExtractor extends
AbstractPOIFSExtractor {
handler.endElement("div");
}
- private void findPictures(List<EscherRecord> records)
+ private void findPictures(List<EscherRecord> records,
+ Map<Integer, Set<Integer>> picToSheets,
+ int[] blipCounter)
throws IOException, SAXException, TikaException {
for (EscherRecord escherRecord : records) {
if (escherRecord instanceof EscherBSERecord) {
+ // 1-based blip index — must increment for every BSE
+ // record, even ones with null blip data, to stay in
+ // sync with sheet drawings' pib references.
+ blipCounter[0]++;
EscherBlipRecord blip = ((EscherBSERecord)
escherRecord).getBlipRecord();
if (blip != null) {
HSSFPictureData picture = new HSSFPictureData(blip);
String mimeType = picture.getMimeType();
TikaInputStream tis =
TikaInputStream.get(picture.getData());
- // Handle the embeded resource
- extractor.handleEmbeddedResource(tis, null, null,
mimeType, handler,
- true);
+ Metadata embeddedMetadata = new Metadata();
+ Set<Integer> sheets = picToSheets.get(blipCounter[0]);
+ if (sheets != null) {
+ PageAnchoring.applySheetMetadata(embeddedMetadata,
sheets);
+ }
+ extractor.handleEmbeddedResource(tis, embeddedMetadata,
+ null, null, null, mimeType, handler, true);
}
}
// Recursive call.
- findPictures(escherRecord.getChildRecords());
+ findPictures(escherRecord.getChildRecords(), picToSheets,
blipCounter);
}
}
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/HSLFExtractor.java
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/HSLFExtractor.java
index e2f10977bc..e516979da0 100644
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/HSLFExtractor.java
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/HSLFExtractor.java
@@ -19,8 +19,10 @@ package org.apache.tika.parser.microsoft;
import java.io.IOException;
import java.io.InputStream;
import java.util.ArrayList;
+import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
+import java.util.Map;
import java.util.Set;
import org.apache.poi.common.usermodel.Hyperlink;
@@ -37,6 +39,7 @@ import org.apache.poi.hslf.usermodel.HSLFNotes;
import org.apache.poi.hslf.usermodel.HSLFObjectData;
import org.apache.poi.hslf.usermodel.HSLFObjectShape;
import org.apache.poi.hslf.usermodel.HSLFPictureData;
+import org.apache.poi.hslf.usermodel.HSLFPictureShape;
import org.apache.poi.hslf.usermodel.HSLFShape;
import org.apache.poi.hslf.usermodel.HSLFSlide;
import org.apache.poi.hslf.usermodel.HSLFSlideShow;
@@ -60,6 +63,7 @@ import org.apache.tika.extractor.EmbeddedDocumentUtil;
import org.apache.tika.io.TikaInputStream;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.Office;
+import org.apache.tika.metadata.PageAnchoring;
import org.apache.tika.metadata.TikaCoreProperties;
import org.apache.tika.mime.MediaType;
import org.apache.tika.parser.ParseContext;
@@ -553,6 +557,12 @@ public class HSLFExtractor extends AbstractPOIFSExtractor {
private void handleSlideEmbeddedPictures(HSLFSlideShow slideshow,
XHTMLContentHandler xhtml)
throws TikaException, SAXException, IOException {
+ // Build picture-index → slide numbers map so each image carries its
page number
+ Map<Integer, Set<Integer>> picToSlides = new HashMap<>();
+ for (HSLFSlide slide : slideshow.getSlides()) {
+ collectPictureSlides(slide, slide.getSlideNumber(), picToSlides);
+ }
+
for (HSLFPictureData pic : slideshow.getPictureData()) {
String mediaType;
@@ -585,12 +595,35 @@ public class HSLFExtractor extends AbstractPOIFSExtractor
{
pic.getIndex(), mediaType);
Metadata picMetadata = Metadata.newInstance(context);
picMetadata.set(TikaCoreProperties.RESOURCE_NAME_EXTENSION_INFERRED, true);
+ PageAnchoring.applyPageMetadata(picMetadata,
picToSlides.get(pic.getIndex()));
handleEmbeddedResource(picIs, picMetadata, picName, null,
null, mediaType, xhtml, false);
}
}
}
+ /**
+ * Walks all shapes in {@code container} and records, for each
+ * {@link HSLFPictureShape}, the 1-based slide number in {@code
picToSlides}.
+ */
+ private void collectPictureSlides(ShapeContainer container, int slideNum,
+ Map<Integer, Set<Integer>> picToSlides) {
+ List<HSLFShape> shapes = getShapes(container);
+ if (shapes == null) {
+ return;
+ }
+ for (HSLFShape shape : shapes) {
+ if (shape instanceof HSLFPictureShape) {
+ HSLFPictureData pd = ((HSLFPictureShape)
shape).getPictureData();
+ if (pd != null) {
+ picToSlides.computeIfAbsent(pd.getIndex(), k -> new
HashSet<>()).add(slideNum);
+ }
+ } else if (shape instanceof HSLFGroupShape) {
+ collectPictureSlides((HSLFGroupShape) shape, slideNum,
picToSlides);
+ }
+ }
+ }
+
private void handleSlideEmbeddedResources(ShapeContainer shapeContainer,
XHTMLContentHandler xhtml)
throws TikaException, SAXException {
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/AbstractOOXMLExtractor.java
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/AbstractOOXMLExtractor.java
index c686414ed1..900853f2e0 100644
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/AbstractOOXMLExtractor.java
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/AbstractOOXMLExtractor.java
@@ -55,6 +55,7 @@ import org.apache.tika.io.FilenameUtils;
import org.apache.tika.io.TikaInputStream;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.Office;
+import org.apache.tika.metadata.PageAnchoring;
import org.apache.tika.metadata.TikaCoreProperties;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.microsoft.OfficeParser;
@@ -153,6 +154,36 @@ public abstract class AbstractOOXMLExtractor implements
OOXMLExtractor {
return Collections.emptyMap();
}
+ /**
+ * Hook for subclasses to apply anchor metadata (page or sheet numbers)
+ * to an embedded part's metadata, for paginated/sheeted containers.
+ * Called from {@link #handleEmbeddedFile} after the basic metadata
+ * (relationship id, content type, etc.) has been written, before the
+ * embedded part is handed off to the recursing parser.
+ *
+ * <p>Default is no-op — non-paginated containers (Word, Visio,
+ * ...) leave embedded resources without anchor metadata. Subclasses
+ * for paginated containers should override and invoke either
+ * {@link PageAnchoring#applyPageMetadata} (presentations) or
+ * {@link PageAnchoring#applySheetMetadata} (spreadsheets) with the
+ * indices for {@code part}. Letting each subclass own the write
+ * keeps the abstract class format-agnostic — pages vs. sheets
+ * is a per-format decision.
+ *
+ * <p>The {@link PackagePart} (not just its target URI) is supplied
+ * because relative target URIs differ depending on the relationship
+ * source (e.g. {@code ../media/image1.png} from a slide, or the same
+ * relative URI from an Excel drawing) and would not be stable
+ * lookup keys across sources. Subclasses can use
+ * {@code part.getPartName().getName()} for a canonical absolute path.
+ *
+ * @param part the embedded part being emitted
+ * @param metadata metadata to enrich
+ */
+ protected void applyEmbeddedAnchorMetadata(PackagePart part, Metadata
metadata) {
+ // default: no-op
+ }
+
protected String getJustFileName(String desc) {
int idx = desc.lastIndexOf('/');
if (idx != -1) {
@@ -450,7 +481,10 @@ public abstract class AbstractOOXMLExtractor implements
OOXMLExtractor {
}
/**
- * Handles an embedded file in the document
+ * Handles an embedded file in the document. Invokes
+ * {@link #applyEmbeddedAnchorMetadata} so paginated/sheeted
+ * subclasses can tag the embedded resource's metadata with the
+ * pages or sheets it is anchored to.
*/
protected void handleEmbeddedFile(PackagePart part, XHTMLContentHandler
xhtml,
String rel,
@@ -469,6 +503,8 @@ public abstract class AbstractOOXMLExtractor implements
OOXMLExtractor {
// Get the content type
metadata.set(Metadata.CONTENT_TYPE, part.getContentType());
+ applyEmbeddedAnchorMetadata(part, metadata);
+
// Call the recursing handler
if (embeddedExtractor.shouldParseEmbedded(metadata)) {
try (TikaInputStream tis =
TikaInputStream.get(part.getInputStream())) {
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/SXSLFPowerPointExtractorDecorator.java
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/SXSLFPowerPointExtractorDecorator.java
index b34a1ffc77..afa01a2339 100644
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/SXSLFPowerPointExtractorDecorator.java
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/SXSLFPowerPointExtractorDecorator.java
@@ -20,8 +20,10 @@ import java.io.IOException;
import java.io.InputStream;
import java.util.ArrayList;
import java.util.HashMap;
+import java.util.LinkedHashSet;
import java.util.List;
import java.util.Map;
+import java.util.Set;
import java.util.zip.ZipException;
import org.apache.poi.openxml4j.exceptions.InvalidFormatException;
@@ -29,6 +31,7 @@ import org.apache.poi.openxml4j.opc.PackagePart;
import org.apache.poi.openxml4j.opc.PackagePartName;
import org.apache.poi.openxml4j.opc.PackageRelationship;
import org.apache.poi.openxml4j.opc.PackageRelationshipCollection;
+import org.apache.poi.openxml4j.opc.PackageRelationshipTypes;
import org.apache.poi.openxml4j.opc.PackagingURIHelper;
import org.apache.poi.openxml4j.opc.TargetMode;
import org.apache.poi.xslf.usermodel.XSLFRelation;
@@ -37,6 +40,7 @@ import org.xml.sax.SAXException;
import org.apache.tika.exception.TikaException;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.Office;
+import org.apache.tika.metadata.PageAnchoring;
import org.apache.tika.metadata.TikaCoreProperties;
import org.apache.tika.parser.ParseContext;
import
org.apache.tika.parser.microsoft.ooxml.xslf.XSLFEventBasedPowerPointExtractor;
@@ -71,6 +75,15 @@ public class SXSLFPowerPointExtractorDecorator extends
AbstractOOXMLExtractor {
private final Metadata metadata;
private final CommentAuthors commentAuthors = new CommentAuthors();
private PackagePart mainDocument = null;
+ /**
+ * Pre-pass index of embedded-image absolute part name → set of
+ * 1-based slide numbers referencing that image. Populated during
+ * {@link #getMainDocumentParts()} so that {@link
#getPagesForEmbeddedTarget(URI)}
+ * can answer per-target lookups even after the deduplication done by
+ * {@code AbstractOOXMLExtractor.handleEmbeddedParts} would otherwise hide
+ * the second-and-later references.
+ */
+ private final Map<String, Set<Integer>> picturePages = new HashMap<>();
public SXSLFPowerPointExtractorDecorator(Metadata metadata, ParseContext
context,
XSLFEventBasedPowerPointExtractor
extractor) {
@@ -267,6 +280,7 @@ public class SXSLFPowerPointExtractorDecorator extends
AbstractOOXMLExtractor {
metadata.add(TikaCoreProperties.TIKA_META_EXCEPTION_WARNING,
ExceptionUtils.getStackTrace(e));
}
+ recordPicturePageRefs(slidePart, i + 1);
addSlideParts(slidePart, parts);
}
}
@@ -300,6 +314,62 @@ public class SXSLFPowerPointExtractorDecorator extends
AbstractOOXMLExtractor {
return parts;
}
+ /**
+ * Records every image relationship of {@code slidePart} against the
+ * given 1-based {@code slideNumber}. Called once per slide during the
+ * pre-pass in {@link #getMainDocumentParts()}. When the same image is
+ * referenced from multiple slides, both slide numbers end up in the
+ * set so {@link org.apache.tika.metadata.TikaPagedText#PAGE_NUMBERS}
+ * ends up multi-valued. Keyed by absolute part name (e.g.
+ * {@code /ppt/media/image1.png}) so the lookup matches what
+ * {@link AbstractOOXMLExtractor#applyEmbeddedAnchorMetadata} sees
+ * — relative target URIs from different sources can clash and
+ * are not stable lookup keys.
+ */
+ private void recordPicturePageRefs(PackagePart slidePart, int slideNumber)
{
+ if (slidePart == null) {
+ return;
+ }
+ PackageRelationshipCollection prc;
+ try {
+ prc =
slidePart.getRelationshipsByType(PackageRelationshipTypes.IMAGE_PART);
+ } catch (InvalidFormatException e) {
+ metadata.add(TikaCoreProperties.TIKA_META_EXCEPTION_WARNING,
+ ExceptionUtils.getStackTrace(e));
+ return;
+ }
+ if (prc == null) {
+ return;
+ }
+ for (PackageRelationship rel : prc) {
+ if (rel.getTargetMode() != TargetMode.INTERNAL) {
+ continue;
+ }
+ PackagePart imagePart;
+ try {
+ imagePart = slidePart.getRelatedPart(rel);
+ } catch (InvalidFormatException | IllegalArgumentException e) {
+ metadata.add(TikaCoreProperties.TIKA_META_EXCEPTION_WARNING,
+ ExceptionUtils.getStackTrace(e));
+ continue;
+ }
+ if (imagePart == null) {
+ continue;
+ }
+ picturePages
+ .computeIfAbsent(imagePart.getPartName().getName(),
+ k -> new LinkedHashSet<>())
+ .add(slideNumber);
+ }
+ }
+
+ @Override
+ protected void applyEmbeddedAnchorMetadata(PackagePart part, Metadata
metadata) {
+ // Pre-pass keys by absolute part name (canonical zip path).
+ PageAnchoring.applyPageMetadata(metadata,
+ picturePages.get(part.getPartName().getName()));
+ }
+
private void addSlideParts(PackagePart slidePart, List<PackagePart> parts)
{
if (slidePart == null) {
return;
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSSFExcelExtractorDecorator.java
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSSFExcelExtractorDecorator.java
index d968cdb856..feb18e0da3 100644
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSSFExcelExtractorDecorator.java
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSSFExcelExtractorDecorator.java
@@ -21,9 +21,11 @@ import java.io.IOException;
import java.io.InputStream;
import java.util.ArrayList;
import java.util.HashMap;
+import java.util.LinkedHashSet;
import java.util.List;
import java.util.Locale;
import java.util.Map;
+import java.util.Set;
import org.apache.poi.hssf.extractor.ExcelExtractor;
import org.apache.poi.openxml4j.exceptions.InvalidFormatException;
@@ -54,6 +56,7 @@ import org.apache.tika.exception.TikaException;
import org.apache.tika.exception.WriteLimitReachedException;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.Office;
+import org.apache.tika.metadata.PageAnchoring;
import org.apache.tika.metadata.TikaCoreProperties;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.microsoft.OfficeParserConfig;
@@ -98,6 +101,18 @@ public class XSSFExcelExtractorDecorator extends
AbstractOOXMLExtractor {
protected static HeaderFooterHelper hfHelper = new HeaderFooterHelper();
protected final DataFormatter formatter;
protected final List<PackagePart> sheetParts = new ArrayList<>();
+ /**
+ * Pre-pass index of embedded-image absolute part name (e.g.
+ * {@code /xl/media/image1.png}) → set of 1-based sheet numbers
+ * referencing that image. In XLSX, sheets reference images
+ * indirectly via drawing parts (sheet → drawing → image), so the
+ * pre-pass walks both hops. Populated by
+ * {@link #getMainDocumentParts()} so that
+ * {@link #applyEmbeddedAnchorMetadata} can answer per-target
+ * lookups even after {@code AbstractOOXMLExtractor.handleEmbeddedParts}
+ * has deduped on second-and-later references.
+ */
+ private final Map<String, Set<Integer>> picturePages = new HashMap<>();
protected final Map<String, String> drawingHyperlinks = new HashMap<>();
protected Metadata metadata;
protected ParseContext parseContext;
@@ -920,7 +935,12 @@ public class XSSFExcelExtractorDecorator extends
AbstractOOXMLExtractor {
@Override
protected List<PackagePart> getMainDocumentParts() throws TikaException {
List<PackagePart> parts = new ArrayList<>();
+ // The sheet order in sheetParts mirrors the workbook's sheet
+ // ordering (populated in buildXHTML), so the index here is the
+ // 1-based sheet number.
+ int sheetNumber = 0;
for (PackagePart part : sheetParts) {
+ sheetNumber++;
// Add the sheet
parts.add(part);
@@ -931,7 +951,9 @@ public class XSSFExcelExtractorDecorator extends
AbstractOOXMLExtractor {
if (rel.getTargetMode() == TargetMode.INTERNAL) {
PackagePartName relName =
PackagingURIHelper.createPartName(rel.getTargetURI());
- parts.add(rel.getPackage().getPart(relName));
+ PackagePart drawingPart =
rel.getPackage().getPart(relName);
+ parts.add(drawingPart);
+ recordImagesOnSheet(drawingPart, sheetNumber);
}
}
for (PackageRelationship rel : part
@@ -939,7 +961,9 @@ public class XSSFExcelExtractorDecorator extends
AbstractOOXMLExtractor {
if (rel.getTargetMode() == TargetMode.INTERNAL) {
PackagePartName relName =
PackagingURIHelper.createPartName(rel.getTargetURI());
- parts.add(rel.getPackage().getPart(relName));
+ PackagePart vmlPart =
rel.getPackage().getPart(relName);
+ parts.add(vmlPart);
+ recordImagesOnSheet(vmlPart, sheetNumber);
}
}
} catch (InvalidFormatException e) {
@@ -1252,4 +1276,58 @@ public class XSSFExcelExtractorDecorator extends
AbstractOOXMLExtractor {
}
}
}
+
+ /**
+ * Records every image relationship of {@code drawingPart} against the
+ * given 1-based {@code sheetNumber}. Called once per drawing during
+ * the pre-pass in {@link #getMainDocumentParts()}. When the same
+ * image is referenced from drawings on multiple sheets, all sheet
+ * numbers end up in the set so {@link Office#SHEET_NUMBERS} ends up
+ * multi-valued. Keyed by absolute part name so the lookup matches
+ * what {@link AbstractOOXMLExtractor#applyEmbeddedAnchorMetadata}
+ * sees — relative target URIs across drawing parts collide
+ * and are not stable lookup keys.
+ */
+ private void recordImagesOnSheet(PackagePart drawingPart, int sheetNumber)
{
+ if (drawingPart == null) {
+ return;
+ }
+ PackageRelationshipCollection prc;
+ try {
+ prc =
drawingPart.getRelationshipsByType(PackageRelationshipTypes.IMAGE_PART);
+ } catch (InvalidFormatException e) {
+ metadata.add(TikaCoreProperties.TIKA_META_EXCEPTION_WARNING,
+ ExceptionUtils.getStackTrace(e));
+ return;
+ }
+ if (prc == null) {
+ return;
+ }
+ for (PackageRelationship rel : prc) {
+ if (rel.getTargetMode() != TargetMode.INTERNAL) {
+ continue;
+ }
+ PackagePart imagePart;
+ try {
+ imagePart = drawingPart.getRelatedPart(rel);
+ } catch (InvalidFormatException | IllegalArgumentException e) {
+ metadata.add(TikaCoreProperties.TIKA_META_EXCEPTION_WARNING,
+ ExceptionUtils.getStackTrace(e));
+ continue;
+ }
+ if (imagePart == null) {
+ continue;
+ }
+ picturePages
+ .computeIfAbsent(imagePart.getPartName().getName(),
+ k -> new LinkedHashSet<>())
+ .add(sheetNumber);
+ }
+ }
+
+ @Override
+ protected void applyEmbeddedAnchorMetadata(PackagePart part, Metadata
metadata) {
+ PageAnchoring.applySheetMetadata(metadata,
+ picturePages.get(part.getPartName().getName()));
+ }
}
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/ExcelParserTest.java
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/ExcelParserTest.java
index 18f408d452..55975455b2 100644
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/ExcelParserTest.java
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/ExcelParserTest.java
@@ -16,6 +16,7 @@
*/
package org.apache.tika.parser.microsoft;
+import static org.junit.jupiter.api.Assertions.assertArrayEquals;
import static org.junit.jupiter.api.Assertions.assertEquals;
import static org.junit.jupiter.api.Assertions.assertNull;
import static org.junit.jupiter.api.Assertions.assertTrue;
@@ -631,4 +632,66 @@ public class ExcelParserTest extends TikaTest {
assertContains("class=\"external-ref-ddeLink\"", xml);
assertContains("cmd|", xml);
}
+
+ @Test
+ public void testXlsPictureSheetNumbers() throws Exception {
+ // testEXCEL_1img.xls has a single embedded image on sheet 1.
+ List<Metadata> metadataList =
getRecursiveMetadata("testEXCEL_1img.xls");
+ int imagesChecked = 0;
+ for (Metadata m : metadataList) {
+ int[] sheets = m.getIntValues(Office.SHEET_NUMBERS);
+ if (sheets.length == 0) {
+ continue;
+ }
+ assertArrayEquals(new int[]{1}, sheets,
+ "embedded resource should be tagged with sheet 1");
+ assertEquals("1", m.get(Office.SHEET_NUMBER));
+ imagesChecked++;
+ }
+ assertTrue(imagesChecked > 0, "expected at least one image with sheet
metadata");
+ }
+
+ @Test
+ public void testXlsxPictureSheetNumbers() throws Exception {
+ // testEXCEL_1img.xlsx has a single image referenced from sheet1
+ // (sheet2 has no drawing). The embedded image's metadata should
+ // therefore carry SHEET_NUMBERS=[1] and SHEET_NUMBER=1.
+ List<Metadata> metadataList =
getRecursiveMetadata("testEXCEL_1img.xlsx");
+ int imagesChecked = 0;
+ for (Metadata m : metadataList) {
+ String path = m.get(TikaCoreProperties.INTERNAL_PATH);
+ if (path == null || !path.startsWith("/xl/media/")) {
+ continue;
+ }
+ assertArrayEquals(new int[]{1},
m.getIntValues(Office.SHEET_NUMBERS),
+ "image " + path + " should be tagged with sheet 1");
+ assertEquals("1", m.get(Office.SHEET_NUMBER),
+ "SHEET_NUMBER should equal 1 for single-sheet image: " +
path);
+ imagesChecked++;
+ }
+ assertEquals(1, imagesChecked, "expected exactly one embedded image");
+ }
+
+ @Test
+ public void testXlsxAllPicturesOnSheet1() throws Exception {
+ // testEXCEL_embeded.xlsx has images referenced from sheet1 (one
+ // via drawing1.xml, several via vmlDrawing1.vml). Every image
+ // under /xl/media/ should carry SHEET_NUMBERS=[1] and
+ // SHEET_NUMBER=1. Don't hardcode the count — embedded OLE
+ // objects can contribute additional preview thumbnails through
+ // their own paths.
+ List<Metadata> metadataList =
getRecursiveMetadata("testEXCEL_embeded.xlsx");
+ int imagesChecked = 0;
+ for (Metadata m : metadataList) {
+ String path = m.get(TikaCoreProperties.INTERNAL_PATH);
+ if (path == null || !path.startsWith("/xl/media/")) {
+ continue;
+ }
+ assertArrayEquals(new int[]{1},
m.getIntValues(Office.SHEET_NUMBERS),
+ "image " + path + " should be tagged with sheet 1");
+ assertEquals("1", m.get(Office.SHEET_NUMBER), path);
+ imagesChecked++;
+ }
+ assertTrue(imagesChecked >= 4, "expected at least four embedded
images, got " + imagesChecked);
+ }
}
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/PowerPointParserTest.java
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/PowerPointParserTest.java
index 15a6dc2a08..52ff5de53e 100644
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/PowerPointParserTest.java
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/PowerPointParserTest.java
@@ -16,7 +16,9 @@
*/
package org.apache.tika.parser.microsoft;
+import static org.junit.jupiter.api.Assertions.assertArrayEquals;
import static org.junit.jupiter.api.Assertions.assertEquals;
+import static org.junit.jupiter.api.Assertions.assertNull;
import static org.junit.jupiter.api.Assertions.assertThrows;
import static org.junit.jupiter.api.Assertions.assertTrue;
@@ -34,6 +36,7 @@ import org.apache.tika.io.TikaInputStream;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.Office;
import org.apache.tika.metadata.TikaCoreProperties;
+import org.apache.tika.metadata.TikaPagedText;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.sax.BodyContentHandler;
@@ -383,4 +386,54 @@ public class PowerPointParserTest extends TikaTest {
xlsx.get(Metadata.CONTENT_TYPE));
}
+
+ @Test
+ public void testPptxPicturePageNumbersMultiSlide() throws Exception {
+ // testPPT_embedded_two_slides.pptx references image4.png, image5.gif
+ // and image6.png from BOTH slide1 and slide2 (other media parts in
+ // /ppt/media/ are OLE-object preview thumbnails not referenced from
+ // any slide). Each slide-referenced image should carry
+ // PAGE_NUMBERS=[1,2] and PAGE_NUMBER unset.
+ List<String> slideReferencedImages = Arrays.asList(
+ "/ppt/media/image4.png", "/ppt/media/image5.gif",
"/ppt/media/image6.png");
+ List<Metadata> metadataList =
+ getRecursiveMetadata("testPPT_embedded_two_slides.pptx");
+ int imagesChecked = 0;
+ for (Metadata m : metadataList) {
+ String path = m.get(TikaCoreProperties.INTERNAL_PATH);
+ if (path == null || !slideReferencedImages.contains(path)) {
+ continue;
+ }
+ assertArrayEquals(new int[]{1, 2},
+ m.getIntValues(TikaPagedText.PAGE_NUMBERS),
+ "image " + path + " should be tagged with both slide
numbers");
+ assertNull(m.get(TikaPagedText.PAGE_NUMBER),
+ "PAGE_NUMBER should be unset for multi-slide image: " +
path);
+ imagesChecked++;
+ }
+ assertEquals(3, imagesChecked,
+ "expected three slide-referenced images in
testPPT_embedded_two_slides.pptx");
+ }
+
+ @Test
+ public void testPptxPicturePageNumbersSingleSlide() throws Exception {
+ // testPPT_2imgs.pptx has all images on slide 1, so each picture
+ // should carry PAGE_NUMBERS=[1] AND PAGE_NUMBER=1 (single-page
+ // backwards-compatible convention).
+ List<Metadata> metadataList =
getRecursiveMetadata("testPPT_2imgs.pptx");
+ int imagesChecked = 0;
+ for (Metadata m : metadataList) {
+ String path = m.get(TikaCoreProperties.INTERNAL_PATH);
+ if (path == null || !path.startsWith("/ppt/media/image")) {
+ continue;
+ }
+ assertArrayEquals(new int[]{1},
+ m.getIntValues(TikaPagedText.PAGE_NUMBERS),
+ "image " + path + " should report slide 1");
+ assertEquals("1", m.get(TikaPagedText.PAGE_NUMBER),
+ "PAGE_NUMBER should equal 1 for single-slide image: " +
path);
+ imagesChecked++;
+ }
+ assertTrue(imagesChecked > 0, "expected at least one embedded image");
+ }
}
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-miscoffice-module/src/main/java/org/apache/tika/parser/odf/OpenDocumentParser.java
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-miscoffice-module/src/main/java/org/apache/tika/parser/odf/OpenDocumentParser.java
index 3d11159475..4b0fced008 100644
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-miscoffice-module/src/main/java/org/apache/tika/parser/odf/OpenDocumentParser.java
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-miscoffice-module/src/main/java/org/apache/tika/parser/odf/OpenDocumentParser.java
@@ -21,14 +21,19 @@ import static java.nio.charset.StandardCharsets.UTF_8;
import java.io.IOException;
import java.io.InputStream;
import java.util.Arrays;
+import java.util.Collection;
import java.util.Collections;
import java.util.Enumeration;
+import java.util.HashMap;
import java.util.HashSet;
+import java.util.LinkedHashSet;
+import java.util.Map;
import java.util.Set;
import org.apache.commons.compress.archivers.zip.ZipArchiveEntry;
import org.apache.commons.compress.archivers.zip.ZipFile;
import org.apache.commons.io.IOUtils;
+import org.xml.sax.Attributes;
import org.xml.sax.ContentHandler;
import org.xml.sax.SAXException;
import org.xml.sax.helpers.DefaultHandler;
@@ -41,7 +46,9 @@ import org.apache.tika.exception.TikaException;
import org.apache.tika.extractor.EmbeddedDocumentUtil;
import org.apache.tika.io.TikaInputStream;
import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.PageAnchoring;
import org.apache.tika.metadata.TikaCoreProperties;
+import org.apache.tika.metadata.TikaPagedText;
import org.apache.tika.mime.MediaType;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.Parser;
@@ -206,17 +213,27 @@ public class OpenDocumentParser implements Parser {
// rest of the file afterwards (TIKA-1353)
// Only possible to guarantee that when opened from a file not a stream
+ // Pre-scan content.xml to build a picture→draw:page map. We need
+ // this before the main loop because the Pictures/ entries are
+ // emitted lazily in zip-iteration order, and we want each emitted
+ // picture's metadata to carry the set of slides on which it
+ // appears. The map is best-effort: if the scan fails, embedded
+ // pictures simply go out without TikaPagedText metadata.
+ Map<String, Set<Integer>> picturePages = scanPicturePages(zipFile,
context);
+
ZipArchiveEntry entry = zipFile.getEntry(MANIFEST_NAME);
if (entry != null) {
try (TikaInputStream tisZip =
TikaInputStream.get(zipFile.getInputStream(entry))) {
- handleZipArchiveEntry(entry, tisZip, metadata, context,
handler, embeddedDocumentUtil);
+ handleZipArchiveEntry(entry, tisZip, metadata, context,
handler,
+ embeddedDocumentUtil, picturePages);
}
}
entry = zipFile.getEntry(META_NAME);
if (entry != null) {
try (TikaInputStream tisZip =
TikaInputStream.get(zipFile.getInputStream(entry))) {
- handleZipArchiveEntry(entry, tisZip, metadata, context,
handler, embeddedDocumentUtil);
+ handleZipArchiveEntry(entry, tisZip, metadata, context,
handler,
+ embeddedDocumentUtil, picturePages);
}
}
@@ -225,15 +242,41 @@ public class OpenDocumentParser implements Parser {
entry = entries.nextElement();
if (!META_NAME.equals(entry.getName())) {
try (TikaInputStream tis =
TikaInputStream.get(zipFile.getInputStream(entry))) {
- handleZipArchiveEntry(entry, tis, metadata, context,
handler, embeddedDocumentUtil);
+ handleZipArchiveEntry(entry, tis, metadata, context,
handler,
+ embeddedDocumentUtil, picturePages);
}
}
}
}
+ /**
+ * Pre-scans {@code content.xml} for {@code draw:page} and
+ * {@code draw:image[xlink:href]} elements, returning a map from
+ * picture href (typically {@code "Pictures/<name>"}, matching the
+ * picture's zip entry name) to the set of 1-based draw:page indices
+ * referencing it. Returns an empty map if {@code content.xml} is
+ * missing or cannot be parsed; per-page metadata is best-effort
+ * enrichment, not load-bearing for the parse itself.
+ */
+ private static Map<String, Set<Integer>> scanPicturePages(ZipFile zipFile,
+ ParseContext
context) {
+ ZipArchiveEntry contentEntry = zipFile.getEntry("content.xml");
+ if (contentEntry == null) {
+ return Collections.emptyMap();
+ }
+ PicturePageHandler scan = new PicturePageHandler();
+ try (InputStream is = zipFile.getInputStream(contentEntry)) {
+ XMLReaderUtils.parseSAX(is, scan, context);
+ } catch (IOException | SAXException | TikaException e) {
+ return Collections.emptyMap();
+ }
+ return scan.getPicturePages();
+ }
+
private void handleZipArchiveEntry(ZipArchiveEntry entry, TikaInputStream
tisZip, Metadata metadata,
ParseContext context, ContentHandler handler,
- EmbeddedDocumentUtil embeddedDocumentUtil)
+ EmbeddedDocumentUtil embeddedDocumentUtil,
+ Map<String, Set<Integer>> picturePages)
throws IOException, SAXException, TikaException {
if (entry.isDirectory()) {
@@ -284,6 +327,15 @@ public class OpenDocumentParser implements Parser {
embeddedMetadata.set(Metadata.CONTENT_TYPE,
embeddedMimeType.toString());
}
tisZip.reset();
+ // Tag the picture with the draw:page indices it
+ // appears on (set populated by scanPicturePages).
+ // A null lookup means "not referenced by any
+ // draw:page" — leaves PAGE_NUMBERS unset, matching
+ // the "unknown" branch of the convention.
+ Collection<Integer> pages =
picturePages.get(embeddedName);
+ if (pages != null) {
+ PageAnchoring.applyPageMetadata(embeddedMetadata,
pages);
+ }
}
if
(embeddedDocumentUtil.shouldParseEmbedded(embeddedMetadata)) {
@@ -350,5 +402,48 @@ public class OpenDocumentParser implements Parser {
return false;
}
+ /**
+ * SAX handler that pre-scans an ODP {@code content.xml} to record which
+ * {@code draw:page} elements reference each {@code draw:image}. The
+ * result is keyed by the picture's {@code xlink:href} attribute value,
+ * which for embedded pictures is the zip-entry path (e.g.
+ * {@code Pictures/abc.png}). Used to populate
+ * {@link TikaPagedText#PAGE_NUMBERS} on the embedded picture's metadata.
+ *
+ * <p>Only draw:page is treated as a page boundary; draw:image elements
+ * outside any draw:page (e.g. on a master-page template) are ignored
+ * because no slide number meaningfully applies. Their metadata is left
+ * without page tagging — the "unknown" branch of the convention.
+ */
+ static final class PicturePageHandler extends DefaultHandler {
+ private static final String DRAW_NS =
+ "urn:oasis:names:tc:opendocument:xmlns:drawing:1.0";
+ private static final String XLINK_NS =
+ "http://www.w3.org/1999/xlink";
+
+ private final Map<String, Set<Integer>> picturePages = new HashMap<>();
+ private int currentPage = 0;
+
+ @Override
+ public void startElement(String uri, String localName, String qName,
+ Attributes attrs) {
+ if (!DRAW_NS.equals(uri)) {
+ return;
+ }
+ if ("page".equals(localName)) {
+ currentPage++;
+ } else if ("image".equals(localName) && currentPage > 0) {
+ String href = attrs.getValue(XLINK_NS, "href");
+ if (href != null) {
+ picturePages
+ .computeIfAbsent(href, k -> new LinkedHashSet<>())
+ .add(currentPage);
+ }
+ }
+ }
+ Map<String, Set<Integer>> getPicturePages() {
+ return picturePages;
+ }
+ }
}
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-miscoffice-module/src/test/java/org/apache/tika/parser/odf/ODFParserTest.java
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-miscoffice-module/src/test/java/org/apache/tika/parser/odf/ODFParserTest.java
index 270f1f0630..cded9ea148 100644
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-miscoffice-module/src/test/java/org/apache/tika/parser/odf/ODFParserTest.java
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-miscoffice-module/src/test/java/org/apache/tika/parser/odf/ODFParserTest.java
@@ -16,7 +16,9 @@
*/
package org.apache.tika.parser.odf;
+import static org.junit.jupiter.api.Assertions.assertArrayEquals;
import static org.junit.jupiter.api.Assertions.assertEquals;
+import static org.junit.jupiter.api.Assertions.assertNull;
import static org.junit.jupiter.api.Assertions.assertThrows;
import static org.junit.jupiter.api.Assertions.assertTrue;
import static org.junit.jupiter.api.Assertions.fail;
@@ -45,6 +47,7 @@ import org.apache.tika.metadata.Office;
import org.apache.tika.metadata.OfficeOpenXMLCore;
import org.apache.tika.metadata.OfficeOpenXMLExtended;
import org.apache.tika.metadata.TikaCoreProperties;
+import org.apache.tika.metadata.TikaPagedText;
import org.apache.tika.parser.EmptyParser;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.Parser;
@@ -502,6 +505,90 @@ public class ODFParserTest extends TikaTest {
assertTrue(filesTested > 10);
}
+ @Test
+ public void testOdpPicturePageNumbers() throws Exception {
+ // testODP_NPE.odp has seven images, each referenced from exactly
+ // one draw:page (computed from the content.xml at fixture build
+ // time). Each embedded picture should therefore carry
+ // PAGE_NUMBERS=[N] and PAGE_NUMBER=N.
+ // Page numbers counted strictly by <draw:page> opening tags
+ // (excluding <draw:page-thumbnail>, <draw:page-number>, etc.).
+ java.util.Map<String, Integer> expected = new java.util.HashMap<>();
+ expected.put("Pictures/10000000000001F4000002705F89BA821D9F0627.jpg",
32);
+ expected.put("Pictures/1000000000000263000005A43C6E7911DFE698F3.jpg",
14);
+ expected.put("Pictures/100000000000028A000003BD577F6EA35EDCA351.jpg",
37);
+ expected.put("Pictures/100000000000029C000003DDDCB6B16F49D730C4.jpg",
35);
+ expected.put("Pictures/10000000000002B9000002619EFEE29AE4039384.jpg",
30);
+ expected.put("Pictures/10000000000003A600000366D2A4CA24B31718E5.jpg",
21);
+ expected.put("Pictures/10000000000005790000046EA0860EFDD89319F8.jpg",
16);
+
+ List<Metadata> metadataList = getRecursiveMetadata("testODP_NPE.odp");
+ int imagesChecked = 0;
+ for (Metadata m : metadataList) {
+ String path = m.get(TikaCoreProperties.INTERNAL_PATH);
+ if (path == null || !expected.containsKey(path)) {
+ continue;
+ }
+ int page = expected.get(path);
+ assertArrayEquals(new int[]{page},
+ m.getIntValues(TikaPagedText.PAGE_NUMBERS),
+ "picture " + path + " should report page " + page);
+ assertEquals(Integer.toString(page),
+ m.get(TikaPagedText.PAGE_NUMBER),
+ "PAGE_NUMBER should equal " + page + " for single-page
picture: " + path);
+ imagesChecked++;
+ }
+ assertEquals(expected.size(), imagesChecked,
+ "expected " + expected.size() + " tagged pictures");
+ }
+
+ @Test
+ public void testOdpPicturePageHandlerMultiPage() throws Exception {
+ // Direct unit test of the SAX pre-scan: a synthetic content.xml
+ // with the same picture referenced from draw:page 1 AND draw:page 3,
+ // a second picture only on draw:page 2, and a third picture inside
+ // a master-page (outside any draw:page) so it should NOT be tagged.
+ String contentXml = "<?xml version=\"1.0\" encoding=\"UTF-8\"?>" +
+ "<office:document-content " +
+
"xmlns:office=\"urn:oasis:names:tc:opendocument:xmlns:office:1.0\" " +
+
"xmlns:draw=\"urn:oasis:names:tc:opendocument:xmlns:drawing:1.0\" " +
+
"xmlns:style=\"urn:oasis:names:tc:opendocument:xmlns:style:1.0\" " +
+ "xmlns:xlink=\"http://www.w3.org/1999/xlink\">" +
+ "<office:master-styles>" +
+ "<style:master-page>" +
+ "<draw:image xlink:href=\"Pictures/master.png\"/>" +
+ "</style:master-page>" +
+ "</office:master-styles>" +
+ "<office:body><office:presentation>" +
+ "<draw:page><draw:frame>" +
+ "<draw:image xlink:href=\"Pictures/shared.png\"/>" +
+ "</draw:frame></draw:page>" +
+ "<draw:page>" +
+ "<draw:image xlink:href=\"Pictures/only_page2.png\"/>" +
+ "</draw:page>" +
+ "<draw:page>" +
+ "<draw:image xlink:href=\"Pictures/shared.png\"/>" +
+ "</draw:page>" +
+ "</office:presentation></office:body>" +
+ "</office:document-content>";
+
+ OpenDocumentParser.PicturePageHandler handler =
+ new OpenDocumentParser.PicturePageHandler();
+ XMLReaderUtils.parseSAX(
+ new
java.io.ByteArrayInputStream(contentXml.getBytes(StandardCharsets.UTF_8)),
+ handler, new ParseContext());
+
+ java.util.Map<String, java.util.Set<Integer>> pages =
handler.getPicturePages();
+ assertEquals(new java.util.LinkedHashSet<>(Arrays.asList(1, 3)),
+ pages.get("Pictures/shared.png"),
+ "shared picture should be on draw:page 1 and 3");
+ assertEquals(new
java.util.LinkedHashSet<>(java.util.Collections.singletonList(2)),
+ pages.get("Pictures/only_page2.png"),
+ "page-2 picture should only be on draw:page 2");
+ assertNull(pages.get("Pictures/master.png"),
+ "master-page picture should NOT be tagged with any draw:page");
+ }
+
@Test
public void testVersions() throws Exception {
//test at least that all files from