This is an automated email from the ASF dual-hosted git repository. tballison pushed a commit to branch haystack-poi-embedded-filenames in repository https://gitbox.apache.org/repos/asf/tika.git
commit 1d6a8d79b67e563c51f9cf132237001211f583be Author: tballison <[email protected]> AuthorDate: Wed May 13 12:43:59 2026 -0400 embedded file names and pagination in hslf --- .../parser/microsoft/AbstractPOIFSExtractor.java | 22 +++++++++++-- .../tika/parser/microsoft/HSLFExtractor.java | 36 ++++++++++++++++++++++ 2 files changed, 55 insertions(+), 3 deletions(-) diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/AbstractPOIFSExtractor.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/AbstractPOIFSExtractor.java index 37267b6644..6bb79a16c0 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/AbstractPOIFSExtractor.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/AbstractPOIFSExtractor.java @@ -21,6 +21,7 @@ import java.io.IOException; import java.io.InputStream; import java.nio.charset.StandardCharsets; import java.util.Iterator; +import java.util.Locale; import org.apache.commons.io.output.UnsynchronizedByteArrayOutputStream; import org.apache.poi.hpsf.ClassID; @@ -213,7 +214,7 @@ abstract class AbstractPOIFSExtractor { metadata.set(TikaCoreProperties.RESOURCE_NAME_KEY, name); } else { metadata.set(TikaCoreProperties.RESOURCE_NAME_KEY, - name + '.' + type.getExtension()); + appendExtensionIfMissing(name, type.getExtension())); metadata.set(TikaCoreProperties.RESOURCE_NAME_EXTENSION_INFERRED, true); } } @@ -225,7 +226,7 @@ abstract class AbstractPOIFSExtractor { metadata.set(TikaCoreProperties.RESOURCE_NAME_KEY, rName); } else { metadata.set(TikaCoreProperties.RESOURCE_NAME_KEY, - rName + '.' + type.getExtension()); + appendExtensionIfMissing(rName, type.getExtension())); metadata.set(TikaCoreProperties.RESOURCE_NAME_EXTENSION_INFERRED, true); } } @@ -320,7 +321,7 @@ abstract class AbstractPOIFSExtractor { // Record what we can do about it metadata.set(Metadata.CONTENT_TYPE, mediaType.toString()); - metadata.set(TikaCoreProperties.RESOURCE_NAME_KEY, rName + extension); + metadata.set(TikaCoreProperties.RESOURCE_NAME_KEY, appendExtensionIfMissing(rName, extension)); metadata.set(TikaCoreProperties.RESOURCE_NAME_EXTENSION_INFERRED, true); metadata.set(Metadata.CONTENT_LENGTH, Integer.toString(length)); parseEmbedded(parentDir, tis, xhtml, metadata, outputHtml); @@ -416,6 +417,21 @@ abstract class AbstractPOIFSExtractor { } + /** + * Appends {@code ext} to {@code name} only when {@code name} does not already end with it + * (case-insensitive). {@code ext} may or may not have a leading dot. + */ + private static String appendExtensionIfMissing(String name, String ext) { + if (StringUtils.isBlank(ext)) { + return name; + } + String dotExt = ext.startsWith(".") ? ext : "." + ext; + if (name.toLowerCase(Locale.ROOT).endsWith(dotExt.toLowerCase(Locale.ROOT))) { + return name; + } + return name + dotExt; + } + public static String tryToGetMsgTitle(DirectoryEntry node, String defaultVal) { for (String entryName : new String[] {"__substg1.0_0037001F", "__substg1.0_0E1D001F", "__substg1.0_0070001F"} ) { diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/HSLFExtractor.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/HSLFExtractor.java index e2f10977bc..95232fba3d 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/HSLFExtractor.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/HSLFExtractor.java @@ -19,8 +19,10 @@ package org.apache.tika.parser.microsoft; import java.io.IOException; import java.io.InputStream; import java.util.ArrayList; +import java.util.HashMap; import java.util.HashSet; import java.util.List; +import java.util.Map; import java.util.Set; import org.apache.poi.common.usermodel.Hyperlink; @@ -37,6 +39,7 @@ import org.apache.poi.hslf.usermodel.HSLFNotes; import org.apache.poi.hslf.usermodel.HSLFObjectData; import org.apache.poi.hslf.usermodel.HSLFObjectShape; import org.apache.poi.hslf.usermodel.HSLFPictureData; +import org.apache.poi.hslf.usermodel.HSLFPictureShape; import org.apache.poi.hslf.usermodel.HSLFShape; import org.apache.poi.hslf.usermodel.HSLFSlide; import org.apache.poi.hslf.usermodel.HSLFSlideShow; @@ -61,6 +64,7 @@ import org.apache.tika.io.TikaInputStream; import org.apache.tika.metadata.Metadata; import org.apache.tika.metadata.Office; import org.apache.tika.metadata.TikaCoreProperties; +import org.apache.tika.metadata.TikaPagedText; import org.apache.tika.mime.MediaType; import org.apache.tika.parser.ParseContext; import org.apache.tika.sax.XHTMLContentHandler; @@ -553,6 +557,12 @@ public class HSLFExtractor extends AbstractPOIFSExtractor { private void handleSlideEmbeddedPictures(HSLFSlideShow slideshow, XHTMLContentHandler xhtml) throws TikaException, SAXException, IOException { + // Build picture-index → slide numbers map so each image carries its page number + Map<Integer, Set<Integer>> picToSlides = new HashMap<>(); + for (HSLFSlide slide : slideshow.getSlides()) { + collectPictureSlides(slide, slide.getSlideNumber(), picToSlides); + } + for (HSLFPictureData pic : slideshow.getPictureData()) { String mediaType; @@ -585,12 +595,38 @@ public class HSLFExtractor extends AbstractPOIFSExtractor { pic.getIndex(), mediaType); Metadata picMetadata = Metadata.newInstance(context); picMetadata.set(TikaCoreProperties.RESOURCE_NAME_EXTENSION_INFERRED, true); + Set<Integer> slideNums = picToSlides.get(pic.getIndex()); + if (slideNums != null && slideNums.size() == 1) { + picMetadata.set(TikaPagedText.PAGE_NUMBER, slideNums.iterator().next()); + } handleEmbeddedResource(picIs, picMetadata, picName, null, null, mediaType, xhtml, false); } } } + /** + * Walks all shapes in {@code container} and records, for each + * {@link HSLFPictureShape}, the 1-based slide number in {@code picToSlides}. + */ + private void collectPictureSlides(ShapeContainer container, int slideNum, + Map<Integer, Set<Integer>> picToSlides) { + List<HSLFShape> shapes = getShapes(container); + if (shapes == null) { + return; + } + for (HSLFShape shape : shapes) { + if (shape instanceof HSLFPictureShape) { + HSLFPictureData pd = ((HSLFPictureShape) shape).getPictureData(); + if (pd != null) { + picToSlides.computeIfAbsent(pd.getIndex(), k -> new HashSet<>()).add(slideNum); + } + } else if (shape instanceof HSLFGroupShape) { + collectPictureSlides((HSLFGroupShape) shape, slideNum, picToSlides); + } + } + } + private void handleSlideEmbeddedResources(ShapeContainer shapeContainer, XHTMLContentHandler xhtml) throws TikaException, SAXException {
