This is an automated email from the ASF dual-hosted git repository.

tballison pushed a commit to branch haystack-poi-embedded-filenames
in repository https://gitbox.apache.org/repos/asf/tika.git

commit 1d6a8d79b67e563c51f9cf132237001211f583be
Author: tballison <[email protected]>
AuthorDate: Wed May 13 12:43:59 2026 -0400

    embedded file names and pagination in hslf
---
 .../parser/microsoft/AbstractPOIFSExtractor.java   | 22 +++++++++++--
 .../tika/parser/microsoft/HSLFExtractor.java       | 36 ++++++++++++++++++++++
 2 files changed, 55 insertions(+), 3 deletions(-)

diff --git 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/AbstractPOIFSExtractor.java
 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/AbstractPOIFSExtractor.java
index 37267b6644..6bb79a16c0 100644
--- 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/AbstractPOIFSExtractor.java
+++ 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/AbstractPOIFSExtractor.java
@@ -21,6 +21,7 @@ import java.io.IOException;
 import java.io.InputStream;
 import java.nio.charset.StandardCharsets;
 import java.util.Iterator;
+import java.util.Locale;
 
 import org.apache.commons.io.output.UnsynchronizedByteArrayOutputStream;
 import org.apache.poi.hpsf.ClassID;
@@ -213,7 +214,7 @@ abstract class AbstractPOIFSExtractor {
                     metadata.set(TikaCoreProperties.RESOURCE_NAME_KEY, name);
                 } else {
                     metadata.set(TikaCoreProperties.RESOURCE_NAME_KEY,
-                            name + '.' + type.getExtension());
+                            appendExtensionIfMissing(name, 
type.getExtension()));
                     
metadata.set(TikaCoreProperties.RESOURCE_NAME_EXTENSION_INFERRED, true);
                 }
             }
@@ -225,7 +226,7 @@ abstract class AbstractPOIFSExtractor {
                     metadata.set(TikaCoreProperties.RESOURCE_NAME_KEY, rName);
                 } else {
                     metadata.set(TikaCoreProperties.RESOURCE_NAME_KEY,
-                            rName + '.' + type.getExtension());
+                            appendExtensionIfMissing(rName, 
type.getExtension()));
                     
metadata.set(TikaCoreProperties.RESOURCE_NAME_EXTENSION_INFERRED, true);
                 }
             }
@@ -320,7 +321,7 @@ abstract class AbstractPOIFSExtractor {
 
             // Record what we can do about it
             metadata.set(Metadata.CONTENT_TYPE, mediaType.toString());
-            metadata.set(TikaCoreProperties.RESOURCE_NAME_KEY, rName + 
extension);
+            metadata.set(TikaCoreProperties.RESOURCE_NAME_KEY, 
appendExtensionIfMissing(rName, extension));
             metadata.set(TikaCoreProperties.RESOURCE_NAME_EXTENSION_INFERRED, 
true);
             metadata.set(Metadata.CONTENT_LENGTH, Integer.toString(length));
             parseEmbedded(parentDir, tis, xhtml, metadata, outputHtml);
@@ -416,6 +417,21 @@ abstract class AbstractPOIFSExtractor {
     }
 
 
+    /**
+     * Appends {@code ext} to {@code name} only when {@code name} does not 
already end with it
+     * (case-insensitive). {@code ext} may or may not have a leading dot.
+     */
+    private static String appendExtensionIfMissing(String name, String ext) {
+        if (StringUtils.isBlank(ext)) {
+            return name;
+        }
+        String dotExt = ext.startsWith(".") ? ext : "." + ext;
+        if 
(name.toLowerCase(Locale.ROOT).endsWith(dotExt.toLowerCase(Locale.ROOT))) {
+            return name;
+        }
+        return name + dotExt;
+    }
+
     public static String tryToGetMsgTitle(DirectoryEntry node, String 
defaultVal) {
 
         for (String entryName : new String[] {"__substg1.0_0037001F", 
"__substg1.0_0E1D001F", "__substg1.0_0070001F"} ) {
diff --git 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/HSLFExtractor.java
 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/HSLFExtractor.java
index e2f10977bc..95232fba3d 100644
--- 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/HSLFExtractor.java
+++ 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/HSLFExtractor.java
@@ -19,8 +19,10 @@ package org.apache.tika.parser.microsoft;
 import java.io.IOException;
 import java.io.InputStream;
 import java.util.ArrayList;
+import java.util.HashMap;
 import java.util.HashSet;
 import java.util.List;
+import java.util.Map;
 import java.util.Set;
 
 import org.apache.poi.common.usermodel.Hyperlink;
@@ -37,6 +39,7 @@ import org.apache.poi.hslf.usermodel.HSLFNotes;
 import org.apache.poi.hslf.usermodel.HSLFObjectData;
 import org.apache.poi.hslf.usermodel.HSLFObjectShape;
 import org.apache.poi.hslf.usermodel.HSLFPictureData;
+import org.apache.poi.hslf.usermodel.HSLFPictureShape;
 import org.apache.poi.hslf.usermodel.HSLFShape;
 import org.apache.poi.hslf.usermodel.HSLFSlide;
 import org.apache.poi.hslf.usermodel.HSLFSlideShow;
@@ -61,6 +64,7 @@ import org.apache.tika.io.TikaInputStream;
 import org.apache.tika.metadata.Metadata;
 import org.apache.tika.metadata.Office;
 import org.apache.tika.metadata.TikaCoreProperties;
+import org.apache.tika.metadata.TikaPagedText;
 import org.apache.tika.mime.MediaType;
 import org.apache.tika.parser.ParseContext;
 import org.apache.tika.sax.XHTMLContentHandler;
@@ -553,6 +557,12 @@ public class HSLFExtractor extends AbstractPOIFSExtractor {
 
     private void handleSlideEmbeddedPictures(HSLFSlideShow slideshow, 
XHTMLContentHandler xhtml)
             throws TikaException, SAXException, IOException {
+        // Build picture-index → slide numbers map so each image carries its 
page number
+        Map<Integer, Set<Integer>> picToSlides = new HashMap<>();
+        for (HSLFSlide slide : slideshow.getSlides()) {
+            collectPictureSlides(slide, slide.getSlideNumber(), picToSlides);
+        }
+
         for (HSLFPictureData pic : slideshow.getPictureData()) {
             String mediaType;
 
@@ -585,12 +595,38 @@ public class HSLFExtractor extends AbstractPOIFSExtractor 
{
                         pic.getIndex(), mediaType);
                 Metadata picMetadata = Metadata.newInstance(context);
                 
picMetadata.set(TikaCoreProperties.RESOURCE_NAME_EXTENSION_INFERRED, true);
+                Set<Integer> slideNums = picToSlides.get(pic.getIndex());
+                if (slideNums != null && slideNums.size() == 1) {
+                    picMetadata.set(TikaPagedText.PAGE_NUMBER, 
slideNums.iterator().next());
+                }
                 handleEmbeddedResource(picIs, picMetadata, picName, null,
                         null, mediaType, xhtml, false);
             }
         }
     }
 
+    /**
+     * Walks all shapes in {@code container} and records, for each
+     * {@link HSLFPictureShape}, the 1-based slide number in {@code 
picToSlides}.
+     */
+    private void collectPictureSlides(ShapeContainer container, int slideNum,
+                                      Map<Integer, Set<Integer>> picToSlides) {
+        List<HSLFShape> shapes = getShapes(container);
+        if (shapes == null) {
+            return;
+        }
+        for (HSLFShape shape : shapes) {
+            if (shape instanceof HSLFPictureShape) {
+                HSLFPictureData pd = ((HSLFPictureShape) 
shape).getPictureData();
+                if (pd != null) {
+                    picToSlides.computeIfAbsent(pd.getIndex(), k -> new 
HashSet<>()).add(slideNum);
+                }
+            } else if (shape instanceof HSLFGroupShape) {
+                collectPictureSlides((HSLFGroupShape) shape, slideNum, 
picToSlides);
+            }
+        }
+    }
+
     private void handleSlideEmbeddedResources(ShapeContainer shapeContainer,
                                               XHTMLContentHandler xhtml)
             throws TikaException, SAXException {

Reply via email to