This is an automated email from the ASF dual-hosted git repository.

tballison pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tika.git


The following commit(s) were added to refs/heads/main by this push:
     new 36211dda7d TIKA-4683 -- hyphens in ooxml (#2799)
36211dda7d is described below

commit 36211dda7d7eacef9dd0a2901511fe86f4b2efd2
Author: Tim Allison <[email protected]>
AuthorDate: Sat May 2 13:38:47 2026 -0400

    TIKA-4683 -- hyphens in ooxml (#2799)
---
 .../parser/microsoft/ooxml/OOXMLWordAndPowerPointTextHandler.java | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLWordAndPowerPointTextHandler.java
 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLWordAndPowerPointTextHandler.java
index 46e25b299d..a93bfd7d08 100644
--- 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLWordAndPowerPointTextHandler.java
+++ 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLWordAndPowerPointTextHandler.java
@@ -69,6 +69,8 @@ public class OOXMLWordAndPowerPointTextHandler extends 
DefaultHandler {
     private final static String STRIKE = "strike";
     private final static String NUM_PR = "numPr";
     private final static String BR = "br";
+    private final static String NO_BREAK_HYPHEN = "noBreakHyphen";
+    private final static String SOFT_HYPHEN = "softHyphen";
     private final static String HYPERLINK = "hyperlink";
     private final static String HLINK_CLICK = "hlinkClick"; //pptx hlink
     private final static String TBL = "tbl";
@@ -305,6 +307,12 @@ public class OOXMLWordAndPowerPointTextHandler extends 
DefaultHandler {
             }
         } else if (BR.equals(localName)) {
             runBuffer.append(NEWLINE);
+        } else if (NO_BREAK_HYPHEN.equals(localName)) {
+            // <w:noBreakHyphen/> — emit U+2011 NON-BREAKING HYPHEN
+            runBuffer.append('\u2011');
+        } else if (SOFT_HYPHEN.equals(localName)) {
+            // <w:softHyphen/> — emit U+00AD SOFT HYPHEN (invisible 
hyphenation hint)
+            runBuffer.append('\u00AD');
         } else if (BOOKMARK_START.equals(localName)) {
             String name = atts.getValue(W_NS, "name");
             String id = atts.getValue(W_NS, "id");

Reply via email to