This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tika.git


The following commit(s) were added to refs/heads/main by this push:
     new 07e08fbe99 clean up dwg parsing (#2770)
07e08fbe99 is described below

commit 07e08fbe997daba691149926dc0c316fbb6199ae
Author: Tim Allison <[email protected]>
AuthorDate: Fri Apr 17 08:54:57 2026 -0400

    clean up dwg parsing (#2770)
---
 .../main/java/org/apache/tika/metadata/DWG.java    |  33 ++++++
 .../org/apache/tika/parser/dwg/DWGReadParser.java  | 132 +++++++++++++++++++--
 .../org/apache/tika/parser/dwg/DWGParserTest.java  |  94 +++++++++++++--
 3 files changed, 239 insertions(+), 20 deletions(-)

diff --git a/tika-core/src/main/java/org/apache/tika/metadata/DWG.java 
b/tika-core/src/main/java/org/apache/tika/metadata/DWG.java
new file mode 100644
index 0000000000..99f5946c70
--- /dev/null
+++ b/tika-core/src/main/java/org/apache/tika/metadata/DWG.java
@@ -0,0 +1,33 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.metadata;
+
+/**
+ * DWG-specific properties surfaced by LibreDWG's dwgread JSON output.
+ */
+public interface DWG {
+
+    String DWG_PREFIX = "dwg" + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER;
+
+    Property APPLICATION_NAME = Property.externalText(DWG_PREFIX + 
"applicationName");
+
+    Property APPLICATION_VERSION = Property.externalText(DWG_PREFIX + 
"applicationVersion");
+
+    Property APPLICATION_COMMENT = Property.externalText(DWG_PREFIX + 
"applicationComment");
+
+    Property PRODUCT_INFO = Property.externalText(DWG_PREFIX + "productInfo");
+}
diff --git 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-cad-module/src/main/java/org/apache/tika/parser/dwg/DWGReadParser.java
 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-cad-module/src/main/java/org/apache/tika/parser/dwg/DWGReadParser.java
index 098969c7c9..a232b4d8ce 100644
--- 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-cad-module/src/main/java/org/apache/tika/parser/dwg/DWGReadParser.java
+++ 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-cad-module/src/main/java/org/apache/tika/parser/dwg/DWGReadParser.java
@@ -28,6 +28,7 @@ import java.nio.file.Files;
 import java.time.Instant;
 import java.util.Arrays;
 import java.util.Collections;
+import java.util.HexFormat;
 import java.util.List;
 import java.util.Set;
 import java.util.UUID;
@@ -48,11 +49,15 @@ import org.xml.sax.SAXException;
 
 import org.apache.tika.config.TikaComponent;
 import org.apache.tika.exception.TikaException;
+import org.apache.tika.extractor.EmbeddedDocumentExtractor;
+import org.apache.tika.extractor.EmbeddedDocumentUtil;
 import org.apache.tika.io.TikaInputStream;
+import org.apache.tika.metadata.DWG;
 import org.apache.tika.metadata.Metadata;
 import org.apache.tika.metadata.TikaCoreProperties;
 import org.apache.tika.mime.MediaType;
 import org.apache.tika.parser.ParseContext;
+import org.apache.tika.sax.EmbeddedContentHandler;
 import org.apache.tika.sax.XHTMLContentHandler;
 import org.apache.tika.utils.ExceptionUtils;
 import org.apache.tika.utils.FileProcessResult;
@@ -116,9 +121,9 @@ public class DWGReadParser extends AbstractDWGParser {
             List<String> command = Arrays.asList(dwgc.getDwgReadExecutable(), 
"-O", "JSON", "-o",
                     tmpFileOut.getCanonicalPath(), 
tmpFileIn.getCanonicalPath());
             ProcessBuilder pb = new ProcessBuilder().command(command);
-            LOG.info("About to call DWGRead: " + command.toString());
+            LOG.debug("About to call DWGRead: {}", command);
             FileProcessResult fpr = ProcessUtils.execute(pb, 
dwgc.getDwgReadTimeout(), 10000, 10000);
-            LOG.info("DWGRead Exit code is: " + fpr.getExitValue());
+            LOG.debug("DWGRead Exit code is: {}", fpr.getExitValue());
             if (fpr.getExitValue() == 0) {
                 if (dwgc.isCleanDwgReadOutput()) {
                     // dwgread sometimes creates strings with invalid utf-8 
sequences or invalid
@@ -216,6 +221,10 @@ public class DWGReadParser extends AbstractDWGParser {
                             parseHeader(jParser, metadata);
                         } else if ("SummaryInfo".equals(nextFieldName)) {
                             parseSummaryInfo(jParser, metadata);
+                        } else if ("AppInfo".equals(nextFieldName)) {
+                            parseAppInfo(jParser, metadata);
+                        } else if ("THUMBNAILIMAGE".equals(nextFieldName)) {
+                            parseThumbnail(jParser, xhtml, metadata, context);
                         } else {
                             jParser.skipChildren();
                         }
@@ -233,6 +242,17 @@ public class DWGReadParser extends AbstractDWGParser {
         xhtml.endDocument();
     }
 
+    private static final Set<String> OBJECT_TEXT_FIELDS = Set.of(
+            "text",
+            "text_value",
+            "default_value",
+            "user_text",
+            "tag",
+            "prompt",
+            "code",
+            "value_string",
+            "ctx.content.txt.default_text");
+
     private void parseDwgObject(JsonParser jsonParser, Consumer<String> 
textConsumer) throws IOException {
         JsonToken nextToken;
         while ((nextToken = jsonParser.nextToken()) != JsonToken.END_OBJECT) {
@@ -242,18 +262,10 @@ public class DWGReadParser extends AbstractDWGParser {
                 if (nextToken.isStructStart()) {
                     jsonParser.skipChildren();
                 } else if (nextToken.isScalarValue()) {
-                    if ("text".equals(nextFieldName)) {
-                        String textVal = jsonParser.getText();
-                        if (StringUtils.isNotBlank(textVal)) {
-
-                            textConsumer.accept(textVal);
-                        }
-                    } else if ("text_value".equals(nextFieldName)) {
+                    if (OBJECT_TEXT_FIELDS.contains(nextFieldName)) {
                         String textVal = jsonParser.getText();
-                        if (StringUtils.isNotBlank(textVal)) {
-
+                        if (hasLetter(textVal)) {
                             textConsumer.accept(textVal);
-
                         }
                     }
                 }
@@ -261,6 +273,18 @@ public class DWGReadParser extends AbstractDWGParser {
         }
     }
 
+    private static boolean hasLetter(String s) {
+        if (s == null || s.isEmpty()) {
+            return false;
+        }
+        for (int i = 0; i < s.length(); i++) {
+            if (Character.isLetter(s.charAt(i))) {
+                return true;
+            }
+        }
+        return false;
+    }
+
     private void parseHeader(JsonParser jsonParser, Metadata metadata) throws 
IOException {
         JsonToken nextToken;
         while ((nextToken = jsonParser.nextToken()) != JsonToken.END_OBJECT) {
@@ -312,6 +336,16 @@ public class DWGReadParser extends AbstractDWGParser {
                             metadata.set(TikaCoreProperties.TITLE, textVal);
                         } else if ("LASTSAVEDBY".equals(nextFieldName)) {
                             metadata.set(TikaCoreProperties.MODIFIER, textVal);
+                        } else if ("AUTHOR".equals(nextFieldName)) {
+                            metadata.set(TikaCoreProperties.CREATOR, textVal);
+                        } else if ("SUBJECT".equals(nextFieldName)) {
+                            metadata.set(TikaCoreProperties.DESCRIPTION, 
textVal);
+                        } else if ("KEYWORDS".equals(nextFieldName)) {
+                            metadata.set(TikaCoreProperties.SUBJECT, textVal);
+                        } else if ("COMMENTS".equals(nextFieldName)) {
+                            metadata.set(TikaCoreProperties.COMMENTS, textVal);
+                        } else if ("HYPERLINKBASE".equals(nextFieldName)) {
+                            metadata.set(TikaCoreProperties.RELATION, textVal);
                         } else if (!Strings.CI.startsWith(nextFieldName, 
"unknown")) {
                             metadata.set(nextFieldName, textVal);
                         }
@@ -321,4 +355,78 @@ public class DWGReadParser extends AbstractDWGParser {
         }
     }
 
+    private void parseAppInfo(JsonParser jsonParser, Metadata metadata) throws 
IOException {
+        JsonToken nextToken;
+        while ((nextToken = jsonParser.nextToken()) != JsonToken.END_OBJECT) {
+            if (nextToken == JsonToken.FIELD_NAME) {
+                String nextFieldName = jsonParser.currentName();
+                nextToken = jsonParser.nextToken();
+                if (nextToken.isStructStart()) {
+                    jsonParser.skipChildren();
+                } else if (nextToken.isScalarValue()) {
+                    String textVal = jsonParser.getText();
+                    if (StringUtils.isBlank(textVal)) {
+                        continue;
+                    }
+                    if ("appinfo_name".equals(nextFieldName)) {
+                        metadata.set(DWG.APPLICATION_NAME, textVal);
+                    } else if ("version".equals(nextFieldName)) {
+                        metadata.set(DWG.APPLICATION_VERSION, textVal);
+                    } else if ("comment".equals(nextFieldName)) {
+                        metadata.set(DWG.APPLICATION_COMMENT, textVal);
+                    } else if ("product_info".equals(nextFieldName)) {
+                        metadata.set(DWG.PRODUCT_INFO, textVal);
+                    }
+                }
+            }
+        }
+    }
+
+    private void parseThumbnail(JsonParser jsonParser, XHTMLContentHandler 
xhtml,
+                                Metadata metadata, ParseContext context)
+            throws IOException, SAXException {
+        String hexChain = null;
+        JsonToken nextToken;
+        while ((nextToken = jsonParser.nextToken()) != JsonToken.END_OBJECT) {
+            if (nextToken == JsonToken.FIELD_NAME) {
+                String nextFieldName = jsonParser.currentName();
+                nextToken = jsonParser.nextToken();
+                if (nextToken.isStructStart()) {
+                    jsonParser.skipChildren();
+                } else if (nextToken.isScalarValue() && 
"chain".equals(nextFieldName)) {
+                    hexChain = jsonParser.getText();
+                }
+            }
+        }
+        if (StringUtils.isBlank(hexChain)) {
+            return;
+        }
+        byte[] bytes;
+        try {
+            bytes = HexFormat.of().parseHex(hexChain);
+        } catch (IllegalArgumentException e) {
+            LOG.warn("Failed to decode DWG thumbnail hex chain", e);
+            return;
+        }
+        if (bytes.length == 0) {
+            return;
+        }
+
+        EmbeddedDocumentExtractor embeddedDocumentExtractor =
+                EmbeddedDocumentUtil.getEmbeddedDocumentExtractor(context);
+        Metadata embeddedMetadata = new Metadata();
+        embeddedMetadata.set(TikaCoreProperties.RESOURCE_NAME_KEY, 
"thumbnail");
+        embeddedMetadata.set(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE,
+                TikaCoreProperties.EmbeddedResourceType.INLINE.toString());
+        if (!embeddedDocumentExtractor.shouldParseEmbedded(embeddedMetadata)) {
+            return;
+        }
+        try (TikaInputStream tis = TikaInputStream.get(bytes)) {
+            embeddedDocumentExtractor.parseEmbedded(tis,
+                    new EmbeddedContentHandler(xhtml), embeddedMetadata, 
context, true);
+        } catch (IOException e) {
+            LOG.warn("Failed to parse DWG thumbnail", e);
+        }
+    }
+
 }
diff --git 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-cad-module/src/test/java/org/apache/tika/parser/dwg/DWGParserTest.java
 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-cad-module/src/test/java/org/apache/tika/parser/dwg/DWGParserTest.java
index e8c9c2d848..c652befed7 100644
--- 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-cad-module/src/test/java/org/apache/tika/parser/dwg/DWGParserTest.java
+++ 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-cad-module/src/test/java/org/apache/tika/parser/dwg/DWGParserTest.java
@@ -19,14 +19,18 @@ package org.apache.tika.parser.dwg;
 
 
 import static org.junit.jupiter.api.Assertions.assertEquals;
+import static org.junit.jupiter.api.Assertions.assertNotNull;
 import static org.junit.jupiter.api.Assertions.assertNull;
 import static org.junit.jupiter.api.Assertions.assertThrows;
 import static org.junit.jupiter.api.Assertions.assertTrue;
 import static org.junit.jupiter.api.Assumptions.assumeTrue;
 
+import java.io.File;
 import java.nio.file.Files;
+import java.nio.file.Path;
 import java.nio.file.Paths;
 import java.util.Arrays;
+import java.util.List;
 
 import org.junit.jupiter.api.Test;
 import org.xml.sax.ContentHandler;
@@ -35,6 +39,7 @@ import org.apache.tika.TikaTest;
 import org.apache.tika.config.loader.TikaLoader;
 import org.apache.tika.exception.TikaException;
 import org.apache.tika.io.TikaInputStream;
+import org.apache.tika.metadata.DWG;
 import org.apache.tika.metadata.Metadata;
 import org.apache.tika.metadata.TikaCoreProperties;
 import org.apache.tika.parser.CompositeParser;
@@ -45,16 +50,45 @@ import org.apache.tika.utils.StringUtils;
 
 public class DWGParserTest extends TikaTest {
     public boolean canRun(DWGParser parser)  {
-        String dwgRead = parser.getDefaultConfig().getDwgReadExecutable();
-
-        if (!StringUtils.isBlank(dwgRead) && 
!Files.isRegularFile(Paths.get(dwgRead))) {
+        String resolved = 
resolveDwgRead(parser.getDefaultConfig().getDwgReadExecutable());
+        if (resolved == null) {
             return false;
         }
-
-        // Try running DWGRead from there, and see if it exists + works
-        String[] checkCmd = { dwgRead };
+        // Point the parser config at the resolved executable so tests "just 
work"
+        // on whichever machine has libredwg installed.
+        parser.getDefaultConfig().setDwgReadExecutable(resolved);
+        String[] checkCmd = {resolved};
         return ProcessUtils.checkCommand(checkCmd);
+    }
 
+    /**
+     * Look for dwgread in (1) the DWGREAD_EXE env var, (2) the configured 
path,
+     * (3) on PATH. Returns null if none are found.
+     */
+    private static String resolveDwgRead(String configPath) {
+        String env = System.getenv("DWGREAD_EXE");
+        if (!StringUtils.isBlank(env) && Files.isRegularFile(Paths.get(env))) {
+            return env;
+        }
+        if (!StringUtils.isBlank(configPath) && 
Files.isRegularFile(Paths.get(configPath))) {
+            return configPath;
+        }
+        boolean windows = System.getProperty("os.name")
+                .toLowerCase(java.util.Locale.ROOT).contains("win");
+        String exeName = windows ? "dwgread.exe" : "dwgread";
+        String pathEnv = System.getenv("PATH");
+        if (pathEnv != null) {
+            for (String dir : pathEnv.split(File.pathSeparator)) {
+                if (dir.isEmpty()) {
+                    continue;
+                }
+                Path candidate = Paths.get(dir, exeName);
+                if (Files.isRegularFile(candidate)) {
+                    return candidate.toString();
+                }
+            }
+        }
+        return null;
     }
     @Test
     public void testDWG2000Parser() throws Exception {
@@ -221,8 +255,52 @@ public class DWGParserTest extends TikaTest {
                         .loadParsers())
                         .getAllComponentParsers().get(0);
         assumeTrue(canRun(parser), "Can't run DWGRead.exe");
-        String output = 
getText("architectural_-_annotation_scaling_and_multileaders.dwg", parser);
-        assertContains("ELEV. 11'-9\" TOP OF SECOND FLR.",output);
+        List<Metadata> metadataList = getRecursiveMetadata(
+                "architectural_-_annotation_scaling_and_multileaders.dwg", 
parser);
+        Metadata root = metadataList.get(0);
+
+        String content = root.get(TikaCoreProperties.TIKA_CONTENT);
+        assertContains("ELEV. 11'-9\" TOP OF SECOND FLR.", content);
+        // MULTILEADER ctx.content.txt.default_text
+        assertContains("EPDM ROOF CONSTRUCTION", content);
+        assertContains("O.S.B SHEATHING", content);
+        // ATTRIB tag / prompt
+        assertContains("Enter sheet number", content);
+
+        // AppInfo
+        assertEquals("AppInfoDataList", root.get(DWG.APPLICATION_NAME));
+        assertEquals("17.1.51.0", root.get(DWG.APPLICATION_VERSION));
+        assertNotNull(root.get(DWG.APPLICATION_COMMENT));
+        assertContains("AutoCAD", root.get(DWG.PRODUCT_INFO));
+
+        // Thumbnail embedded as INLINE
+        Metadata thumb = null;
+        for (int i = 1; i < metadataList.size(); i++) {
+            String type = 
metadataList.get(i).get(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE);
+            if 
(TikaCoreProperties.EmbeddedResourceType.INLINE.name().equals(type)) {
+                thumb = metadataList.get(i);
+                break;
+            }
+        }
+        assertNotNull(thumb, "Expected an INLINE thumbnail attachment");
+    }
+
+    @Test
+    public void testDWGReadSummaryInfoMapping() throws Exception {
+        DWGParser parser =
+                (DWGParser) ((CompositeParser) TikaLoader.load(
+                                getConfigPath(DWGParserTest.class, 
"tika-config-dwgRead.json"))
+                        .loadParsers())
+                        .getAllComponentParsers().get(0);
+        assumeTrue(canRun(parser), "Can't run DWGRead.exe");
+        Metadata metadata = getXML("testDWGmech2004.dwg", parser).metadata;
+        assertEquals("Test Title", metadata.get(TikaCoreProperties.TITLE));
+        assertEquals("Test Subject", 
metadata.get(TikaCoreProperties.DESCRIPTION));
+        assertEquals("My Author", metadata.get(TikaCoreProperties.CREATOR));
+        assertEquals("My keyword1, MyKeyword2", 
metadata.get(TikaCoreProperties.SUBJECT));
+        assertEquals("This is a comment", 
metadata.get(TikaCoreProperties.COMMENTS));
+        assertEquals("bejanpol", metadata.get(TikaCoreProperties.MODIFIER));
+        assertEquals("http://mycompany/drawings";, 
metadata.get(TikaCoreProperties.RELATION));
     }
 
     @Test

Reply via email to