This is an automated email from the ASF dual-hosted git repository.
tallison pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tika.git
The following commit(s) were added to refs/heads/main by this push:
new 07e08fbe99 clean up dwg parsing (#2770)
07e08fbe99 is described below
commit 07e08fbe997daba691149926dc0c316fbb6199ae
Author: Tim Allison <[email protected]>
AuthorDate: Fri Apr 17 08:54:57 2026 -0400
clean up dwg parsing (#2770)
---
.../main/java/org/apache/tika/metadata/DWG.java | 33 ++++++
.../org/apache/tika/parser/dwg/DWGReadParser.java | 132 +++++++++++++++++++--
.../org/apache/tika/parser/dwg/DWGParserTest.java | 94 +++++++++++++--
3 files changed, 239 insertions(+), 20 deletions(-)
diff --git a/tika-core/src/main/java/org/apache/tika/metadata/DWG.java
b/tika-core/src/main/java/org/apache/tika/metadata/DWG.java
new file mode 100644
index 0000000000..99f5946c70
--- /dev/null
+++ b/tika-core/src/main/java/org/apache/tika/metadata/DWG.java
@@ -0,0 +1,33 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.metadata;
+
+/**
+ * DWG-specific properties surfaced by LibreDWG's dwgread JSON output.
+ */
+public interface DWG {
+
+ String DWG_PREFIX = "dwg" + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER;
+
+ Property APPLICATION_NAME = Property.externalText(DWG_PREFIX +
"applicationName");
+
+ Property APPLICATION_VERSION = Property.externalText(DWG_PREFIX +
"applicationVersion");
+
+ Property APPLICATION_COMMENT = Property.externalText(DWG_PREFIX +
"applicationComment");
+
+ Property PRODUCT_INFO = Property.externalText(DWG_PREFIX + "productInfo");
+}
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-cad-module/src/main/java/org/apache/tika/parser/dwg/DWGReadParser.java
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-cad-module/src/main/java/org/apache/tika/parser/dwg/DWGReadParser.java
index 098969c7c9..a232b4d8ce 100644
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-cad-module/src/main/java/org/apache/tika/parser/dwg/DWGReadParser.java
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-cad-module/src/main/java/org/apache/tika/parser/dwg/DWGReadParser.java
@@ -28,6 +28,7 @@ import java.nio.file.Files;
import java.time.Instant;
import java.util.Arrays;
import java.util.Collections;
+import java.util.HexFormat;
import java.util.List;
import java.util.Set;
import java.util.UUID;
@@ -48,11 +49,15 @@ import org.xml.sax.SAXException;
import org.apache.tika.config.TikaComponent;
import org.apache.tika.exception.TikaException;
+import org.apache.tika.extractor.EmbeddedDocumentExtractor;
+import org.apache.tika.extractor.EmbeddedDocumentUtil;
import org.apache.tika.io.TikaInputStream;
+import org.apache.tika.metadata.DWG;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.TikaCoreProperties;
import org.apache.tika.mime.MediaType;
import org.apache.tika.parser.ParseContext;
+import org.apache.tika.sax.EmbeddedContentHandler;
import org.apache.tika.sax.XHTMLContentHandler;
import org.apache.tika.utils.ExceptionUtils;
import org.apache.tika.utils.FileProcessResult;
@@ -116,9 +121,9 @@ public class DWGReadParser extends AbstractDWGParser {
List<String> command = Arrays.asList(dwgc.getDwgReadExecutable(),
"-O", "JSON", "-o",
tmpFileOut.getCanonicalPath(),
tmpFileIn.getCanonicalPath());
ProcessBuilder pb = new ProcessBuilder().command(command);
- LOG.info("About to call DWGRead: " + command.toString());
+ LOG.debug("About to call DWGRead: {}", command);
FileProcessResult fpr = ProcessUtils.execute(pb,
dwgc.getDwgReadTimeout(), 10000, 10000);
- LOG.info("DWGRead Exit code is: " + fpr.getExitValue());
+ LOG.debug("DWGRead Exit code is: {}", fpr.getExitValue());
if (fpr.getExitValue() == 0) {
if (dwgc.isCleanDwgReadOutput()) {
// dwgread sometimes creates strings with invalid utf-8
sequences or invalid
@@ -216,6 +221,10 @@ public class DWGReadParser extends AbstractDWGParser {
parseHeader(jParser, metadata);
} else if ("SummaryInfo".equals(nextFieldName)) {
parseSummaryInfo(jParser, metadata);
+ } else if ("AppInfo".equals(nextFieldName)) {
+ parseAppInfo(jParser, metadata);
+ } else if ("THUMBNAILIMAGE".equals(nextFieldName)) {
+ parseThumbnail(jParser, xhtml, metadata, context);
} else {
jParser.skipChildren();
}
@@ -233,6 +242,17 @@ public class DWGReadParser extends AbstractDWGParser {
xhtml.endDocument();
}
+ private static final Set<String> OBJECT_TEXT_FIELDS = Set.of(
+ "text",
+ "text_value",
+ "default_value",
+ "user_text",
+ "tag",
+ "prompt",
+ "code",
+ "value_string",
+ "ctx.content.txt.default_text");
+
private void parseDwgObject(JsonParser jsonParser, Consumer<String>
textConsumer) throws IOException {
JsonToken nextToken;
while ((nextToken = jsonParser.nextToken()) != JsonToken.END_OBJECT) {
@@ -242,18 +262,10 @@ public class DWGReadParser extends AbstractDWGParser {
if (nextToken.isStructStart()) {
jsonParser.skipChildren();
} else if (nextToken.isScalarValue()) {
- if ("text".equals(nextFieldName)) {
- String textVal = jsonParser.getText();
- if (StringUtils.isNotBlank(textVal)) {
-
- textConsumer.accept(textVal);
- }
- } else if ("text_value".equals(nextFieldName)) {
+ if (OBJECT_TEXT_FIELDS.contains(nextFieldName)) {
String textVal = jsonParser.getText();
- if (StringUtils.isNotBlank(textVal)) {
-
+ if (hasLetter(textVal)) {
textConsumer.accept(textVal);
-
}
}
}
@@ -261,6 +273,18 @@ public class DWGReadParser extends AbstractDWGParser {
}
}
+ private static boolean hasLetter(String s) {
+ if (s == null || s.isEmpty()) {
+ return false;
+ }
+ for (int i = 0; i < s.length(); i++) {
+ if (Character.isLetter(s.charAt(i))) {
+ return true;
+ }
+ }
+ return false;
+ }
+
private void parseHeader(JsonParser jsonParser, Metadata metadata) throws
IOException {
JsonToken nextToken;
while ((nextToken = jsonParser.nextToken()) != JsonToken.END_OBJECT) {
@@ -312,6 +336,16 @@ public class DWGReadParser extends AbstractDWGParser {
metadata.set(TikaCoreProperties.TITLE, textVal);
} else if ("LASTSAVEDBY".equals(nextFieldName)) {
metadata.set(TikaCoreProperties.MODIFIER, textVal);
+ } else if ("AUTHOR".equals(nextFieldName)) {
+ metadata.set(TikaCoreProperties.CREATOR, textVal);
+ } else if ("SUBJECT".equals(nextFieldName)) {
+ metadata.set(TikaCoreProperties.DESCRIPTION,
textVal);
+ } else if ("KEYWORDS".equals(nextFieldName)) {
+ metadata.set(TikaCoreProperties.SUBJECT, textVal);
+ } else if ("COMMENTS".equals(nextFieldName)) {
+ metadata.set(TikaCoreProperties.COMMENTS, textVal);
+ } else if ("HYPERLINKBASE".equals(nextFieldName)) {
+ metadata.set(TikaCoreProperties.RELATION, textVal);
} else if (!Strings.CI.startsWith(nextFieldName,
"unknown")) {
metadata.set(nextFieldName, textVal);
}
@@ -321,4 +355,78 @@ public class DWGReadParser extends AbstractDWGParser {
}
}
+ private void parseAppInfo(JsonParser jsonParser, Metadata metadata) throws
IOException {
+ JsonToken nextToken;
+ while ((nextToken = jsonParser.nextToken()) != JsonToken.END_OBJECT) {
+ if (nextToken == JsonToken.FIELD_NAME) {
+ String nextFieldName = jsonParser.currentName();
+ nextToken = jsonParser.nextToken();
+ if (nextToken.isStructStart()) {
+ jsonParser.skipChildren();
+ } else if (nextToken.isScalarValue()) {
+ String textVal = jsonParser.getText();
+ if (StringUtils.isBlank(textVal)) {
+ continue;
+ }
+ if ("appinfo_name".equals(nextFieldName)) {
+ metadata.set(DWG.APPLICATION_NAME, textVal);
+ } else if ("version".equals(nextFieldName)) {
+ metadata.set(DWG.APPLICATION_VERSION, textVal);
+ } else if ("comment".equals(nextFieldName)) {
+ metadata.set(DWG.APPLICATION_COMMENT, textVal);
+ } else if ("product_info".equals(nextFieldName)) {
+ metadata.set(DWG.PRODUCT_INFO, textVal);
+ }
+ }
+ }
+ }
+ }
+
+ private void parseThumbnail(JsonParser jsonParser, XHTMLContentHandler
xhtml,
+ Metadata metadata, ParseContext context)
+ throws IOException, SAXException {
+ String hexChain = null;
+ JsonToken nextToken;
+ while ((nextToken = jsonParser.nextToken()) != JsonToken.END_OBJECT) {
+ if (nextToken == JsonToken.FIELD_NAME) {
+ String nextFieldName = jsonParser.currentName();
+ nextToken = jsonParser.nextToken();
+ if (nextToken.isStructStart()) {
+ jsonParser.skipChildren();
+ } else if (nextToken.isScalarValue() &&
"chain".equals(nextFieldName)) {
+ hexChain = jsonParser.getText();
+ }
+ }
+ }
+ if (StringUtils.isBlank(hexChain)) {
+ return;
+ }
+ byte[] bytes;
+ try {
+ bytes = HexFormat.of().parseHex(hexChain);
+ } catch (IllegalArgumentException e) {
+ LOG.warn("Failed to decode DWG thumbnail hex chain", e);
+ return;
+ }
+ if (bytes.length == 0) {
+ return;
+ }
+
+ EmbeddedDocumentExtractor embeddedDocumentExtractor =
+ EmbeddedDocumentUtil.getEmbeddedDocumentExtractor(context);
+ Metadata embeddedMetadata = new Metadata();
+ embeddedMetadata.set(TikaCoreProperties.RESOURCE_NAME_KEY,
"thumbnail");
+ embeddedMetadata.set(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE,
+ TikaCoreProperties.EmbeddedResourceType.INLINE.toString());
+ if (!embeddedDocumentExtractor.shouldParseEmbedded(embeddedMetadata)) {
+ return;
+ }
+ try (TikaInputStream tis = TikaInputStream.get(bytes)) {
+ embeddedDocumentExtractor.parseEmbedded(tis,
+ new EmbeddedContentHandler(xhtml), embeddedMetadata,
context, true);
+ } catch (IOException e) {
+ LOG.warn("Failed to parse DWG thumbnail", e);
+ }
+ }
+
}
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-cad-module/src/test/java/org/apache/tika/parser/dwg/DWGParserTest.java
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-cad-module/src/test/java/org/apache/tika/parser/dwg/DWGParserTest.java
index e8c9c2d848..c652befed7 100644
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-cad-module/src/test/java/org/apache/tika/parser/dwg/DWGParserTest.java
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-cad-module/src/test/java/org/apache/tika/parser/dwg/DWGParserTest.java
@@ -19,14 +19,18 @@ package org.apache.tika.parser.dwg;
import static org.junit.jupiter.api.Assertions.assertEquals;
+import static org.junit.jupiter.api.Assertions.assertNotNull;
import static org.junit.jupiter.api.Assertions.assertNull;
import static org.junit.jupiter.api.Assertions.assertThrows;
import static org.junit.jupiter.api.Assertions.assertTrue;
import static org.junit.jupiter.api.Assumptions.assumeTrue;
+import java.io.File;
import java.nio.file.Files;
+import java.nio.file.Path;
import java.nio.file.Paths;
import java.util.Arrays;
+import java.util.List;
import org.junit.jupiter.api.Test;
import org.xml.sax.ContentHandler;
@@ -35,6 +39,7 @@ import org.apache.tika.TikaTest;
import org.apache.tika.config.loader.TikaLoader;
import org.apache.tika.exception.TikaException;
import org.apache.tika.io.TikaInputStream;
+import org.apache.tika.metadata.DWG;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.TikaCoreProperties;
import org.apache.tika.parser.CompositeParser;
@@ -45,16 +50,45 @@ import org.apache.tika.utils.StringUtils;
public class DWGParserTest extends TikaTest {
public boolean canRun(DWGParser parser) {
- String dwgRead = parser.getDefaultConfig().getDwgReadExecutable();
-
- if (!StringUtils.isBlank(dwgRead) &&
!Files.isRegularFile(Paths.get(dwgRead))) {
+ String resolved =
resolveDwgRead(parser.getDefaultConfig().getDwgReadExecutable());
+ if (resolved == null) {
return false;
}
-
- // Try running DWGRead from there, and see if it exists + works
- String[] checkCmd = { dwgRead };
+ // Point the parser config at the resolved executable so tests "just
work"
+ // on whichever machine has libredwg installed.
+ parser.getDefaultConfig().setDwgReadExecutable(resolved);
+ String[] checkCmd = {resolved};
return ProcessUtils.checkCommand(checkCmd);
+ }
+ /**
+ * Look for dwgread in (1) the DWGREAD_EXE env var, (2) the configured
path,
+ * (3) on PATH. Returns null if none are found.
+ */
+ private static String resolveDwgRead(String configPath) {
+ String env = System.getenv("DWGREAD_EXE");
+ if (!StringUtils.isBlank(env) && Files.isRegularFile(Paths.get(env))) {
+ return env;
+ }
+ if (!StringUtils.isBlank(configPath) &&
Files.isRegularFile(Paths.get(configPath))) {
+ return configPath;
+ }
+ boolean windows = System.getProperty("os.name")
+ .toLowerCase(java.util.Locale.ROOT).contains("win");
+ String exeName = windows ? "dwgread.exe" : "dwgread";
+ String pathEnv = System.getenv("PATH");
+ if (pathEnv != null) {
+ for (String dir : pathEnv.split(File.pathSeparator)) {
+ if (dir.isEmpty()) {
+ continue;
+ }
+ Path candidate = Paths.get(dir, exeName);
+ if (Files.isRegularFile(candidate)) {
+ return candidate.toString();
+ }
+ }
+ }
+ return null;
}
@Test
public void testDWG2000Parser() throws Exception {
@@ -221,8 +255,52 @@ public class DWGParserTest extends TikaTest {
.loadParsers())
.getAllComponentParsers().get(0);
assumeTrue(canRun(parser), "Can't run DWGRead.exe");
- String output =
getText("architectural_-_annotation_scaling_and_multileaders.dwg", parser);
- assertContains("ELEV. 11'-9\" TOP OF SECOND FLR.",output);
+ List<Metadata> metadataList = getRecursiveMetadata(
+ "architectural_-_annotation_scaling_and_multileaders.dwg",
parser);
+ Metadata root = metadataList.get(0);
+
+ String content = root.get(TikaCoreProperties.TIKA_CONTENT);
+ assertContains("ELEV. 11'-9\" TOP OF SECOND FLR.", content);
+ // MULTILEADER ctx.content.txt.default_text
+ assertContains("EPDM ROOF CONSTRUCTION", content);
+ assertContains("O.S.B SHEATHING", content);
+ // ATTRIB tag / prompt
+ assertContains("Enter sheet number", content);
+
+ // AppInfo
+ assertEquals("AppInfoDataList", root.get(DWG.APPLICATION_NAME));
+ assertEquals("17.1.51.0", root.get(DWG.APPLICATION_VERSION));
+ assertNotNull(root.get(DWG.APPLICATION_COMMENT));
+ assertContains("AutoCAD", root.get(DWG.PRODUCT_INFO));
+
+ // Thumbnail embedded as INLINE
+ Metadata thumb = null;
+ for (int i = 1; i < metadataList.size(); i++) {
+ String type =
metadataList.get(i).get(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE);
+ if
(TikaCoreProperties.EmbeddedResourceType.INLINE.name().equals(type)) {
+ thumb = metadataList.get(i);
+ break;
+ }
+ }
+ assertNotNull(thumb, "Expected an INLINE thumbnail attachment");
+ }
+
+ @Test
+ public void testDWGReadSummaryInfoMapping() throws Exception {
+ DWGParser parser =
+ (DWGParser) ((CompositeParser) TikaLoader.load(
+ getConfigPath(DWGParserTest.class,
"tika-config-dwgRead.json"))
+ .loadParsers())
+ .getAllComponentParsers().get(0);
+ assumeTrue(canRun(parser), "Can't run DWGRead.exe");
+ Metadata metadata = getXML("testDWGmech2004.dwg", parser).metadata;
+ assertEquals("Test Title", metadata.get(TikaCoreProperties.TITLE));
+ assertEquals("Test Subject",
metadata.get(TikaCoreProperties.DESCRIPTION));
+ assertEquals("My Author", metadata.get(TikaCoreProperties.CREATOR));
+ assertEquals("My keyword1, MyKeyword2",
metadata.get(TikaCoreProperties.SUBJECT));
+ assertEquals("This is a comment",
metadata.get(TikaCoreProperties.COMMENTS));
+ assertEquals("bejanpol", metadata.get(TikaCoreProperties.MODIFIER));
+ assertEquals("http://mycompany/drawings",
metadata.get(TikaCoreProperties.RELATION));
}
@Test