oscerd commented on code in PR #20979:
URL: https://github.com/apache/camel/pull/20979#discussion_r2716813146
##########
components/camel-ai/camel-docling/src/main/java/org/apache/camel/component/docling/DoclingProducer.java:
##########
@@ -395,45 +399,16 @@ private DocumentMetadata extractMetadataUsingApi(String
inputPath) throws IOExce
}
}
- // Try to extract metadata from the JSON structure
- if (rootNode.has(DoclingMetadataFields.METADATA)) {
- JsonNode metadataNode =
rootNode.get(DoclingMetadataFields.METADATA);
- extractMetadataFieldsFromJson(metadata, metadataNode);
- }
-
- // Look for document-level information
- if (rootNode.has(DoclingMetadataFields.DOCUMENT)) {
- JsonNode docNode =
rootNode.get(DoclingMetadataFields.DOCUMENT);
- if (docNode.has(DoclingMetadataFields.NAME) &&
metadata.getTitle() == null) {
-
metadata.setTitle(docNode.get(DoclingMetadataFields.NAME).asText());
- }
- }
+ metadata.setPageCount(doclingDocument.getPages().size());
- // Extract main text to determine document type/format
- if (rootNode.has(DoclingMetadataFields.MAIN_TEXT)) {
- JsonNode mainTextNode =
rootNode.get(DoclingMetadataFields.MAIN_TEXT);
- if (mainTextNode.isArray() && mainTextNode.size() > 0) {
- // Document has text content
- metadata.setDocumentType("Text Document");
- }
- }
-
- // Count pages if available
- if (rootNode.has(DoclingMetadataFields.PAGES)) {
- if (rootNode.get(DoclingMetadataFields.PAGES).isArray()) {
-
metadata.setPageCount(rootNode.get(DoclingMetadataFields.PAGES).size());
- } else if (rootNode.get(DoclingMetadataFields.PAGES).isInt()) {
-
metadata.setPageCount(rootNode.get(DoclingMetadataFields.PAGES).asInt());
- }
- } else if (rootNode.has(DoclingMetadataFields.NUM_PAGES)) {
-
metadata.setPageCount(rootNode.get(DoclingMetadataFields.NUM_PAGES).asInt());
- } else if (rootNode.has(DoclingMetadataFields.PAGE_COUNT)) {
-
metadata.setPageCount(rootNode.get(DoclingMetadataFields.PAGE_COUNT).asInt());
+ if (doclingDocument.getOrigin() != null &&
doclingDocument.getOrigin().getMimetype() != null) {
+ metadata.setFormat(doclingDocument.getOrigin().getMimetype());
}
-
+
// Store raw metadata if requested
if (configuration.isIncludeRawMetadata()) {
@SuppressWarnings("unchecked")
+ JsonNode rootNode = objectMapper.readTree(jsonOutput);
Map<String, Object> rawMap =
objectMapper.convertValue(rootNode, Map.class);
metadata.setRawMetadata(rawMap);
}
Review Comment:
Yes, this would makes sense and also avoid all the json parsing
##########
components/camel-ai/camel-docling/src/main/java/org/apache/camel/component/docling/DoclingProducer.java:
##########
@@ -395,45 +399,16 @@ private DocumentMetadata extractMetadataUsingApi(String
inputPath) throws IOExce
}
}
- // Try to extract metadata from the JSON structure
- if (rootNode.has(DoclingMetadataFields.METADATA)) {
- JsonNode metadataNode =
rootNode.get(DoclingMetadataFields.METADATA);
- extractMetadataFieldsFromJson(metadata, metadataNode);
- }
-
- // Look for document-level information
- if (rootNode.has(DoclingMetadataFields.DOCUMENT)) {
- JsonNode docNode =
rootNode.get(DoclingMetadataFields.DOCUMENT);
- if (docNode.has(DoclingMetadataFields.NAME) &&
metadata.getTitle() == null) {
-
metadata.setTitle(docNode.get(DoclingMetadataFields.NAME).asText());
- }
- }
+ metadata.setPageCount(doclingDocument.getPages().size());
Review Comment:
yes.
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]