awelless commented on code in PR #9825:
URL: https://github.com/apache/nifi/pull/9825#discussion_r2024861351
##########
nifi-extension-bundles/nifi-box-bundle/nifi-box-processors/src/main/java/org/apache/nifi/processors/box/ExtractStructuredBoxFileMetadata.java:
##########
@@ -58,17 +59,26 @@
@InputRequirement(InputRequirement.Requirement.INPUT_REQUIRED)
@Tags({"box", "storage", "metadata", "ai", "extract"})
-@CapabilityDescription("Extracts metadata from a Box file using Box AI and a
template. The extracted metadata is written to the FlowFile content as JSON.")
+@CapabilityDescription("""
+ Extracts metadata from a Box file using Box AI. The extraction can use
either a template or a list of fields.\s
+ The extracted metadata is written to the FlowFile content as JSON.
+ """)
@SeeAlso({ListBoxFileMetadataTemplates.class, ListBoxFile.class,
FetchBoxFile.class, UpdateBoxFileMetadataInstance.class})
@WritesAttributes({
@WritesAttribute(attribute = "box.id", description = "The ID of the
file from which metadata was extracted"),
- @WritesAttribute(attribute = "box.ai.template.key", description = "The
template key used for extraction"),
+ @WritesAttribute(attribute = "box.ai.template.key", description = "The
template key used for extraction (when using TEMPLATE extraction method)"),
+ @WritesAttribute(attribute = "box.ai.extraction.method", description =
"The extraction method used (TEMPLATE or FIELDS)"),
@WritesAttribute(attribute = "box.ai.completion.reason", description =
"The completion reason from the AI extraction"),
@WritesAttribute(attribute = "mime.type", description = "Set to
'application/json' for the JSON content"),
@WritesAttribute(attribute = ERROR_CODE, description =
ERROR_CODE_DESC),
@WritesAttribute(attribute = ERROR_MESSAGE, description =
ERROR_MESSAGE_DESC)
})
-public class ExtractBoxFileMetadataWithBoxAI extends AbstractProcessor {
+public class ExtractStructuredBoxFileMetadata extends AbstractProcessor {
+
+ public static class ExtractionMethod {
+ public static final String TEMPLATE = "TEMPLATE";
+ public static final String FIELDS = "FIELDS";
+ }
Review Comment:
We can use an enum extending `DescribedValue` which allows providing better
documentation. Using an enum has further benefits, comments to which I'll add
further below.
```java
public enum ExtractionMethod implements DescribedValue {
TEMPLATE,
FIELDS;
private final String displayName;
private final String description;
// the c-tor, getters
}
```
##########
nifi-extension-bundles/nifi-box-bundle/nifi-box-processors/src/main/java/org/apache/nifi/processors/box/CreateBoxMetadataTemplate.java:
##########
@@ -270,77 +224,65 @@ public void onTrigger(final ProcessContext context, final
ProcessSession session
}
private void processRecord(final Record record,
- final RecordPath keyRecordPath,
- final RecordPath typeRecordPath,
- final RecordPath displayNameRecordPath,
final List<MetadataTemplate.Field> fields,
final Set<String> processedKeys,
final List<String> errors) {
- final RecordPathResult keyPathResult = keyRecordPath.evaluate(record);
- final List<FieldValue> keyValues =
keyPathResult.getSelectedFields().toList();
-
- if (keyValues.isEmpty()) {
- errors.add("Record is missing a key field");
- return;
- }
-
- final Object keyObj = keyValues.getFirst().getValue();
+ // Extract and validate key (required)
+ final Object keyObj = record.getValue("key");
if (keyObj == null) {
- errors.add("Record has a null key value");
+ errors.add("Record is missing a key field");
return;
}
-
final String key = keyObj.toString();
- // Skip if we've already processed this key
if (processedKeys.contains(key)) {
- getLogger().warn("Duplicate key '{}' found in record, skipping",
key);
+ errors.add("Duplicate key '" + key + "' found in record,
skipping");
Review Comment:
Technically we don't skip the key. We fail the whole template creation.
##########
nifi-extension-bundles/nifi-box-bundle/nifi-box-processors/src/main/java/org/apache/nifi/processors/box/ExtractStructuredBoxFileMetadata.java:
##########
@@ -202,16 +267,47 @@ BoxAIExtractStructuredResponse
getBoxAIExtractStructuredResponse(final String te
);
}
- private static @NotNull Map<String, String> getAttributes(final
BoxAIExtractStructuredResponse result,
- final String
fileId) {
- final String completionReason = result.getCompletionReason();
+ /**
+ * Extracts metadata from a Box file using the specified fields.
+ * Visible for testing purposes.
+ *
+ * @param fieldsString A comma-separated list of field names to extract.
+ * @param fileId The ID of the Box file from which to extract
metadata.
+ * @return The response containing the extracted metadata.
+ */
+ BoxAIExtractStructuredResponse
getBoxAIExtractStructuredResponseWithFields(final String fieldsString,
+
final String fileId) {
+ final List<BoxAIExtractField> fields = parseFields(fieldsString);
+ final BoxAIItem fileItem = new BoxAIItem(fileId, BoxAIItem.Type.FILE);
- final Map<String, String> attributes = new HashMap<>();
- attributes.put("box.id", fileId);
+ return BoxAI.extractMetadataStructured(
+ boxAPIConnection,
+ Collections.singletonList(fileItem),
+ fields
+ );
+ }
- if (completionReason != null) {
- attributes.put("box.ai.completion.reason", completionReason);
+ /**
+ * Parses a comma-separated string of field names into a list of
BoxAIExtractField objects.
+ *
+ * @param fieldsString A comma-separated list of field names.
+ * @return A list of BoxAIExtractField objects.
+ */
+ private List<BoxAIExtractField> parseFields(final String fieldsString) {
+ if (fieldsString == null || fieldsString.trim().isEmpty()) {
+ return Collections.emptyList();
}
- return attributes;
+
+ final List<BoxAIExtractField> fields = new ArrayList<>();
+ final String[] fieldNames = fieldsString.split(",");
+
+ for (final String fieldName : fieldNames) {
+ final String trimmedField = fieldName.trim();
+ if (!trimmedField.isEmpty()) {
+ fields.add(new BoxAIExtractField(trimmedField));
Review Comment:
Providing field names won't be sufficient. Besides a filed key at least a
type should be provided. Otherwise it's hard to reason about the output schema.
Can we accept just a json `fields` definition as [in the
API](https://developer.box.com/reference/post-ai-extract-structured/#param-fields),
and pass it as is?
##########
nifi-extension-bundles/nifi-box-bundle/nifi-box-processors/src/main/java/org/apache/nifi/processors/box/UpdateBoxFileMetadataInstance.java:
##########
@@ -180,66 +160,45 @@ public void onTrigger(final ProcessContext context, final
ProcessSession session
}
final String fileId =
context.getProperty(FILE_ID).evaluateAttributeExpressions(flowFile).getValue();
- final String templateName =
context.getProperty(TEMPLATE_NAME).evaluateAttributeExpressions(flowFile).getValue();
- final String templateScope =
context.getProperty(TEMPLATE_SCOPE).evaluateAttributeExpressions(flowFile).getValue();
+ final String templateKey =
context.getProperty(TEMPLATE_KEY).evaluateAttributeExpressions(flowFile).getValue();
final RecordReaderFactory recordReaderFactory =
context.getProperty(RECORD_READER).asControllerService(RecordReaderFactory.class);
- final String keyRecordPathStr =
context.getProperty(KEY_RECORD_PATH).evaluateAttributeExpressions(flowFile).getValue();
- final String valueRecordPathStr =
context.getProperty(VALUE_RECORD_PATH).evaluateAttributeExpressions(flowFile).getValue();
-
- try (final InputStream inputStream = session.read(flowFile);
- final RecordReader recordReader =
recordReaderFactory.createRecordReader(flowFile, inputStream, getLogger())) {
-
- final RecordPath keyRecordPath =
RecordPath.compile(keyRecordPathStr);
- final RecordPath valueRecordPath =
RecordPath.compile(valueRecordPathStr);
-
- // Create metadata object
- final Metadata metadata = new Metadata(templateScope,
templateName);
- final Set<String> updatedKeys = new HashSet<>();
- final List<String> errors = new ArrayList<>();
-
- Record record;
- try {
- while ((record = recordReader.nextRecord()) != null) {
- processRecord(record, keyRecordPath, valueRecordPath,
metadata, updatedKeys, errors);
- }
- } catch (final Exception e) {
- getLogger().error("Error processing record: {}",
e.getMessage(), e);
- errors.add("Error processing record: " + e.getMessage());
- }
- if (!errors.isEmpty()) {
- flowFile = session.putAttribute(flowFile, ERROR_MESSAGE,
String.join(", ", errors));
- session.transfer(flowFile, REL_FAILURE);
- return;
- }
+ try {
+ final BoxFile boxFile = getBoxFile(fileId);
+ final Map<String, Object> desiredState = readDesiredState(session,
flowFile, recordReaderFactory);
- if (updatedKeys.isEmpty()) {
+ if (desiredState.isEmpty()) {
flowFile = session.putAttribute(flowFile, ERROR_MESSAGE, "No
valid metadata key-value pairs found in the input");
session.transfer(flowFile, REL_FAILURE);
return;
}
- final BoxFile boxFile = getBoxFile(fileId);
- boxFile.updateMetadata(metadata);
+ final Metadata metadata = getOrCreateMetadata(boxFile,
templateKey, fileId);
+ final Set<String> processedKeys = updateMetadata(metadata,
desiredState);
Review Comment:
`processedKeys` can be removed entirely. `metadata.getOperations()` should
be used instead.
##########
nifi-extension-bundles/nifi-box-bundle/nifi-box-processors/src/main/java/org/apache/nifi/processors/box/ExtractStructuredBoxFileMetadata.java:
##########
@@ -143,16 +180,38 @@ public void onTrigger(final ProcessContext context, final
ProcessSession session
}
final String fileId =
context.getProperty(FILE_ID).evaluateAttributeExpressions(flowFile).getValue();
- final String templateKey =
context.getProperty(TEMPLATE_KEY).evaluateAttributeExpressions(flowFile).getValue();
+ final String extractionMethod =
context.getProperty(EXTRACTION_METHOD).getValue();
try {
- final BoxAIExtractStructuredResponse result =
getBoxAIExtractStructuredResponse(templateKey, fileId);
- flowFile = session.putAllAttributes(flowFile,
getAttributes(result, fileId));
+ final BoxAIExtractStructuredResponse result;
+ final Map<String, String> attributes = new HashMap<>();
+ attributes.put("box.id", fileId);
+ attributes.put("box.ai.extraction.method", extractionMethod);
+
+ if (ExtractionMethod.TEMPLATE.equals(extractionMethod)) {
+ final String templateKey =
context.getProperty(TEMPLATE_KEY).evaluateAttributeExpressions(flowFile).getValue();
+ attributes.put("box.ai.template.key", templateKey);
+ result =
getBoxAIExtractStructuredResponseWithTemplate(templateKey, fileId);
+ } else {
+ final String fieldsString =
context.getProperty(FIELDS).evaluateAttributeExpressions(flowFile).getValue();
+ result =
getBoxAIExtractStructuredResponseWithFields(fieldsString, fileId);
+ }
Review Comment:
With enums java's switch expression conducts exhaustiveness checks. So when
doing this, you're never going to miss when a new enum value is added.
```java
final BoxAIExtractStructuredResponse result = switch
(extractionMethod) {
case TEMPLATE -> {
final String templateKey =
context.getProperty(TEMPLATE_KEY).evaluateAttributeExpressions(flowFile).getValue();
attributes.put("box.ai.template.key", templateKey);
yield
getBoxAIExtractStructuredResponseWithTemplate(templateKey, fileId);
}
case FIELDS -> {
final String fieldsString =
context.getProperty(FIELDS).evaluateAttributeExpressions(flowFile).getValue();
yield
getBoxAIExtractStructuredResponseWithFields(fieldsString, fileId);
}
};
```
##########
nifi-extension-bundles/nifi-box-bundle/nifi-box-processors/src/test/java/org/apache/nifi/processors/box/UpdateBoxFileMetadataInstanceTest.java:
##########
@@ -122,82 +133,287 @@ void testEmptyInput() {
}
@Test
- void testFileNotFound() {
- // Simulate 404 Not Found response
- BoxAPIResponseException mockException = new
BoxAPIResponseException("API Error", 404, "Box File Not Found", null);
-
doThrow(mockException).when(mockBoxFile).updateMetadata(any(Metadata.class));
+ void testFileNotFound() throws Exception {
+ final UpdateBoxFileMetadataInstance testSubject = new
UpdateBoxFileMetadataInstance() {
+ @Override
+ BoxFile getBoxFile(final String fileId) {
+ throw new BoxAPIResponseException("API Error", 404, "Box File
Not Found", null);
Review Comment:
Same as in `ExtractStructuredBoxFileMetadataTest`. Let's do the runner setup
in `@BeforeEach` only. To override the behavior, we can use `Supplier` or
`Function`.
##########
nifi-extension-bundles/nifi-box-bundle/nifi-box-processors/src/main/java/org/apache/nifi/processors/box/CreateBoxFileMetadataInstance.java:
##########
@@ -201,74 +177,65 @@ public void onTrigger(final ProcessContext context, final
ProcessSession session
return;
}
- if (createdKeys.isEmpty()) {
+ if (metadata.getOperations().isEmpty()) {
flowFile = session.putAttribute(flowFile, ERROR_MESSAGE, "No
valid metadata key-value pairs found in the input");
session.transfer(flowFile, REL_FAILURE);
return;
}
final BoxFile boxFile = getBoxFile(fileId);
- boxFile.createMetadata(templateName, metadata);
-
- // Update FlowFile attributes
- final Map<String, String> attributes = new HashMap<>();
- attributes.put("box.id", fileId);
- attributes.put("box.template.name", templateName);
- flowFile = session.putAllAttributes(flowFile, attributes);
-
- session.getProvenanceReporter().create(flowFile,
BoxFileUtils.BOX_URL + fileId);
- session.transfer(flowFile, REL_SUCCESS);
+ boxFile.createMetadata(templateKey, metadata);
} catch (final BoxAPIResponseException e) {
flowFile = session.putAttribute(flowFile, ERROR_CODE,
valueOf(e.getResponseCode()));
flowFile = session.putAttribute(flowFile, ERROR_MESSAGE,
e.getMessage());
if (e.getResponseCode() == 404) {
- getLogger().warn("Box file with ID {} was not found.", fileId);
- session.transfer(flowFile, REL_NOT_FOUND);
+ final String errorBody = e.getResponse();
+ if (errorBody != null &&
errorBody.toLowerCase().contains("specified metadata template not found")) {
+ getLogger().warn("Box metadata template with key {} was
not found.", templateKey);
+ session.transfer(flowFile, REL_TEMPLATE_NOT_FOUND);
+ } else {
+ getLogger().warn("Box file with ID {} was not found.",
fileId);
+ session.transfer(flowFile, REL_FILE_NOT_FOUND);
+ }
} else {
getLogger().error("Couldn't create metadata for file with id
[{}]", fileId, e);
session.transfer(flowFile, REL_FAILURE);
}
+ return;
} catch (Exception e) {
getLogger().error("Error processing metadata creation for Box file
[{}]", fileId, e);
flowFile = session.putAttribute(flowFile, ERROR_MESSAGE,
e.getMessage());
session.transfer(flowFile, REL_FAILURE);
+ return;
}
- }
- private void processRecord(Record record, RecordPath keyRecordPath,
RecordPath valueRecordPath,
- Metadata metadata, Set<String> createdKeys,
List<String> errors) {
- // Get the key from the record
- final RecordPathResult keyPathResult = keyRecordPath.evaluate(record);
- final List<FieldValue> keyValues =
keyPathResult.getSelectedFields().toList();
+ final Map<String, String> attributes = Map.of(
+ "box.id", fileId,
+ "box.template.key", templateKey);
+ flowFile = session.putAllAttributes(flowFile, attributes);
- if (keyValues.isEmpty()) {
- errors.add("Record is missing a key field");
- return;
- }
+ session.getProvenanceReporter().create(flowFile,
"%s%s/metadata/%s/%s".formatted(BoxFileUtils.BOX_URL, fileId, "enterprise",
templateKey));
+ session.transfer(flowFile, REL_SUCCESS);
+ }
- final Object keyObj = keyValues.getFirst().getValue();
- if (keyObj == null) {
- errors.add("Record has a null key value");
+ private void processRecord(Record record, Metadata metadata, List<String>
errors) {
+ if (record == null) {
+ errors.add("No record found in input");
return;
}
- final String key = keyObj.toString();
+ Set<String> fieldNames = new
HashSet<>(record.getSchema().getFieldNames());
Review Comment:
It seems we can avoid creating a set here. Is there any particular reason?
```suggestion
List<String> fieldNames = record.getSchema().getFieldNames();
```
##########
nifi-extension-bundles/nifi-box-bundle/nifi-box-processors/src/main/java/org/apache/nifi/processors/box/CreateBoxFileMetadataInstance.java:
##########
@@ -201,74 +177,65 @@ public void onTrigger(final ProcessContext context, final
ProcessSession session
return;
}
- if (createdKeys.isEmpty()) {
+ if (metadata.getOperations().isEmpty()) {
flowFile = session.putAttribute(flowFile, ERROR_MESSAGE, "No
valid metadata key-value pairs found in the input");
session.transfer(flowFile, REL_FAILURE);
return;
}
final BoxFile boxFile = getBoxFile(fileId);
- boxFile.createMetadata(templateName, metadata);
-
- // Update FlowFile attributes
- final Map<String, String> attributes = new HashMap<>();
- attributes.put("box.id", fileId);
- attributes.put("box.template.name", templateName);
- flowFile = session.putAllAttributes(flowFile, attributes);
-
- session.getProvenanceReporter().create(flowFile,
BoxFileUtils.BOX_URL + fileId);
- session.transfer(flowFile, REL_SUCCESS);
+ boxFile.createMetadata(templateKey, metadata);
} catch (final BoxAPIResponseException e) {
flowFile = session.putAttribute(flowFile, ERROR_CODE,
valueOf(e.getResponseCode()));
flowFile = session.putAttribute(flowFile, ERROR_MESSAGE,
e.getMessage());
if (e.getResponseCode() == 404) {
- getLogger().warn("Box file with ID {} was not found.", fileId);
- session.transfer(flowFile, REL_NOT_FOUND);
+ final String errorBody = e.getResponse();
+ if (errorBody != null &&
errorBody.toLowerCase().contains("specified metadata template not found")) {
+ getLogger().warn("Box metadata template with key {} was
not found.", templateKey);
+ session.transfer(flowFile, REL_TEMPLATE_NOT_FOUND);
+ } else {
+ getLogger().warn("Box file with ID {} was not found.",
fileId);
+ session.transfer(flowFile, REL_FILE_NOT_FOUND);
+ }
} else {
getLogger().error("Couldn't create metadata for file with id
[{}]", fileId, e);
session.transfer(flowFile, REL_FAILURE);
}
+ return;
} catch (Exception e) {
getLogger().error("Error processing metadata creation for Box file
[{}]", fileId, e);
flowFile = session.putAttribute(flowFile, ERROR_MESSAGE,
e.getMessage());
session.transfer(flowFile, REL_FAILURE);
+ return;
}
- }
- private void processRecord(Record record, RecordPath keyRecordPath,
RecordPath valueRecordPath,
- Metadata metadata, Set<String> createdKeys,
List<String> errors) {
- // Get the key from the record
- final RecordPathResult keyPathResult = keyRecordPath.evaluate(record);
- final List<FieldValue> keyValues =
keyPathResult.getSelectedFields().toList();
+ final Map<String, String> attributes = Map.of(
+ "box.id", fileId,
+ "box.template.key", templateKey);
+ flowFile = session.putAllAttributes(flowFile, attributes);
- if (keyValues.isEmpty()) {
- errors.add("Record is missing a key field");
- return;
- }
+ session.getProvenanceReporter().create(flowFile,
"%s%s/metadata/%s/%s".formatted(BoxFileUtils.BOX_URL, fileId, "enterprise",
templateKey));
Review Comment:
Nitpicking:
```suggestion
session.getProvenanceReporter().create(flowFile,
"%s%s/metadata/enterprise/%s".formatted(BoxFileUtils.BOX_URL, fileId,
templateKey));
```
##########
nifi-extension-bundles/nifi-box-bundle/nifi-box-processors/src/main/java/org/apache/nifi/processors/box/CreateBoxMetadataTemplate.java:
##########
@@ -270,77 +224,65 @@ public void onTrigger(final ProcessContext context, final
ProcessSession session
}
private void processRecord(final Record record,
- final RecordPath keyRecordPath,
- final RecordPath typeRecordPath,
- final RecordPath displayNameRecordPath,
final List<MetadataTemplate.Field> fields,
final Set<String> processedKeys,
final List<String> errors) {
- final RecordPathResult keyPathResult = keyRecordPath.evaluate(record);
- final List<FieldValue> keyValues =
keyPathResult.getSelectedFields().toList();
-
- if (keyValues.isEmpty()) {
- errors.add("Record is missing a key field");
- return;
- }
-
- final Object keyObj = keyValues.getFirst().getValue();
+ // Extract and validate key (required)
+ final Object keyObj = record.getValue("key");
if (keyObj == null) {
- errors.add("Record has a null key value");
+ errors.add("Record is missing a key field");
return;
}
-
final String key = keyObj.toString();
- // Skip if we've already processed this key
if (processedKeys.contains(key)) {
- getLogger().warn("Duplicate key '{}' found in record, skipping",
key);
+ errors.add("Duplicate key '" + key + "' found in record,
skipping");
return;
}
- final RecordPathResult typePathResult =
typeRecordPath.evaluate(record);
- final List<FieldValue> typeValues =
typePathResult.getSelectedFields().toList();
-
- if (typeValues.isEmpty()) {
+ // Extract and validate type (required)
+ final Object typeObj = record.getValue("type");
+ if (typeObj == null) {
errors.add("Record with key '" + key + "' is missing a type
field");
return;
}
+ final String normalizedType = typeObj.toString().toLowerCase();
- final Object typeObj = typeValues.getFirst().getValue();
- if (typeObj == null) {
- errors.add("Record with key '" + key + "' has a null type value");
+ if (!VALID_FIELD_TYPES.contains(normalizedType)) {
+ errors.add("Record with key '" + key + "' has an invalid type: '"
+ normalizedType +
+ "'. Valid types are: " + String.join(", ",
VALID_FIELD_TYPES));
return;
}
- final String type = typeObj.toString().toLowerCase();
+ final MetadataTemplate.Field metadataField = new
MetadataTemplate.Field();
+ metadataField.setKey(key);
+ metadataField.setType(normalizedType);
+
+ final Object displayNameObj = record.getValue("displayName");
+ if (displayNameObj != null) {
+ metadataField.setDisplayName(displayNameObj.toString());
+ }
- // Validate field type
- if (!VALID_FIELD_TYPES.contains(type)) {
- errors.add("Record with key '" + key + "' has an invalid type: '"
+ type + "'. Valid types are: " +
- String.join(", ", VALID_FIELD_TYPES));
- return;
+ final Object hiddenObj = record.getValue("hidden");
+ if (hiddenObj != null) {
+
metadataField.setIsHidden(Boolean.parseBoolean(hiddenObj.toString()));
}
- // Get the display name from the record (falls back to key if missing)
- String displayName = key;
- if (displayNameRecordPath != null) {
- final RecordPathResult displayNamePathResult =
displayNameRecordPath.evaluate(record);
- final List<FieldValue> displayNameValues =
displayNamePathResult.getSelectedFields().toList();
+ final Object descriptionObj = record.getValue("description");
+ if (descriptionObj != null) {
+ metadataField.setDescription(descriptionObj.toString());
+ }
- if (!displayNameValues.isEmpty()) {
- final Object displayNameObj =
displayNameValues.getFirst().getValue();
- if (displayNameObj != null) {
- displayName = displayNameObj.toString();
- }
+ if ("enum".equals(normalizedType) ||
"multiSelect".equals(normalizedType)) {
+ final Object optionsObj = record.getValue("options");
+ if (optionsObj instanceof List<?> optionsList) {
+ final List<String> options = optionsList.stream()
+ .map(obj -> obj != null ? obj.toString() : "")
Review Comment:
Shall we just fail the processing if `null` option value is found? That
isn't an expected scenario, right?
##########
nifi-extension-bundles/nifi-box-bundle/nifi-box-processors/src/main/java/org/apache/nifi/processors/box/UpdateBoxFileMetadataInstance.java:
##########
@@ -248,40 +207,110 @@ public void onTrigger(final ProcessContext context,
final ProcessSession session
}
}
- private void processRecord(Record record, RecordPath keyRecordPath,
RecordPath valueRecordPath,
- Metadata metadata, Set<String> updatedKeys,
List<String> errors) {
- // Get the key from the record
- final RecordPathResult keyPathResult = keyRecordPath.evaluate(record);
- final List<FieldValue> keyValues =
keyPathResult.getSelectedFields().toList();
+ private Map<String, Object> readDesiredState(final ProcessSession session,
+ final FlowFile flowFile,
+ final RecordReaderFactory
recordReaderFactory) throws Exception {
+ final Map<String, Object> desiredState = new HashMap<>();
- if (keyValues.isEmpty()) {
- errors.add("Record is missing a key field");
- return;
+ try (final InputStream inputStream = session.read(flowFile);
+ final RecordReader recordReader =
recordReaderFactory.createRecordReader(flowFile, inputStream, getLogger())) {
+
+ final Record record = recordReader.nextRecord();
+ if (record != null) {
+ for (String fieldName : record.getSchema().getFieldNames()) {
+ desiredState.put(fieldName, record.getValue(fieldName));
+ }
+ }
}
- final Object keyObj = keyValues.getFirst().getValue();
- if (keyObj == null) {
- errors.add("Record has a null key value");
- return;
+ return desiredState;
+ }
+
+ Metadata getOrCreateMetadata(final BoxFile boxFile,
Review Comment:
Now it just gets metadata. No creation.
##########
nifi-extension-bundles/nifi-box-bundle/nifi-box-processors/src/main/java/org/apache/nifi/processors/box/ExtractStructuredBoxFileMetadata.java:
##########
@@ -58,17 +59,26 @@
@InputRequirement(InputRequirement.Requirement.INPUT_REQUIRED)
@Tags({"box", "storage", "metadata", "ai", "extract"})
-@CapabilityDescription("Extracts metadata from a Box file using Box AI and a
template. The extracted metadata is written to the FlowFile content as JSON.")
+@CapabilityDescription("""
+ Extracts metadata from a Box file using Box AI. The extraction can use
either a template or a list of fields.\s
+ The extracted metadata is written to the FlowFile content as JSON.
+ """)
@SeeAlso({ListBoxFileMetadataTemplates.class, ListBoxFile.class,
FetchBoxFile.class, UpdateBoxFileMetadataInstance.class})
@WritesAttributes({
@WritesAttribute(attribute = "box.id", description = "The ID of the
file from which metadata was extracted"),
- @WritesAttribute(attribute = "box.ai.template.key", description = "The
template key used for extraction"),
+ @WritesAttribute(attribute = "box.ai.template.key", description = "The
template key used for extraction (when using TEMPLATE extraction method)"),
+ @WritesAttribute(attribute = "box.ai.extraction.method", description =
"The extraction method used (TEMPLATE or FIELDS)"),
@WritesAttribute(attribute = "box.ai.completion.reason", description =
"The completion reason from the AI extraction"),
@WritesAttribute(attribute = "mime.type", description = "Set to
'application/json' for the JSON content"),
@WritesAttribute(attribute = ERROR_CODE, description =
ERROR_CODE_DESC),
@WritesAttribute(attribute = ERROR_MESSAGE, description =
ERROR_MESSAGE_DESC)
})
-public class ExtractBoxFileMetadataWithBoxAI extends AbstractProcessor {
+public class ExtractStructuredBoxFileMetadata extends AbstractProcessor {
+
+ public static class ExtractionMethod {
+ public static final String TEMPLATE = "TEMPLATE";
+ public static final String FIELDS = "FIELDS";
Review Comment:
We should add better description what these options mean.
Using `DescribedValue` should make it more convenient.
##########
nifi-extension-bundles/nifi-box-bundle/nifi-box-processors/src/main/java/org/apache/nifi/processors/box/UpdateBoxFileMetadataInstance.java:
##########
@@ -248,40 +207,110 @@ public void onTrigger(final ProcessContext context,
final ProcessSession session
}
}
- private void processRecord(Record record, RecordPath keyRecordPath,
RecordPath valueRecordPath,
- Metadata metadata, Set<String> updatedKeys,
List<String> errors) {
- // Get the key from the record
- final RecordPathResult keyPathResult = keyRecordPath.evaluate(record);
- final List<FieldValue> keyValues =
keyPathResult.getSelectedFields().toList();
+ private Map<String, Object> readDesiredState(final ProcessSession session,
+ final FlowFile flowFile,
+ final RecordReaderFactory
recordReaderFactory) throws Exception {
+ final Map<String, Object> desiredState = new HashMap<>();
- if (keyValues.isEmpty()) {
- errors.add("Record is missing a key field");
- return;
+ try (final InputStream inputStream = session.read(flowFile);
+ final RecordReader recordReader =
recordReaderFactory.createRecordReader(flowFile, inputStream, getLogger())) {
+
+ final Record record = recordReader.nextRecord();
+ if (record != null) {
+ for (String fieldName : record.getSchema().getFieldNames()) {
+ desiredState.put(fieldName, record.getValue(fieldName));
+ }
+ }
}
- final Object keyObj = keyValues.getFirst().getValue();
- if (keyObj == null) {
- errors.add("Record has a null key value");
- return;
+ return desiredState;
+ }
+
+ Metadata getOrCreateMetadata(final BoxFile boxFile,
+ final String templateKey,
+ final String fileId) {
+ try {
+ final Metadata metadata = boxFile.getMetadata(templateKey);
+ getLogger().debug("Retrieved existing metadata for file {}",
fileId);
+ return metadata;
+ } catch (final BoxAPIResponseException e) {
+ if (e.getResponseCode() == 404) {
+ getLogger().warn("No existing metadata found for file {} with
template key {}.", fileId, templateKey);
+ throw e;
+ }
+ throw e;
}
+ }
- final String key = keyObj.toString();
+ private Set<String> updateMetadata(final Metadata metadata,
+ final Map<String, Object> desiredState)
{
+ final Set<String> processedKeys = new HashSet<>();
+ final List<String> currentKeys = metadata.getPropertyPaths();
- // Get the value from the record
- final RecordPathResult valuePathResult =
valueRecordPath.evaluate(record);
- final List<FieldValue> valueValues =
valuePathResult.getSelectedFields().toList();
+ // Remove fields not in desired state
+ for (final String propertyPath : currentKeys) {
+ final String fieldName = propertyPath.substring(1); // Remove
leading '/'
- if (valueValues.isEmpty()) {
- errors.add("Record with key '" + key + "' is missing a value
field");
- return;
+ if (!desiredState.containsKey(fieldName)) {
+ metadata.remove(propertyPath);
+ processedKeys.add(fieldName);
+ getLogger().debug("Removing metadata field: {}", fieldName);
+ }
+ }
+
+ // Add or update fields
+ for (final Map.Entry<String, Object> entry : desiredState.entrySet()) {
+ final String fieldName = entry.getKey();
+ final Object value = entry.getValue();
+ final String propertyPath = "/" + fieldName;
+
+ if (updateField(metadata, propertyPath, value,
currentKeys.contains(propertyPath))) {
+ processedKeys.add(fieldName);
+ }
+ }
+
+ return processedKeys;
+ }
+
+ private boolean updateField(final Metadata metadata,
+ final String propertyPath,
+ final Object value,
+ final boolean exists) {
+ if (value == null) {
+ return false;
}
- final Object valueObj = valueValues.getFirst().getValue();
- final String value = valueObj != null ? valueObj.toString() : null;
+ if (exists) {
+ final Object currentValue = metadata.getValue(propertyPath);
- // Add the key-value pair to the metadata update
- metadata.add("/" + key, value);
- updatedKeys.add(key);
+ // Only update if values are different
+ if (Objects.equals(currentValue, value)) {
+ return false;
+ }
+
+ // Update
+ switch (value) {
+ case Number n -> metadata.replace(propertyPath,
n.doubleValue());
+ case List<?> l -> metadata.replace(propertyPath,
convertListToStringList(l));
+ default -> metadata.replace(propertyPath, value.toString());
+ }
+ } else {
+ // Add new field
+ switch (value) {
+ case Number n -> metadata.add(propertyPath, n.doubleValue());
+ case List<?> l -> metadata.add(propertyPath,
convertListToStringList(l));
+ default -> metadata.add(propertyPath, value.toString());
+ }
+ }
+ return true;
+ }
+
+ private List<String> convertListToStringList(final List<?> list) {
+ // Box doesn't support null values in metadata lists, so we filter
them out
Review Comment:
Does it make sense to fail instead of silently ignoring null values?
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]