This is an automated email from the ASF dual-hosted git repository.
tingchen pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/pinot.git
The following commit(s) were added to refs/heads/master by this push:
new 7ddb7a46f6 In SchemaConformingTransformer, Flatten array into multiple
entries each with a key and array value. (#13890)
7ddb7a46f6 is described below
commit 7ddb7a46f6520e66247810e487e263dd9a7ba9c1
Author: Ting Chen <[email protected]>
AuthorDate: Thu Sep 5 15:46:00 2024 -0700
In SchemaConformingTransformer, Flatten array into multiple entries each
with a key and array value. (#13890)
* Flatten array into multiple entries each with a key and array value.
* Fix lint issues.
* Address the comments
---
.../SchemaConformingTransformerV2.java | 28 +++++++---
.../SchemaConformingTransformerV2Test.java | 59 +++++++++++++---------
2 files changed, 57 insertions(+), 30 deletions(-)
diff --git
a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/recordtransformer/SchemaConformingTransformerV2.java
b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/recordtransformer/SchemaConformingTransformerV2.java
index 583003b3f5..47b629f522 100644
---
a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/recordtransformer/SchemaConformingTransformerV2.java
+++
b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/recordtransformer/SchemaConformingTransformerV2.java
@@ -430,7 +430,7 @@ public class SchemaConformingTransformerV2 implements
RecordTransformer {
}
/**
- * Generate an Lucene document based on the provided key-value pair.
+ * Generate a Lucene document based on the provided key-value pair.
* The index document follows this format: "val:key".
* @param kv used to generate text index
documents
* @param indexDocuments a list to store the generated
index documents
@@ -439,18 +439,34 @@ public class SchemaConformingTransformerV2 implements
RecordTransformer {
public void generateTextIndexLuceneDocument(Map.Entry<String, Object> kv,
List<String> indexDocuments,
Integer mergedTextIndexDocumentMaxLength) {
String key = kv.getKey();
- String val;
// To avoid redundant leading and tailing '"', only convert to JSON string
if the value is a list or an array
if (kv.getValue() instanceof Collection || kv.getValue() instanceof
Object[]) {
+ // Add the entire array or collection as one string to the Lucene doc.
try {
- val = JsonUtils.objectToString(kv.getValue());
+ addLuceneDoc(indexDocuments, mergedTextIndexDocumentMaxLength, key,
JsonUtils.objectToString(kv.getValue()));
+ // To enable array contains search, we also add each array element
with the key value pair to the Lucene doc.
+ // Currently it only supports 1 level flattening, any element deeper
than 1 level will still stay nested.
+ if (kv.getValue() instanceof Collection) {
+ for (Object o : (Collection) kv.getValue()) {
+ addLuceneDoc(indexDocuments, mergedTextIndexDocumentMaxLength,
key, JsonUtils.objectToString(o));
+ }
+ } else if (kv.getValue() instanceof Object[]) {
+ for (Object o : (Object[]) kv.getValue()) {
+ addLuceneDoc(indexDocuments, mergedTextIndexDocumentMaxLength,
key, JsonUtils.objectToString(o));
+ }
+ }
} catch (JsonProcessingException e) {
- val = kv.getValue().toString();
+ addLuceneDoc(indexDocuments, mergedTextIndexDocumentMaxLength, key,
kv.getValue().toString());
}
- } else {
- val = kv.getValue().toString();
+ return;
}
+ // If the value is a single value
+ addLuceneDoc(indexDocuments, mergedTextIndexDocumentMaxLength, key,
kv.getValue().toString());
+ }
+
+ private void addLuceneDoc(List<String> indexDocuments, Integer
mergedTextIndexDocumentMaxLength, String key,
+ String val) {
// TODO: theoretically, the key length + 1 could cause integer overflow.
But in reality, upstream message size
// limit usually could not reach that high. We should revisit this if we
see any issue.
if (key.length() + 1 > MAXIMUM_LUCENE_DOCUMENT_SIZE) {
diff --git
a/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/recordtransformer/SchemaConformingTransformerV2Test.java
b/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/recordtransformer/SchemaConformingTransformerV2Test.java
index 6189f14d42..cd1d85dc1d 100644
---
a/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/recordtransformer/SchemaConformingTransformerV2Test.java
+++
b/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/recordtransformer/SchemaConformingTransformerV2Test.java
@@ -326,7 +326,7 @@ public class SchemaConformingTransformerV2Test {
// No schema
schemaBuilder = createDefaultSchemaBuilder();
- /*
+ /* Expected output
{
"indexableExtras":{
"arrayField":[0, 1, 2, 3],
@@ -370,10 +370,7 @@ public class SchemaConformingTransformerV2Test {
}
},
__mergedTextIndex: [
- "[0, 1, 2, 3]:arrayField", "a:stringField",
- "[0, 1, 2, 3]:mapField.arrayField", "a:mapField.stringField",
- "[0, 1, 2, 3]:nestedFields.arrayField", "a:nestedFields.stringField",
- "[0, 1, 2, 3]:nestedFields.mapField.arrayField",
"a:nestedFields.mapField.stringField",
+ // See the value of expectedJsonNodeWithMergedTextIndex
]
}
*/
@@ -397,9 +394,15 @@ public class SchemaConformingTransformerV2Test {
transformWithUnIndexableFieldsAndMergedTextIndex(schemaBuilder.build(),
inputJsonNode, expectedJsonNode);
expectedJsonNodeWithMergedTextIndex =
expectedJsonNode.deepCopy().set(MERGED_TEXT_INDEX_FIELD_NAME,
-
N.arrayNode().add("[0,1,2,3]:arrayField").add("a:stringField").add("[0,1,2,3]:mapField.arrayField")
-
.add("a:mapField.stringField").add("[0,1,2,3]:nestedFields.arrayField").add("a:nestedFields.stringField")
-
.add("[0,1,2,3]:nestedFields.mapField.arrayField").add("a:nestedFields.mapField.stringField"));
+
N.arrayNode().add("[0,1,2,3]:arrayField").add("0:arrayField").add("1:arrayField").add("2:arrayField")
+
.add("3:arrayField").add("a:stringField").add("[0,1,2,3]:mapField.arrayField").add("0:mapField.arrayField")
+
.add("1:mapField.arrayField").add("2:mapField.arrayField").add("3:mapField.arrayField")
+
.add("a:mapField.stringField").add("[0,1,2,3]:nestedFields.arrayField").add("0:nestedFields.arrayField")
+
.add("1:nestedFields.arrayField").add("2:nestedFields.arrayField").add("3:nestedFields.arrayField")
+
.add("a:nestedFields.stringField").add("[0,1,2,3]:nestedFields.mapField.arrayField")
+
.add("0:nestedFields.mapField.arrayField").add("1:nestedFields.mapField.arrayField")
+
.add("2:nestedFields.mapField.arrayField").add("3:nestedFields.mapField.arrayField")
+ .add("a:nestedFields.mapField.stringField"));
transformWithUnIndexableFieldsAndMergedTextIndex(
schemaBuilder.addMultiValueDimension(MERGED_TEXT_INDEX_FIELD_NAME,
DataType.STRING).build(), inputJsonNode,
expectedJsonNodeWithMergedTextIndex);
@@ -453,10 +456,7 @@ public class SchemaConformingTransformerV2Test {
}
},
__mergedTextIndex: [
- "[0, 1, 2, 3]:arrayField", "a:stringField",
- "[0, 1, 2, 3]:mapField.arrayField", "a:mapField.stringField",
- "[0, 1, 2, 3]:nestedFields.arrayField", "a:nestedFields.stringField",
- "[0, 1, 2, 3]:nestedFields.mapField.arrayField",
"a:nestedFields.mapField.stringField",
+ // See the value of expectedJsonNodeWithMergedTextIndex
]
}
*/
@@ -480,9 +480,15 @@ public class SchemaConformingTransformerV2Test {
transformWithUnIndexableFieldsAndMergedTextIndex(schemaBuilder.build(),
inputJsonNode, expectedJsonNode);
expectedJsonNodeWithMergedTextIndex =
expectedJsonNode.deepCopy().set(MERGED_TEXT_INDEX_FIELD_NAME,
-
N.arrayNode().add("[0,1,2,3]:arrayField").add("a:stringField").add("[0,1,2,3]:mapField.arrayField")
-
.add("a:mapField.stringField").add("[0,1,2,3]:nestedFields.arrayField").add("a:nestedFields.stringField")
-
.add("[0,1,2,3]:nestedFields.mapField.arrayField").add("a:nestedFields.mapField.stringField"));
+
N.arrayNode().add("[0,1,2,3]:arrayField").add("0:arrayField").add("1:arrayField").add("2:arrayField")
+
.add("3:arrayField").add("a:stringField").add("[0,1,2,3]:mapField.arrayField").add("0:mapField.arrayField")
+
.add("1:mapField.arrayField").add("2:mapField.arrayField").add("3:mapField.arrayField")
+
.add("a:mapField.stringField").add("[0,1,2,3]:nestedFields.arrayField").add("0:nestedFields.arrayField")
+
.add("1:nestedFields.arrayField").add("2:nestedFields.arrayField").add("3:nestedFields.arrayField")
+
.add("a:nestedFields.stringField").add("[0,1,2,3]:nestedFields.mapField.arrayField")
+
.add("0:nestedFields.mapField.arrayField").add("1:nestedFields.mapField.arrayField")
+
.add("2:nestedFields.mapField.arrayField").add("3:nestedFields.mapField.arrayField")
+ .add("a:nestedFields.mapField.stringField"));
transformWithUnIndexableFieldsAndMergedTextIndex(
schemaBuilder.addMultiValueDimension(MERGED_TEXT_INDEX_FIELD_NAME,
DataType.STRING).build(), inputJsonNode,
expectedJsonNodeWithMergedTextIndex);
@@ -540,10 +546,7 @@ public class SchemaConformingTransformerV2Test {
}
},
__mergedTextIndex: [
- "[0, 1, 2, 3]:arrayField", "a:stringField",
- "[0, 1, 2, 3]:mapField.arrayField", "a:mapField.stringField",
- "[0, 1, 2, 3]:nestedFields.arrayField", "a:nestedFields.stringField",
- "[0, 1, 2, 3]:nestedFields.mapField.arrayField",
"a:nestedFields.mapField.stringField",
+ // See the value of expectedJsonNodeWithMergedTextIndex
]
}
*/
@@ -567,9 +570,15 @@ public class SchemaConformingTransformerV2Test {
.set(TEST_JSON_MAP_FIELD_NAME,
TEST_JSON_MAP_NO_IDX_NODE)));
transformWithUnIndexableFieldsAndMergedTextIndex(schemaBuilder.build(),
inputJsonNode, expectedJsonNode);
expectedJsonNodeWithMergedTextIndex =
expectedJsonNode.deepCopy().set(MERGED_TEXT_INDEX_FIELD_NAME,
-
N.arrayNode().add("[0,1,2,3]:arrayField").add("a:stringField").add("[0,1,2,3]:mapField.arrayField")
-
.add("a:mapField.stringField").add("[0,1,2,3]:nestedFields.arrayField").add("a:nestedFields.stringField")
-
.add("[0,1,2,3]:nestedFields.mapField.arrayField").add("a:nestedFields.mapField.stringField"));
+
N.arrayNode().add("[0,1,2,3]:arrayField").add("0:arrayField").add("1:arrayField").add("2:arrayField")
+
.add("3:arrayField").add("a:stringField").add("[0,1,2,3]:mapField.arrayField").add("0:mapField.arrayField")
+
.add("1:mapField.arrayField").add("2:mapField.arrayField").add("3:mapField.arrayField")
+
.add("a:mapField.stringField").add("[0,1,2,3]:nestedFields.arrayField").add("0:nestedFields.arrayField")
+
.add("1:nestedFields.arrayField").add("2:nestedFields.arrayField").add("3:nestedFields.arrayField")
+
.add("a:nestedFields.stringField").add("[0,1,2,3]:nestedFields.mapField.arrayField")
+
.add("0:nestedFields.mapField.arrayField").add("1:nestedFields.mapField.arrayField")
+
.add("2:nestedFields.mapField.arrayField").add("3:nestedFields.mapField.arrayField")
+ .add("a:nestedFields.mapField.stringField"));
transformWithUnIndexableFieldsAndMergedTextIndex(
schemaBuilder.addMultiValueDimension(MERGED_TEXT_INDEX_FIELD_NAME,
DataType.STRING).build(), inputJsonNode,
expectedJsonNodeWithMergedTextIndex);
@@ -713,8 +722,10 @@ public class SchemaConformingTransformerV2Test {
.set(TEST_JSON_STRING_NO_IDX_FIELD_NAME,
TEST_JSON_STRING_NO_IDX_NODE)));
expectedJsonNodeWithMergedTextIndex =
expectedJsonNode.deepCopy().set(MERGED_TEXT_INDEX_FIELD_NAME,
-
N.arrayNode().add("[0,1,2,3]:arrayField").add("a:stringField").add("[0,1,2,3]:nestedFields.arrayField").add(
- "a:nestedFields.stringField"));
+
N.arrayNode().add("0:arrayField").add("1:arrayField").add("2:arrayField").add("3:arrayField").
+
add("[0,1,2,3]:arrayField").add("a:stringField").add("[0,1,2,3]:nestedFields.arrayField").
+
add("0:nestedFields.arrayField").add("1:nestedFields.arrayField").add("2:nestedFields.arrayField").
+
add("3:nestedFields.arrayField").add("a:nestedFields.stringField"));
transformKeyValueTransformation(
schemaBuilder.addMultiValueDimension(MERGED_TEXT_INDEX_FIELD_NAME,
DataType.STRING).build(), keyMapping,
pathToDrop, pathToPreserve, inputJsonNode,
expectedJsonNodeWithMergedTextIndex);
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]