This is an automated email from the ASF dual-hosted git repository.
paleolimbot pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow-nanoarrow.git
The following commit(s) were added to refs/heads/main by this push:
new 31a4634c feat: Add ArrowIpcDictionaryMapping to track dictionary
identifiers when parsing schema message (#856)
31a4634c is described below
commit 31a4634c8a041ae32b814919c76108a77a47a139
Author: Dewey Dunnington <[email protected]>
AuthorDate: Mon Mar 23 08:48:52 2026 -0500
feat: Add ArrowIpcDictionaryMapping to track dictionary identifiers when
parsing schema message (#856)
This PR adds `struct ArrowIpcDictionaryEncoding` and `struct
ArrowIpcDictionaryEncodings` that accompany an `ArrowSchema` to map
fields to dictionary IDs. The primary purpose of this is to assign
dictionary IDs to the (internal) `struct ArrowIpcField` so that the next
PR can make use of that to actually decode them. There is now an
`ArrowIpcDecoderDecodeSchemaWithDictionaries()` and
`ArrowIpcDecoderSetSchemaWithDictionaries()` to mach
`ArrowIpcDecoderDecodeSchema()` and `ArrowIpcDecoderSetSchema()`
(respectively).
This is largely an implementation detail that simply transports
information from the Flatbuffers schema to the decoder internals.
Closes #844.
---------
Co-authored-by: Copilot Autofix powered by AI
<[email protected]>
---
src/nanoarrow/ipc/decoder.c | 208 ++++++++++++++++++++++++++++++--------
src/nanoarrow/ipc/decoder_test.cc | 32 +++++-
src/nanoarrow/ipc/encoder.c | 2 +
src/nanoarrow/ipc/ipc_hpp_test.cc | 15 +++
src/nanoarrow/ipc/reader.c | 11 +-
src/nanoarrow/nanoarrow_ipc.h | 115 ++++++++++++++++++++-
src/nanoarrow/nanoarrow_ipc.hpp | 21 ++++
7 files changed, 355 insertions(+), 49 deletions(-)
diff --git a/src/nanoarrow/ipc/decoder.c b/src/nanoarrow/ipc/decoder.c
index f3b4b443..400d483e 100644
--- a/src/nanoarrow/ipc/decoder.c
+++ b/src/nanoarrow/ipc/decoder.c
@@ -17,6 +17,7 @@
#include <errno.h>
#include <inttypes.h>
+#include <stdint.h>
#include <stdio.h>
#include <string.h>
@@ -54,6 +55,8 @@
#define NANOARROW_IPC_MAGIC "ARROW1"
+#define NANOARROW_IPC_NO_DICTIONARY_ID INT64_MIN
+
// Internal representation of a parsed "Field" from flatbuffers. This
// represents a field in a depth-first walk of column arrays and their
// children.
@@ -66,6 +69,9 @@ struct ArrowIpcField {
struct ArrowArray* array;
// The cumulative number of buffers preceding this node.
int64_t buffer_offset;
+ // Dictionary identifier (or NANOARROW_IPC_NO_DICTIONARY_ID if this is not a
+ // dictionary-encoded field)
+ int64_t dictionary_id;
};
// Internal data specific to the read/decode process
@@ -272,6 +278,46 @@ static int ArrowIpcDecoderNeedsSwapEndian(struct
ArrowIpcDecoder* decoder) {
}
}
+void ArrowIpcDictionaryEncodingsInit(
+ struct ArrowIpcDictionaryEncodings* dictionary_encodings) {
+ NANOARROW_DCHECK(dictionary_encodings != NULL);
+ ArrowBufferInit(&dictionary_encodings->encodings);
+}
+
+ArrowErrorCode ArrowIpcDictionaryEncodingsAppend(
+ struct ArrowIpcDictionaryEncodings* dictionaries,
+ struct ArrowIpcDictionaryEncoding encoding) {
+ NANOARROW_DCHECK(dictionaries != NULL);
+ NANOARROW_RETURN_NOT_OK(ArrowBufferAppend(&dictionaries->encodings,
&encoding,
+ sizeof(struct
ArrowIpcDictionaryEncoding)));
+ return NANOARROW_OK;
+}
+
+const struct ArrowIpcDictionaryEncoding* ArrowIpcDictionaryEncodingsFind(
+ const struct ArrowIpcDictionaryEncodings* dictionary_encodings,
+ const struct ArrowSchema* schema) {
+ NANOARROW_DCHECK(dictionary_encodings != NULL);
+ int64_t length = dictionary_encodings->encodings.size_bytes /
+ sizeof(struct ArrowIpcDictionaryEncoding);
+ const struct ArrowIpcDictionaryEncoding* data =
+ (const struct
ArrowIpcDictionaryEncoding*)dictionary_encodings->encodings.data;
+
+ for (int64_t i = 0; i < length; i++) {
+ const struct ArrowIpcDictionaryEncoding* encoding = data + i;
+ if (encoding->schema == schema) {
+ return encoding;
+ }
+ }
+
+ return NULL;
+}
+
+void ArrowIpcDictionaryEncodingsReset(
+ struct ArrowIpcDictionaryEncodings* dictionary_encodings) {
+ NANOARROW_DCHECK(dictionary_encodings != NULL);
+ ArrowBufferReset(&dictionary_encodings->encodings);
+}
+
ArrowErrorCode ArrowIpcDecoderInit(struct ArrowIpcDecoder* decoder) {
memset(decoder, 0, sizeof(struct ArrowIpcDecoder));
struct ArrowIpcDecoderPrivate* private_data =
@@ -943,7 +989,7 @@ static int
ArrowIpcMoveNonExtensionFieldMetadataBackToFieldIfNeeded(
static int ArrowIpcSetDictionaryEncoding(
struct ArrowSchema* schema, ns(DictionaryEncoding_table_t
dictionary_encoding),
- struct ArrowError* error) {
+ struct ArrowIpcDictionaryEncodings* dictionaries, struct ArrowError*
error) {
switch (
org_apache_arrow_flatbuf_DictionaryEncoding_dictionaryKind(dictionary_encoding))
{
case ns(DictionaryKind_DenseArray):
@@ -982,16 +1028,25 @@ static int ArrowIpcSetDictionaryEncoding(
NANOARROW_RETURN_NOT_OK_WITH_ERROR(
ArrowIpcMoveNonExtensionFieldMetadataBackToFieldIfNeeded(schema), error);
- // TODO: Track the dictionary
- // https://github.com/apache/arrow-nanoarrow/issues/844
+ // Track the identifier if we have a dictionaries object in which to track it
+ if (dictionaries != NULL) {
+ int64_t id = ns(DictionaryEncoding_id(dictionary_encoding));
+ struct ArrowIpcDictionaryEncoding encoding = {
+ schema, id, NANOARROW_IPC_DICTIONARY_KIND_DENSE_ARRAY};
+
+ NANOARROW_RETURN_NOT_OK_WITH_ERROR(
+ ArrowIpcDictionaryEncodingsAppend(dictionaries, encoding), error);
+ }
return NANOARROW_OK;
}
static int ArrowIpcDecoderSetChildren(struct ArrowSchema* schema,
ns(Field_vec_t) fields,
+ struct ArrowIpcDictionaryEncodings*
dictionaries,
struct ArrowError* error);
static int ArrowIpcDecoderSetField(struct ArrowSchema* schema,
ns(Field_table_t) field,
+ struct ArrowIpcDictionaryEncodings*
dictionaries,
struct ArrowError* error) {
int result;
if (ns(Field_name_is_present(field))) {
@@ -1032,26 +1087,29 @@ static int ArrowIpcDecoderSetField(struct ArrowSchema*
schema, ns(Field_table_t)
ArrowSchemaInit(schema->children[i]);
}
- NANOARROW_RETURN_NOT_OK(ArrowIpcDecoderSetChildren(schema, children, error));
+ NANOARROW_RETURN_NOT_OK(
+ ArrowIpcDecoderSetChildren(schema, children, dictionaries, error));
NANOARROW_RETURN_NOT_OK(
ArrowIpcDecoderSetMetadata(schema, ns(Field_custom_metadata(field)),
error));
// If this is a dictionary encoded field, set the dictionary encoding
if (ns(Field_dictionary_is_present(field))) {
- NANOARROW_RETURN_NOT_OK(
- ArrowIpcSetDictionaryEncoding(schema, ns(Field_dictionary(field)),
error));
+ NANOARROW_RETURN_NOT_OK(ArrowIpcSetDictionaryEncoding(
+ schema, ns(Field_dictionary(field)), dictionaries, error));
}
return NANOARROW_OK;
}
static int ArrowIpcDecoderSetChildren(struct ArrowSchema* schema,
ns(Field_vec_t) fields,
+ struct ArrowIpcDictionaryEncodings*
dictionaries,
struct ArrowError* error) {
int64_t n_fields = ns(Schema_vec_len(fields));
for (int64_t i = 0; i < n_fields; i++) {
ns(Field_table_t) field = ns(Field_vec_at(fields, i));
- NANOARROW_RETURN_NOT_OK(ArrowIpcDecoderSetField(schema->children[i],
field, error));
+ NANOARROW_RETURN_NOT_OK(
+ ArrowIpcDecoderSetField(schema->children[i], field, dictionaries,
error));
}
return NANOARROW_OK;
@@ -1447,9 +1505,10 @@ ArrowErrorCode ArrowIpcDecoderDecodeHeader(struct
ArrowIpcDecoder* decoder,
return NANOARROW_OK;
}
-static ArrowErrorCode ArrowIpcDecoderDecodeSchemaImpl(ns(Schema_table_t)
schema,
- struct ArrowSchema* out,
- struct ArrowError*
error) {
+static ArrowErrorCode ArrowIpcDecoderDecodeSchemaImpl(
+ ns(Schema_table_t) schema, struct ArrowSchema* out,
+ struct ArrowIpcDictionaryEncodings* dictionary_encodings_out,
+ struct ArrowError* error) {
ArrowSchemaInit(out);
// Top-level batch schema is typically non-nullable
out->flags = 0;
@@ -1464,15 +1523,17 @@ static ArrowErrorCode
ArrowIpcDecoderDecodeSchemaImpl(ns(Schema_table_t) schema,
return result;
}
- NANOARROW_RETURN_NOT_OK(ArrowIpcDecoderSetChildren(out, fields, error));
+ NANOARROW_RETURN_NOT_OK(
+ ArrowIpcDecoderSetChildren(out, fields, dictionary_encodings_out,
error));
NANOARROW_RETURN_NOT_OK(
ArrowIpcDecoderSetMetadata(out, ns(Schema_custom_metadata(schema)),
error));
return NANOARROW_OK;
}
-ArrowErrorCode ArrowIpcDecoderDecodeSchema(struct ArrowIpcDecoder* decoder,
- struct ArrowSchema* out,
- struct ArrowError* error) {
+ArrowErrorCode ArrowIpcDecoderDecodeSchemaWithDictionaries(
+ struct ArrowIpcDecoder* decoder, struct ArrowSchema* out,
+ struct ArrowIpcDictionaryEncodings* dictionary_encodings_out,
+ struct ArrowError* error) {
struct ArrowIpcDecoderPrivate* private_data =
(struct ArrowIpcDecoderPrivate*)decoder->private_data;
@@ -1482,18 +1543,34 @@ ArrowErrorCode ArrowIpcDecoderDecodeSchema(struct
ArrowIpcDecoder* decoder,
return EINVAL;
}
+ if (dictionary_encodings_out != NULL) {
+ ArrowIpcDictionaryEncodingsInit(dictionary_encodings_out);
+ }
+
struct ArrowSchema tmp;
- ArrowErrorCode result = ArrowIpcDecoderDecodeSchemaImpl(
- (ns(Schema_table_t))private_data->last_message, &tmp, error);
+ ArrowErrorCode result =
+
ArrowIpcDecoderDecodeSchemaImpl((ns(Schema_table_t))private_data->last_message,
+ &tmp, dictionary_encodings_out, error);
if (result != NANOARROW_OK) {
ArrowSchemaRelease(&tmp);
+
+ if (dictionary_encodings_out != NULL) {
+ ArrowIpcDictionaryEncodingsReset(dictionary_encodings_out);
+ }
+
return result;
}
ArrowSchemaMove(&tmp, out);
return NANOARROW_OK;
}
+ArrowErrorCode ArrowIpcDecoderDecodeSchema(struct ArrowIpcDecoder* decoder,
+ struct ArrowSchema* out,
+ struct ArrowError* error) {
+ return ArrowIpcDecoderDecodeSchemaWithDictionaries(decoder, out, NULL,
error);
+}
+
ArrowErrorCode ArrowIpcDecoderDecodeFooter(struct ArrowIpcDecoder* decoder,
struct ArrowBufferView data,
struct ArrowError* error) {
@@ -1510,7 +1587,8 @@ ArrowErrorCode ArrowIpcDecoderDecodeFooter(struct
ArrowIpcDecoder* decoder,
ArrowIpcDecoderDecodeSchemaHeader(decoder, ns(Footer_schema(footer)),
error));
NANOARROW_RETURN_NOT_OK(ArrowIpcDecoderDecodeSchemaImpl(
- ns(Footer_schema(footer)), &private_data->footer.schema, error));
+ ns(Footer_schema(footer)), &private_data->footer.schema,
+ &private_data->footer.dictionaries, error));
ns(Block_vec_t) blocks = ns(Footer_recordBatches(footer));
int64_t n = ns(Block_vec_len(blocks));
@@ -1529,21 +1607,53 @@ ArrowErrorCode ArrowIpcDecoderDecodeFooter(struct
ArrowIpcDecoder* decoder,
return NANOARROW_OK;
}
-static void ArrowIpcDecoderCountFields(struct ArrowSchema* schema, int64_t*
n_fields) {
+static void ArrowIpcDecoderCountFields(const struct ArrowSchema* schema,
+ int64_t* n_fields) {
*n_fields += 1;
for (int64_t i = 0; i < schema->n_children; i++) {
ArrowIpcDecoderCountFields(schema->children[i], n_fields);
}
}
-static void ArrowIpcDecoderInitFields(struct ArrowIpcField* fields,
- struct ArrowArrayView* array_view,
- struct ArrowArray* array, int64_t*
n_fields,
- int64_t* n_buffers, int64_t*
n_union_fields) {
+static ArrowErrorCode ArrowIpcDecoderInitFields(
+ struct ArrowIpcField* fields, const struct ArrowSchema* schema,
+ const struct ArrowIpcDictionaryEncodings* dictionary_encodings,
+ struct ArrowArrayView* array_view, struct ArrowArray* array, int64_t*
n_fields,
+ int64_t* n_buffers, int64_t* n_union_fields, struct ArrowError* error) {
struct ArrowIpcField* field = fields + (*n_fields);
field->array_view = array_view;
field->array = array;
field->buffer_offset = *n_buffers;
+ field->dictionary_id = NANOARROW_IPC_NO_DICTIONARY_ID;
+
+ if (schema->dictionary != NULL) {
+ if (dictionary_encodings == NULL) {
+ const char* name = schema->name;
+ if (name == NULL) {
+ name = "<unnamed field>";
+ }
+
+ ArrowErrorSet(error,
+ "Can't resolve dictionary ID for field '%s' (dictionary
encodings "
+ "not provided)",
+ name);
+ return EINVAL;
+ }
+
+ const struct ArrowIpcDictionaryEncoding* dictionary_encoding =
+ ArrowIpcDictionaryEncodingsFind(dictionary_encodings, schema);
+ if (dictionary_encoding == NULL) {
+ const char* name = schema->name;
+ if (name == NULL) {
+ name = "<unnamed field>";
+ }
+
+ ArrowErrorSet(error, "Can't resolve dictionary ID for field '%s'", name);
+ return EINVAL;
+ }
+
+ field->dictionary_id = dictionary_encoding->id;
+ }
for (int i = 0; i < NANOARROW_MAX_FIXED_BUFFERS; i++) {
*n_buffers += array_view->layout.buffer_type[i] !=
NANOARROW_BUFFER_TYPE_NONE;
@@ -1554,14 +1664,18 @@ static void ArrowIpcDecoderInitFields(struct
ArrowIpcField* fields,
*n_fields += 1;
for (int64_t i = 0; i < array_view->n_children; i++) {
- ArrowIpcDecoderInitFields(fields, array_view->children[i],
array->children[i],
- n_fields, n_buffers, n_union_fields);
+ NANOARROW_RETURN_NOT_OK(ArrowIpcDecoderInitFields(
+ fields, schema->children[i], dictionary_encodings,
array_view->children[i],
+ array->children[i], n_fields, n_buffers, n_union_fields, error));
}
+
+ return NANOARROW_OK;
}
-ArrowErrorCode ArrowIpcDecoderSetSchema(struct ArrowIpcDecoder* decoder,
- struct ArrowSchema* schema,
- struct ArrowError* error) {
+ArrowErrorCode ArrowIpcDecoderSetSchemaWithDictionaries(
+ struct ArrowIpcDecoder* decoder, const struct ArrowSchema* schema,
+ const struct ArrowIpcDictionaryEncodings* dictionary_encodings,
+ struct ArrowError* error) {
struct ArrowIpcDecoderPrivate* private_data =
(struct ArrowIpcDecoderPrivate*)decoder->private_data;
@@ -1602,13 +1716,20 @@ ArrowErrorCode ArrowIpcDecoderSetSchema(struct
ArrowIpcDecoder* decoder,
// Init field information and calculate starting buffer offset for each
int64_t field_i = 0;
- ArrowIpcDecoderInitFields(private_data->fields, &private_data->array_view,
- &private_data->array, &field_i,
&private_data->n_buffers,
- &private_data->n_union_fields);
+ NANOARROW_RETURN_NOT_OK(ArrowIpcDecoderInitFields(
+ private_data->fields, schema, dictionary_encodings,
&private_data->array_view,
+ &private_data->array, &field_i, &private_data->n_buffers,
+ &private_data->n_union_fields, error));
return NANOARROW_OK;
}
+ArrowErrorCode ArrowIpcDecoderSetSchema(struct ArrowIpcDecoder* decoder,
+ const struct ArrowSchema* schema,
+ struct ArrowError* error) {
+ return ArrowIpcDecoderSetSchemaWithDictionaries(decoder, schema, NULL,
error);
+}
+
ArrowErrorCode ArrowIpcDecoderSetEndianness(struct ArrowIpcDecoder* decoder,
enum ArrowIpcEndianness
endianness) {
struct ArrowIpcDecoderPrivate* private_data =
@@ -2001,18 +2122,22 @@ static int ArrowIpcDecoderWalkGetArray(struct
ArrowArrayView* array_view,
array_view->children[i], array->children[i], out->children[i], error));
}
- if (array->dictionary != NULL) {
- ArrowErrorSet(error, "Decode of dictionary array is not yet supported");
- return ENOTSUP;
- }
-
return NANOARROW_OK;
}
-static int ArrowIpcDecoderWalkSetArrayView(struct ArrowIpcArraySetter* setter,
+static int ArrowIpcDecoderWalkSetArrayView(struct ArrowIpcDecoder* decoder,
+ struct ArrowIpcArraySetter* setter,
struct ArrowArrayView* array_view,
struct ArrowArray* array,
struct ArrowError* error) {
+ struct ArrowIpcDecoderPrivate* private_data =
+ (struct ArrowIpcDecoderPrivate*)decoder->private_data;
+ struct ArrowIpcField* ipc_field = private_data->fields + setter->field_i;
+ if (ipc_field->dictionary_id != NANOARROW_IPC_NO_DICTIONARY_ID) {
+ ArrowErrorSet(error, "Decoding a dictionary-encoding field is not
supported");
+ return ENOTSUP;
+ }
+
ns(FieldNode_struct_t) field =
ns(FieldNode_vec_at(setter->fields, (size_t)setter->field_i));
array_view->length = ns(FieldNode_length(field));
@@ -2066,7 +2191,7 @@ static int ArrowIpcDecoderWalkSetArrayView(struct
ArrowIpcArraySetter* setter,
for (int64_t i = 0; i < array_view->n_children; i++) {
NANOARROW_RETURN_NOT_OK(ArrowIpcDecoderWalkSetArrayView(
- setter, array_view->children[i], array->children[i], error));
+ decoder, setter, array_view->children[i], array->children[i], error));
}
return NANOARROW_OK;
@@ -2158,12 +2283,13 @@ static ArrowErrorCode
ArrowIpcDecoderDecodeArrayViewInternal(
setter.buffer_i++;
for (int64_t i = 0; i < root->array_view->n_children; i++) {
- NANOARROW_RETURN_NOT_OK(ArrowIpcDecoderWalkSetArrayView(
- &setter, root->array_view->children[i], root->array->children[i],
error));
+ NANOARROW_RETURN_NOT_OK(
+ ArrowIpcDecoderWalkSetArrayView(decoder, &setter,
root->array_view->children[i],
+ root->array->children[i], error));
}
} else {
- NANOARROW_RETURN_NOT_OK(
- ArrowIpcDecoderWalkSetArrayView(&setter, root->array_view,
root->array, error));
+ NANOARROW_RETURN_NOT_OK(ArrowIpcDecoderWalkSetArrayView(
+ decoder, &setter, root->array_view, root->array, error));
}
// If we decoded a compressed message, wait for any pending decompression
tasks to
diff --git a/src/nanoarrow/ipc/decoder_test.cc
b/src/nanoarrow/ipc/decoder_test.cc
index 40322908..5f958367 100644
--- a/src/nanoarrow/ipc/decoder_test.cc
+++ b/src/nanoarrow/ipc/decoder_test.cc
@@ -44,6 +44,7 @@ struct ArrowIpcField {
struct ArrowArrayView* array_view;
struct ArrowArray* array;
int64_t buffer_offset;
+ int64_t dictionary_id;
};
struct ArrowIpcDecoderPrivate {
@@ -589,6 +590,7 @@ TEST(NanoarrowIpcTest, NanoarrowIpcDecodeDictionarySchema) {
struct ArrowIpcDecoder decoder;
struct ArrowError error;
struct ArrowSchema schema;
+ struct ArrowIpcDictionaryEncodings dictionary_encodings;
struct ArrowBufferView data;
data.data.as_uint8 = kDictionarySchema;
@@ -599,7 +601,9 @@ TEST(NanoarrowIpcTest, NanoarrowIpcDecodeDictionarySchema) {
EXPECT_EQ(ArrowIpcDecoderDecodeHeader(&decoder, data, &error), NANOARROW_OK);
ASSERT_EQ(decoder.message_type, NANOARROW_IPC_MESSAGE_TYPE_SCHEMA);
- ASSERT_EQ(ArrowIpcDecoderDecodeSchema(&decoder, &schema, &error),
NANOARROW_OK);
+ ASSERT_EQ(ArrowIpcDecoderDecodeSchemaWithDictionaries(&decoder, &schema,
+ &dictionary_encodings,
&error),
+ NANOARROW_OK);
ASSERT_EQ(schema.n_children, 1);
EXPECT_STREQ(schema.children[0]->name, "some_col");
EXPECT_EQ(schema.children[0]->flags, ARROW_FLAG_NULLABLE);
@@ -608,7 +612,33 @@ TEST(NanoarrowIpcTest, NanoarrowIpcDecodeDictionarySchema)
{
ASSERT_NE(schema.children[0]->dictionary, nullptr);
EXPECT_STREQ(schema.children[0]->dictionary->format, "u");
+ // The dictionary encodings should fail to locate anything except the
dictionary-encoded
+ // field
+ ASSERT_EQ(ArrowIpcDictionaryEncodingsFind(&dictionary_encodings, nullptr),
nullptr);
+ ASSERT_EQ(ArrowIpcDictionaryEncodingsFind(&dictionary_encodings, &schema),
nullptr);
+ const struct ArrowIpcDictionaryEncoding* encoding =
+ ArrowIpcDictionaryEncodingsFind(&dictionary_encodings,
schema.children[0]);
+ ASSERT_NE(encoding, nullptr);
+ ASSERT_EQ(encoding->schema, schema.children[0]);
+ ASSERT_EQ(encoding->id, 0);
+ ASSERT_EQ(encoding->kind, NANOARROW_IPC_DICTIONARY_KIND_DENSE_ARRAY);
+
+ // If we try to set the schema without the dictionaries, we should get an
error
+ ASSERT_EQ(ArrowIpcDecoderSetSchema(&decoder, &schema, &error), EINVAL);
+ ASSERT_STREQ(error.message,
+ "Can't resolve dictionary ID for field 'some_col' (dictionary
encodings "
+ "not provided)");
+
+ // When we do set the schema, the ID should propagate into the fields
+ ASSERT_EQ(ArrowIpcDecoderSetSchemaWithDictionaries(&decoder, &schema,
+ &dictionary_encodings,
&error),
+ NANOARROW_OK);
+ auto decoder_private =
+ reinterpret_cast<struct ArrowIpcDecoderPrivate*>(decoder.private_data);
+ ASSERT_EQ(decoder_private->fields[1].dictionary_id, 0);
+
ArrowSchemaRelease(&schema);
+ ArrowIpcDictionaryEncodingsReset(&dictionary_encodings);
ArrowIpcDecoderReset(&decoder);
}
diff --git a/src/nanoarrow/ipc/encoder.c b/src/nanoarrow/ipc/encoder.c
index d5876143..8b3fc489 100644
--- a/src/nanoarrow/ipc/encoder.c
+++ b/src/nanoarrow/ipc/encoder.c
@@ -629,6 +629,7 @@ ArrowErrorCode ArrowIpcEncoderEncodeSimpleRecordBatch(
void ArrowIpcFooterInit(struct ArrowIpcFooter* footer) {
footer->schema.release = NULL;
ArrowBufferInit(&footer->record_batch_blocks);
+ ArrowIpcDictionaryEncodingsInit(&footer->dictionaries);
}
void ArrowIpcFooterReset(struct ArrowIpcFooter* footer) {
@@ -636,6 +637,7 @@ void ArrowIpcFooterReset(struct ArrowIpcFooter* footer) {
ArrowSchemaRelease(&footer->schema);
}
ArrowBufferReset(&footer->record_batch_blocks);
+ ArrowIpcDictionaryEncodingsReset(&footer->dictionaries);
}
ArrowErrorCode ArrowIpcEncoderEncodeFooter(struct ArrowIpcEncoder* encoder,
diff --git a/src/nanoarrow/ipc/ipc_hpp_test.cc
b/src/nanoarrow/ipc/ipc_hpp_test.cc
index e5867395..b0a84090 100644
--- a/src/nanoarrow/ipc/ipc_hpp_test.cc
+++ b/src/nanoarrow/ipc/ipc_hpp_test.cc
@@ -86,3 +86,18 @@ TEST(NanoarrowIpcHppTest,
NanoarrowIpcHppTestUniqueOutputStream) {
EXPECT_NE(output2->release, nullptr);
EXPECT_EQ(output->release, nullptr); //
NOLINT(clang-analyzer-cplusplus.Move)
}
+
+TEST(NanoarrowIpcHppTest, NanoarrowIpcHppTestUniqueDictionaryEncodings) {
+ nanoarrow::ipc::UniqueDictionaryEncodings dictionary_encodings;
+ ASSERT_EQ(ArrowIpcDictionaryEncodingsAppend(
+ dictionary_encodings.get(),
+ {nullptr, 1, NANOARROW_IPC_DICTIONARY_KIND_DENSE_ARRAY}),
+ NANOARROW_OK);
+
+ nanoarrow::ipc::UniqueDictionaryEncodings dictionary_encodings2 =
+ std::move(dictionary_encodings);
+ EXPECT_NE(dictionary_encodings2->encodings.data, nullptr);
+ EXPECT_EQ(
+ dictionary_encodings->encodings.data, //
NOLINT(clang-analyzer-cplusplus.Move)
+ nullptr);
+}
diff --git a/src/nanoarrow/ipc/reader.c b/src/nanoarrow/ipc/reader.c
index 7ecc4ccd..c70b1448 100644
--- a/src/nanoarrow/ipc/reader.c
+++ b/src/nanoarrow/ipc/reader.c
@@ -403,19 +403,22 @@ static int ArrowIpcArrayStreamReaderReadSchemaIfNeeded(
&private_data->error);
struct ArrowSchema tmp;
- NANOARROW_RETURN_NOT_OK(
- ArrowIpcDecoderDecodeSchema(&private_data->decoder, &tmp,
&private_data->error));
+ struct ArrowIpcDictionaryEncodings dictionary_encodings;
+ NANOARROW_RETURN_NOT_OK(ArrowIpcDecoderDecodeSchemaWithDictionaries(
+ &private_data->decoder, &tmp, &dictionary_encodings,
&private_data->error));
// Only support "read the whole thing" for now
if (private_data->field_index != -1) {
ArrowSchemaRelease(&tmp);
+ ArrowIpcDictionaryEncodingsReset(&dictionary_encodings);
ArrowErrorSet(&private_data->error, "Field index != -1 is not yet
supported");
return ENOTSUP;
}
// Notify the decoder of the schema for forthcoming messages
- int result =
- ArrowIpcDecoderSetSchema(&private_data->decoder, &tmp,
&private_data->error);
+ int result = ArrowIpcDecoderSetSchemaWithDictionaries(
+ &private_data->decoder, &tmp, &dictionary_encodings,
&private_data->error);
+ ArrowIpcDictionaryEncodingsReset(&dictionary_encodings);
if (result != NANOARROW_OK) {
ArrowSchemaRelease(&tmp);
return result;
diff --git a/src/nanoarrow/nanoarrow_ipc.h b/src/nanoarrow/nanoarrow_ipc.h
index a1a172cb..ba94e5b5 100644
--- a/src/nanoarrow/nanoarrow_ipc.h
+++ b/src/nanoarrow/nanoarrow_ipc.h
@@ -49,6 +49,8 @@
NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowIpcDecoderDecodeHeader)
#define ArrowIpcDecoderDecodeSchema \
NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowIpcDecoderDecodeSchema)
+#define ArrowIpcDecoderDecodeSchemaWithDictionaries \
+ NANOARROW_SYMBOL(NANOARROW_NAMESPACE,
ArrowIpcDecoderDecodeSchemaWithDictionaries)
#define ArrowIpcDecoderDecodeArrayView \
NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowIpcDecoderDecodeArrayView)
#define ArrowIpcDecoderDecodeArray \
@@ -57,6 +59,8 @@
NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowIpcDecoderDecodeArrayFromShared)
#define ArrowIpcDecoderSetSchema \
NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowIpcDecoderSetSchema)
+#define ArrowIpcDecoderSetSchemaWithDictionaries \
+ NANOARROW_SYMBOL(NANOARROW_NAMESPACE,
ArrowIpcDecoderSetSchemaWithDictionaries)
#define ArrowIpcDecoderSetEndianness \
NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowIpcDecoderSetEndianness)
#define ArrowIpcDecoderPeekFooter \
@@ -105,6 +109,14 @@
#define ArrowIpcFooterReset NANOARROW_SYMBOL(NANOARROW_NAMESPACE,
ArrowIpcFooterReset)
#define ArrowIpcEncoderEncodeFooter \
NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowIpcEncoderEncodeFooter)
+#define ArrowIpcDictionaryEncodingsInit \
+ NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowIpcDictionaryEncodingsInit)
+#define ArrowIpcDictionaryEncodingsAppend \
+ NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowIpcDictionaryEncodingsAppend)
+#define ArrowIpcDictionaryEncodingsFind \
+ NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowIpcDictionaryEncodingsFind)
+#define ArrowIpcDictionaryEncodingsReset \
+ NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowIpcDictionaryEncodingsReset)
#endif
@@ -156,6 +168,12 @@ enum ArrowIpcCompressionType {
NANOARROW_IPC_COMPRESSION_TYPE_ZSTD
};
+/// \brief Dictionary kind enumerator
+enum ArrowIpcDictionaryKind {
+ NANOARROW_IPC_DICTIONARY_KIND_UNINITIALIZED,
+ NANOARROW_IPC_DICTIONARY_KIND_DENSE_ARRAY
+};
+
/// \brief Feature flag for a stream that uses dictionary replacement
#define NANOARROW_IPC_FEATURE_DICTIONARY_REPLACEMENT 1
@@ -171,6 +189,60 @@ struct ArrowIpcDictionaryBatch {
int is_delta;
};
+/// \brief Description of a dictionary-encoded field
+///
+/// This struct is intended to be passed by value; however, its data is
invalidated
+/// if the underlying ArrowSchema that contains the dictionary-encoded field is
+/// released.
+struct ArrowIpcDictionaryEncoding {
+ /// \brief A pointer to the ArrowSchema node of the dictionary-encoded field
+ ///
+ /// This is a reference into another object and care must be taken to ensure
+ /// that if that object is copied that the schema pointers are updated
+ /// appropriately.
+ const struct ArrowSchema* schema;
+
+ /// \brief The identifier used that will appear in dictionary batch messages
+ int64_t id;
+
+ /// \brief The dictionary kind
+ ///
+ /// Currently only one dictionary kind is permitted by the Arrow
specification
+ /// (DenseArray).
+ enum ArrowIpcDictionaryKind kind;
+};
+
+/// \brief List of ArrowIpcDictionaryEncoding structs
+///
+/// This structure provides a list of dictionary encoded fields extracted
+/// from an ArrowSchema during decoding. Its members refer to pointers
+/// within a specific schema, so care must be taken to keep the schema
+/// containing the pointed-to ArrowSchema fields valid.
+struct ArrowIpcDictionaryEncodings {
+ struct ArrowBuffer encodings;
+};
+
+/// \brief Initialize an ArrowIpcDictionaryEncodings list
+NANOARROW_DLL void ArrowIpcDictionaryEncodingsInit(
+ struct ArrowIpcDictionaryEncodings* dictionaries);
+
+/// \brief Append a given ArrowIpcDictionaryEncoding to this list
+NANOARROW_DLL ArrowErrorCode
+ArrowIpcDictionaryEncodingsAppend(struct ArrowIpcDictionaryEncodings*
dictionaries,
+ struct ArrowIpcDictionaryEncoding encoding);
+
+/// \brief Resolve a ArrowIpcDictionaryEncoding for a given dictionary encoded
field
+///
+/// Returns NULL if the pointed to schema does not match any of the pointed to
+/// schemas contained in this list.
+NANOARROW_DLL const struct ArrowIpcDictionaryEncoding*
ArrowIpcDictionaryEncodingsFind(
+ const struct ArrowIpcDictionaryEncodings* dictionaries,
+ const struct ArrowSchema* schema);
+
+/// \brief Release an encodings list and associated resources
+NANOARROW_DLL void ArrowIpcDictionaryEncodingsReset(
+ struct ArrowIpcDictionaryEncodings* dictionaries);
+
/// \brief Checks the nanoarrow runtime to make sure the run/build versions
match
NANOARROW_DLL ArrowErrorCode ArrowIpcCheckRuntime(struct ArrowError* error);
@@ -392,6 +464,8 @@ NANOARROW_DLL ArrowErrorCode
ArrowIpcDecoderDecodeHeader(struct ArrowIpcDecoder*
///
/// After a successful call to ArrowIpcDecoderDecodeHeader(), retrieve an
ArrowSchema.
/// The caller is responsible for releasing the schema if NANOARROW_OK is
returned.
+/// This is equivalent to calling
ArrowIpcDecoderDecodeSchemaWithDictionaries() with
+/// dictionaries_out = NULL.
///
/// Returns EINVAL if the decoder did not just decode a schema message or
/// NANOARROW_OK otherwise.
@@ -399,19 +473,52 @@ NANOARROW_DLL ArrowErrorCode
ArrowIpcDecoderDecodeSchema(struct ArrowIpcDecoder*
struct ArrowSchema*
out,
struct ArrowError*
error);
+/// \brief Decode an ArrowSchema with dictionary encoding information
+///
+/// After a successful call to ArrowIpcDecoderDecodeHeader(), retrieve an
ArrowSchema.
+/// The caller is responsible for releasing the schema if NANOARROW_OK is
returned.
+/// Neither out nor dictionaries_out should be initialized; dictionaries_out
may be
+/// null to omit exporting dictionary identifiers.
+///
+/// Returns EINVAL if the decoder did not just decode a schema message or
+/// NANOARROW_OK otherwise.
+NANOARROW_DLL ArrowErrorCode ArrowIpcDecoderDecodeSchemaWithDictionaries(
+ struct ArrowIpcDecoder* decoder, struct ArrowSchema* out,
+ struct ArrowIpcDictionaryEncodings* dictionaries_out, struct ArrowError*
error);
+
/// \brief Set the ArrowSchema used to decode future record batch messages
///
/// Prepares the decoder for future record batch messages
-/// of this type. The decoder takes ownership of schema if NANOARROW_OK is
returned.
+/// of this type. The decoder does not take ownership of schema.
/// Note that you must call this explicitly after decoding a
/// Schema message (i.e., the decoder does not assume that the last-decoded
/// schema message applies to future record batch messages).
///
+/// This is equivalent to calling ArrowIpcDecoderSetSchemaWithDictionaries()
with
+/// dictionary_encodings = NULL.
+///
/// Returns EINVAL if schema validation fails or NANOARROW_OK otherwise.
NANOARROW_DLL ArrowErrorCode ArrowIpcDecoderSetSchema(struct ArrowIpcDecoder*
decoder,
- struct ArrowSchema*
schema,
+ const struct
ArrowSchema* schema,
struct ArrowError*
error);
+/// \brief Set the ArrowSchema and dictionary encodings used to decode future
record batch
+/// messages
+///
+/// Prepares the decoder for future record batch messages
+/// of this type. The decoder does not take ownership of schema.
+/// Note that you must call this explicitly after decoding a
+/// Schema message (i.e., the decoder does not assume that the last-decoded
+/// schema message applies to future record batch messages).
+///
+/// Returns EINVAL if schema validation fails or if the schema contains
+/// dictionary encodings that could not be resolved in the provided
+/// ArrowIpcDictionaryEncodings object, or NANOARROW_OK otherwise.
+NANOARROW_DLL ArrowErrorCode ArrowIpcDecoderSetSchemaWithDictionaries(
+ struct ArrowIpcDecoder* decoder, const struct ArrowSchema* schema,
+ const struct ArrowIpcDictionaryEncodings* dictionary_encodings,
+ struct ArrowError* error);
+
/// \brief Set the endianness used to decode future record batch messages
///
/// Prepares the decoder for future record batch messages with the specified
@@ -708,8 +815,10 @@ struct ArrowIpcFileBlock {
/// This structure is intended to be allocated by the caller, initialized using
/// ArrowIpcFooterInit(), and released with ArrowIpcFooterReset().
struct ArrowIpcFooter {
- /// \brief the Footer's embedded Schema
+ /// \brief The Footer's embedded Schema
struct ArrowSchema schema;
+ /// \brief Dictionaries present in the footer Schema
+ struct ArrowIpcDictionaryEncodings dictionaries;
/// \brief all blocks containing RecordBatch Messages
struct ArrowBuffer record_batch_blocks;
};
diff --git a/src/nanoarrow/nanoarrow_ipc.hpp b/src/nanoarrow/nanoarrow_ipc.hpp
index a9fadf1f..84f1d90b 100644
--- a/src/nanoarrow/nanoarrow_ipc.hpp
+++ b/src/nanoarrow/nanoarrow_ipc.hpp
@@ -57,6 +57,23 @@ inline void release_pointer(struct ArrowIpcDecoder* data) {
ArrowIpcDecoderReset(data);
}
+template <>
+inline void init_pointer(struct ArrowIpcDictionaryEncodings* data) {
+ ArrowIpcDictionaryEncodingsInit(data);
+}
+
+template <>
+inline void move_pointer(struct ArrowIpcDictionaryEncodings* src,
+ struct ArrowIpcDictionaryEncodings* dst) {
+ memcpy(dst, src, sizeof(struct ArrowIpcDictionaryEncodings));
+ ArrowIpcDictionaryEncodingsInit(src);
+}
+
+template <>
+inline void release_pointer(struct ArrowIpcDictionaryEncodings* data) {
+ ArrowIpcDictionaryEncodingsReset(data);
+}
+
template <>
inline void init_pointer(struct ArrowIpcFooter* data) {
ArrowIpcFooterInit(data);
@@ -66,6 +83,7 @@ template <>
inline void move_pointer(struct ArrowIpcFooter* src, struct ArrowIpcFooter*
dst) {
ArrowSchemaMove(&src->schema, &dst->schema);
ArrowBufferMove(&src->record_batch_blocks, &dst->record_batch_blocks);
+ move_pointer(&src->dictionaries, &dst->dictionaries);
}
template <>
@@ -185,6 +203,9 @@ using UniqueDecoder = internal::Unique<struct
ArrowIpcDecoder>;
/// \brief Class wrapping a unique struct ArrowIpcFooter
using UniqueFooter = internal::Unique<struct ArrowIpcFooter>;
+/// \brief Class wrapping a unique struct ArrowIpcDictionaryEncodings
+using UniqueDictionaryEncodings = internal::Unique<struct
ArrowIpcDictionaryEncodings>;
+
/// \brief Class wrapping a unique struct ArrowIpcEncoder
using UniqueEncoder = internal::Unique<struct ArrowIpcEncoder>;