This is an automated email from the ASF dual-hosted git repository.

paleolimbot pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow-nanoarrow.git


The following commit(s) were added to refs/heads/main by this push:
     new 31a4634c feat: Add ArrowIpcDictionaryMapping to track dictionary 
identifiers when parsing schema message (#856)
31a4634c is described below

commit 31a4634c8a041ae32b814919c76108a77a47a139
Author: Dewey Dunnington <[email protected]>
AuthorDate: Mon Mar 23 08:48:52 2026 -0500

    feat: Add ArrowIpcDictionaryMapping to track dictionary identifiers when 
parsing schema message (#856)
    
    This PR adds `struct ArrowIpcDictionaryEncoding` and `struct
    ArrowIpcDictionaryEncodings` that accompany an `ArrowSchema` to map
    fields to dictionary IDs. The primary purpose of this is to assign
    dictionary IDs to the (internal) `struct ArrowIpcField` so that the next
    PR can make use of that to actually decode them. There is now an
    `ArrowIpcDecoderDecodeSchemaWithDictionaries()` and
    `ArrowIpcDecoderSetSchemaWithDictionaries()` to mach
    `ArrowIpcDecoderDecodeSchema()` and `ArrowIpcDecoderSetSchema()`
    (respectively).
    
    This is largely an implementation detail that simply transports
    information from the Flatbuffers schema to the decoder internals.
    
    Closes #844.
    
    ---------
    
    Co-authored-by: Copilot Autofix powered by AI 
<[email protected]>
---
 src/nanoarrow/ipc/decoder.c       | 208 ++++++++++++++++++++++++++++++--------
 src/nanoarrow/ipc/decoder_test.cc |  32 +++++-
 src/nanoarrow/ipc/encoder.c       |   2 +
 src/nanoarrow/ipc/ipc_hpp_test.cc |  15 +++
 src/nanoarrow/ipc/reader.c        |  11 +-
 src/nanoarrow/nanoarrow_ipc.h     | 115 ++++++++++++++++++++-
 src/nanoarrow/nanoarrow_ipc.hpp   |  21 ++++
 7 files changed, 355 insertions(+), 49 deletions(-)

diff --git a/src/nanoarrow/ipc/decoder.c b/src/nanoarrow/ipc/decoder.c
index f3b4b443..400d483e 100644
--- a/src/nanoarrow/ipc/decoder.c
+++ b/src/nanoarrow/ipc/decoder.c
@@ -17,6 +17,7 @@
 
 #include <errno.h>
 #include <inttypes.h>
+#include <stdint.h>
 #include <stdio.h>
 #include <string.h>
 
@@ -54,6 +55,8 @@
 
 #define NANOARROW_IPC_MAGIC "ARROW1"
 
+#define NANOARROW_IPC_NO_DICTIONARY_ID INT64_MIN
+
 // Internal representation of a parsed "Field" from flatbuffers. This
 // represents a field in a depth-first walk of column arrays and their
 // children.
@@ -66,6 +69,9 @@ struct ArrowIpcField {
   struct ArrowArray* array;
   // The cumulative number of buffers preceding this node.
   int64_t buffer_offset;
+  // Dictionary identifier (or NANOARROW_IPC_NO_DICTIONARY_ID if this is not a
+  // dictionary-encoded field)
+  int64_t dictionary_id;
 };
 
 // Internal data specific to the read/decode process
@@ -272,6 +278,46 @@ static int ArrowIpcDecoderNeedsSwapEndian(struct 
ArrowIpcDecoder* decoder) {
   }
 }
 
+void ArrowIpcDictionaryEncodingsInit(
+    struct ArrowIpcDictionaryEncodings* dictionary_encodings) {
+  NANOARROW_DCHECK(dictionary_encodings != NULL);
+  ArrowBufferInit(&dictionary_encodings->encodings);
+}
+
+ArrowErrorCode ArrowIpcDictionaryEncodingsAppend(
+    struct ArrowIpcDictionaryEncodings* dictionaries,
+    struct ArrowIpcDictionaryEncoding encoding) {
+  NANOARROW_DCHECK(dictionaries != NULL);
+  NANOARROW_RETURN_NOT_OK(ArrowBufferAppend(&dictionaries->encodings, 
&encoding,
+                                            sizeof(struct 
ArrowIpcDictionaryEncoding)));
+  return NANOARROW_OK;
+}
+
+const struct ArrowIpcDictionaryEncoding* ArrowIpcDictionaryEncodingsFind(
+    const struct ArrowIpcDictionaryEncodings* dictionary_encodings,
+    const struct ArrowSchema* schema) {
+  NANOARROW_DCHECK(dictionary_encodings != NULL);
+  int64_t length = dictionary_encodings->encodings.size_bytes /
+                   sizeof(struct ArrowIpcDictionaryEncoding);
+  const struct ArrowIpcDictionaryEncoding* data =
+      (const struct 
ArrowIpcDictionaryEncoding*)dictionary_encodings->encodings.data;
+
+  for (int64_t i = 0; i < length; i++) {
+    const struct ArrowIpcDictionaryEncoding* encoding = data + i;
+    if (encoding->schema == schema) {
+      return encoding;
+    }
+  }
+
+  return NULL;
+}
+
+void ArrowIpcDictionaryEncodingsReset(
+    struct ArrowIpcDictionaryEncodings* dictionary_encodings) {
+  NANOARROW_DCHECK(dictionary_encodings != NULL);
+  ArrowBufferReset(&dictionary_encodings->encodings);
+}
+
 ArrowErrorCode ArrowIpcDecoderInit(struct ArrowIpcDecoder* decoder) {
   memset(decoder, 0, sizeof(struct ArrowIpcDecoder));
   struct ArrowIpcDecoderPrivate* private_data =
@@ -943,7 +989,7 @@ static int 
ArrowIpcMoveNonExtensionFieldMetadataBackToFieldIfNeeded(
 
 static int ArrowIpcSetDictionaryEncoding(
     struct ArrowSchema* schema, ns(DictionaryEncoding_table_t 
dictionary_encoding),
-    struct ArrowError* error) {
+    struct ArrowIpcDictionaryEncodings* dictionaries, struct ArrowError* 
error) {
   switch (
       
org_apache_arrow_flatbuf_DictionaryEncoding_dictionaryKind(dictionary_encoding))
 {
     case ns(DictionaryKind_DenseArray):
@@ -982,16 +1028,25 @@ static int ArrowIpcSetDictionaryEncoding(
   NANOARROW_RETURN_NOT_OK_WITH_ERROR(
       ArrowIpcMoveNonExtensionFieldMetadataBackToFieldIfNeeded(schema), error);
 
-  // TODO: Track the dictionary
-  // https://github.com/apache/arrow-nanoarrow/issues/844
+  // Track the identifier if we have a dictionaries object in which to track it
+  if (dictionaries != NULL) {
+    int64_t id = ns(DictionaryEncoding_id(dictionary_encoding));
+    struct ArrowIpcDictionaryEncoding encoding = {
+        schema, id, NANOARROW_IPC_DICTIONARY_KIND_DENSE_ARRAY};
+
+    NANOARROW_RETURN_NOT_OK_WITH_ERROR(
+        ArrowIpcDictionaryEncodingsAppend(dictionaries, encoding), error);
+  }
 
   return NANOARROW_OK;
 }
 
 static int ArrowIpcDecoderSetChildren(struct ArrowSchema* schema, 
ns(Field_vec_t) fields,
+                                      struct ArrowIpcDictionaryEncodings* 
dictionaries,
                                       struct ArrowError* error);
 
 static int ArrowIpcDecoderSetField(struct ArrowSchema* schema, 
ns(Field_table_t) field,
+                                   struct ArrowIpcDictionaryEncodings* 
dictionaries,
                                    struct ArrowError* error) {
   int result;
   if (ns(Field_name_is_present(field))) {
@@ -1032,26 +1087,29 @@ static int ArrowIpcDecoderSetField(struct ArrowSchema* 
schema, ns(Field_table_t)
     ArrowSchemaInit(schema->children[i]);
   }
 
-  NANOARROW_RETURN_NOT_OK(ArrowIpcDecoderSetChildren(schema, children, error));
+  NANOARROW_RETURN_NOT_OK(
+      ArrowIpcDecoderSetChildren(schema, children, dictionaries, error));
   NANOARROW_RETURN_NOT_OK(
       ArrowIpcDecoderSetMetadata(schema, ns(Field_custom_metadata(field)), 
error));
 
   // If this is a dictionary encoded field, set the dictionary encoding
   if (ns(Field_dictionary_is_present(field))) {
-    NANOARROW_RETURN_NOT_OK(
-        ArrowIpcSetDictionaryEncoding(schema, ns(Field_dictionary(field)), 
error));
+    NANOARROW_RETURN_NOT_OK(ArrowIpcSetDictionaryEncoding(
+        schema, ns(Field_dictionary(field)), dictionaries, error));
   }
 
   return NANOARROW_OK;
 }
 
 static int ArrowIpcDecoderSetChildren(struct ArrowSchema* schema, 
ns(Field_vec_t) fields,
+                                      struct ArrowIpcDictionaryEncodings* 
dictionaries,
                                       struct ArrowError* error) {
   int64_t n_fields = ns(Schema_vec_len(fields));
 
   for (int64_t i = 0; i < n_fields; i++) {
     ns(Field_table_t) field = ns(Field_vec_at(fields, i));
-    NANOARROW_RETURN_NOT_OK(ArrowIpcDecoderSetField(schema->children[i], 
field, error));
+    NANOARROW_RETURN_NOT_OK(
+        ArrowIpcDecoderSetField(schema->children[i], field, dictionaries, 
error));
   }
 
   return NANOARROW_OK;
@@ -1447,9 +1505,10 @@ ArrowErrorCode ArrowIpcDecoderDecodeHeader(struct 
ArrowIpcDecoder* decoder,
   return NANOARROW_OK;
 }
 
-static ArrowErrorCode ArrowIpcDecoderDecodeSchemaImpl(ns(Schema_table_t) 
schema,
-                                                      struct ArrowSchema* out,
-                                                      struct ArrowError* 
error) {
+static ArrowErrorCode ArrowIpcDecoderDecodeSchemaImpl(
+    ns(Schema_table_t) schema, struct ArrowSchema* out,
+    struct ArrowIpcDictionaryEncodings* dictionary_encodings_out,
+    struct ArrowError* error) {
   ArrowSchemaInit(out);
   // Top-level batch schema is typically non-nullable
   out->flags = 0;
@@ -1464,15 +1523,17 @@ static ArrowErrorCode 
ArrowIpcDecoderDecodeSchemaImpl(ns(Schema_table_t) schema,
     return result;
   }
 
-  NANOARROW_RETURN_NOT_OK(ArrowIpcDecoderSetChildren(out, fields, error));
+  NANOARROW_RETURN_NOT_OK(
+      ArrowIpcDecoderSetChildren(out, fields, dictionary_encodings_out, 
error));
   NANOARROW_RETURN_NOT_OK(
       ArrowIpcDecoderSetMetadata(out, ns(Schema_custom_metadata(schema)), 
error));
   return NANOARROW_OK;
 }
 
-ArrowErrorCode ArrowIpcDecoderDecodeSchema(struct ArrowIpcDecoder* decoder,
-                                           struct ArrowSchema* out,
-                                           struct ArrowError* error) {
+ArrowErrorCode ArrowIpcDecoderDecodeSchemaWithDictionaries(
+    struct ArrowIpcDecoder* decoder, struct ArrowSchema* out,
+    struct ArrowIpcDictionaryEncodings* dictionary_encodings_out,
+    struct ArrowError* error) {
   struct ArrowIpcDecoderPrivate* private_data =
       (struct ArrowIpcDecoderPrivate*)decoder->private_data;
 
@@ -1482,18 +1543,34 @@ ArrowErrorCode ArrowIpcDecoderDecodeSchema(struct 
ArrowIpcDecoder* decoder,
     return EINVAL;
   }
 
+  if (dictionary_encodings_out != NULL) {
+    ArrowIpcDictionaryEncodingsInit(dictionary_encodings_out);
+  }
+
   struct ArrowSchema tmp;
-  ArrowErrorCode result = ArrowIpcDecoderDecodeSchemaImpl(
-      (ns(Schema_table_t))private_data->last_message, &tmp, error);
+  ArrowErrorCode result =
+      
ArrowIpcDecoderDecodeSchemaImpl((ns(Schema_table_t))private_data->last_message,
+                                      &tmp, dictionary_encodings_out, error);
 
   if (result != NANOARROW_OK) {
     ArrowSchemaRelease(&tmp);
+
+    if (dictionary_encodings_out != NULL) {
+      ArrowIpcDictionaryEncodingsReset(dictionary_encodings_out);
+    }
+
     return result;
   }
   ArrowSchemaMove(&tmp, out);
   return NANOARROW_OK;
 }
 
+ArrowErrorCode ArrowIpcDecoderDecodeSchema(struct ArrowIpcDecoder* decoder,
+                                           struct ArrowSchema* out,
+                                           struct ArrowError* error) {
+  return ArrowIpcDecoderDecodeSchemaWithDictionaries(decoder, out, NULL, 
error);
+}
+
 ArrowErrorCode ArrowIpcDecoderDecodeFooter(struct ArrowIpcDecoder* decoder,
                                            struct ArrowBufferView data,
                                            struct ArrowError* error) {
@@ -1510,7 +1587,8 @@ ArrowErrorCode ArrowIpcDecoderDecodeFooter(struct 
ArrowIpcDecoder* decoder,
       ArrowIpcDecoderDecodeSchemaHeader(decoder, ns(Footer_schema(footer)), 
error));
 
   NANOARROW_RETURN_NOT_OK(ArrowIpcDecoderDecodeSchemaImpl(
-      ns(Footer_schema(footer)), &private_data->footer.schema, error));
+      ns(Footer_schema(footer)), &private_data->footer.schema,
+      &private_data->footer.dictionaries, error));
 
   ns(Block_vec_t) blocks = ns(Footer_recordBatches(footer));
   int64_t n = ns(Block_vec_len(blocks));
@@ -1529,21 +1607,53 @@ ArrowErrorCode ArrowIpcDecoderDecodeFooter(struct 
ArrowIpcDecoder* decoder,
   return NANOARROW_OK;
 }
 
-static void ArrowIpcDecoderCountFields(struct ArrowSchema* schema, int64_t* 
n_fields) {
+static void ArrowIpcDecoderCountFields(const struct ArrowSchema* schema,
+                                       int64_t* n_fields) {
   *n_fields += 1;
   for (int64_t i = 0; i < schema->n_children; i++) {
     ArrowIpcDecoderCountFields(schema->children[i], n_fields);
   }
 }
 
-static void ArrowIpcDecoderInitFields(struct ArrowIpcField* fields,
-                                      struct ArrowArrayView* array_view,
-                                      struct ArrowArray* array, int64_t* 
n_fields,
-                                      int64_t* n_buffers, int64_t* 
n_union_fields) {
+static ArrowErrorCode ArrowIpcDecoderInitFields(
+    struct ArrowIpcField* fields, const struct ArrowSchema* schema,
+    const struct ArrowIpcDictionaryEncodings* dictionary_encodings,
+    struct ArrowArrayView* array_view, struct ArrowArray* array, int64_t* 
n_fields,
+    int64_t* n_buffers, int64_t* n_union_fields, struct ArrowError* error) {
   struct ArrowIpcField* field = fields + (*n_fields);
   field->array_view = array_view;
   field->array = array;
   field->buffer_offset = *n_buffers;
+  field->dictionary_id = NANOARROW_IPC_NO_DICTIONARY_ID;
+
+  if (schema->dictionary != NULL) {
+    if (dictionary_encodings == NULL) {
+      const char* name = schema->name;
+      if (name == NULL) {
+        name = "<unnamed field>";
+      }
+
+      ArrowErrorSet(error,
+                    "Can't resolve dictionary ID for field '%s' (dictionary 
encodings "
+                    "not provided)",
+                    name);
+      return EINVAL;
+    }
+
+    const struct ArrowIpcDictionaryEncoding* dictionary_encoding =
+        ArrowIpcDictionaryEncodingsFind(dictionary_encodings, schema);
+    if (dictionary_encoding == NULL) {
+      const char* name = schema->name;
+      if (name == NULL) {
+        name = "<unnamed field>";
+      }
+
+      ArrowErrorSet(error, "Can't resolve dictionary ID for field '%s'", name);
+      return EINVAL;
+    }
+
+    field->dictionary_id = dictionary_encoding->id;
+  }
 
   for (int i = 0; i < NANOARROW_MAX_FIXED_BUFFERS; i++) {
     *n_buffers += array_view->layout.buffer_type[i] != 
NANOARROW_BUFFER_TYPE_NONE;
@@ -1554,14 +1664,18 @@ static void ArrowIpcDecoderInitFields(struct 
ArrowIpcField* fields,
   *n_fields += 1;
 
   for (int64_t i = 0; i < array_view->n_children; i++) {
-    ArrowIpcDecoderInitFields(fields, array_view->children[i], 
array->children[i],
-                              n_fields, n_buffers, n_union_fields);
+    NANOARROW_RETURN_NOT_OK(ArrowIpcDecoderInitFields(
+        fields, schema->children[i], dictionary_encodings, 
array_view->children[i],
+        array->children[i], n_fields, n_buffers, n_union_fields, error));
   }
+
+  return NANOARROW_OK;
 }
 
-ArrowErrorCode ArrowIpcDecoderSetSchema(struct ArrowIpcDecoder* decoder,
-                                        struct ArrowSchema* schema,
-                                        struct ArrowError* error) {
+ArrowErrorCode ArrowIpcDecoderSetSchemaWithDictionaries(
+    struct ArrowIpcDecoder* decoder, const struct ArrowSchema* schema,
+    const struct ArrowIpcDictionaryEncodings* dictionary_encodings,
+    struct ArrowError* error) {
   struct ArrowIpcDecoderPrivate* private_data =
       (struct ArrowIpcDecoderPrivate*)decoder->private_data;
 
@@ -1602,13 +1716,20 @@ ArrowErrorCode ArrowIpcDecoderSetSchema(struct 
ArrowIpcDecoder* decoder,
 
   // Init field information and calculate starting buffer offset for each
   int64_t field_i = 0;
-  ArrowIpcDecoderInitFields(private_data->fields, &private_data->array_view,
-                            &private_data->array, &field_i, 
&private_data->n_buffers,
-                            &private_data->n_union_fields);
+  NANOARROW_RETURN_NOT_OK(ArrowIpcDecoderInitFields(
+      private_data->fields, schema, dictionary_encodings, 
&private_data->array_view,
+      &private_data->array, &field_i, &private_data->n_buffers,
+      &private_data->n_union_fields, error));
 
   return NANOARROW_OK;
 }
 
+ArrowErrorCode ArrowIpcDecoderSetSchema(struct ArrowIpcDecoder* decoder,
+                                        const struct ArrowSchema* schema,
+                                        struct ArrowError* error) {
+  return ArrowIpcDecoderSetSchemaWithDictionaries(decoder, schema, NULL, 
error);
+}
+
 ArrowErrorCode ArrowIpcDecoderSetEndianness(struct ArrowIpcDecoder* decoder,
                                             enum ArrowIpcEndianness 
endianness) {
   struct ArrowIpcDecoderPrivate* private_data =
@@ -2001,18 +2122,22 @@ static int ArrowIpcDecoderWalkGetArray(struct 
ArrowArrayView* array_view,
         array_view->children[i], array->children[i], out->children[i], error));
   }
 
-  if (array->dictionary != NULL) {
-    ArrowErrorSet(error, "Decode of dictionary array is not yet supported");
-    return ENOTSUP;
-  }
-
   return NANOARROW_OK;
 }
 
-static int ArrowIpcDecoderWalkSetArrayView(struct ArrowIpcArraySetter* setter,
+static int ArrowIpcDecoderWalkSetArrayView(struct ArrowIpcDecoder* decoder,
+                                           struct ArrowIpcArraySetter* setter,
                                            struct ArrowArrayView* array_view,
                                            struct ArrowArray* array,
                                            struct ArrowError* error) {
+  struct ArrowIpcDecoderPrivate* private_data =
+      (struct ArrowIpcDecoderPrivate*)decoder->private_data;
+  struct ArrowIpcField* ipc_field = private_data->fields + setter->field_i;
+  if (ipc_field->dictionary_id != NANOARROW_IPC_NO_DICTIONARY_ID) {
+    ArrowErrorSet(error, "Decoding a dictionary-encoding field is not 
supported");
+    return ENOTSUP;
+  }
+
   ns(FieldNode_struct_t) field =
       ns(FieldNode_vec_at(setter->fields, (size_t)setter->field_i));
   array_view->length = ns(FieldNode_length(field));
@@ -2066,7 +2191,7 @@ static int ArrowIpcDecoderWalkSetArrayView(struct 
ArrowIpcArraySetter* setter,
 
   for (int64_t i = 0; i < array_view->n_children; i++) {
     NANOARROW_RETURN_NOT_OK(ArrowIpcDecoderWalkSetArrayView(
-        setter, array_view->children[i], array->children[i], error));
+        decoder, setter, array_view->children[i], array->children[i], error));
   }
 
   return NANOARROW_OK;
@@ -2158,12 +2283,13 @@ static ArrowErrorCode 
ArrowIpcDecoderDecodeArrayViewInternal(
     setter.buffer_i++;
 
     for (int64_t i = 0; i < root->array_view->n_children; i++) {
-      NANOARROW_RETURN_NOT_OK(ArrowIpcDecoderWalkSetArrayView(
-          &setter, root->array_view->children[i], root->array->children[i], 
error));
+      NANOARROW_RETURN_NOT_OK(
+          ArrowIpcDecoderWalkSetArrayView(decoder, &setter, 
root->array_view->children[i],
+                                          root->array->children[i], error));
     }
   } else {
-    NANOARROW_RETURN_NOT_OK(
-        ArrowIpcDecoderWalkSetArrayView(&setter, root->array_view, 
root->array, error));
+    NANOARROW_RETURN_NOT_OK(ArrowIpcDecoderWalkSetArrayView(
+        decoder, &setter, root->array_view, root->array, error));
   }
 
   // If we decoded a compressed message, wait for any pending decompression 
tasks to
diff --git a/src/nanoarrow/ipc/decoder_test.cc 
b/src/nanoarrow/ipc/decoder_test.cc
index 40322908..5f958367 100644
--- a/src/nanoarrow/ipc/decoder_test.cc
+++ b/src/nanoarrow/ipc/decoder_test.cc
@@ -44,6 +44,7 @@ struct ArrowIpcField {
   struct ArrowArrayView* array_view;
   struct ArrowArray* array;
   int64_t buffer_offset;
+  int64_t dictionary_id;
 };
 
 struct ArrowIpcDecoderPrivate {
@@ -589,6 +590,7 @@ TEST(NanoarrowIpcTest, NanoarrowIpcDecodeDictionarySchema) {
   struct ArrowIpcDecoder decoder;
   struct ArrowError error;
   struct ArrowSchema schema;
+  struct ArrowIpcDictionaryEncodings dictionary_encodings;
 
   struct ArrowBufferView data;
   data.data.as_uint8 = kDictionarySchema;
@@ -599,7 +601,9 @@ TEST(NanoarrowIpcTest, NanoarrowIpcDecodeDictionarySchema) {
   EXPECT_EQ(ArrowIpcDecoderDecodeHeader(&decoder, data, &error), NANOARROW_OK);
   ASSERT_EQ(decoder.message_type, NANOARROW_IPC_MESSAGE_TYPE_SCHEMA);
 
-  ASSERT_EQ(ArrowIpcDecoderDecodeSchema(&decoder, &schema, &error), 
NANOARROW_OK);
+  ASSERT_EQ(ArrowIpcDecoderDecodeSchemaWithDictionaries(&decoder, &schema,
+                                                        &dictionary_encodings, 
&error),
+            NANOARROW_OK);
   ASSERT_EQ(schema.n_children, 1);
   EXPECT_STREQ(schema.children[0]->name, "some_col");
   EXPECT_EQ(schema.children[0]->flags, ARROW_FLAG_NULLABLE);
@@ -608,7 +612,33 @@ TEST(NanoarrowIpcTest, NanoarrowIpcDecodeDictionarySchema) 
{
   ASSERT_NE(schema.children[0]->dictionary, nullptr);
   EXPECT_STREQ(schema.children[0]->dictionary->format, "u");
 
+  // The dictionary encodings should fail to locate anything except the 
dictionary-encoded
+  // field
+  ASSERT_EQ(ArrowIpcDictionaryEncodingsFind(&dictionary_encodings, nullptr), 
nullptr);
+  ASSERT_EQ(ArrowIpcDictionaryEncodingsFind(&dictionary_encodings, &schema), 
nullptr);
+  const struct ArrowIpcDictionaryEncoding* encoding =
+      ArrowIpcDictionaryEncodingsFind(&dictionary_encodings, 
schema.children[0]);
+  ASSERT_NE(encoding, nullptr);
+  ASSERT_EQ(encoding->schema, schema.children[0]);
+  ASSERT_EQ(encoding->id, 0);
+  ASSERT_EQ(encoding->kind, NANOARROW_IPC_DICTIONARY_KIND_DENSE_ARRAY);
+
+  // If we try to set the schema without the dictionaries, we should get an 
error
+  ASSERT_EQ(ArrowIpcDecoderSetSchema(&decoder, &schema, &error), EINVAL);
+  ASSERT_STREQ(error.message,
+               "Can't resolve dictionary ID for field 'some_col' (dictionary 
encodings "
+               "not provided)");
+
+  // When we do set the schema, the ID should propagate into the fields
+  ASSERT_EQ(ArrowIpcDecoderSetSchemaWithDictionaries(&decoder, &schema,
+                                                     &dictionary_encodings, 
&error),
+            NANOARROW_OK);
+  auto decoder_private =
+      reinterpret_cast<struct ArrowIpcDecoderPrivate*>(decoder.private_data);
+  ASSERT_EQ(decoder_private->fields[1].dictionary_id, 0);
+
   ArrowSchemaRelease(&schema);
+  ArrowIpcDictionaryEncodingsReset(&dictionary_encodings);
   ArrowIpcDecoderReset(&decoder);
 }
 
diff --git a/src/nanoarrow/ipc/encoder.c b/src/nanoarrow/ipc/encoder.c
index d5876143..8b3fc489 100644
--- a/src/nanoarrow/ipc/encoder.c
+++ b/src/nanoarrow/ipc/encoder.c
@@ -629,6 +629,7 @@ ArrowErrorCode ArrowIpcEncoderEncodeSimpleRecordBatch(
 void ArrowIpcFooterInit(struct ArrowIpcFooter* footer) {
   footer->schema.release = NULL;
   ArrowBufferInit(&footer->record_batch_blocks);
+  ArrowIpcDictionaryEncodingsInit(&footer->dictionaries);
 }
 
 void ArrowIpcFooterReset(struct ArrowIpcFooter* footer) {
@@ -636,6 +637,7 @@ void ArrowIpcFooterReset(struct ArrowIpcFooter* footer) {
     ArrowSchemaRelease(&footer->schema);
   }
   ArrowBufferReset(&footer->record_batch_blocks);
+  ArrowIpcDictionaryEncodingsReset(&footer->dictionaries);
 }
 
 ArrowErrorCode ArrowIpcEncoderEncodeFooter(struct ArrowIpcEncoder* encoder,
diff --git a/src/nanoarrow/ipc/ipc_hpp_test.cc 
b/src/nanoarrow/ipc/ipc_hpp_test.cc
index e5867395..b0a84090 100644
--- a/src/nanoarrow/ipc/ipc_hpp_test.cc
+++ b/src/nanoarrow/ipc/ipc_hpp_test.cc
@@ -86,3 +86,18 @@ TEST(NanoarrowIpcHppTest, 
NanoarrowIpcHppTestUniqueOutputStream) {
   EXPECT_NE(output2->release, nullptr);
   EXPECT_EQ(output->release, nullptr);  // 
NOLINT(clang-analyzer-cplusplus.Move)
 }
+
+TEST(NanoarrowIpcHppTest, NanoarrowIpcHppTestUniqueDictionaryEncodings) {
+  nanoarrow::ipc::UniqueDictionaryEncodings dictionary_encodings;
+  ASSERT_EQ(ArrowIpcDictionaryEncodingsAppend(
+                dictionary_encodings.get(),
+                {nullptr, 1, NANOARROW_IPC_DICTIONARY_KIND_DENSE_ARRAY}),
+            NANOARROW_OK);
+
+  nanoarrow::ipc::UniqueDictionaryEncodings dictionary_encodings2 =
+      std::move(dictionary_encodings);
+  EXPECT_NE(dictionary_encodings2->encodings.data, nullptr);
+  EXPECT_EQ(
+      dictionary_encodings->encodings.data,  // 
NOLINT(clang-analyzer-cplusplus.Move)
+      nullptr);
+}
diff --git a/src/nanoarrow/ipc/reader.c b/src/nanoarrow/ipc/reader.c
index 7ecc4ccd..c70b1448 100644
--- a/src/nanoarrow/ipc/reader.c
+++ b/src/nanoarrow/ipc/reader.c
@@ -403,19 +403,22 @@ static int ArrowIpcArrayStreamReaderReadSchemaIfNeeded(
       &private_data->error);
 
   struct ArrowSchema tmp;
-  NANOARROW_RETURN_NOT_OK(
-      ArrowIpcDecoderDecodeSchema(&private_data->decoder, &tmp, 
&private_data->error));
+  struct ArrowIpcDictionaryEncodings dictionary_encodings;
+  NANOARROW_RETURN_NOT_OK(ArrowIpcDecoderDecodeSchemaWithDictionaries(
+      &private_data->decoder, &tmp, &dictionary_encodings, 
&private_data->error));
 
   // Only support "read the whole thing" for now
   if (private_data->field_index != -1) {
     ArrowSchemaRelease(&tmp);
+    ArrowIpcDictionaryEncodingsReset(&dictionary_encodings);
     ArrowErrorSet(&private_data->error, "Field index != -1 is not yet 
supported");
     return ENOTSUP;
   }
 
   // Notify the decoder of the schema for forthcoming messages
-  int result =
-      ArrowIpcDecoderSetSchema(&private_data->decoder, &tmp, 
&private_data->error);
+  int result = ArrowIpcDecoderSetSchemaWithDictionaries(
+      &private_data->decoder, &tmp, &dictionary_encodings, 
&private_data->error);
+  ArrowIpcDictionaryEncodingsReset(&dictionary_encodings);
   if (result != NANOARROW_OK) {
     ArrowSchemaRelease(&tmp);
     return result;
diff --git a/src/nanoarrow/nanoarrow_ipc.h b/src/nanoarrow/nanoarrow_ipc.h
index a1a172cb..ba94e5b5 100644
--- a/src/nanoarrow/nanoarrow_ipc.h
+++ b/src/nanoarrow/nanoarrow_ipc.h
@@ -49,6 +49,8 @@
   NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowIpcDecoderDecodeHeader)
 #define ArrowIpcDecoderDecodeSchema \
   NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowIpcDecoderDecodeSchema)
+#define ArrowIpcDecoderDecodeSchemaWithDictionaries \
+  NANOARROW_SYMBOL(NANOARROW_NAMESPACE, 
ArrowIpcDecoderDecodeSchemaWithDictionaries)
 #define ArrowIpcDecoderDecodeArrayView \
   NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowIpcDecoderDecodeArrayView)
 #define ArrowIpcDecoderDecodeArray \
@@ -57,6 +59,8 @@
   NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowIpcDecoderDecodeArrayFromShared)
 #define ArrowIpcDecoderSetSchema \
   NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowIpcDecoderSetSchema)
+#define ArrowIpcDecoderSetSchemaWithDictionaries \
+  NANOARROW_SYMBOL(NANOARROW_NAMESPACE, 
ArrowIpcDecoderSetSchemaWithDictionaries)
 #define ArrowIpcDecoderSetEndianness \
   NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowIpcDecoderSetEndianness)
 #define ArrowIpcDecoderPeekFooter \
@@ -105,6 +109,14 @@
 #define ArrowIpcFooterReset NANOARROW_SYMBOL(NANOARROW_NAMESPACE, 
ArrowIpcFooterReset)
 #define ArrowIpcEncoderEncodeFooter \
   NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowIpcEncoderEncodeFooter)
+#define ArrowIpcDictionaryEncodingsInit \
+  NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowIpcDictionaryEncodingsInit)
+#define ArrowIpcDictionaryEncodingsAppend \
+  NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowIpcDictionaryEncodingsAppend)
+#define ArrowIpcDictionaryEncodingsFind \
+  NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowIpcDictionaryEncodingsFind)
+#define ArrowIpcDictionaryEncodingsReset \
+  NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowIpcDictionaryEncodingsReset)
 
 #endif
 
@@ -156,6 +168,12 @@ enum ArrowIpcCompressionType {
   NANOARROW_IPC_COMPRESSION_TYPE_ZSTD
 };
 
+/// \brief Dictionary kind enumerator
+enum ArrowIpcDictionaryKind {
+  NANOARROW_IPC_DICTIONARY_KIND_UNINITIALIZED,
+  NANOARROW_IPC_DICTIONARY_KIND_DENSE_ARRAY
+};
+
 /// \brief Feature flag for a stream that uses dictionary replacement
 #define NANOARROW_IPC_FEATURE_DICTIONARY_REPLACEMENT 1
 
@@ -171,6 +189,60 @@ struct ArrowIpcDictionaryBatch {
   int is_delta;
 };
 
+/// \brief Description of a dictionary-encoded field
+///
+/// This struct is intended to be passed by value; however, its data is 
invalidated
+/// if the underlying ArrowSchema that contains the dictionary-encoded field is
+/// released.
+struct ArrowIpcDictionaryEncoding {
+  /// \brief A pointer to the ArrowSchema node of the dictionary-encoded field
+  ///
+  /// This is a reference into another object and care must be taken to ensure
+  /// that if that object is copied that the schema pointers are updated
+  /// appropriately.
+  const struct ArrowSchema* schema;
+
+  /// \brief The identifier used that will appear in dictionary batch messages
+  int64_t id;
+
+  /// \brief The dictionary kind
+  ///
+  /// Currently only one dictionary kind is permitted by the Arrow 
specification
+  /// (DenseArray).
+  enum ArrowIpcDictionaryKind kind;
+};
+
+/// \brief List of ArrowIpcDictionaryEncoding structs
+///
+/// This structure provides a list of dictionary encoded fields extracted
+/// from an ArrowSchema during decoding. Its members refer to pointers
+/// within a specific schema, so care must be taken to keep the schema
+/// containing the pointed-to ArrowSchema fields valid.
+struct ArrowIpcDictionaryEncodings {
+  struct ArrowBuffer encodings;
+};
+
+/// \brief Initialize an ArrowIpcDictionaryEncodings list
+NANOARROW_DLL void ArrowIpcDictionaryEncodingsInit(
+    struct ArrowIpcDictionaryEncodings* dictionaries);
+
+/// \brief Append a given ArrowIpcDictionaryEncoding to this list
+NANOARROW_DLL ArrowErrorCode
+ArrowIpcDictionaryEncodingsAppend(struct ArrowIpcDictionaryEncodings* 
dictionaries,
+                                  struct ArrowIpcDictionaryEncoding encoding);
+
+/// \brief Resolve a ArrowIpcDictionaryEncoding for a given dictionary encoded 
field
+///
+/// Returns NULL if the pointed to schema does not match any of the pointed to
+/// schemas contained in this list.
+NANOARROW_DLL const struct ArrowIpcDictionaryEncoding* 
ArrowIpcDictionaryEncodingsFind(
+    const struct ArrowIpcDictionaryEncodings* dictionaries,
+    const struct ArrowSchema* schema);
+
+/// \brief Release an encodings list and associated resources
+NANOARROW_DLL void ArrowIpcDictionaryEncodingsReset(
+    struct ArrowIpcDictionaryEncodings* dictionaries);
+
 /// \brief Checks the nanoarrow runtime to make sure the run/build versions 
match
 NANOARROW_DLL ArrowErrorCode ArrowIpcCheckRuntime(struct ArrowError* error);
 
@@ -392,6 +464,8 @@ NANOARROW_DLL ArrowErrorCode 
ArrowIpcDecoderDecodeHeader(struct ArrowIpcDecoder*
 ///
 /// After a successful call to ArrowIpcDecoderDecodeHeader(), retrieve an 
ArrowSchema.
 /// The caller is responsible for releasing the schema if NANOARROW_OK is 
returned.
+/// This is equivalent to calling 
ArrowIpcDecoderDecodeSchemaWithDictionaries() with
+/// dictionaries_out = NULL.
 ///
 /// Returns EINVAL if the decoder did not just decode a schema message or
 /// NANOARROW_OK otherwise.
@@ -399,19 +473,52 @@ NANOARROW_DLL ArrowErrorCode 
ArrowIpcDecoderDecodeSchema(struct ArrowIpcDecoder*
                                                          struct ArrowSchema* 
out,
                                                          struct ArrowError* 
error);
 
+/// \brief Decode an ArrowSchema with dictionary encoding information
+///
+/// After a successful call to ArrowIpcDecoderDecodeHeader(), retrieve an 
ArrowSchema.
+/// The caller is responsible for releasing the schema if NANOARROW_OK is 
returned.
+/// Neither out nor dictionaries_out should be initialized; dictionaries_out 
may be
+/// null to omit exporting dictionary identifiers.
+///
+/// Returns EINVAL if the decoder did not just decode a schema message or
+/// NANOARROW_OK otherwise.
+NANOARROW_DLL ArrowErrorCode ArrowIpcDecoderDecodeSchemaWithDictionaries(
+    struct ArrowIpcDecoder* decoder, struct ArrowSchema* out,
+    struct ArrowIpcDictionaryEncodings* dictionaries_out, struct ArrowError* 
error);
+
 /// \brief Set the ArrowSchema used to decode future record batch messages
 ///
 /// Prepares the decoder for future record batch messages
-/// of this type. The decoder takes ownership of schema if NANOARROW_OK is 
returned.
+/// of this type. The decoder does not take ownership of schema.
 /// Note that you must call this explicitly after decoding a
 /// Schema message (i.e., the decoder does not assume that the last-decoded
 /// schema message applies to future record batch messages).
 ///
+/// This is equivalent to calling ArrowIpcDecoderSetSchemaWithDictionaries() 
with
+/// dictionary_encodings = NULL.
+///
 /// Returns EINVAL if schema validation fails or NANOARROW_OK otherwise.
 NANOARROW_DLL ArrowErrorCode ArrowIpcDecoderSetSchema(struct ArrowIpcDecoder* 
decoder,
-                                                      struct ArrowSchema* 
schema,
+                                                      const struct 
ArrowSchema* schema,
                                                       struct ArrowError* 
error);
 
+/// \brief Set the ArrowSchema and dictionary encodings used to decode future 
record batch
+/// messages
+///
+/// Prepares the decoder for future record batch messages
+/// of this type. The decoder does not take ownership of schema.
+/// Note that you must call this explicitly after decoding a
+/// Schema message (i.e., the decoder does not assume that the last-decoded
+/// schema message applies to future record batch messages).
+///
+/// Returns EINVAL if schema validation fails or if the schema contains
+/// dictionary encodings that could not be resolved in the provided
+/// ArrowIpcDictionaryEncodings object, or NANOARROW_OK otherwise.
+NANOARROW_DLL ArrowErrorCode ArrowIpcDecoderSetSchemaWithDictionaries(
+    struct ArrowIpcDecoder* decoder, const struct ArrowSchema* schema,
+    const struct ArrowIpcDictionaryEncodings* dictionary_encodings,
+    struct ArrowError* error);
+
 /// \brief Set the endianness used to decode future record batch messages
 ///
 /// Prepares the decoder for future record batch messages with the specified
@@ -708,8 +815,10 @@ struct ArrowIpcFileBlock {
 /// This structure is intended to be allocated by the caller, initialized using
 /// ArrowIpcFooterInit(), and released with ArrowIpcFooterReset().
 struct ArrowIpcFooter {
-  /// \brief the Footer's embedded Schema
+  /// \brief The Footer's embedded Schema
   struct ArrowSchema schema;
+  /// \brief Dictionaries present in the footer Schema
+  struct ArrowIpcDictionaryEncodings dictionaries;
   /// \brief all blocks containing RecordBatch Messages
   struct ArrowBuffer record_batch_blocks;
 };
diff --git a/src/nanoarrow/nanoarrow_ipc.hpp b/src/nanoarrow/nanoarrow_ipc.hpp
index a9fadf1f..84f1d90b 100644
--- a/src/nanoarrow/nanoarrow_ipc.hpp
+++ b/src/nanoarrow/nanoarrow_ipc.hpp
@@ -57,6 +57,23 @@ inline void release_pointer(struct ArrowIpcDecoder* data) {
   ArrowIpcDecoderReset(data);
 }
 
+template <>
+inline void init_pointer(struct ArrowIpcDictionaryEncodings* data) {
+  ArrowIpcDictionaryEncodingsInit(data);
+}
+
+template <>
+inline void move_pointer(struct ArrowIpcDictionaryEncodings* src,
+                         struct ArrowIpcDictionaryEncodings* dst) {
+  memcpy(dst, src, sizeof(struct ArrowIpcDictionaryEncodings));
+  ArrowIpcDictionaryEncodingsInit(src);
+}
+
+template <>
+inline void release_pointer(struct ArrowIpcDictionaryEncodings* data) {
+  ArrowIpcDictionaryEncodingsReset(data);
+}
+
 template <>
 inline void init_pointer(struct ArrowIpcFooter* data) {
   ArrowIpcFooterInit(data);
@@ -66,6 +83,7 @@ template <>
 inline void move_pointer(struct ArrowIpcFooter* src, struct ArrowIpcFooter* 
dst) {
   ArrowSchemaMove(&src->schema, &dst->schema);
   ArrowBufferMove(&src->record_batch_blocks, &dst->record_batch_blocks);
+  move_pointer(&src->dictionaries, &dst->dictionaries);
 }
 
 template <>
@@ -185,6 +203,9 @@ using UniqueDecoder = internal::Unique<struct 
ArrowIpcDecoder>;
 /// \brief Class wrapping a unique struct ArrowIpcFooter
 using UniqueFooter = internal::Unique<struct ArrowIpcFooter>;
 
+/// \brief Class wrapping a unique struct ArrowIpcDictionaryEncodings
+using UniqueDictionaryEncodings = internal::Unique<struct 
ArrowIpcDictionaryEncodings>;
+
 /// \brief Class wrapping a unique struct ArrowIpcEncoder
 using UniqueEncoder = internal::Unique<struct ArrowIpcEncoder>;
 


Reply via email to