I can confirm that fixes that issue in the simple case. But if result is
either of these, we get an error:
result = compiler.compile(t.select("b"))
# leads to:
Traceback (most recent call last):
File "<stdin>", line 1, in <module>
File "pyarrow/_substrait.pyx", line 140, in pyarrow._substrait.run_query
c_reader = GetResultValue(c_res_reader)
File "pyarrow/error.pxi", line 144, in
pyarrow.lib.pyarrow_internal_check_status
return check_status(status)
File "pyarrow/error.pxi", line 100, in pyarrow.lib.check_status
raise ArrowInvalid(message)
pyarrow.lib.ArrowInvalid: ConsumingSinkNode with mismatched number of names
/Users/willjones/Documents/arrows/arrow/cpp/src/arrow/engine/substrait/util.cc:84
plan_->StartProducing()
/Users/willjones/Documents/arrows/arrow/cpp/src/arrow/engine/substrait/util.cc:131
executor.Execute()
result = compiler.compile(t.group_by("a").mutate(z = t.b.sum()))
# leads to:
Traceback (most recent call last):
File "<stdin>", line 1, in <module>
File "pyarrow/_substrait.pyx", line 140, in pyarrow._substrait.run_query
c_reader = GetResultValue(c_res_reader)
File "pyarrow/error.pxi", line 144, in
pyarrow.lib.pyarrow_internal_check_status
return check_status(status)
File "pyarrow/error.pxi", line 100, in pyarrow.lib.check_status
raise ArrowInvalid(message)
pyarrow.lib.ArrowInvalid: Invalid column index to add field.
/Users/willjones/Documents/arrows/arrow/cpp/src/arrow/engine/substrait/relation_internal.cc:338
project_schema->AddField( num_columns +
static_cast<int>(project.expressions().size()) - 1,
std::move(project_field))
/Users/willjones/Documents/arrows/arrow/cpp/src/arrow/engine/substrait/serde.cc:156
FromProto(plan_rel.has_root() ? plan_rel.root().input() : plan_rel.rel(),
ext_set, conversion_options)
/Users/willjones/Documents/arrows/arrow/cpp/src/arrow/engine/substrait/util.cc:106
engine::DeserializePlans(substrait_buffer, consumer_factory, registry,
nullptr, conversion_options_)
/Users/willjones/Documents/arrows/arrow/cpp/src/arrow/engine/substrait/util.cc:130
executor.Init(substrait_buffer, registry)
On Wed, Oct 5, 2022 at 11:58 AM Li Jin <[email protected]> wrote:
> Ok I think I got a working version now:
>
> t = ibis.table([("a", "int64"), ("b", "int64")], name="table0")
>
> test_table_0 = pa.Table.from_pydict({"a": [1, 2, 3], "b": [4, 5,
> 6]})
>
>
>
> result = self.compiler.compile(t)
>
>
>
> def table_provider(names):
>
> if not names:
>
> raise Exception("No names provided")
>
> elif names[0] == 'table0':
>
> return test_table_0
>
> else:
>
> raise Exception(f"Unknown table name {names}")
>
>
>
> reader =
> pa.substrait.run_query(pa.py_buffer(result.SerializeToString()),
> table_provider)
>
> result_table = reader.read_all()
>
>
>
> self.assertTrue(result_table == test_table_0)
>
> First successful run with ibis/substrait/acero - Hooray
>
> On Wed, Oct 5, 2022 at 2:33 PM Li Jin <[email protected]> wrote:
>
> > Hmm. Thanks for the update - Now I searched the code more, it seems
> > perhaps I should be using "compile" rather than "translate";
> >
> >
> >
> https://github.com/ibis-project/ibis-substrait/blob/main/ibis_substrait/compiler/core.py#L82
> >
> > Let me try some more
> >
> > On Wed, Oct 5, 2022 at 1:42 PM Will Jones <[email protected]>
> wrote:
> >
> >> Hi Li Jin,
> >>
> >> The original segfault seems to occur because you are passing a Python
> >> bytes
> >> object and not a PyArrow Buffer object. You can wrap the bytes object
> >> using
> >> pa.py_buffer():
> >>
> >> pa.substrait.run_query(pa.py_buffer(result_bytes), table_provider)
> >>
> >>
> >> That being said, when I run your full example with that, we now get a
> >> different error similar to what you get when you pass in through JSON:
> >>
> >> Traceback (most recent call last):
> >> File "<stdin>", line 1, in <module>
> >> File "pyarrow/_substrait.pyx", line 140, in
> pyarrow._substrait.run_query
> >> c_reader = GetResultValue(c_res_reader)
> >> File "pyarrow/error.pxi", line 144, in
> >> pyarrow.lib.pyarrow_internal_check_status
> >> return check_status(status)
> >> File "pyarrow/error.pxi", line 100, in pyarrow.lib.check_status
> >> raise ArrowInvalid(message)
> >> pyarrow.lib.ArrowInvalid: ExecPlan has no node
> >>
> >>
> /Users/willjones/Documents/arrows/arrow/cpp/src/arrow/engine/substrait/util.cc:82
> >> plan_->Validate()
> >>
> >>
> /Users/willjones/Documents/arrows/arrow/cpp/src/arrow/engine/substrait/util.cc:131
> >> executor.Execute()
> >>
> >>
> >> We get the same error even if I add operations onto the plan:
> >>
> >> result = translate(t.group_by("a").mutate(z = t.b.sum()), compiler)
> >> print(result)
> >>
> >>
> >> project {
> >> input {
> >> read {
> >> base_schema {
> >> names: "a"
> >> names: "b"
> >> struct {
> >> types {
> >> i64 {
> >> nullability: NULLABILITY_NULLABLE
> >> }
> >> }
> >> types {
> >> i64 {
> >> nullability: NULLABILITY_NULLABLE
> >> }
> >> }
> >> nullability: NULLABILITY_REQUIRED
> >> }
> >> }
> >> named_table {
> >> names: "table0"
> >> }
> >> }
> >> }
> >> expressions {
> >> selection {
> >> direct_reference {
> >> struct_field {
> >> }
> >> }
> >> root_reference {
> >> }
> >> }
> >> }
> >> expressions {
> >> selection {
> >> direct_reference {
> >> struct_field {
> >> field: 1
> >> }
> >> }
> >> root_reference {
> >> }
> >> }
> >> }
> >> expressions {
> >> window_function {
> >> function_reference: 1
> >> partitions {
> >> selection {
> >> direct_reference {
> >> struct_field {
> >> }
> >> }
> >> root_reference {
> >> }
> >> }
> >> }
> >> upper_bound {
> >> unbounded {
> >> }
> >> }
> >> lower_bound {
> >> unbounded {
> >> }
> >> }
> >> phase: AGGREGATION_PHASE_INITIAL_TO_RESULT
> >> output_type {
> >> i64 {
> >> nullability: NULLABILITY_NULLABLE
> >> }
> >> }
> >> arguments {
> >> value {
> >> selection {
> >> direct_reference {
> >> struct_field {
> >> field: 1
> >> }
> >> }
> >> root_reference {
> >> }
> >> }
> >> }
> >> }
> >> }
> >> }
> >> }
> >>
> >>
> >> Full reproduction:
> >>
> >> import pyarrow as pa
> >> import pyarrow.substrait
> >> import ibis
> >> from ibis_substrait.compiler.core import SubstraitCompiler
> >> from ibis_substrait.compiler.translate import translate
> >>
> >>
> >> compiler = SubstraitCompiler()
> >>
> >>
> >> t = ibis.table([("a", "int64"), ("b", "int64")], name="table0")
> >> result = translate(t.group_by("a").mutate(z = t.b.sum()), compiler)
> >>
> >> def table_provider(names):
> >> if not names:
> >> raise Exception("No names provided")
> >> elif names[0] == 'table0':
> >> return test_table_0
> >> else:
> >> raise Exception(f"Unknown table name {names}")
> >>
> >>
> >> test_table_0 = pa.Table.from_pydict({"a": [1, 2, 3], "b": [4, 5, 6]})
> >>
> >> result_bytes = result.SerializeToString()
> >>
> >> pa.substrait.run_query(pa.py_buffer(result_bytes), table_provider)
> >>
> >> Best,
> >>
> >> Will Jones
> >>
> >> On Tue, Oct 4, 2022 at 12:30 PM Li Jin <[email protected]> wrote:
> >>
> >> > For reference, this is the "relations" entry that I was referring to:
> >> >
> >> >
> >>
> https://github.com/apache/arrow/blob/master/python/pyarrow/tests/test_substrait.py#L186
> >> >
> >> > On Tue, Oct 4, 2022 at 3:28 PM Li Jin <[email protected]> wrote:
> >> >
> >> > > So I made some progress with updated code:
> >> > >
> >> > > t = ibis.table([("a", "int64"), ("b", "int64")],
> >> name="table0")
> >> > >
> >> > > test_table_0 = pa.Table.from_pydict({"a": [1, 2, 3], "b":
> [4,
> >> 5,
> >> > > 6]})
> >> > >
> >> > >
> >> > >
> >> > > result = translate(t, self.compiler)
> >> > >
> >> > >
> >> > >
> >> > > def table_provider(names):
> >> > >
> >> > > if not names:
> >> > >
> >> > > raise Exception("No names provided")
> >> > >
> >> > > elif names[0] == 'table0':
> >> > >
> >> > > return test_table_0
> >> > >
> >> > > else:
> >> > >
> >> > > raise Exception(f"Unknown table name {names}")
> >> > >
> >> > >
> >> > >
> >> > > print(result)
> >> > >
> >> > > result_buf =
> >> > > pa._substrait._parse_json_plan(tobytes(MessageToJson(result)))
> >> > >
> >> > >
> >> > >
> >> > > pa.substrait.run_query(result_buf, table_provider)
> >> > >
> >> > > I think now the plan is passed properly and I got a "ArrowInvalid:
> >> Empty
> >> > > substrait plan is passed"
> >> > >
> >> > >
> >> > > Looking the plan reproduces by ibis-substrait, it looks like doesn't
> >> > match
> >> > > the expected format of Acero consumer. In particular, it looks like
> >> the
> >> > > plan produced by ibis-substrait doesn't have a "relations" entry -
> any
> >> > > thoughts on how this can be fixed? (I don't know if I am using the
> API
> >> > > wrong or some format inconsistency between the two)
> >> > >
> >> > > On Tue, Oct 4, 2022 at 1:54 PM Li Jin <[email protected]>
> wrote:
> >> > >
> >> > >> Hi,
> >> > >>
> >> > >> I am testing integration between ibis-substrait and Acero but hit a
> >> > >> segmentation fault. I think this might be cause the way I am
> >> > >> integrating these two libraries are wrong, here is my code:
> >> > >>
> >> > >> Li Jin
> >> > >> 1:51 PM (1 minute ago)
> >> > >> to me
> >> > >>
> >> > >> class BasicTests(unittest.TestCase):
> >> > >>
> >> > >> """Test basic features"""
> >> > >>
> >> > >>
> >> > >>
> >> > >>
> >> > >>
> >> > >> @classmethod
> >> > >>
> >> > >> def setUpClass(cls):
> >> > >>
> >> > >> cls.compiler = SubstraitCompiler()
> >> > >>
> >> > >>
> >> > >>
> >> > >> def test_named_table(self):
> >> > >>
> >> > >> """Test basic"""
> >> > >>
> >> > >> t = ibis.table([("a", "int64"), ("b", "int64")],
> >> name="table0")
> >> > >>
> >> > >> result = translate(t, self.compiler)
> >> > >>
> >> > >>
> >> > >>
> >> > >> def table_provider(names):
> >> > >>
> >> > >> if not names:
> >> > >>
> >> > >> raise Exception("No names provided")
> >> > >>
> >> > >> elif names[0] == 'table0':
> >> > >>
> >> > >> return test_table_0
> >> > >>
> >> > >> else:
> >> > >>
> >> > >> raise Exception(f"Unknown table name {names}")
> >> > >>
> >> > >>
> >> > >>
> >> > >> test_table_0 = pa.Table.from_pydict({"a": [1, 2, 3], "b":
> >> [4, 5,
> >> > >> 6]})
> >> > >>
> >> > >>
> >> > >>
> >> > >> print(type(result))
> >> > >>
> >> > >> print(result)
> >> > >>
> >> > >> result_bytes = result.SerializeToString()
> >> > >>
> >> > >>
> >> > >>
> >> > >> pa.substrait.run_query(result_bytes, table_provider)
> >> > >>
> >> > >>
> >> > >> I wonder if someone has tried integration between these two before
> >> and
> >> > >> can share some working code?
> >> > >>
> >> > >
> >> >
> >>
> >
>