Ok I think I got a working version now:
t = ibis.table([("a", "int64"), ("b", "int64")], name="table0")
test_table_0 = pa.Table.from_pydict({"a": [1, 2, 3], "b": [4, 5,
6]})
result = self.compiler.compile(t)
def table_provider(names):
if not names:
raise Exception("No names provided")
elif names[0] == 'table0':
return test_table_0
else:
raise Exception(f"Unknown table name {names}")
reader =
pa.substrait.run_query(pa.py_buffer(result.SerializeToString()),
table_provider)
result_table = reader.read_all()
self.assertTrue(result_table == test_table_0)
First successful run with ibis/substrait/acero - Hooray
On Wed, Oct 5, 2022 at 2:33 PM Li Jin <[email protected]> wrote:
> Hmm. Thanks for the update - Now I searched the code more, it seems
> perhaps I should be using "compile" rather than "translate";
>
>
> https://github.com/ibis-project/ibis-substrait/blob/main/ibis_substrait/compiler/core.py#L82
>
> Let me try some more
>
> On Wed, Oct 5, 2022 at 1:42 PM Will Jones <[email protected]> wrote:
>
>> Hi Li Jin,
>>
>> The original segfault seems to occur because you are passing a Python
>> bytes
>> object and not a PyArrow Buffer object. You can wrap the bytes object
>> using
>> pa.py_buffer():
>>
>> pa.substrait.run_query(pa.py_buffer(result_bytes), table_provider)
>>
>>
>> That being said, when I run your full example with that, we now get a
>> different error similar to what you get when you pass in through JSON:
>>
>> Traceback (most recent call last):
>> File "<stdin>", line 1, in <module>
>> File "pyarrow/_substrait.pyx", line 140, in pyarrow._substrait.run_query
>> c_reader = GetResultValue(c_res_reader)
>> File "pyarrow/error.pxi", line 144, in
>> pyarrow.lib.pyarrow_internal_check_status
>> return check_status(status)
>> File "pyarrow/error.pxi", line 100, in pyarrow.lib.check_status
>> raise ArrowInvalid(message)
>> pyarrow.lib.ArrowInvalid: ExecPlan has no node
>>
>> /Users/willjones/Documents/arrows/arrow/cpp/src/arrow/engine/substrait/util.cc:82
>> plan_->Validate()
>>
>> /Users/willjones/Documents/arrows/arrow/cpp/src/arrow/engine/substrait/util.cc:131
>> executor.Execute()
>>
>>
>> We get the same error even if I add operations onto the plan:
>>
>> result = translate(t.group_by("a").mutate(z = t.b.sum()), compiler)
>> print(result)
>>
>>
>> project {
>> input {
>> read {
>> base_schema {
>> names: "a"
>> names: "b"
>> struct {
>> types {
>> i64 {
>> nullability: NULLABILITY_NULLABLE
>> }
>> }
>> types {
>> i64 {
>> nullability: NULLABILITY_NULLABLE
>> }
>> }
>> nullability: NULLABILITY_REQUIRED
>> }
>> }
>> named_table {
>> names: "table0"
>> }
>> }
>> }
>> expressions {
>> selection {
>> direct_reference {
>> struct_field {
>> }
>> }
>> root_reference {
>> }
>> }
>> }
>> expressions {
>> selection {
>> direct_reference {
>> struct_field {
>> field: 1
>> }
>> }
>> root_reference {
>> }
>> }
>> }
>> expressions {
>> window_function {
>> function_reference: 1
>> partitions {
>> selection {
>> direct_reference {
>> struct_field {
>> }
>> }
>> root_reference {
>> }
>> }
>> }
>> upper_bound {
>> unbounded {
>> }
>> }
>> lower_bound {
>> unbounded {
>> }
>> }
>> phase: AGGREGATION_PHASE_INITIAL_TO_RESULT
>> output_type {
>> i64 {
>> nullability: NULLABILITY_NULLABLE
>> }
>> }
>> arguments {
>> value {
>> selection {
>> direct_reference {
>> struct_field {
>> field: 1
>> }
>> }
>> root_reference {
>> }
>> }
>> }
>> }
>> }
>> }
>> }
>>
>>
>> Full reproduction:
>>
>> import pyarrow as pa
>> import pyarrow.substrait
>> import ibis
>> from ibis_substrait.compiler.core import SubstraitCompiler
>> from ibis_substrait.compiler.translate import translate
>>
>>
>> compiler = SubstraitCompiler()
>>
>>
>> t = ibis.table([("a", "int64"), ("b", "int64")], name="table0")
>> result = translate(t.group_by("a").mutate(z = t.b.sum()), compiler)
>>
>> def table_provider(names):
>> if not names:
>> raise Exception("No names provided")
>> elif names[0] == 'table0':
>> return test_table_0
>> else:
>> raise Exception(f"Unknown table name {names}")
>>
>>
>> test_table_0 = pa.Table.from_pydict({"a": [1, 2, 3], "b": [4, 5, 6]})
>>
>> result_bytes = result.SerializeToString()
>>
>> pa.substrait.run_query(pa.py_buffer(result_bytes), table_provider)
>>
>> Best,
>>
>> Will Jones
>>
>> On Tue, Oct 4, 2022 at 12:30 PM Li Jin <[email protected]> wrote:
>>
>> > For reference, this is the "relations" entry that I was referring to:
>> >
>> >
>> https://github.com/apache/arrow/blob/master/python/pyarrow/tests/test_substrait.py#L186
>> >
>> > On Tue, Oct 4, 2022 at 3:28 PM Li Jin <[email protected]> wrote:
>> >
>> > > So I made some progress with updated code:
>> > >
>> > > t = ibis.table([("a", "int64"), ("b", "int64")],
>> name="table0")
>> > >
>> > > test_table_0 = pa.Table.from_pydict({"a": [1, 2, 3], "b": [4,
>> 5,
>> > > 6]})
>> > >
>> > >
>> > >
>> > > result = translate(t, self.compiler)
>> > >
>> > >
>> > >
>> > > def table_provider(names):
>> > >
>> > > if not names:
>> > >
>> > > raise Exception("No names provided")
>> > >
>> > > elif names[0] == 'table0':
>> > >
>> > > return test_table_0
>> > >
>> > > else:
>> > >
>> > > raise Exception(f"Unknown table name {names}")
>> > >
>> > >
>> > >
>> > > print(result)
>> > >
>> > > result_buf =
>> > > pa._substrait._parse_json_plan(tobytes(MessageToJson(result)))
>> > >
>> > >
>> > >
>> > > pa.substrait.run_query(result_buf, table_provider)
>> > >
>> > > I think now the plan is passed properly and I got a "ArrowInvalid:
>> Empty
>> > > substrait plan is passed"
>> > >
>> > >
>> > > Looking the plan reproduces by ibis-substrait, it looks like doesn't
>> > match
>> > > the expected format of Acero consumer. In particular, it looks like
>> the
>> > > plan produced by ibis-substrait doesn't have a "relations" entry - any
>> > > thoughts on how this can be fixed? (I don't know if I am using the API
>> > > wrong or some format inconsistency between the two)
>> > >
>> > > On Tue, Oct 4, 2022 at 1:54 PM Li Jin <[email protected]> wrote:
>> > >
>> > >> Hi,
>> > >>
>> > >> I am testing integration between ibis-substrait and Acero but hit a
>> > >> segmentation fault. I think this might be cause the way I am
>> > >> integrating these two libraries are wrong, here is my code:
>> > >>
>> > >> Li Jin
>> > >> 1:51 PM (1 minute ago)
>> > >> to me
>> > >>
>> > >> class BasicTests(unittest.TestCase):
>> > >>
>> > >> """Test basic features"""
>> > >>
>> > >>
>> > >>
>> > >>
>> > >>
>> > >> @classmethod
>> > >>
>> > >> def setUpClass(cls):
>> > >>
>> > >> cls.compiler = SubstraitCompiler()
>> > >>
>> > >>
>> > >>
>> > >> def test_named_table(self):
>> > >>
>> > >> """Test basic"""
>> > >>
>> > >> t = ibis.table([("a", "int64"), ("b", "int64")],
>> name="table0")
>> > >>
>> > >> result = translate(t, self.compiler)
>> > >>
>> > >>
>> > >>
>> > >> def table_provider(names):
>> > >>
>> > >> if not names:
>> > >>
>> > >> raise Exception("No names provided")
>> > >>
>> > >> elif names[0] == 'table0':
>> > >>
>> > >> return test_table_0
>> > >>
>> > >> else:
>> > >>
>> > >> raise Exception(f"Unknown table name {names}")
>> > >>
>> > >>
>> > >>
>> > >> test_table_0 = pa.Table.from_pydict({"a": [1, 2, 3], "b":
>> [4, 5,
>> > >> 6]})
>> > >>
>> > >>
>> > >>
>> > >> print(type(result))
>> > >>
>> > >> print(result)
>> > >>
>> > >> result_bytes = result.SerializeToString()
>> > >>
>> > >>
>> > >>
>> > >> pa.substrait.run_query(result_bytes, table_provider)
>> > >>
>> > >>
>> > >> I wonder if someone has tried integration between these two before
>> and
>> > >> can share some working code?
>> > >>
>> > >
>> >
>>
>