Hi all,

I suspect that I've ran into a bug (or two).

On Cassandra 4.1.1, when `cdc_enabled` in the cassandra.yaml file is set to `false` on at least one node in the cluster, and then the `ALTER TABLE ... WITH cdc=...` statement was run against that node, the cluster will end up in the schema disagreement state. At this stage, a rolling restart will bring the schema back in sync, but the changes made to the `cdc` table property will be lost.

On Cassandra 4.1.6, the same procedure doesn't cause visible schema disagreement in the `nodetool describecluster` command's output, but the `ALTER TABLE` statement only has cosmetic effect on the node it is run. The node with `cdc_enabled` set to `false` will show the `cdc` table property has changed, but this does not affect its behaviour in any way. At the same time, other nodes do not see that table property change at all. This is perhaps even worse than on 4.1.1, because the alter table statement is silently failing.

A shell script for reproducing the above described behaviours, and the output on both 4.1.1 and 4.1.6 are attached.

(as a good security practice, please always read and understand the full script you downloaded from untrusted sources before attempting to run it)

So, are these bugs? Or is this some kind of behaviour that's documented but I failed to find that documentation for?

Cheers,
Bowen
#!/bin/sh
set -eu

# pick a version
# ver="4.1.1"
ver="4.1.6"

# a helper function to wait for the DB container to become ready
wait_ready() {
    # wait for the node to full start and join the cluster
    until [ "$(docker exec "$1" nodetool netstats 2>/dev/null | grep '^Mode:' | 
cut -d' ' -f2)" = 'NORMAL' ]; do
        sleep 1
    done

    # wait for the DB ready for queries
    until docker exec "$1" cqlsh -e "select key from system.local;" >/dev/null 
2>&1; do
        sleep 1
    done
}

print_schema() {
    echo "nodetool describecluster:"
    docker exec c1 nodetool describecluster | sed -n '/Schema versions:/,/Stats 
for all nodes:/p' | head -n-1

    echo "table cdc property on each node:"
    for node in c1 c2 c3; do
        echo "node: $node"
        docker exec "$node" cqlsh -e "desc table ks1.tbl1;" | grep cdc
    done
}

echo "version: $ver"

docker network create cassnet >/dev/null
for node in c1 c2 c3; do
    echo "starting node: $node"
    docker run --network cassnet --name "$node" -e MAX_HEAP_SIZE=1G -e 
HEAP_NEWSIZE=200M -e CASSANDRA_SEEDS=c1 -d "cassandra:$ver" >/dev/null
    wait_ready "$node"
done

echo "creating keyspace and table"
# create tables
docker exec c1 cqlsh -e "create keyspace ks1 WITH replication = {'class': 
'NetworkTopologyStrategy', 'datacenter1': '3'};"
docker exec c1 cqlsh -e "create table ks1.tbl1 (id int primary key);"

# print the schema, they should be consistent
print_schema

# enable cdc on one of the nodes
echo "enabling cdc on one node"
docker exec c3 sed -i 's/^cdc_enabled:.*$/cdc_enabled: true/' 
/etc/cassandra/cassandra.yaml
docker restart c3 >/dev/null
wait_ready c3

# enable cdc on the table
# on 4.1.1: will show timeout and schema disagreement warnings
# on 4.1.6: no error
echo "alter table"
docker exec c1 cqlsh -e "alter table ks1.tbl1 WITH cdc=true;" || :

# print the schema, the disagreement/inconsistency will show up
print_schema

# rolling restart
echo "rolling restart"
for node in c1 c2 c3; do
    echo "node: $node"
    docker restart "$node" >/dev/null
    wait_ready "$node"
done

# print the schema again, the disagreement/inconsistency is gone, but the 
effect of the ALTER TABLE statement is also voided
print_schema

# clean up
echo "clean up"
docker stop c1 c2 c3 >/dev/null
docker rm c1 c2 c3 >/dev/null
docker network rm cassnet >/dev/null
version: 4.1.1
starting node: c1
starting node: c2
starting node: c3
creating keyspace and table
nodetool describecluster:
        Schema versions:
                32699367-6dda-3c41-b63e-26f770adb14d: [172.19.0.3, 172.19.0.2, 
172.19.0.4]

table cdc property on each node:
node: c1
    AND cdc = false
node: c2
    AND cdc = false
node: c3
    AND cdc = false
enabling cdc on one node
alter table
<stdin>:1:OperationTimedOut: errors={'Connection defunct by heartbeat': 'Client 
request timeout. See Session.execute[_async](timeout)'}, 
last_host=127.0.0.1:9042
<stdin>:1:Warning: schema version mismatch detected; check the schema versions 
of your nodes in system.local and system.peers.
nodetool describecluster:
        Schema versions:
                53789740-2180-3161-a22d-fb2c6a4d32a7: [172.19.0.2]

                32699367-6dda-3c41-b63e-26f770adb14d: [172.19.0.3, 172.19.0.4]

table cdc property on each node:
node: c1
    AND cdc = true
node: c2
    AND cdc = false
node: c3
    AND cdc = false
rolling restart
node: c1
node: c2
node: c3
nodetool describecluster:
        Schema versions:
                53789740-2180-3161-a22d-fb2c6a4d32a7: [172.19.0.3, 172.19.0.2, 
172.19.0.4]

table cdc property on each node:
node: c1
    AND cdc = false
node: c2
    AND cdc = false
node: c3
    AND cdc = false
cleanup
version: 4.1.6
starting node: c1
starting node: c2
starting node: c3
creating keyspace and table
nodetool describecluster:
        Schema versions:
                b595fb53-19d3-3d45-9856-0ac042fdd05b: [172.19.0.3, 172.19.0.2, 
172.19.0.4]

table cdc property on each node:
node: c1
    AND cdc = false
node: c2
    AND cdc = false
node: c3
    AND cdc = false
enabling cdc on one node
alter table
nodetool describecluster:
        Schema versions:
                8691b0f6-965e-31d4-89cf-b10d61a38215: [172.19.0.3, 172.19.0.2, 
172.19.0.4]

table cdc property on each node:
node: c1
    AND cdc = true
node: c2
    AND cdc = false
node: c3
    AND cdc = false
rolling restart
node: c1
node: c2
node: c3
nodetool describecluster:
        Schema versions:
                8691b0f6-965e-31d4-89cf-b10d61a38215: [172.19.0.3, 172.19.0.2, 
172.19.0.4]

table cdc property on each node:
node: c1
    AND cdc = false
node: c2
    AND cdc = false
node: c3
    AND cdc = false
clean up

Reply via email to