This is an automated email from the ASF dual-hosted git repository. jli pushed a commit to branch fix-examples-not-loading in repository https://gitbox.apache.org/repos/asf/superset.git
commit 2a9292e4adad8c51ab3f19a4617969efe40e3205 Author: Joe Li <[email protected]> AuthorDate: Thu Jan 29 14:39:53 2026 -0800 fix(examples): add schema to _find_dataset lookup to prevent cross-schema collisions Adds schema parameter to _find_dataset() fallback lookup so that two datasets with the same table_name in different schemas don't collide during UUID backfill. Adds test to verify schema-based lookup distinguishes same-name tables. Co-Authored-By: Claude Opus 4.5 <[email protected]> --- superset/examples/generic_loader.py | 10 ++++-- tests/unit_tests/examples/generic_loader_test.py | 42 ++++++++++++++++++++++-- 2 files changed, 47 insertions(+), 5 deletions(-) diff --git a/superset/examples/generic_loader.py b/superset/examples/generic_loader.py index dea2432998a..9b6df23f742 100644 --- a/superset/examples/generic_loader.py +++ b/superset/examples/generic_loader.py @@ -38,15 +38,19 @@ def _find_dataset( table_name: str, database_id: int, uuid: Optional[str] = None, + schema: Optional[str] = None, ) -> tuple[Optional[SqlaTable], bool]: """Find a dataset by UUID first, then fall back to table_name + database_id. + Includes schema in the fallback lookup to prevent cross-schema collisions. + This avoids unique constraint violations when a duplicate row exists. Args: table_name: The table name to look up database_id: The database ID uuid: Optional UUID to look up first + schema: Optional schema to include in fallback lookup Returns: A tuple of (dataset or None, found_by_uuid bool) @@ -62,7 +66,7 @@ def _find_dataset( if not tbl: tbl = ( db.session.query(SqlaTable) - .filter_by(table_name=table_name, database_id=database_id) + .filter_by(table_name=table_name, database_id=database_id, schema=schema) .first() ) @@ -127,7 +131,7 @@ def load_parquet_table( # noqa: C901 table_exists = database.has_table(Table(table_name, schema=schema)) if table_exists and not force: logger.info("Table %s already exists, skipping data load", table_name) - tbl, found_by_uuid = _find_dataset(table_name, database.id, uuid) + tbl, found_by_uuid = _find_dataset(table_name, database.id, uuid, schema) if tbl: # Backfill UUID if found by table_name (not UUID) and UUID not set if uuid and not tbl.uuid and not found_by_uuid: @@ -202,7 +206,7 @@ def load_parquet_table( # noqa: C901 logger.info("Loaded %d rows into %s", len(pdf), table_name) # Create or update SqlaTable metadata using UUID-first lookup - tbl, found_by_uuid = _find_dataset(table_name, database.id, uuid) + tbl, found_by_uuid = _find_dataset(table_name, database.id, uuid, schema) if not tbl: tbl = SqlaTable(table_name=table_name, database_id=database.id) diff --git a/tests/unit_tests/examples/generic_loader_test.py b/tests/unit_tests/examples/generic_loader_test.py index 74c3ddaa6db..c50b5781493 100644 --- a/tests/unit_tests/examples/generic_loader_test.py +++ b/tests/unit_tests/examples/generic_loader_test.py @@ -307,7 +307,7 @@ def test_find_dataset_returns_uuid_match_first(mock_db: MagicMock) -> None: uuid_row ) - result, found_by_uuid = _find_dataset("test_table", 1, "target-uuid") + result, found_by_uuid = _find_dataset("test_table", 1, "target-uuid", "public") assert result is uuid_row assert found_by_uuid is True @@ -333,7 +333,7 @@ def test_find_dataset_falls_back_to_table_name(mock_db: MagicMock) -> None: mock_db.session.query.return_value.filter_by.side_effect = filter_by_side_effect - result, found_by_uuid = _find_dataset("test_table", 1, "nonexistent-uuid") + result, found_by_uuid = _find_dataset("test_table", 1, "nonexistent-uuid", "public") assert result is tablename_row assert found_by_uuid is False @@ -392,3 +392,41 @@ def test_load_parquet_table_duplicate_rows_table_missing( # Should return the UUID row, not try to backfill (which would collide) assert result is uuid_row assert result.uuid == "target-uuid" + + +@patch("superset.examples.generic_loader.db") +def test_find_dataset_distinguishes_schemas(mock_db: MagicMock) -> None: + """Test that _find_dataset uses schema to distinguish same-name tables. + + Scenario: + - Row A: table_name="users", schema="schema_a", uuid=None + - Row B: table_name="users", schema="schema_b", uuid=None + + Looking up "users" in "schema_b" should find Row B, not Row A. + """ + from superset.examples.generic_loader import _find_dataset + + # Row in schema_b (should be found) + schema_b_row = MagicMock() + schema_b_row.uuid = None + schema_b_row.table_name = "users" + schema_b_row.schema = "schema_b" + + # No UUID lookup (uuid not provided), table_name lookup returns schema_b row + def filter_by_side_effect(**kwargs): + mock_result = MagicMock() + if "uuid" in kwargs: + mock_result.first.return_value = None + elif kwargs.get("schema") == "schema_b": + mock_result.first.return_value = schema_b_row + else: + mock_result.first.return_value = None # schema_a not requested + return mock_result + + mock_db.session.query.return_value.filter_by.side_effect = filter_by_side_effect + + result, found_by_uuid = _find_dataset("users", 1, None, "schema_b") + + assert result is schema_b_row + assert found_by_uuid is False + assert result.schema == "schema_b"
