(superset) 03/05: fix(examples): add schema to _find_dataset lookup to prevent cross-schema collisions

jli Thu, 29 Jan 2026 17:08:11 -0800

This is an automated email from the ASF dual-hosted git repository.

jli pushed a commit to branch fix-examples-not-loading
in repository https://gitbox.apache.org/repos/asf/superset.git


commit 2a9292e4adad8c51ab3f19a4617969efe40e3205
Author: Joe Li <[email protected]>
AuthorDate: Thu Jan 29 14:39:53 2026 -0800

    fix(examples): add schema to _find_dataset lookup to prevent cross-schema 
collisions
    
    Adds schema parameter to _find_dataset() fallback lookup so that two 
datasets
    with the same table_name in different schemas don't collide during UUID 
backfill.
    
    Adds test to verify schema-based lookup distinguishes same-name tables.
    
    Co-Authored-By: Claude Opus 4.5 <[email protected]>
---
 superset/examples/generic_loader.py              | 10 ++++--
 tests/unit_tests/examples/generic_loader_test.py | 42 ++++++++++++++++++++++--
 2 files changed, 47 insertions(+), 5 deletions(-)

diff --git a/superset/examples/generic_loader.py 
b/superset/examples/generic_loader.py
index dea2432998a..9b6df23f742 100644
--- a/superset/examples/generic_loader.py
+++ b/superset/examples/generic_loader.py
@@ -38,15 +38,19 @@ def _find_dataset(
     table_name: str,
     database_id: int,
     uuid: Optional[str] = None,
+    schema: Optional[str] = None,
 ) -> tuple[Optional[SqlaTable], bool]:
     """Find a dataset by UUID first, then fall back to table_name + 
database_id.
 
+    Includes schema in the fallback lookup to prevent cross-schema collisions.
+
     This avoids unique constraint violations when a duplicate row exists.
 
     Args:
         table_name: The table name to look up
         database_id: The database ID
         uuid: Optional UUID to look up first
+        schema: Optional schema to include in fallback lookup
 
     Returns:
         A tuple of (dataset or None, found_by_uuid bool)
@@ -62,7 +66,7 @@ def _find_dataset(
     if not tbl:
         tbl = (
             db.session.query(SqlaTable)
-            .filter_by(table_name=table_name, database_id=database_id)
+            .filter_by(table_name=table_name, database_id=database_id, 
schema=schema)
             .first()
         )
 
@@ -127,7 +131,7 @@ def load_parquet_table(  # noqa: C901
     table_exists = database.has_table(Table(table_name, schema=schema))
     if table_exists and not force:
         logger.info("Table %s already exists, skipping data load", table_name)
-        tbl, found_by_uuid = _find_dataset(table_name, database.id, uuid)
+        tbl, found_by_uuid = _find_dataset(table_name, database.id, uuid, 
schema)
         if tbl:
             # Backfill UUID if found by table_name (not UUID) and UUID not set
             if uuid and not tbl.uuid and not found_by_uuid:
@@ -202,7 +206,7 @@ def load_parquet_table(  # noqa: C901
         logger.info("Loaded %d rows into %s", len(pdf), table_name)
 
     # Create or update SqlaTable metadata using UUID-first lookup
-    tbl, found_by_uuid = _find_dataset(table_name, database.id, uuid)
+    tbl, found_by_uuid = _find_dataset(table_name, database.id, uuid, schema)
 
     if not tbl:
         tbl = SqlaTable(table_name=table_name, database_id=database.id)
diff --git a/tests/unit_tests/examples/generic_loader_test.py 
b/tests/unit_tests/examples/generic_loader_test.py
index 74c3ddaa6db..c50b5781493 100644
--- a/tests/unit_tests/examples/generic_loader_test.py
+++ b/tests/unit_tests/examples/generic_loader_test.py
@@ -307,7 +307,7 @@ def test_find_dataset_returns_uuid_match_first(mock_db: 
MagicMock) -> None:
         uuid_row
     )
 
-    result, found_by_uuid = _find_dataset("test_table", 1, "target-uuid")
+    result, found_by_uuid = _find_dataset("test_table", 1, "target-uuid", 
"public")
 
     assert result is uuid_row
     assert found_by_uuid is True
@@ -333,7 +333,7 @@ def test_find_dataset_falls_back_to_table_name(mock_db: 
MagicMock) -> None:
 
     mock_db.session.query.return_value.filter_by.side_effect = 
filter_by_side_effect
 
-    result, found_by_uuid = _find_dataset("test_table", 1, "nonexistent-uuid")
+    result, found_by_uuid = _find_dataset("test_table", 1, "nonexistent-uuid", 
"public")
 
     assert result is tablename_row
     assert found_by_uuid is False
@@ -392,3 +392,41 @@ def test_load_parquet_table_duplicate_rows_table_missing(
         # Should return the UUID row, not try to backfill (which would collide)
         assert result is uuid_row
         assert result.uuid == "target-uuid"
+
+
+@patch("superset.examples.generic_loader.db")
+def test_find_dataset_distinguishes_schemas(mock_db: MagicMock) -> None:
+    """Test that _find_dataset uses schema to distinguish same-name tables.
+
+    Scenario:
+    - Row A: table_name="users", schema="schema_a", uuid=None
+    - Row B: table_name="users", schema="schema_b", uuid=None
+
+    Looking up "users" in "schema_b" should find Row B, not Row A.
+    """
+    from superset.examples.generic_loader import _find_dataset
+
+    # Row in schema_b (should be found)
+    schema_b_row = MagicMock()
+    schema_b_row.uuid = None
+    schema_b_row.table_name = "users"
+    schema_b_row.schema = "schema_b"
+
+    # No UUID lookup (uuid not provided), table_name lookup returns schema_b 
row
+    def filter_by_side_effect(**kwargs):
+        mock_result = MagicMock()
+        if "uuid" in kwargs:
+            mock_result.first.return_value = None
+        elif kwargs.get("schema") == "schema_b":
+            mock_result.first.return_value = schema_b_row
+        else:
+            mock_result.first.return_value = None  # schema_a not requested
+        return mock_result
+
+    mock_db.session.query.return_value.filter_by.side_effect = 
filter_by_side_effect
+
+    result, found_by_uuid = _find_dataset("users", 1, None, "schema_b")
+
+    assert result is schema_b_row
+    assert found_by_uuid is False
+    assert result.schema == "schema_b"

(superset) 03/05: fix(examples): add schema to _find_dataset lookup to prevent cross-schema collisions

Reply via email to