This is an automated email from the ASF dual-hosted git repository.
msyavuz pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/superset.git
The following commit(s) were added to refs/heads/master by this push:
new fadab21493e fix(api): nan is not properly handled for athena
connections (#37071)
fadab21493e is described below
commit fadab21493e1e40639f0e434bbcbef0bb2130c60
Author: Ramiro Aquino Romero <[email protected]>
AuthorDate: Thu Jan 22 11:29:09 2026 -0400
fix(api): nan is not properly handled for athena connections (#37071)
---
superset/dataframe.py | 7 +-
tests/unit_tests/dataframe_test.py | 196 +++++++++++++++++++++++++++++++++----
2 files changed, 184 insertions(+), 19 deletions(-)
diff --git a/superset/dataframe.py b/superset/dataframe.py
index 5f3c0dc7798..0e7cba0bc3c 100644
--- a/superset/dataframe.py
+++ b/superset/dataframe.py
@@ -41,6 +41,9 @@ def df_to_records(dframe: pd.DataFrame) -> list[dict[str,
Any]]:
"""
Convert a DataFrame to a set of records.
+ NaN values are converted to None for JSON compatibility.
+ This handles division by zero and other operations that produce NaN.
+
:param dframe: the DataFrame to convert
:returns: a list of dictionaries reflecting each single row of the
DataFrame
"""
@@ -52,6 +55,8 @@ def df_to_records(dframe: pd.DataFrame) -> list[dict[str,
Any]]:
for record in records:
for key in record:
- record[key] = _convert_big_integers(record[key])
+ record[key] = (
+ None if pd.isna(record[key]) else
_convert_big_integers(record[key])
+ )
return records
diff --git a/tests/unit_tests/dataframe_test.py
b/tests/unit_tests/dataframe_test.py
index 0443bc1461c..934edea2047 100644
--- a/tests/unit_tests/dataframe_test.py
+++ b/tests/unit_tests/dataframe_test.py
@@ -17,18 +17,19 @@
# pylint: disable=unused-argument, import-outside-toplevel
from datetime import datetime
+import numpy as np
import pytest
from pandas import Timestamp
from pandas._libs.tslibs import NaT
from superset.dataframe import df_to_records
+from superset.db_engine_specs import BaseEngineSpec
+from superset.result_set import SupersetResultSet
from superset.superset_typing import DbapiDescription
+from superset.utils import json as superset_json
def test_df_to_records() -> None:
- from superset.db_engine_specs import BaseEngineSpec
- from superset.result_set import SupersetResultSet
-
data = [("a1", "b1", "c1"), ("a2", "b2", "c2")]
cursor_descr: DbapiDescription = [
(column, "string", None, None, None, None, False) for column in ("a",
"b", "c")
@@ -43,9 +44,6 @@ def test_df_to_records() -> None:
def test_df_to_records_NaT_type() -> None: # noqa: N802
- from superset.db_engine_specs import BaseEngineSpec
- from superset.result_set import SupersetResultSet
-
data = [(NaT,), (Timestamp("2023-01-06 20:50:31.749000+0000", tz="UTC"),)]
cursor_descr: DbapiDescription = [
("date", "timestamp with time zone", None, None, None, None, False)
@@ -60,9 +58,6 @@ def test_df_to_records_NaT_type() -> None: # noqa: N802
def test_df_to_records_mixed_emoji_type() -> None:
- from superset.db_engine_specs import BaseEngineSpec
- from superset.result_set import SupersetResultSet
-
data = [
("What's up?", "This is a string text", 1),
("What's up?", "This is a string with an 😍 added", 2),
@@ -100,9 +95,6 @@ def test_df_to_records_mixed_emoji_type() -> None:
def test_df_to_records_mixed_accent_type() -> None:
- from superset.db_engine_specs import BaseEngineSpec
- from superset.result_set import SupersetResultSet
-
data = [
("What's up?", "This is a string text", 1),
("What's up?", "This is a string with áccent", 2),
@@ -140,9 +132,6 @@ def test_df_to_records_mixed_accent_type() -> None:
def test_js_max_int() -> None:
- from superset.db_engine_specs import BaseEngineSpec
- from superset.result_set import SupersetResultSet
-
data = [(1, 1239162456494753670, "c1"), (2, 100, "c2")]
cursor_descr: DbapiDescription = [
("a", "int", None, None, None, None, False),
@@ -192,9 +181,6 @@ def test_js_max_int() -> None:
],
)
def test_max_pandas_timestamp(input_, expected) -> None:
- from superset.db_engine_specs import BaseEngineSpec
- from superset.result_set import SupersetResultSet
-
cursor_descr: DbapiDescription = [
("a", "datetime", None, None, None, None, False),
("b", "int", None, None, None, None, False),
@@ -203,3 +189,177 @@ def test_max_pandas_timestamp(input_, expected) -> None:
df = results.to_pandas_df()
assert df_to_records(df) == expected
+
+
+def test_df_to_records_with_nan_from_division_by_zero() -> None:
+ """Test that NaN values from division by zero are converted to None."""
+ # Simulate Athena query: select 0.00 / 0.00 as test
+ data = [(np.nan,), (5.0,), (np.nan,)]
+ cursor_descr: DbapiDescription = [("test", "double", None, None, None,
None, False)]
+ results = SupersetResultSet(data, cursor_descr, BaseEngineSpec)
+ df = results.to_pandas_df()
+
+ assert df_to_records(df) == [
+ {"test": None},
+ {"test": 5.0},
+ {"test": None},
+ ]
+
+
+def test_df_to_records_with_mixed_nan_and_valid_values() -> None:
+ """Test that NaN values are properly handled alongside valid numeric
data."""
+
+ # Simulate a query with multiple columns containing NaN values
+ data = [
+ ("row1", 10.5, np.nan, 100),
+ ("row2", np.nan, 20.3, 200),
+ ("row3", 30.7, 40.2, np.nan),
+ ("row4", np.nan, np.nan, np.nan),
+ ]
+ cursor_descr: DbapiDescription = [
+ ("name", "varchar", None, None, None, None, False),
+ ("value1", "double", None, None, None, None, False),
+ ("value2", "double", None, None, None, None, False),
+ ("value3", "int", None, None, None, None, False),
+ ]
+ results = SupersetResultSet(data, cursor_descr, BaseEngineSpec)
+ df = results.to_pandas_df()
+
+ assert df_to_records(df) == [
+ {"name": "row1", "value1": 10.5, "value2": None, "value3": 100},
+ {"name": "row2", "value1": None, "value2": 20.3, "value3": 200},
+ {"name": "row3", "value1": 30.7, "value2": 40.2, "value3": None},
+ {"name": "row4", "value1": None, "value2": None, "value3": None},
+ ]
+
+
+def test_df_to_records_with_inf_and_nan() -> None:
+ """Test that both NaN and infinity values are handled correctly."""
+ # Test various edge cases: NaN, positive infinity, negative infinity
+ data = [
+ (np.nan, "division by zero"),
+ (np.inf, "positive infinity"),
+ (-np.inf, "negative infinity"),
+ (0.0, "zero"),
+ (42.5, "normal value"),
+ ]
+ cursor_descr: DbapiDescription = [
+ ("result", "double", None, None, None, None, False),
+ ("description", "varchar", None, None, None, None, False),
+ ]
+ results = SupersetResultSet(data, cursor_descr, BaseEngineSpec)
+ df = results.to_pandas_df()
+
+ records = df_to_records(df)
+
+ # NaN should be converted to None
+ assert records[0]["result"] is None
+ assert records[0]["description"] == "division by zero"
+
+ # Infinity values should remain as-is (they're valid JSON)
+ assert records[1]["result"] == np.inf
+ assert records[2]["result"] == -np.inf
+
+ # Normal values should remain unchanged
+ assert records[3]["result"] == 0.0
+ assert records[4]["result"] == 42.5
+
+
+def test_df_to_records_nan_json_serialization() -> None:
+ """
+ Test that NaN values are properly converted to None for JSON serialization.
+
+ Without the pd.isna() check, np.nan values would be passed through to JSON
+ serialization, which either produces non-spec-compliant output or requires
+ special handling with ignore_nan flags throughout the codebase.
+
+ This test validates that our fix converts NaN to None for proper JSON
+ serialization.
+ """
+ # Simulate Athena query: SELECT 0.00 / 0.00 as test
+ data = [(np.nan,), (5.0,), (np.nan,)]
+ cursor_descr: DbapiDescription = [("test", "double", None, None, None,
None, False)]
+ results = SupersetResultSet(data, cursor_descr, BaseEngineSpec)
+ df = results.to_pandas_df()
+
+ # Get records with our fix
+ records = df_to_records(df)
+
+ # Verify NaN values are converted to None
+ assert records == [
+ {"test": None}, # NaN converted to None
+ {"test": 5.0},
+ {"test": None}, # NaN converted to None
+ ]
+
+ # This should succeed with valid, spec-compliant JSON
+ json_output = superset_json.dumps(records)
+ parsed = superset_json.loads(json_output)
+
+ # Verify JSON serialization works correctly
+ assert parsed == records
+
+ # Demonstrate what happens WITHOUT the fix
+ # (simulate the old behavior by directly using to_dict)
+ records_without_fix = df.to_dict(orient="records")
+
+ # Verify the records contain actual NaN values (not None)
+ assert np.isnan(records_without_fix[0]["test"])
+ assert records_without_fix[1]["test"] == 5.0
+ assert np.isnan(records_without_fix[2]["test"])
+
+ # Demonstrate the actual bug: without the fix, ignore_nan=False raises
ValueError
+ # This is the error users would see without our fix
+ with pytest.raises(
+ ValueError, match="Out of range float values are not JSON compliant"
+ ):
+ superset_json.dumps(records_without_fix, ignore_nan=False)
+
+ # With ignore_nan=True, it works by converting NaN to null
+ # But this requires the flag to be set everywhere - our fix eliminates
this need
+ json_with_ignore = superset_json.dumps(records_without_fix,
ignore_nan=True)
+ parsed_with_ignore = superset_json.loads(json_with_ignore)
+ # The output is the same, but our fix doesn't require the ignore_nan flag
+ assert parsed_with_ignore[0]["test"] is None
+
+
+def test_df_to_records_with_json_serialization_like_sql_lab() -> None:
+ """
+ Test that mimics the actual SQL Lab serialization flow.
+ This shows how the fix prevents errors in the real usage path.
+ """
+ # Simulate query with NaN results
+ data = [
+ ("user1", 100.0, np.nan),
+ ("user2", np.nan, 50.0),
+ ("user3", 75.0, 25.0),
+ ]
+ cursor_descr: DbapiDescription = [
+ ("name", "varchar", None, None, None, None, False),
+ ("value1", "double", None, None, None, None, False),
+ ("value2", "double", None, None, None, None, False),
+ ]
+ results = SupersetResultSet(data, cursor_descr, BaseEngineSpec)
+ df = results.to_pandas_df()
+
+ # Mimic sql_lab.py:360 - this is where df_to_records is used
+ records = df_to_records(df) or []
+
+ # Mimic sql_lab.py:332 - JSON serialization with Superset's custom
json.dumps
+ # This should work without errors
+ json_str = superset_json.dumps(
+ records, default=superset_json.json_iso_dttm_ser, ignore_nan=True
+ )
+
+ # Verify it's valid JSON and NaN values are properly handled as null
+ parsed = superset_json.loads(json_str)
+ assert parsed[0]["value2"] is None # NaN became null
+ assert parsed[1]["value1"] is None # NaN became null
+ assert parsed[0]["value1"] == 100.0
+
+ # Also verify it works without ignore_nan flag (since we convert NaN to
None)
+ json_str_no_flag = superset_json.dumps(
+ records, default=superset_json.json_iso_dttm_ser, ignore_nan=False
+ )
+ parsed_no_flag = superset_json.loads(json_str_no_flag)
+ assert parsed_no_flag == parsed # Same result