This is an automated email from the ASF dual-hosted git repository. elizabeth pushed a commit to branch elizabeth/fix-resize-bug in repository https://gitbox.apache.org/repos/asf/superset.git
commit b404dceb5c2e6ff7bcb50189541514fd6724ca2c Author: Maxime Beauchemin <[email protected]> AuthorDate: Fri Jul 25 16:26:32 2025 -0700 feat(timeseries): enhance 'Series Limit' to support grouping the long tail (#34308) --- LLMS.md | 8 +- .../src/sections/echartsTimeSeriesQuery.tsx | 2 +- .../src/shared-controls/sharedControls.tsx | 14 ++ .../superset-ui-core/src/query/buildQueryObject.ts | 2 + .../test/MixedTimeseries/buildQuery.test.ts | 2 + .../src/explore/controlPanels/sections.tsx | 3 +- .../UploadDataModel/UploadDataModal.test.tsx | 2 +- superset/charts/schemas.py | 8 ++ superset/common/query_object.py | 3 + superset/models/helpers.py | 123 ++++++++++++++++- tests/unit_tests/models/helpers_test.py | 150 +++++++++++++++++++++ tests/unit_tests/queries/query_object_test.py | 1 + 12 files changed, 309 insertions(+), 9 deletions(-) diff --git a/LLMS.md b/LLMS.md index f59caa840b..22d58e691b 100644 --- a/LLMS.md +++ b/LLMS.md @@ -141,9 +141,11 @@ curl -f http://localhost:8088/health || echo "❌ Setup required - see https://s # Install hooks pre-commit install -# Quick validation (only changed files - faster) -pre-commit run # Staged files only - PREFERRED for iterations -pre-commit run --all-files # Full codebase - slower, use sparingly +# IMPORTANT: Stage your changes first! +git add . # Pre-commit only checks staged files + +# Quick validation (faster than --all-files) +pre-commit run # Staged files only pre-commit run mypy # Python type checking pre-commit run prettier # Code formatting pre-commit run eslint # Frontend linting diff --git a/superset-frontend/packages/superset-ui-chart-controls/src/sections/echartsTimeSeriesQuery.tsx b/superset-frontend/packages/superset-ui-chart-controls/src/sections/echartsTimeSeriesQuery.tsx index b8ec643c0b..7a15bbfc9a 100644 --- a/superset-frontend/packages/superset-ui-chart-controls/src/sections/echartsTimeSeriesQuery.tsx +++ b/superset-frontend/packages/superset-ui-chart-controls/src/sections/echartsTimeSeriesQuery.tsx @@ -30,7 +30,7 @@ const controlsWithoutXAxis: ControlSetRow[] = [ ['groupby'], [contributionModeControl], ['adhoc_filters'], - ['limit'], + ['limit', 'group_others_when_limit_reached'], ['timeseries_limit_metric'], ['order_desc'], ['row_limit'], diff --git a/superset-frontend/packages/superset-ui-chart-controls/src/shared-controls/sharedControls.tsx b/superset-frontend/packages/superset-ui-chart-controls/src/shared-controls/sharedControls.tsx index 6ceff6a444..f456d98e22 100644 --- a/superset-frontend/packages/superset-ui-chart-controls/src/shared-controls/sharedControls.tsx +++ b/superset-frontend/packages/superset-ui-chart-controls/src/shared-controls/sharedControls.tsx @@ -283,6 +283,19 @@ const series_limit: SharedControlConfig<'SelectControl'> = { ), }; +const group_others_when_limit_reached: SharedControlConfig<'CheckboxControl'> = + { + type: 'CheckboxControl', + label: t('Group remaining as "Others"'), + default: false, + description: t( + 'Groups remaining series into an "Others" category when series limit is reached. ' + + 'This prevents incomplete time series data from being displayed.', + ), + visibility: ({ form_data }: { form_data: any }) => + Boolean(form_data?.limit || form_data?.series_limit), + }; + const y_axis_format: SharedControlConfig<'SelectControl', SelectDefaultOption> = { type: 'SelectControl', @@ -446,6 +459,7 @@ export default { time_shift_color, series_columns: dndColumnsControl, series_limit, + group_others_when_limit_reached, series_limit_metric: dndSortByControl, legacy_order_by: dndSortByControl, truncate_metric, diff --git a/superset-frontend/packages/superset-ui-core/src/query/buildQueryObject.ts b/superset-frontend/packages/superset-ui-core/src/query/buildQueryObject.ts index 725c2c87aa..3f3ec3615e 100644 --- a/superset-frontend/packages/superset-ui-core/src/query/buildQueryObject.ts +++ b/superset-frontend/packages/superset-ui-core/src/query/buildQueryObject.ts @@ -63,6 +63,7 @@ export default function buildQueryObject<T extends QueryFormData>( series_columns, series_limit, series_limit_metric, + group_others_when_limit_reached, ...residualFormData } = formData; const { @@ -128,6 +129,7 @@ export default function buildQueryObject<T extends QueryFormData>( normalizeSeriesLimitMetric(series_limit_metric) ?? timeseries_limit_metric ?? undefined, + group_others_when_limit_reached: group_others_when_limit_reached ?? false, order_desc: typeof order_desc === 'undefined' ? true : order_desc, url_params: url_params || undefined, custom_params, diff --git a/superset-frontend/plugins/plugin-chart-echarts/test/MixedTimeseries/buildQuery.test.ts b/superset-frontend/plugins/plugin-chart-echarts/test/MixedTimeseries/buildQuery.test.ts index eefdc298d8..7c265bb31e 100644 --- a/superset-frontend/plugins/plugin-chart-echarts/test/MixedTimeseries/buildQuery.test.ts +++ b/superset-frontend/plugins/plugin-chart-echarts/test/MixedTimeseries/buildQuery.test.ts @@ -107,6 +107,7 @@ test('should compile query object A', () => { series_columns: ['foo'], series_limit: 5, series_limit_metric: undefined, + group_others_when_limit_reached: false, url_params: {}, custom_params: {}, custom_form_data: {}, @@ -166,6 +167,7 @@ test('should compile query object B', () => { series_columns: [], series_limit: 0, series_limit_metric: undefined, + group_others_when_limit_reached: false, url_params: {}, custom_params: {}, custom_form_data: {}, diff --git a/superset-frontend/src/explore/controlPanels/sections.tsx b/superset-frontend/src/explore/controlPanels/sections.tsx index 6ff7dd9fac..724e77ac8e 100644 --- a/superset-frontend/src/explore/controlPanels/sections.tsx +++ b/superset-frontend/src/explore/controlPanels/sections.tsx @@ -99,7 +99,8 @@ export const NVD3TimeSeries: ControlPanelSectionConfig[] = [ ['metrics'], ['adhoc_filters'], ['groupby'], - ['limit', 'timeseries_limit_metric'], + ['limit', 'group_others_when_limit_reached'], + ['timeseries_limit_metric'], ['order_desc'], [ { diff --git a/superset-frontend/src/features/databases/UploadDataModel/UploadDataModal.test.tsx b/superset-frontend/src/features/databases/UploadDataModel/UploadDataModal.test.tsx index 3b268408e7..b7292087b9 100644 --- a/superset-frontend/src/features/databases/UploadDataModel/UploadDataModal.test.tsx +++ b/superset-frontend/src/features/databases/UploadDataModel/UploadDataModal.test.tsx @@ -409,7 +409,7 @@ describe('UploadDataModal - Database and Schema Population', () => { await userEvent.click(selectSchema); await waitFor(() => screen.getAllByText('schema1')); await waitFor(() => screen.getAllByText('schema2')); - }); + }, 60000); }); describe('UploadDataModal - Form Validation', () => { diff --git a/superset/charts/schemas.py b/superset/charts/schemas.py index 7d004431b0..a8911ce13d 100644 --- a/superset/charts/schemas.py +++ b/superset/charts/schemas.py @@ -1257,6 +1257,14 @@ class ChartDataQueryObjectSchema(Schema): }, allow_none=True, ) + group_others_when_limit_reached = fields.Boolean( + metadata={ + "description": "When true, groups remaining series into an 'Others' " + "category when series limit is reached. Prevents incomplete data." + }, + load_default=False, + allow_none=True, + ) timeseries_limit = fields.Integer( metadata={ "description": "Maximum row count for timeseries queries. " diff --git a/superset/common/query_object.py b/superset/common/query_object.py index 8d828d0576..eb696f00b1 100644 --- a/superset/common/query_object.py +++ b/superset/common/query_object.py @@ -131,6 +131,7 @@ class QueryObject: # pylint: disable=too-many-instance-attributes series_columns: list[Column] | None = None, series_limit: int = 0, series_limit_metric: Metric | None = None, + group_others_when_limit_reached: bool = False, time_range: str | None = None, time_shift: str | None = None, **kwargs: Any, @@ -154,6 +155,7 @@ class QueryObject: # pylint: disable=too-many-instance-attributes self._init_series_columns(series_columns, metrics, is_timeseries) self.series_limit = series_limit self.series_limit_metric = series_limit_metric + self.group_others_when_limit_reached = group_others_when_limit_reached self.time_range = time_range self.time_shift = time_shift self.from_dttm = kwargs.get("from_dttm") @@ -356,6 +358,7 @@ class QueryObject: # pylint: disable=too-many-instance-attributes "series_columns": self.series_columns, "series_limit": self.series_limit, "series_limit_metric": self.series_limit_metric, + "group_others_when_limit_reached": self.group_others_when_limit_reached, "to_dttm": self.to_dttm, "time_shift": self.time_shift, } diff --git a/superset/models/helpers.py b/superset/models/helpers.py index a555509d54..6b7c07b784 100644 --- a/superset/models/helpers.py +++ b/superset/models/helpers.py @@ -26,7 +26,7 @@ import re import uuid from collections.abc import Hashable from datetime import datetime, timedelta -from typing import Any, cast, NamedTuple, Optional, TYPE_CHECKING, Union +from typing import Any, Callable, cast, NamedTuple, Optional, TYPE_CHECKING, Union import dateutil.parser import humanize @@ -1258,6 +1258,65 @@ class ExploreMixin: # pylint: disable=too-many-public-methods return or_(*groups) + def _apply_series_others_grouping( + self, + select_exprs: list[Any], + groupby_all_columns: dict[str, Any], + groupby_series_columns: dict[str, Any], + condition_factory: Callable[[str, Any], Any], + ) -> tuple[list[Any], dict[str, Any]]: + """ + Apply "Others" grouping to series columns in both SELECT and GROUP BY clauses. + + This method encapsulates the common logic for replacing series columns with + CASE expressions that group remaining series into an "Others" category when + the series limit is reached. + + Args: + select_exprs: List of SELECT expressions to modify + groupby_all_columns: Dict of GROUP BY columns to modify + groupby_series_columns: Dict of series columns to apply Others grouping to + condition_factory: Function that takes (col_name, original_expr) and returns + the condition for when to keep original value vs use "Others" + + Returns: + Tuple of (modified_select_exprs, modified_groupby_all_columns) + """ + # Modify SELECT expressions + modified_select_exprs = [] + for expr in select_exprs: + if hasattr(expr, "name") and expr.name in groupby_series_columns: + # Create condition for this column using the factory function + condition = condition_factory(expr.name, expr) + + # Create CASE expression: condition true -> original, else "Others" + case_expr = sa.case( + [(condition, expr)], else_=sa.literal_column("Others") + ) + case_expr = self.make_sqla_column_compatible(case_expr, expr.name) + modified_select_exprs.append(case_expr) + else: + modified_select_exprs.append(expr) + + # Modify GROUP BY expressions + modified_groupby_all_columns = {} + for col_name, gby_expr in groupby_all_columns.items(): + if col_name in groupby_series_columns: + # Create condition for this column using the factory function + condition = condition_factory(col_name, gby_expr) + + # Create CASE expression for groupby + case_expr = sa.case( + [(condition, gby_expr)], + else_=sa.literal_column("Others"), + ) + case_expr = self.make_sqla_column_compatible(case_expr, col_name) + modified_groupby_all_columns[col_name] = case_expr + else: + modified_groupby_all_columns[col_name] = gby_expr + + return modified_select_exprs, modified_groupby_all_columns + def dttm_sql_literal(self, dttm: datetime, col: "TableColumn") -> str: """Convert datetime object to a SQL expression string""" @@ -1449,6 +1508,7 @@ class ExploreMixin: # pylint: disable=too-many-public-methods series_columns: Optional[list[Column]] = None, series_limit: Optional[int] = None, series_limit_metric: Optional[Metric] = None, + group_others_when_limit_reached: bool = False, row_limit: Optional[int] = None, row_offset: Optional[int] = None, timeseries_limit: Optional[int] = None, @@ -2040,7 +2100,43 @@ class ExploreMixin: # pylint: disable=too-many-public-methods col_name = db_engine_spec.make_label_compatible(gby_name + "__") on_clause.append(gby_obj == sa.column(col_name)) - tbl = tbl.join(subq.alias(SERIES_LIMIT_SUBQ_ALIAS), and_(*on_clause)) + # Use LEFT JOIN when grouping others, INNER JOIN otherwise + if group_others_when_limit_reached: + # Create the alias once and reuse it + subq_alias = subq.alias(SERIES_LIMIT_SUBQ_ALIAS) + tbl = tbl.join( + subq_alias, + and_(*on_clause), + isouter=True, + ) + + # Apply Others grouping using the refactored method + def _create_join_condition(col_name: str, expr: Any) -> Any: + # Get the corresponding column from the subquery + subq_col_name = db_engine_spec.make_label_compatible( + col_name + "__" + ) + # Reference the column from the already-created aliased subquery + subq_col = subq_alias.c[subq_col_name] + return subq_col.is_not(None) + + select_exprs, groupby_all_columns = ( + self._apply_series_others_grouping( + select_exprs, + groupby_all_columns, + groupby_series_columns, + _create_join_condition, + ) + ) + + # Reconstruct query with modified expressions + qry = sa.select(select_exprs) + if groupby_all_columns: + qry = qry.group_by(*groupby_all_columns.values()) + else: + tbl = tbl.join( + subq.alias(SERIES_LIMIT_SUBQ_ALIAS), and_(*on_clause) + ) else: if series_limit_metric: orderby = [ @@ -2081,7 +2177,28 @@ class ExploreMixin: # pylint: disable=too-many-public-methods top_groups = self._get_top_groups( result.df, dimensions, groupby_series_columns, columns_by_name ) - qry = qry.where(top_groups) + + if group_others_when_limit_reached: + # Apply Others grouping using the refactored method + def _create_top_groups_condition(col_name: str, expr: Any) -> Any: + return top_groups + + select_exprs, groupby_all_columns = ( + self._apply_series_others_grouping( + select_exprs, + groupby_all_columns, + groupby_series_columns, + _create_top_groups_condition, + ) + ) + + # Reconstruct query with modified expressions + qry = sa.select(select_exprs) + if groupby_all_columns: + qry = qry.group_by(*groupby_all_columns.values()) + else: + # Original behavior: filter to only top groups + qry = qry.where(top_groups) qry = qry.select_from(tbl) diff --git a/tests/unit_tests/models/helpers_test.py b/tests/unit_tests/models/helpers_test.py index d0dfef43b8..05d837437b 100644 --- a/tests/unit_tests/models/helpers_test.py +++ b/tests/unit_tests/models/helpers_test.py @@ -210,3 +210,153 @@ def test_values_for_column_double_percents( assert called_sql.compare(expected_sql) is True assert called_conn.engine == engine + + +def test_apply_series_others_grouping(database: Database) -> None: + """ + Test the `_apply_series_others_grouping` method. + + This method should replace series columns with CASE expressions that + group remaining series into an "Others" category based on a condition. + """ + from unittest.mock import Mock + + from superset.connectors.sqla.models import SqlaTable, TableColumn + + # Create a mock table for testing + table = SqlaTable( + database=database, + schema=None, + table_name="test_table", + columns=[ + TableColumn(column_name="category", type="TEXT"), + TableColumn(column_name="metric_col", type="INTEGER"), + TableColumn(column_name="other_col", type="TEXT"), + ], + ) + + # Mock SELECT expressions + category_expr = Mock() + category_expr.name = "category" + metric_expr = Mock() + metric_expr.name = "metric_col" + other_expr = Mock() + other_expr.name = "other_col" + + select_exprs = [category_expr, metric_expr, other_expr] + + # Mock GROUP BY columns + groupby_all_columns = { + "category": category_expr, + "other_col": other_expr, + } + + # Define series columns (only category should be modified) + groupby_series_columns = {"category": category_expr} + + # Create a condition factory that always returns True + def always_true_condition(col_name: str, expr) -> bool: + return True + + # Mock the make_sqla_column_compatible method + def mock_make_compatible(expr, name=None): + mock_result = Mock() + mock_result.name = name + return mock_result + + with patch.object( + table, "make_sqla_column_compatible", side_effect=mock_make_compatible + ): + # Call the method + result_select_exprs, result_groupby_columns = ( + table._apply_series_others_grouping( + select_exprs, + groupby_all_columns, + groupby_series_columns, + always_true_condition, + ) + ) + + # Verify SELECT expressions + assert len(result_select_exprs) == 3 + + # Category (series column) should be replaced with CASE expression + category_result = result_select_exprs[0] + assert category_result.name == "category" # Should be made compatible + + # Metric (non-series column) should remain unchanged + assert result_select_exprs[1] == metric_expr + + # Other (non-series column) should remain unchanged + assert result_select_exprs[2] == other_expr + + # Verify GROUP BY columns + assert len(result_groupby_columns) == 2 + + # Category (series column) should be replaced with CASE expression + assert "category" in result_groupby_columns + category_groupby_result = result_groupby_columns["category"] + assert category_groupby_result.name == "category" # Should be made compatible + + # Other (non-series column) should remain unchanged + assert result_groupby_columns["other_col"] == other_expr + + +def test_apply_series_others_grouping_with_false_condition(database: Database) -> None: + """ + Test the `_apply_series_others_grouping` method with a condition that returns False. + + This should result in CASE expressions that always use "Others". + """ + from unittest.mock import Mock + + from superset.connectors.sqla.models import SqlaTable, TableColumn + + # Create a mock table for testing + table = SqlaTable( + database=database, + schema=None, + table_name="test_table", + columns=[TableColumn(column_name="category", type="TEXT")], + ) + + # Mock SELECT expressions + category_expr = Mock() + category_expr.name = "category" + select_exprs = [category_expr] + + # Mock GROUP BY columns + groupby_all_columns = {"category": category_expr} + groupby_series_columns = {"category": category_expr} + + # Create a condition factory that always returns False + def always_false_condition(col_name: str, expr) -> bool: + return False + + # Mock the make_sqla_column_compatible method + def mock_make_compatible(expr, name=None): + mock_result = Mock() + mock_result.name = name + return mock_result + + with patch.object( + table, "make_sqla_column_compatible", side_effect=mock_make_compatible + ): + # Call the method + result_select_exprs, result_groupby_columns = ( + table._apply_series_others_grouping( + select_exprs, + groupby_all_columns, + groupby_series_columns, + always_false_condition, + ) + ) + + # Verify that the expressions were replaced (we can't test SQL generation + # in a unit test, but we can verify the structure changed) + assert len(result_select_exprs) == 1 + assert result_select_exprs[0].name == "category" + + assert len(result_groupby_columns) == 1 + assert "category" in result_groupby_columns + assert result_groupby_columns["category"].name == "category" diff --git a/tests/unit_tests/queries/query_object_test.py b/tests/unit_tests/queries/query_object_test.py index 8f42a460e0..059711f16b 100644 --- a/tests/unit_tests/queries/query_object_test.py +++ b/tests/unit_tests/queries/query_object_test.py @@ -44,6 +44,7 @@ def test_default_query_object_to_dict(): "filter": [], "from_dttm": None, "granularity": None, + "group_others_when_limit_reached": False, "inner_from_dttm": None, "inner_to_dttm": None, "is_rowcount": False,
