villebro commented on code in PR #37815: URL: https://github.com/apache/superset/pull/37815#discussion_r2897602652
########## superset-core/src/superset_core/semantic_layers/semantic_layer.py: ########## @@ -0,0 +1,118 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +from __future__ import annotations + +from abc import ABC, abstractmethod +from typing import Any, Generic, TypeVar + +from pydantic import BaseModel +from superset_core.semantic_layers.semantic_view import SemanticView + +ConfigT = TypeVar("ConfigT", bound=BaseModel) +SemanticViewT = TypeVar("SemanticViewT", bound="SemanticView") + + +class SemanticLayer(ABC, Generic[ConfigT, SemanticViewT]): + """ + Abstract base class for semantic layers. + """ + + @classmethod + @abstractmethod + def from_configuration( + cls, + configuration: dict[str, Any], + ) -> SemanticLayer[ConfigT, SemanticViewT]: + """ + Create a semantic layer from its configuration. + """ + + @classmethod + @abstractmethod + def get_configuration_schema( + cls, + configuration: ConfigT | None = None, + ) -> dict[str, Any]: + """ + Get the JSON schema for the configuration needed to add the semantic layer. + + A partial configuration `configuration` can be sent to improve the schema, + allowing for progressive validation and better UX. For example, a semantic + layer might require: + + - auth information + - a database + + If the user provides the auth information, a client can send the partial + configuration to this method, and the resulting JSON schema would include + the list of databases the user has access to, allowing a dropdown to be + populated. + + The Snowflake semantic layer has an example implementation of this method, where + database and schema names are populated based on the provided connection info. + """ + + @classmethod + @abstractmethod + def get_runtime_schema( + cls, + configuration: ConfigT, + runtime_data: dict[str, Any] | None = None, + ) -> dict[str, Any]: + """ + Get the JSON schema for the runtime parameters needed to load semantic views. + + This returns the schema needed to connect to a semantic view given the + configuration for the semantic layer. For example, a semantic layer might + be configured by: + + - auth information + - an optional database + + If the user does not provide a database when creating the semantic layer, the + runtime schema would require the database name to be provided before loading any + semantic views. This allows users to create semantic layers that connect to a + specific database (or project, account, etc.), or that allow users to select it + at query time. + + The Snowflake semantic layer has an example implementation of this method, where + database and schema names are required if they were not provided in the initial + configuration. + """ + + @abstractmethod + def get_semantic_views( + self, + runtime_configuration: dict[str, Any], + ) -> set[SemanticViewT]: + """ + Get the semantic views available in the semantic layer. + + The runtime configuration can provide information like a given project or + schema, used to restrict the semantic views returned. + """ Review Comment: In the other stubs we usually raise `NotImplementedError`. Something like `raise NotImplementedError("Method will be replaced during initialization")` could be good here. ########## superset-core/src/superset_core/semantic_layers/types.py: ########## @@ -0,0 +1,327 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +from __future__ import annotations + +import enum +from dataclasses import dataclass +from datetime import date, datetime, time, timedelta +from functools import total_ordering +from typing import Type as TypeOf + +import pyarrow as pa + +__all__ = [ + "BINARY", + "BOOLEAN", + "DATE", + "DATETIME", + "DECIMAL", + "Day", + "Dimension", + "Hour", + "INTEGER", + "INTERVAL", + "Minute", + "Month", + "NUMBER", + "OBJECT", + "Quarter", + "Second", + "STRING", + "TIME", + "Week", + "Year", +] + + +class Type: + """ + Base class for types. + """ + + +class INTEGER(Type): + """ + Represents an integer type. + """ + + +class NUMBER(Type): + """ + Represents a number type. + """ + + +class DECIMAL(Type): + """ + Represents a decimal type. + """ + + +class STRING(Type): + """ + Represents a string type. + """ + + +class BOOLEAN(Type): + """ + Represents a boolean type. + """ + + +class DATE(Type): + """ + Represents a date type. + """ + + +class TIME(Type): + """ + Represents a time type. + """ + + +class DATETIME(DATE, TIME): + """ + Represents a datetime type. + """ + + +class INTERVAL(Type): + """ + Represents an interval type. + """ + + +class OBJECT(Type): + """ + Represents an object type. + """ + + +class BINARY(Type): + """ + Represents a binary type. + """ + + +@dataclass(frozen=True) +@total_ordering +class Grain: + """ + Base class for time and date grains with comparison support. + + Attributes: + name: Human-readable name of the grain (e.g., "Second") + representation: ISO 8601 representation (e.g., "PT1S") + value: Time period as a timedelta + """ + + name: str + representation: str + value: timedelta + + def __eq__(self, other: object) -> bool: + if isinstance(other, Grain): + return self.value == other.value + return NotImplemented + + def __lt__(self, other: object) -> bool: + if isinstance(other, Grain): + return self.value < other.value + return NotImplemented + + def __hash__(self) -> int: + return hash((self.name, self.representation, self.value)) + + +class Second(Grain): + name = "Second" + representation = "PT1S" + value = timedelta(seconds=1) + + +class Minute(Grain): + name = "Minute" + representation = "PT1M" + value = timedelta(minutes=1) + + +class Hour(Grain): + name = "Hour" + representation = "PT1H" + value = timedelta(hours=1) + + +class Day(Grain): + name = "Day" + representation = "P1D" + value = timedelta(days=1) + + +class Week(Grain): + name = "Week" + representation = "P1W" + value = timedelta(weeks=1) + + +class Month(Grain): + name = "Month" + representation = "P1M" + value = timedelta(days=30) + + +class Quarter(Grain): + name = "Quarter" + representation = "P3M" + value = timedelta(days=90) + + +class Year(Grain): + name = "Year" + representation = "P1Y" + value = timedelta(days=365) + + +@dataclass(frozen=True) +class Dimension: + id: str + name: str + type: TypeOf[Type] + + definition: str | None = None + description: str | None = None + grain: TypeOf[Grain] | None = None + + +@dataclass(frozen=True) +class Metric: + id: str + name: str + type: TypeOf[Type] + + definition: str | None + description: str | None = None + + +@dataclass(frozen=True) +class AdhocExpression: + id: str + definition: str + + +class Operator(str, enum.Enum): + EQUALS = "=" + NOT_EQUALS = "!=" + GREATER_THAN = ">" + LESS_THAN = "<" + GREATER_THAN_OR_EQUAL = ">=" + LESS_THAN_OR_EQUAL = "<=" + IN = "IN" + NOT_IN = "NOT IN" + LIKE = "LIKE" + NOT_LIKE = "NOT LIKE" + IS_NULL = "IS NULL" + IS_NOT_NULL = "IS NOT NULL" + ADHOC = "ADHOC" + + +FilterValues = str | int | float | bool | datetime | date | time | timedelta | None + + +class PredicateType(enum.Enum): + WHERE = "WHERE" + HAVING = "HAVING" + + +@dataclass(frozen=True, order=True) +class Filter: + type: PredicateType + column: Dimension | Metric | None + operator: Operator + value: FilterValues | frozenset[FilterValues] Review Comment: Should we distinguish `WHERE` and `HAVING` filters? I think they're slightly different, so it might make sense to break them up (I'm not very opinionated though, so this is ok for me) ########## superset-core/src/superset_core/semantic_layers/types.py: ########## @@ -0,0 +1,327 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +from __future__ import annotations + +import enum +from dataclasses import dataclass +from datetime import date, datetime, time, timedelta +from functools import total_ordering +from typing import Type as TypeOf + +import pyarrow as pa + +__all__ = [ Review Comment: I vote to remove it ########## superset-core/src/superset_core/semantic_layers/semantic_view.py: ########## @@ -0,0 +1,109 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +from __future__ import annotations + +import enum +from abc import ABC, abstractmethod + +from superset_core.semantic_layers.types import ( + Dimension, + Filter, + GroupLimit, + Metric, + OrderTuple, + SemanticResult, +) + + +# TODO (betodealmeida): move to the extension JSON +class SemanticViewFeature(enum.Enum): + """ + Custom features supported by semantic layers. + """ + + ADHOC_EXPRESSIONS_IN_ORDERBY = "ADHOC_EXPRESSIONS_IN_ORDERBY" + GROUP_LIMIT = "GROUP_LIMIT" + GROUP_OTHERS = "GROUP_OTHERS" + + +class SemanticView(ABC): + """ + Abstract base class for semantic views. + """ + + features: frozenset[SemanticViewFeature] + + @abstractmethod + def uid(self) -> str: + """ + Returns a unique identifier for the semantic view. + """ + + @abstractmethod + def get_dimensions(self) -> set[Dimension]: + """ + Get the dimensions defined in the semantic view. + """ + + @abstractmethod + def get_metrics(self) -> set[Metric]: + """ + Get the metrics defined in the semantic view. + """ + + @abstractmethod + def get_values( + self, + dimension: Dimension, + filters: set[Filter] | None = None, + ) -> SemanticResult: + """ + Return distinct values for a dimension. + """ + + @abstractmethod + def get_dataframe( + self, + metrics: list[Metric], + dimensions: list[Dimension], + filters: set[Filter] | None = None, + order: list[OrderTuple] | None = None, + limit: int | None = None, + offset: int | None = None, Review Comment: Should this method expect a `SemanticQuery` rather than its constituents? ########## superset-core/src/superset_core/semantic_layers/types.py: ########## @@ -0,0 +1,327 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +from __future__ import annotations + +import enum +from dataclasses import dataclass +from datetime import date, datetime, time, timedelta +from functools import total_ordering +from typing import Type as TypeOf + +import pyarrow as pa + +__all__ = [ + "BINARY", + "BOOLEAN", + "DATE", + "DATETIME", + "DECIMAL", + "Day", + "Dimension", + "Hour", + "INTEGER", + "INTERVAL", + "Minute", + "Month", + "NUMBER", + "OBJECT", + "Quarter", + "Second", + "STRING", + "TIME", + "Week", + "Year", +] + + +class Type: + """ + Base class for types. + """ + + +class INTEGER(Type): + """ + Represents an integer type. + """ + + +class NUMBER(Type): + """ + Represents a number type. + """ + + +class DECIMAL(Type): + """ + Represents a decimal type. + """ + + +class STRING(Type): + """ + Represents a string type. + """ + + +class BOOLEAN(Type): + """ + Represents a boolean type. + """ + + +class DATE(Type): + """ + Represents a date type. + """ + + +class TIME(Type): + """ + Represents a time type. + """ + + +class DATETIME(DATE, TIME): + """ + Represents a datetime type. + """ + + +class INTERVAL(Type): + """ + Represents an interval type. + """ + + +class OBJECT(Type): + """ + Represents an object type. + """ + + +class BINARY(Type): + """ + Represents a binary type. + """ + + +@dataclass(frozen=True) +@total_ordering +class Grain: Review Comment: We currently have many intermediate time grains, like 5 seconds, 10 seconds, 15 seconds etc. So at a minimum those would need to be supported, and they're missing here. I think a more scalable solution would be to support arbitrary ISO-8601 expressions. For example: - `PT1S` = one second - `PT12S` = 12 seconds - `P2W` = two weeks - `P3M` = a quarter - `P1H30M` - etc. I asked AI to generate an ISO-8601 duration regex pattern, and this came up: `r"^P(?=\d|T\d)(?:(\d+)Y)?(?:(\d+)M)?(?:(\d+)D)?(?:T(?:(\d+)H)?(?:(\d+)M)?(?:(\d+(?:\.\d+)?)S)?)?$"`. Not super beautiful, but with a good parametrized test we could make sure it supports the currently supported time grains + any arbitrary intermediate grain (e.g. the `PT12S` case above) ########## superset-core/src/superset_core/semantic_layers/semantic_layer.py: ########## @@ -0,0 +1,118 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +from __future__ import annotations + +from abc import ABC, abstractmethod +from typing import Any, Generic, TypeVar + +from pydantic import BaseModel +from superset_core.semantic_layers.semantic_view import SemanticView + +ConfigT = TypeVar("ConfigT", bound=BaseModel) +SemanticViewT = TypeVar("SemanticViewT", bound="SemanticView") + + +class SemanticLayer(ABC, Generic[ConfigT, SemanticViewT]): + """ + Abstract base class for semantic layers. + """ + + @classmethod + @abstractmethod + def from_configuration( + cls, + configuration: dict[str, Any], + ) -> SemanticLayer[ConfigT, SemanticViewT]: + """ + Create a semantic layer from its configuration. + """ + + @classmethod + @abstractmethod + def get_configuration_schema( + cls, + configuration: ConfigT | None = None, + ) -> dict[str, Any]: + """ + Get the JSON schema for the configuration needed to add the semantic layer. + + A partial configuration `configuration` can be sent to improve the schema, + allowing for progressive validation and better UX. For example, a semantic + layer might require: + + - auth information + - a database + + If the user provides the auth information, a client can send the partial + configuration to this method, and the resulting JSON schema would include + the list of databases the user has access to, allowing a dropdown to be + populated. + + The Snowflake semantic layer has an example implementation of this method, where + database and schema names are populated based on the provided connection info. + """ + + @classmethod + @abstractmethod + def get_runtime_schema( + cls, + configuration: ConfigT, + runtime_data: dict[str, Any] | None = None, + ) -> dict[str, Any]: + """ + Get the JSON schema for the runtime parameters needed to load semantic views. + + This returns the schema needed to connect to a semantic view given the + configuration for the semantic layer. For example, a semantic layer might + be configured by: + + - auth information + - an optional database + + If the user does not provide a database when creating the semantic layer, the + runtime schema would require the database name to be provided before loading any + semantic views. This allows users to create semantic layers that connect to a + specific database (or project, account, etc.), or that allow users to select it + at query time. + + The Snowflake semantic layer has an example implementation of this method, where + database and schema names are required if they were not provided in the initial + configuration. + """ + + @abstractmethod + def get_semantic_views( + self, + runtime_configuration: dict[str, Any], + ) -> set[SemanticViewT]: + """ + Get the semantic views available in the semantic layer. + + The runtime configuration can provide information like a given project or + schema, used to restrict the semantic views returned. + """ + + @abstractmethod + def get_semantic_view( + self, + name: str, + additional_configuration: dict[str, Any], + ) -> SemanticViewT: + """ + Get a specific semantic view by its name and additional configuration. + """ Review Comment: same here ########## superset-core/src/superset_core/semantic_layers/semantic_view.py: ########## @@ -0,0 +1,109 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +from __future__ import annotations + +import enum +from abc import ABC, abstractmethod + +from superset_core.semantic_layers.types import ( + Dimension, + Filter, + GroupLimit, + Metric, + OrderTuple, + SemanticResult, +) + + +# TODO (betodealmeida): move to the extension JSON +class SemanticViewFeature(enum.Enum): + """ + Custom features supported by semantic layers. + """ + + ADHOC_EXPRESSIONS_IN_ORDERBY = "ADHOC_EXPRESSIONS_IN_ORDERBY" + GROUP_LIMIT = "GROUP_LIMIT" + GROUP_OTHERS = "GROUP_OTHERS" + + +class SemanticView(ABC): + """ + Abstract base class for semantic views. + """ + + features: frozenset[SemanticViewFeature] + + @abstractmethod + def uid(self) -> str: + """ + Returns a unique identifier for the semantic view. + """ + + @abstractmethod + def get_dimensions(self) -> set[Dimension]: + """ + Get the dimensions defined in the semantic view. + """ + + @abstractmethod + def get_metrics(self) -> set[Metric]: + """ + Get the metrics defined in the semantic view. + """ + + @abstractmethod + def get_values( + self, + dimension: Dimension, + filters: set[Filter] | None = None, + ) -> SemanticResult: + """ + Return distinct values for a dimension. + """ + + @abstractmethod + def get_dataframe( Review Comment: IIRC, we agreed on `get_table` to align with [Apache Arrow Table](https://arrow.apache.org/docs/python/generated/pyarrow.Table.html). ########## superset-core/src/superset_core/semantic_layers/types.py: ########## @@ -0,0 +1,327 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +from __future__ import annotations + +import enum +from dataclasses import dataclass +from datetime import date, datetime, time, timedelta +from functools import total_ordering +from typing import Type as TypeOf + +import pyarrow as pa + +__all__ = [ + "BINARY", + "BOOLEAN", + "DATE", + "DATETIME", + "DECIMAL", + "Day", + "Dimension", + "Hour", + "INTEGER", + "INTERVAL", + "Minute", + "Month", + "NUMBER", + "OBJECT", + "Quarter", + "Second", + "STRING", + "TIME", + "Week", + "Year", +] + + +class Type: + """ + Base class for types. + """ Review Comment: If we go all in on Apache Arrow, should we just start using their [types](https://arrow.apache.org/docs/python/api/datatypes.html) from day one, and not reinvent the wheel? For instance, this opens up support for TZ aware timestamps which has been a huge shortcoming with our current temporal formats.. -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: [email protected] For queries about this service, please contact Infrastructure at: [email protected] --------------------------------------------------------------------- To unsubscribe, e-mail: [email protected] For additional commands, e-mail: [email protected]
