This is an automated email from the ASF dual-hosted git repository. beto pushed a commit to branch engine-manager in repository https://gitbox.apache.org/repos/asf/superset.git
commit 688224c4c0c27d0b6b2b5eb23b8507f91ae5d915 Author: Beto Dealmeida <[email protected]> AuthorDate: Wed Dec 3 20:23:18 2025 -0500 Simplify key generation --- superset/engines/manager.py | 112 +++++++++++--------------------------------- 1 file changed, 28 insertions(+), 84 deletions(-) diff --git a/superset/engines/manager.py b/superset/engines/manager.py index 1d5d0efec65..452882108aa 100644 --- a/superset/engines/manager.py +++ b/superset/engines/manager.py @@ -17,7 +17,6 @@ import enum import hashlib -import json import logging import threading from contextlib import contextmanager @@ -110,62 +109,22 @@ EngineKey = str TunnelKey = str -def _normalize_value(value: Any) -> str: +def _generate_cache_key(*args: Any) -> str: """ - Normalize a value for consistent hashing. + Generate a deterministic cache key from arbitrary arguments. - Converts various types to a consistent string representation for hashing. - Handles special cases like bytes, class objects, and nested structures. + Uses repr() for serialization and SHA-256 for hashing. The resulting key + is a 32-character hex string that: + 1. Is deterministic for the same inputs + 2. Does not expose sensitive data (everything is hashed) + 3. Has sufficient entropy to avoid collisions - :param value: The value to normalize - :returns: String representation suitable for hashing + :param args: Arguments to include in the cache key + :returns: 32-character hex string """ - if isinstance(value, bytes): - # For binary data (like private keys), hash it to avoid encoding issues - return hashlib.sha256(value).hexdigest()[:16] - elif isinstance(value, type): - # For class objects (like pool classes), use the class name - return value.__name__ - elif isinstance(value, dict): - # For nested dicts, recursively normalize - normalized_dict = {} - for k, v in sorted(value.items()): - normalized_dict[k] = _normalize_value(v) - return json.dumps(normalized_dict, sort_keys=True, separators=(",", ":")) - elif isinstance(value, (list, tuple)): - # For lists/tuples, normalize each item - normalized_list = [_normalize_value(item) for item in value] - return json.dumps(normalized_list, separators=(",", ":")) - else: - # For everything else, convert to string - return str(value) - - -def _generate_secure_key(components: dict[str, Any]) -> str: - """ - Generate a secure hash-based key from components. - - Creates a SHA-256 hash of the components to ensure: - 1. The key includes all parameters for proper caching - 2. Sensitive data is not exposed in logs or errors - 3. The key is deterministic for the same inputs - - :param components: Dictionary of components to hash - :returns: 32-character hex string representing the secure key - """ - # Create deterministic string representation - # Sort keys for consistency - key_data = { - k: _normalize_value(v) if v is not None else "" - for k, v in sorted(components.items()) - } - - # Create compact JSON representation - key_string = json.dumps(key_data, sort_keys=True, separators=(",", ":")) - - # Generate SHA-256 hash and return first 32 hex characters - # 32 characters = 128 bits of entropy, sufficient for collision resistance - return hashlib.sha256(key_string.encode("utf-8")).hexdigest()[:32] + # Use repr() which works with most Python objects and is deterministic + serialized = repr(args).encode("utf-8") + return hashlib.sha256(serialized).hexdigest()[:32] class EngineModes(enum.Enum): @@ -314,15 +273,13 @@ class EngineManager: user_id: int | None, ) -> EngineKey: """ - Generate a secure hash-based key for the engine. + Generate a cache key for the engine. - The key includes all parameters (including OAuth tokens and other sensitive - data) to ensure proper cache isolation, but uses a one-way hash to prevent - credential exposure in logs or errors. + The key is a hash of all parameters that affect the engine, ensuring + proper cache isolation without exposing sensitive data. - :returns: 32-character hex string representing the secure key + :returns: 32-character hex string """ - # Get all parameters that affect the engine uri, kwargs = self._get_engine_args( database, catalog, @@ -331,20 +288,15 @@ class EngineManager: user_id, ) - # Create components for the key - # Include all parameters to ensure proper cache isolation - key_components = { - "database_id": database.id, - "catalog": catalog, - "schema": schema, - "uri": str(uri), # SQLAlchemy URLs mask passwords - "source": str(source) if source else None, - "user_id": user_id, - "kwargs": kwargs, # Includes OAuth tokens and other sensitive params - } - - # Generate secure hash-based key - return _generate_secure_key(key_components) + return _generate_cache_key( + database.id, + catalog, + schema, + str(uri), + source, + user_id, + kwargs, + ) def _get_engine_args( self, @@ -533,20 +485,12 @@ class EngineManager: def _get_tunnel_key(self, ssh_tunnel: "SSHTunnel", uri: URL) -> TunnelKey: """ - Generate a secure hash-based key for the SSH tunnel. - - The key includes all tunnel parameters (including passwords and private keys) - to ensure proper cache isolation, but uses a one-way hash to prevent - credential exposure in logs or errors. + Generate a cache key for the SSH tunnel. - :returns: 32-character hex string representing the secure key + :returns: 32-character hex string """ - # Get all tunnel parameters tunnel_kwargs = self._get_tunnel_kwargs(ssh_tunnel, uri) - - # Generate secure hash-based key - # The tunnel_kwargs may contain sensitive data like passwords and private keys - return _generate_secure_key(tunnel_kwargs) + return _generate_cache_key(tunnel_kwargs) def _create_tunnel(self, ssh_tunnel: "SSHTunnel", uri: URL) -> SSHTunnelForwarder: kwargs = self._get_tunnel_kwargs(ssh_tunnel, uri)
