This is an automated email from the ASF dual-hosted git repository.
kaxilnaik pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/airflow.git
The following commit(s) were added to refs/heads/main by this push:
new 41c62b95cc6 Refactor registry workflow and metadata configuration
constants. (#63024)
41c62b95cc6 is described below
commit 41c62b95cc6a4973a026b076bfe26d5eeb03b244
Author: Kaxil Naik <[email protected]>
AuthorDate: Sat Mar 7 12:24:57 2026 +0000
Refactor registry workflow and metadata configuration constants. (#63024)
---
.github/workflows/registry-build.yml | 54 +++++++++++++++++++++++-------------
dev/registry/extract_metadata.py | 31 +++++++++++++++------
2 files changed, 57 insertions(+), 28 deletions(-)
diff --git a/.github/workflows/registry-build.yml
b/.github/workflows/registry-build.yml
index 27a1713629c..f3e0e8bc7f2 100644
--- a/.github/workflows/registry-build.yml
+++ b/.github/workflows/registry-build.yml
@@ -59,6 +59,15 @@ jobs:
timeout-minutes: 30
name: "Build & Publish Registry"
runs-on: ubuntu-latest
+ env:
+ EXISTING_REGISTRY_DIR: /tmp/existing-registry
+ REGISTRY_DATA_DIR: dev/registry
+ REGISTRY_PROVIDERS_JSON: providers.json
+ REGISTRY_MODULES_JSON: modules.json
+ REGISTRY_SITE_DATA_DIR: registry/src/_data
+ REGISTRY_SITE_VERSIONS_DIR: registry/src/_data/versions
+ REGISTRY_SITE_LOGOS_DIR: registry/public/logos
+ REGISTRY_CACHE_CONTROL: public, max-age=300
permissions:
contents: read
id-token: write
@@ -142,12 +151,15 @@ jobs:
env:
S3_BUCKET: ${{ steps.destination.outputs.bucket }}
run: |
- mkdir -p /tmp/existing-registry
- PROVIDERS_URL="${S3_BUCKET}api/providers.json"
- MODULES_URL="${S3_BUCKET}api/modules.json"
- if aws s3 cp "${PROVIDERS_URL}"
/tmp/existing-registry/providers.json 2>/dev/null; then
+ mkdir -p "${EXISTING_REGISTRY_DIR}"
+ PROVIDERS_URL="${S3_BUCKET}api/${REGISTRY_PROVIDERS_JSON}"
+ MODULES_URL="${S3_BUCKET}api/${REGISTRY_MODULES_JSON}"
+ if aws s3 cp \
+ "${PROVIDERS_URL}" \
+ "${EXISTING_REGISTRY_DIR}/${REGISTRY_PROVIDERS_JSON}" \
+ 2>/dev/null; then
echo "found=true" >> "${GITHUB_OUTPUT}"
- aws s3 cp "${MODULES_URL}" /tmp/existing-registry/modules.json ||
true
+ aws s3 cp "${MODULES_URL}"
"${EXISTING_REGISTRY_DIR}/${REGISTRY_MODULES_JSON}" || true
else
echo "found=false" >> "${GITHUB_OUTPUT}"
echo "No existing registry data on S3"
@@ -169,24 +181,28 @@ jobs:
if: inputs.provider != '' && steps.download-existing.outputs.found ==
'true'
run: |
uv run dev/registry/merge_registry_data.py \
- --existing-providers /tmp/existing-registry/providers.json \
- --existing-modules /tmp/existing-registry/modules.json \
- --new-providers dev/registry/providers.json \
- --new-modules dev/registry/modules.json \
- --output dev/registry/
+ --existing-providers
"${EXISTING_REGISTRY_DIR}/${REGISTRY_PROVIDERS_JSON}" \
+ --existing-modules
"${EXISTING_REGISTRY_DIR}/${REGISTRY_MODULES_JSON}" \
+ --new-providers "${REGISTRY_DATA_DIR}/${REGISTRY_PROVIDERS_JSON}" \
+ --new-modules "${REGISTRY_DATA_DIR}/${REGISTRY_MODULES_JSON}" \
+ --output "${REGISTRY_DATA_DIR}/"
- name: "Copy breeze output to registry data"
run: |
- mkdir -p registry/src/_data/versions
- cp dev/registry/providers.json registry/src/_data/providers.json
- cp dev/registry/modules.json registry/src/_data/modules.json
- if [ -d dev/registry/output/versions ]; then
- cp -r dev/registry/output/versions/* registry/src/_data/versions/
+ mkdir -p "${REGISTRY_SITE_VERSIONS_DIR}"
+ cp \
+ "${REGISTRY_DATA_DIR}/${REGISTRY_PROVIDERS_JSON}" \
+ "${REGISTRY_SITE_DATA_DIR}/${REGISTRY_PROVIDERS_JSON}"
+ cp \
+ "${REGISTRY_DATA_DIR}/${REGISTRY_MODULES_JSON}" \
+ "${REGISTRY_SITE_DATA_DIR}/${REGISTRY_MODULES_JSON}"
+ if [ -d "${REGISTRY_DATA_DIR}/output/versions" ]; then
+ cp -r "${REGISTRY_DATA_DIR}/output/versions/"*
"${REGISTRY_SITE_VERSIONS_DIR}/"
fi
# Copy provider logos extracted from
providers/*/docs/integration-logos/
- if [ -d dev/registry/logos ]; then
- mkdir -p registry/public/logos
- cp -r dev/registry/logos/* registry/public/logos/
+ if [ -d "${REGISTRY_DATA_DIR}/logos" ]; then
+ mkdir -p "${REGISTRY_SITE_LOGOS_DIR}"
+ cp -r "${REGISTRY_DATA_DIR}/logos/"* "${REGISTRY_SITE_LOGOS_DIR}/"
fi
- name: "Setup pnpm"
@@ -224,7 +240,7 @@ jobs:
S3_BUCKET: ${{ steps.destination.outputs.bucket }}
run: |
aws s3 sync registry/_site/ "${S3_BUCKET}" \
- --cache-control "public, max-age=300"
+ --cache-control "${REGISTRY_CACHE_CONTROL}"
- name: "Publish version metadata"
env:
diff --git a/dev/registry/extract_metadata.py b/dev/registry/extract_metadata.py
index 664e67f54e4..c1d2043d24d 100644
--- a/dev/registry/extract_metadata.py
+++ b/dev/registry/extract_metadata.py
@@ -44,11 +44,24 @@ from typing import Any
import tomllib
import yaml
+# External endpoints used by metadata extraction.
+PYPISTATS_RECENT_URL =
"https://pypistats.org/api/packages/{package_name}/recent"
+PYPI_PACKAGE_JSON_URL = "https://pypi.org/pypi/{package_name}/json"
+S3_DOC_URL = "http://apache-airflow-docs.s3-website.eu-central-1.amazonaws.com"
+AIRFLOW_PROVIDER_DOCS_URL =
"https://airflow.apache.org/docs/{package_name}/stable/"
+AIRFLOW_PROVIDER_CONNECTIONS_URL = (
+
"https://airflow.apache.org/docs/{package_name}/stable/connections/index.html"
+)
+AIRFLOW_PROVIDER_SOURCE_URL = (
+
"https://github.com/apache/airflow/tree/providers-{provider_id}/{version}/providers/{provider_path}"
+)
+PYPI_PACKAGE_URL = "https://pypi.org/project/{package_name}/"
+
def fetch_pypi_downloads(package_name: str) -> dict[str, int]:
"""Fetch download statistics from pypistats.org API."""
try:
- url = f"https://pypistats.org/api/packages/{package_name}/recent"
+ url = PYPISTATS_RECENT_URL.format(package_name=package_name)
with urllib.request.urlopen(url, timeout=5) as response:
data = json.loads(response.read().decode())
return {
@@ -64,7 +77,7 @@ def fetch_pypi_downloads(package_name: str) -> dict[str, int]:
def fetch_pypi_dates(package_name: str) -> dict[str, str]:
"""Fetch first release and latest release dates from PyPI JSON API."""
try:
- url = f"https://pypi.org/pypi/{package_name}/json"
+ url = PYPI_PACKAGE_JSON_URL.format(package_name=package_name)
with urllib.request.urlopen(url, timeout=10) as response:
data = json.loads(response.read().decode())
@@ -109,7 +122,6 @@ def read_inventory(inv_path: Path) -> dict[str, str]:
return result
-S3_DOC_URL = "http://apache-airflow-docs.s3-website.eu-central-1.amazonaws.com"
INVENTORY_CACHE_DIR = Path(__file__).parent / ".inventory_cache"
INVENTORY_TTL = datetime.timedelta(hours=12)
@@ -520,9 +532,7 @@ def main():
# Extract connection types from provider.yaml
# Link to the connections index page since individual connection pages
might not exist
connection_types = []
- connections_index_url = (
-
f"https://airflow.apache.org/docs/{package_name}/stable/connections/index.html"
- )
+ connections_index_url =
AIRFLOW_PROVIDER_CONNECTIONS_URL.format(package_name=package_name)
for conn in provider_yaml.get("connection-types", []):
conn_type = conn.get("connection-type", "")
hook_class = conn.get("hook-class-name", "")
@@ -546,6 +556,7 @@ def main():
# Airflow version compatibility (from pyproject.toml dependencies)
airflow_versions =
determine_airflow_versions(pyproject_data["dependencies"])
+ provider_source_path =
provider_path.relative_to(PROVIDERS_DIR).as_posix()
provider = Provider(
id=provider_id,
name=name,
@@ -563,9 +574,11 @@ def main():
requires_python=pyproject_data["requires_python"],
dependencies=pyproject_data["dependencies"],
optional_extras=pyproject_data.get("optional_extras", {}),
- docs_url=f"https://airflow.apache.org/docs/{package_name}/stable/",
-
source_url=f"https://github.com/apache/airflow/tree/providers-{provider_id}/{version}/providers/{provider_path.relative_to(PROVIDERS_DIR)}",
- pypi_url=f"https://pypi.org/project/{package_name}/",
+
docs_url=AIRFLOW_PROVIDER_DOCS_URL.format(package_name=package_name),
+ source_url=AIRFLOW_PROVIDER_SOURCE_URL.format(
+ provider_id=provider_id, version=version,
provider_path=provider_source_path
+ ),
+ pypi_url=PYPI_PACKAGE_URL.format(package_name=package_name),
first_released=pypi_dates["first_released"],
last_updated=pypi_dates["last_updated"],
)