commit:     aef17b0f44b11964484d26b6a5b07714662038a2
Author:     Matt Jolly <kangie <AT> gentoo <DOT> org>
AuthorDate: Thu May  1 05:44:09 2025 +0000
Commit:     Matt Jolly <kangie <AT> gentoo <DOT> org>
CommitDate: Thu May  1 07:48:14 2025 +0000
URL:        
https://gitweb.gentoo.org/proj/chromium-tools.git/commit/?id=aef17b0f

get-chromium-licences.py: new script

This script uses the processed SPDX-ish information from the
`Chromium Licences` project (and some built-in mapping info)
to identify the appropriate Gentoo `LICENSE` values for Chromium.

Signed-off-by: Matt Jolly <kangie <AT> gentoo.org>

 chromium_licence_mappings.yaml |  45 ++++++
 get-chromium-licences.py       | 354 +++++++++++++++++++++++++++++++++++++++++
 2 files changed, 399 insertions(+)

diff --git a/chromium_licence_mappings.yaml b/chromium_licence_mappings.yaml
new file mode 100644
index 0000000..b1f5b2b
--- /dev/null
+++ b/chromium_licence_mappings.yaml
@@ -0,0 +1,45 @@
+# Anything that isn't really a SPDX licence (or that we need to map to a 
Gentoo licence directly)
+remediation_mapping:
+    'Apache 2.0': 'Apache-2.0'
+    'Apache-with-LLVM-Exception': 'Apache-2.0-with-LLVM-exceptions'
+    'blessing': 'Public domain'
+    'BSD 2-clause': 'BSD-2'
+    'BSD-3-Clause-Clear': 'Clear-BSD'
+    'BSD-3': 'BSD'
+    'BSD-Source-Code': 'BSD-2' # False positive neon_2_sse is BSD-2
+    'Caffe': 'Apache-2.0' # False positive? tflite is Apache-2.0 only
+    'compatible licenses': 'Apache-2.0' # "Apache 2.0 and compatible 
licenses"; swiftshader
+    'Custome license': 'FFT2D' # Bad match antother of Takuya OOURAs `fft` 
codes
+    'HPND-sell-variant': 'BSD' # false positive fontconfig (BSD-3)
+    'JsonCPP': 'Public domain' # or MIT where public domain is not allowed; 
already covered so..
+    'LGPL v2.1': 'LGPL-2.1'
+    'LGPL v2': 'LGPL-2'
+    'Libpng-2.0': 'libpng2'
+    'LLVM Release License': 'UoI-NCSA'
+    'LZMA-SDK-9.22': 'Public domain'
+    'MIT-Modern-Variant': 'MIT'
+    'Opus-Patent-BSD-3-Clause': 'BSD'
+    'pffft': 'BSD' # Just the BSD 3 clause licence
+    'public-domain-md5': 'Public domain'
+    'SPL-SQRT-FLOOR': 'Public domain'
+    'Subzero Release License': 'UoI-NCSA'
+    'SunPro': 'SunSoft'
+    'UnRAR': 'unRAR'
+    'X11': 'MIT'
+
+# These can be verified directly in the chromium sources;
+# the chromium-licenses repo is just easier since (e.g.) v8 is a submodule.
+custom_licences:
+    'SPDXRef-Package-g711': 'Public domain' # 
https://github.com/TeamDev-IP/Chromium-Licences/blob/master/chromium-licenses/third_party/webrtc/modules/third_party/g711/LICENSE
+    'SPDXRef-Package-g722': 'Public domain' # 
https://github.com/TeamDev-IP/Chromium-Licences/blob/master/chromium-licenses/third_party/webrtc/modules/third_party/g722/LICENSE
+    'SPDXRef-Package-fft': 'MIT'            # 
https://github.com/TeamDev-IP/Chromium-Licences/blob/master/chromium-licenses/third_party/webrtc/modules/third_party/fft/LICENSE
+    'SPDXRef-Package-base64': 'Base64'      # 
https://github.com/TeamDev-IP/Chromium-Licences/blob/master/chromium-licenses/third_party/webrtc/rtc_base/third_party/base64/LICENSE
+    'SPDXRef-Package-v8-codegen': 'MIT'     # 
https://github.com/TeamDev-IP/Chromium-Licences/blob/master/chromium-licenses/v8/third_party/v8/codegen/LICENSE
+    'SPDXRef-Package-v8-builtins': 'PSF-2'  # 
https://github.com/TeamDev-IP/Chromium-Licences/blob/master/chromium-licenses/v8/third_party/v8/builtins/LICENSE
+
+# Ignore these "licenses"
+ignore_list:
+  - 'Ignorable'
+  - 'Patent'
+  - 'Public domain'
+  - 'Custom license'

diff --git a/get-chromium-licences.py b/get-chromium-licences.py
new file mode 100755
index 0000000..7166401
--- /dev/null
+++ b/get-chromium-licences.py
@@ -0,0 +1,354 @@
+#!/usr/bin/env python3
+
+# This script uses the processed SPDX-ish information from the
+# `Chromium Licences` project (and some built-in mapping info)
+# to identify the appropriate Gentoo `LICENSE` values for Chromium.
+
+# 
https://github.com/TeamDev-IP/Chromium-Licenses/blob/v135.0.7049.96/chromium-licenses.spdx.json
+
+import argparse
+import logging
+import os
+import re
+import requests
+import structlog
+import sys
+import yaml
+
+from typing import List, Optional # Optional needed if input can be None
+
+logger = structlog.get_logger()
+
+# --- Constants ---
+BASE_SPDX_URL_TEMPLATE = 
"https://raw.githubusercontent.com/TeamDev-IP/Chromium-Licences/refs/tags/v{version}/chromium-licenses.spdx.json";
+VERSION_REGEX = r"^\d+\.\d+(?:\.\d+(?:\.\d+)?)?$"
+GENTOO_MAPPING_FILE_RELPATH = 'metadata/license-mapping.conf'
+GENTOO_LICENSES_DIR_RELPATH = 'licenses'
+
+logger = structlog.get_logger()
+
+# --- Functions ---
+def fetch_spdx_data(version: str) -> Optional[dict]:
+    fetchuri = BASE_SPDX_URL_TEMPLATE.format(version=version)
+    logger.info(f"Attempting to fetch SPDX data from: {fetchuri}")
+    try:
+        response = requests.get(fetchuri, timeout=10)
+        response.raise_for_status() # Raise HTTPError for 4xx/5xx status codes
+        logger.info(f"Successfully fetched data, status code: 
{response.status_code}")
+        return response.json()
+    except requests.exceptions.Timeout as e:
+        logger.error(f"Request timed out while fetching SPDX data for version 
{version}: {e}")
+        return None
+    except requests.exceptions.HTTPError as e:
+        if e.response.status_code == 404:
+            logger.error(f"SPDX data not found for version {version} (404). 
URL: {fetchuri}")
+        else:
+            logger.error(f"HTTP error occurred fetching SPDX data for version 
{version}: {e}")
+        return None
+    except requests.exceptions.ConnectionError as e:
+        logger.error(f"Connection error while fetching SPDX data for version 
{version}: {e}")
+        return None
+    except requests.exceptions.RequestException as e: # Catch other 
requests-related errors
+        logger.error(f"An unexpected error occurred during request for version 
{version}: {e}")
+        return None
+    except requests.exceptions.JSONDecodeError as e:
+         logger.error(f"Failed to decode JSON response for version {version}: 
{e}")
+         return None
+
+
+def load_external_mappings(config_path="chromium_licence_mappings.yaml"):
+    try:
+        with open(config_path, 'r') as f:
+            mappings = yaml.safe_load(f)
+        return (
+            mappings.get('remediation_mapping', {}),
+            mappings.get('custom_licences', {}),
+            set(mappings.get('ignore_list',))
+         )
+    except FileNotFoundError:
+        logger.error(f"Mapping configuration file not found: {config_path}")
+        return {}, {}, set()
+    except yaml.YAMLError as e:
+        logger.error(f"Error parsing mapping configuration file {config_path}: 
{e}")
+        return {}, {}, set()
+    except Exception as e:
+        logger.exception(f"Unexpected error loading mapping config 
{config_path}: {e}")
+        return {}, {}, set()
+
+
+def load_gentoo_mappings(mapping_file_path: str) -> dict:
+    logger.info(f"Loading Gentoo mappings from: {mapping_file_path}")
+    gentoo_licence_mapping = {}
+    try:
+        if not os.path.exists(mapping_file_path):
+             logger.error(f"Gentoo mapping file not found: 
{mapping_file_path}")
+             return {} # Return empty dict on failure
+
+        with open(mapping_file_path, 'r') as f:
+            current_section = None
+            for line in f:
+                line = line.strip()
+                if not line or line.startswith('#'):
+                    continue
+                if line.startswith('[') and line.endswith(']'):
+                    current_section = line[1:-1]
+                    continue
+                if current_section == 'spdx-to-ebuild' and '=' in line:
+                    spdx, gentoo = [x.strip() for x in line.split('=', 1)]
+                    gentoo_licence_mapping[spdx] = gentoo
+
+        logger.info(f"Successfully loaded {len(gentoo_licence_mapping)} 
mappings.")
+        return gentoo_licence_mapping
+
+    except FileNotFoundError:
+        logger.error(f"Gentoo mapping file not found: {mapping_file_path}")
+        return {}
+    except PermissionError:
+        logger.error(f"Permission denied reading mapping file: 
{mapping_file_path}")
+        return {}
+    except OSError as e:
+        logger.error(f"OS error reading mapping file {mapping_file_path}: {e}")
+        return {}
+    except Exception as e:
+        logger.exception(f"Unexpected error loading Gentoo mappings from 
{mapping_file_path}: {e}")
+        return {}
+
+
+
+def process_spdx_data(spdx_data: dict) -> tuple[list, dict, dict]:
+    """
+    Processes the SPDX data to extract relevant information about licenses and 
packages.
+
+    This function extracts license information from both the 
'hasExtractedLicensingInfos'
+    section and the individual packages in the SPDX data. It handles cases 
where licenses
+    may be combined using 'and' or separated by commas.
+
+
+        A tuple containing three elements:
+        - A list of all unique license names found in the SPDX data
+        - A dictionary mapping license IDs to license names
+        - A dictionary mapping package SPDXIDs to dictionaries containing 
package information
+            (name, downloadLocation, externalRefs, and licence)
+
+    Note:
+        The function logs detailed information about the extraction process at 
debug level,
+        and outputs a summary of found licenses at info level.
+    """
+    found_licences = set()
+    licence_mapping = {}
+    found_packages = {}
+    # Extract relevant information from the SPDX data
+    if 'hasExtractedLicensingInfos' in spdx_data:
+        for licence_info in spdx_data['hasExtractedLicensingInfos']:
+            if 'licenseId' in licence_info and 'name' in licence_info:
+                licence_mapping[licence_info['licenseId']] = 
licence_info['name']
+                # We can get an easy list of all licences here, and easily 
split them. We should still parse packages, maybe?
+                if 'and' in licence_info['name'] or ',' in 
licence_info['name']:
+                    # If the licence is a combination of licences, we need to 
split it
+                    logger.debug(f"Splitting licence string: 
{licence_info['name']}")
+                    licences = split_licence_string(licence_info['name'])
+                    for licence in licences:
+                            found_licences.add(licence)
+                else:
+                    # Make it an array so we can "iterate" over it below
+                    logger.debug(f"Adding licence: {licence_info['name']}")
+                    found_licences.add(licence_info['name'])
+
+
+        logger.debug(f"Extracted {len(licence_mapping)} licence ID mappings:")
+        for licence_id, name in licence_mapping.items():
+            logger.debug(f"  {licence_id} -> {name}")
+
+        # We probably don't get any new licences here, but we should still 
parse packages; it may be useful down the line
+        for package in spdx_data['packages']:
+            pkglicence = package.get('licenseConcluded', 
package.get('licenseInfoFromFiles', 'UNKNOWN'))
+            # Handle case where pkglicence might be a single string or an array
+            if isinstance(pkglicence, list):
+                for licence in pkglicence:
+                    found_licences.add(licence)
+            elif 'and' in pkglicence or ',' in pkglicence:
+                # We could have 'foo and bar and baz' or 'foo, bar, baz' which 
we need to split and process individually
+                pkglicence = split_licence_string(pkglicence)
+                for licence in pkglicence:
+                    found_licences.add(licence)
+            elif pkglicence:
+                found_licences.add(pkglicence)
+                logger.debug(f"Found licence: {pkglicence}")
+
+            # Store package information
+            found_packages[package['SPDXID']] = {
+                'name': package['name'],
+                'downloadLocation': package['downloadLocation'],
+                'externalRefs': package.get('externalRefs', []),
+                'licence': pkglicence,
+            }
+
+            logger.debug(f"Package ID: {package['SPDXID']}")
+            logger.debug(f"Package Name: {package['name']}")
+            logger.debug(f"URI: {package['downloadLocation']}")
+            logger.debug(f"License Info: {package['externalRefs']}")
+            if 'licenseConcluded' in package:
+                logger.debug(f"License: {package['licenseConcluded']}")
+            if 'licenseInfoFromFiles' in package:
+                logger.debug(f"License Info From Files: 
{package['licenseInfoFromFiles']}")
+
+        logger.info("Licences:")
+        for licence in found_licences:
+            logger.info(f"- {licence}")
+
+        return found_licences, licence_mapping, found_packages
+
+
+def setup_logging(args):
+    """Configures logging based on structlog."""
+    log_level = logging.WARNING
+    if args.verbose:
+        log_level = logging.INFO
+    if args.debug:
+        log_level = logging.DEBUG
+
+    # Configure standard logging for libraries
+    logging.basicConfig(
+        format="%(message)s",
+        level=log_level,
+        stream=sys.stderr,
+    )
+
+    # Quiet down noisy libraries unless debugging
+    if log_level > logging.DEBUG:
+        logging.getLogger("requests").setLevel(logging.WARNING)
+        logging.getLogger("urllib3").setLevel(logging.WARNING)
+
+    # Configure structlog
+    
structlog.configure(wrapper_class=structlog.make_filtering_bound_logger(log_level))
+
+
+def split_licence_string(licence_str: Optional[str]) -> List[str]:
+    """
+    Splits a licence string potentially containing ' and ' or ', ' delimiters.
+
+    Args:
+        license_str: The license string to split, or None.
+
+    Returns:
+        A list of individual license strings, or an empty list if
+        input is None or empty after splitting.
+    """
+    if not licence_str:
+        return
+
+    # Basic split on ' and ' or ', '
+    parts = re.split(r'\s+and\s+|\s*,\s*', licence_str)
+    cleaned_parts = [part.strip() for part in parts if part and part.strip()]
+    return cleaned_parts
+
+
+def parse_arguments():
+    parser = argparse.ArgumentParser(description="Script to fetch and display 
Chromium licencing information.")
+    parser.add_argument('-v', '--verbose', action='store_true', help='Enable 
verbose output')
+    parser.add_argument('-d', '--debug', action='store_true', help='Enable 
debug output')
+    parser.add_argument('-g', '--gentoo-repo', default='/var/db/repos/gentoo', 
help='Path to the Gentoo repository')
+    parser.add_argument('version', help='Chromium version to fetch licences 
for')
+    return parser.parse_args()
+
+
+def main():
+    args = parse_arguments()
+    setup_logging(args)
+
+    gentoo_repo = args.gentoo_repo
+
+    version = ''
+    if args.version:
+        logger.info(f"Fetching licences for Chromium version: {args.version}")
+        version = args.version
+    else:
+        print("No version specified. Please provide a Chromium version:")
+        version = input("Version: ")
+
+    if not re.match(VERSION_REGEX, version):
+        logger.error("Invalid version format. Please enter a version like 
X.Y.Z.W (e.g., 123.0.4567.890)")
+        exit(1)
+
+    chromium_spdx_data = fetch_spdx_data(version)
+    logger.debug("Available keys in the data:")
+    for key in chromium_spdx_data.keys():
+        logger.debug(f"- {key}")
+    # Create a mapping of licence IDs to names
+    found_licences, chromium_spdx_licence_mapping, found_packages = 
process_spdx_data(chromium_spdx_data)
+
+    # Now to match with Gentoo licences!
+    logger.info("Matching with Gentoo licences...")
+
+    # Gentoo -> SPDX mapping
+    gentoo_licence_mapping = load_gentoo_mappings(os.path.join(gentoo_repo, 
GENTOO_MAPPING_FILE_RELPATH))
+    gentoo_licence_dir = os.path.join(gentoo_repo, GENTOO_LICENSES_DIR_RELPATH)
+
+    if not os.path.exists(gentoo_licence_dir):
+        logger.error(f"Gentoo licences directory not found at 
{gentoo_licence_dir}. Please check the path.")
+        exit(1)
+
+    # Read all Gentoo licences from the directory
+    gentoo_licences = []
+    for filename in os.listdir(gentoo_licence_dir):
+        # Only consider regular files, not directories or symlinks
+        if os.path.isfile(os.path.join(gentoo_licence_dir, filename)):
+            gentoo_licences.append(filename)
+
+    logger.debug(f"Found {len(gentoo_licences)} Gentoo licences")
+
+    REMEDIATION_MAPPING, CUSTOM_LICENCES, IGNORE_LIST = 
load_external_mappings()
+
+    matched_licences = set()
+    unmatched_licences = set()
+
+    # Add these to found_licences as we still need to process or discard the 
mapped values
+    for k, v in CUSTOM_LICENCES.items():
+        found_licences.add(v)
+        logger.debug(f"Found custom licence: {v}")
+
+    for l in found_licences:
+        # Map the Chromium licence to its name
+        chromium_licences = [l]
+        if l in chromium_spdx_licence_mapping:
+            logger.debug(f"Mapping {l} to {chromium_spdx_licence_mapping[l]}")
+            if 'and' in chromium_spdx_licence_mapping[l] or ',' in 
chromium_spdx_licence_mapping[l]:
+                # If the licence is a combination of licences, we need to 
split it
+                chromium_licences = 
split_licence_string(chromium_spdx_licence_mapping[l])
+                logger.debug(f"Splitting licence string: {chromium_licences}")
+            else:
+                # Make it an array so we can "iterate" over it below
+                chromium_licences = [chromium_spdx_licence_mapping[l]]
+
+        if l in IGNORE_LIST:
+            continue
+
+        for licence in chromium_licences:
+            # Check if the Chromium licence exactly matches a Gentoo licence
+            if licence in gentoo_licences:
+                matched_licences.add(licence)
+            elif licence in gentoo_licence_mapping:
+                logger.debug(f"Mapping {licence} to 
{gentoo_licence_mapping[licence]}")
+                matched_licences.add(gentoo_licence_mapping[licence])
+            elif licence in REMEDIATION_MAPPING:
+                logger.debug(f"Mapping {licence} to 
{REMEDIATION_MAPPING[licence]}")
+                if REMEDIATION_MAPPING[licence] in IGNORE_LIST:
+                    continue
+                matched_licences.add(REMEDIATION_MAPPING[licence])
+            else:
+                if licence in IGNORE_LIST:
+                    continue
+                unmatched_licences.add(licence)
+
+    logger.info(f"Licences for Chromium version {version}:")
+
+    print(f'LICENSES="{' '.join(sorted(matched_licences))}"')
+
+    if unmatched_licences:
+        print(f"\nUnmatched licences ({len(unmatched_licences)}):")
+        for licence in sorted(unmatched_licences):
+            print(f"- {licence}")
+
+
+if __name__ == "__main__":
+    main()

Reply via email to