Source code for improver.cli.realization_cluster_and_match

#!/usr/bin/env python
# (C) Crown Copyright, Met Office. All rights reserved.
#
# This file is part of 'IMPROVER' and is released under the BSD 3-Clause license.
# See LICENSE in the root of the repository for full licensing details.
"""CLI to run clustering and matching of realizations."""

from improver import cli


[docs] @cli.clizefy @cli.with_output def process( *cubes: cli.inputcube, hierarchy: cli.inputjson, n_clusters: int, model_id_attr: str = "mosg__model_configuration", cycletime: str = None, target_grid_name: str = "target_grid", clustering_method: str = "KMedoids", regrid_mode: str = "esmf-area-weighted", regrid_for_clustering: bool = True, clustering_kwargs: cli.inputjson = None, regrid_kwargs: cli.inputjson = None, ): """Cluster primary input and match secondary inputs to clusters. Clusters the primary input using the specified clustering method, then matches other input realizations from other sources to the clusters derived from the primary inputs. A single match from other sources is made for each lead time. If there are multiple sources for a given lead time then the order of precedence defined in the hierarchy specification is used to determine which to use. Args: cubes (list of iris.cube.Cube): Input cubes containing primary and secondary forecast data. Different forecast sources must be identifiable using the model_id_attr attribute. hierarchy (dict): Dictionary defining the hierarchy of inputs. Specifies the primary input (which is clustered) and secondary inputs (which are matched to clusters). The order of secondary_inputs defines precedence, with earlier entries having higher priority. Format:: { "primary_input": "model_name", "secondary_inputs": {"model2": [0, 6], "model3": [0, 24]}, } The lists specify forecast period hours. A two-element list [start, end] will be expanded to the range start to end inclusive (e.g., [0, 6] includes 0, 1, 2, 3, 4, 5, 6). Lists with other lengths are treated as explicit forecast period hours. Only forecast periods that actually exist in the input cubes within these ranges will be processed. The hour values will be automatically converted to seconds to match the forecast_period coordinate units in the input cubes. n_clusters (int): Number of clusters to create. This determines how many representative realizations will be selected from the primary input. model_id_attr (str): Name of the attribute used to identify different models within the input cubes. Default: "mosg__model_configuration" cycletime (str): The forecast_reference_time on the input cubes will be reset to this value. The forecast periods will be adjusted accordingly with the validity times kept fixed. cycletime should be provided in the format YYYYMMDDTHHMMZ (e.g., 20240101T0000Z). If not provided, the forecast_reference_time on the input cubes will be left unchanged. target_grid_name (str): Name of the target grid cube for regridding. The input cubes must include a cube with this name. Default: "target_grid" clustering_method (str): Clustering method to use. Currently only "KMedoids" is supported. Default: "KMedoids" regrid_mode (str): Regridding mode to use for regridding to the target grid. Valid options include "bilinear", "nearest", "esmf-area-weighted", "nearest-with-mask", etc. Default: "esmf-area-weighted" regrid_for_clustering (bool): If True, regrid all cubes (primary and secondary) to the target grid before clustering and matching. This can speed up computation and emphasise large-scale spatial features for clustering. If False, clustering and matching are performed on the original grids without regridding. Default: True clustering_kwargs (dict): Additional keyword arguments to pass to the clustering method. Can be provided as a JSON file path or a JSON string. Common options for KMedoids include: - random_state (int): Random seed for reproducibility - max_iter (int): Maximum number of iterations Example:: {"random_state": 42, "max_iter": 300} Default: None (no additional kwargs) regrid_kwargs (dict): Additional keyword arguments to pass to RegridLandSea for regridding. Can be provided as a JSON file path or a JSON string. Common options include: - mdtol (float): Tolerance of missing data for esmf-area-weighted regridding (0 to 1, default 1) - extrapolation_mode (str): Mode to fill regions outside domain - landmask (Cube): Land-sea mask for mask-aware regridding Example:: {"mdtol": 0.5} Default: None (no additional kwargs) Returns: iris.cube.Cube: Cube containing the clustered and matched realizations, with secondary inputs matched to clusters according to the hierarchy. """ from iris.cube import CubeList from improver.clustering.realization_clustering import ( RealizationClusterAndMatch, ) # Use clustering_kwargs if provided, otherwise use empty dict clustering_kw = clustering_kwargs if clustering_kwargs is not None else {} # Add n_clusters to kwargs clustering_kw["n_clusters"] = n_clusters # Use regrid_kwargs if provided, otherwise use empty dict regrid_kw = regrid_kwargs if regrid_kwargs is not None else {} plugin = RealizationClusterAndMatch( hierarchy=hierarchy, model_id_attr=model_id_attr, clustering_method=clustering_method, target_grid_name=target_grid_name, regrid_mode=regrid_mode, regrid_for_clustering=regrid_for_clustering, regrid_kwargs=regrid_kw, cycletime=cycletime, **clustering_kw, ) return plugin(CubeList(cubes))