Source code for improver.clustering.clustering

# (C) Crown Copyright, Met Office. All rights reserved.
#
# This file is part of 'IMPROVER' and is released under the BSD 3-Clause license.
# See LICENSE in the root of the repository for full licensing details.
"""Plugins to perform clustering on DataFrames using scikit-learn or kmedoids."""

from typing import Any

import pandas as pd

from improver import BasePlugin



[docs]
class FitClustering(BasePlugin):
    """Class to perform clustering on DataFrames using scikit-learn or kmedoids.

    This plugin provides a unified interface for applying various clustering algorithms
    to pandas DataFrames. It supports clustering methods from scikit-learn's cluster
    module as well as the KMedoids algorithm from the kmedoids package.
    The plugin automatically selects the appropriate package based on the specified
    clustering method:
    - "KMedoids": Uses the kmedoids package
    - All other methods: Uses sklearn.cluster
    """


[docs]
    def __init__(self, clustering_method: str, **kwargs: Any) -> None:
        """Initialise the clustering plugin.

        Args:
            clustering_method: The name of the clustering method to use.
                Must be either "KMedoids" (from kmedoids package) or a valid
                clustering class name from sklearn.cluster (e.g., "KMeans",
                "DBSCAN", "AgglomerativeClustering").
            **kwargs: Additional keyword arguments to pass to the clustering
                algorithm. These are method-specific parameters. Common examples:
                - n_clusters (int): Number of clusters (for KMeans,
                AgglomerativeClustering)
                - random_state (int): Random seed for reproducibility
                Refer to the scikit-learn or kmedoids documentation for the complete
                list of parameters for each clustering method.
        Raises:
            ValueError: If the specified clustering method is not found in
                sklearn.cluster or kmedoids packages.
        """
        self.clustering_method = clustering_method
        self.kwargs = kwargs



[docs]
    def process(self, df: pd.DataFrame) -> Any:
        """Apply the clustering method to the DataFrame. Fits the specified clustering
        algorithm to the input DataFrame and returns the fitted clustering model.

        Args:
            df: The input DataFrame to cluster. Each row represents
                a sample and each column represents a feature. The DataFrame should
                contain numeric data suitable for the chosen clustering algorithm.
        Returns:
            A fitted clustering model object from either sklearn.cluster or kmedoids.
            The returned object will have at minimum a `labels_` attribute containing
            the cluster assignment for each sample. Additional attributes depend on
            the specific clustering method used (e.g., `cluster_centers_` for KMeans,
            `core_sample_indices_` for DBSCAN).
        Raises:
            ValueError: If the specified clustering method is not found in
                sklearn.cluster or is not "KMedoids".
        """
        # Use kmedoids directly if requested
        if self.clustering_method == "KMedoids":
            import kmedoids

            # Set default metric to euclidean if not specified
            kwargs = self.kwargs.copy()
            if "metric" not in kwargs:
                kwargs["metric"] = "euclidean"

            clustering_class = getattr(kmedoids, self.clustering_method)
            # Convert DataFrame to numpy array for kmedoids
            return clustering_class(**kwargs).fit(df.values)

        # Otherwise, use sklearn
        from sklearn import cluster

        if hasattr(cluster, self.clustering_method):
            clustering_class = getattr(cluster, self.clustering_method)
            return clustering_class(**self.kwargs).fit(df)
        else:
            msg = (
                f"The clustering method '{self.clustering_method}' is not supported. "
                "Please check sklearn.cluster documentation for available methods."
            )
            raise ValueError(msg)