Source code for improver.clustering.clustering
# (C) Crown Copyright, Met Office. All rights reserved.
#
# This file is part of 'IMPROVER' and is released under the BSD 3-Clause license.
# See LICENSE in the root of the repository for full licensing details.
"""Plugins to perform clustering on DataFrames using scikit-learn or kmedoids."""
from typing import Any
import pandas as pd
from improver import BasePlugin
[docs]
class FitClustering(BasePlugin):
"""Class to perform clustering on DataFrames using scikit-learn or kmedoids.
This plugin provides a unified interface for applying various clustering algorithms
to pandas DataFrames. It supports clustering methods from scikit-learn's cluster
module as well as the KMedoids algorithm from the kmedoids package.
The plugin automatically selects the appropriate package based on the specified
clustering method:
- "KMedoids": Uses the kmedoids package
- All other methods: Uses sklearn.cluster
"""
[docs]
def __init__(self, clustering_method: str, **kwargs: Any) -> None:
"""Initialise the clustering plugin.
Args:
clustering_method: The name of the clustering method to use.
Must be either "KMedoids" (from kmedoids package) or a valid
clustering class name from sklearn.cluster (e.g., "KMeans",
"DBSCAN", "AgglomerativeClustering").
**kwargs: Additional keyword arguments to pass to the clustering
algorithm. These are method-specific parameters. Common examples:
- n_clusters (int): Number of clusters (for KMeans,
AgglomerativeClustering)
- random_state (int): Random seed for reproducibility
Refer to the scikit-learn or kmedoids documentation for the complete
list of parameters for each clustering method.
Raises:
ValueError: If the specified clustering method is not found in
sklearn.cluster or kmedoids packages.
"""
self.clustering_method = clustering_method
self.kwargs = kwargs
[docs]
def process(self, df: pd.DataFrame) -> Any:
"""Apply the clustering method to the DataFrame. Fits the specified clustering
algorithm to the input DataFrame and returns the fitted clustering model.
Args:
df: The input DataFrame to cluster. Each row represents
a sample and each column represents a feature. The DataFrame should
contain numeric data suitable for the chosen clustering algorithm.
Returns:
A fitted clustering model object from either sklearn.cluster or kmedoids.
The returned object will have at minimum a `labels_` attribute containing
the cluster assignment for each sample. Additional attributes depend on
the specific clustering method used (e.g., `cluster_centers_` for KMeans,
`core_sample_indices_` for DBSCAN).
Raises:
ValueError: If the specified clustering method is not found in
sklearn.cluster or is not "KMedoids".
"""
# Use kmedoids directly if requested
if self.clustering_method == "KMedoids":
import kmedoids
# Set default metric to euclidean if not specified
kwargs = self.kwargs.copy()
if "metric" not in kwargs:
kwargs["metric"] = "euclidean"
clustering_class = getattr(kmedoids, self.clustering_method)
# Convert DataFrame to numpy array for kmedoids
return clustering_class(**kwargs).fit(df.values)
# Otherwise, use sklearn
from sklearn import cluster
if hasattr(cluster, self.clustering_method):
clustering_class = getattr(cluster, self.clustering_method)
return clustering_class(**self.kwargs).fit(df)
else:
msg = (
f"The clustering method '{self.clustering_method}' is not supported. "
"Please check sklearn.cluster documentation for available methods."
)
raise ValueError(msg)