diff --git a/.gitignore b/.gitignore index 0b53ec64..b25d56b3 100644 --- a/.gitignore +++ b/.gitignore @@ -1,2 +1,5 @@ doc/build Manifest.toml +*.swp +.vscode +docs/build/ diff --git a/docs/source/index.md b/docs/source/index.md index b0c25ed1..21c4b949 100644 --- a/docs/source/index.md +++ b/docs/source/index.md @@ -3,7 +3,7 @@ *Clustering.jl* is a Julia package for data clustering. It covers the two aspects of data clustering: - - [Clustering Algorithms](@ref clu_algo_basics), *e.g.* K-means, K-medoids, Affinity - propagation, and DBSCAN, etc. - - [Clustering Evaluation](@ref clu_validate), *e.g.* Silhouettes and variational - information. + - [Clustering Algorithms](@ref clu_algo_basics): K-means, K-medoids, Affinity + propagation, DBSCAN etc. + - [Clustering Comparison & Evaluation](@ref clu_validate): cross-tabulation, variational + and mutual information, intrinsic clustering quality indices, such as *silhouettes*, etc. diff --git a/docs/source/validate.md b/docs/source/validate.md index 11485389..5b3640be 100644 --- a/docs/source/validate.md +++ b/docs/source/validate.md @@ -1,10 +1,13 @@ # [Evaluation & Validation](@id clu_validate) -*Clustering.jl* package provides a number of methods to evaluate the results of -a clustering algorithm and/or to validate its correctness. +*Clustering.jl* package provides a number of methods to compare different clusterings, +evaluate clustering quality or validate its correctness. +## Clustering comparison -## Cross tabulation +Methods to compare two clusterings and measure their similarity. + +### Cross tabulation [Cross tabulation](https://en.wikipedia.org/wiki/Contingency_table), or *contingency matrix*, is a basis for many clustering quality measures. @@ -13,11 +16,20 @@ It shows how similar are the two clusterings on a cluster level. *Clustering.jl* extends `StatsBase.counts()` with methods that accept [`ClusteringResult`](@ref) arguments: ```@docs -counts(a::ClusteringResult, b::ClusteringResult) +counts(::ClusteringResult, ::ClusteringResult) ``` +### Confusion matrix + +[Confusion matrix](https://en.wikipedia.org/wiki/Confusion_matrix) +for the two clusterings is a 2×2 contingency table that counts +how frequently the pair of data points are in the same or different clusters. + +```@docs +confusion +``` -## Rand index +### Rand index [Rand index](http://en.wikipedia.org/wiki/Rand_index) is a measure of the similarity between the two data clusterings. From a mathematical @@ -28,34 +40,7 @@ even when the original class labels are not used. randindex ``` - -## Silhouettes - -[Silhouettes](http://en.wikipedia.org/wiki/Silhouette_(clustering)) is -a method for evaluating the quality of clustering. Particularly, it provides a -quantitative way to measure how well each point lies within its cluster in -comparison to the other clusters. - -The *Silhouette* value for the ``i``-th data point is: -```math -s_i = \frac{b_i - a_i}{\max(a_i, b_i)}, \ \text{where} -``` - - ``a_i`` is the average distance from the ``i``-th point to the other points in - the same cluster ``z_i``, - - ``b_i ≝ \min_{k \ne z_i} b_{ik}``, where ``b_{ik}`` is the average distance - from the ``i``-th point to the points in the ``k``-th cluster. - -Note that ``s_i \le 1``, and that ``s_i`` is close to ``1`` when the ``i``-th -point lies well within its own cluster. This property allows using -`mean(silhouettes(assignments, counts, X))` as a measure of clustering quality. -Higher values indicate better separation of clusters w.r.t. point distances. - -```@docs -silhouettes -``` - - -## Variation of Information +### Variation of Information [Variation of information](http://en.wikipedia.org/wiki/Variation_of_information) (also known as *shared information distance*) is a measure of the @@ -64,11 +49,9 @@ information*, but it is a true metric, *i.e.* it is symmetric and satisfies the triangle inequality. ```@docs -varinfo +Clustering.varinfo ``` - - -## V-measure +### V-measure *V*-measure can be used to compare the clustering results with the existing class labels of data points or with the alternative clustering. @@ -89,7 +72,7 @@ weight, and when ``\beta < 1`` it's *homogeneity*. vmeasure ``` -## Mutual information +### Mutual information [Mutual information](https://en.wikipedia.org/wiki/Mutual_information) quantifies the "amount of information" obtained about one random variable @@ -100,17 +83,178 @@ the similarity of two different clusterings of a dataset. mutualinfo ``` -## Confusion matrix +## Clustering quality indices -Pair [confusion matrix](https://en.wikipedia.org/wiki/Confusion_matrix) -arising from two clusterings is a 2×2 contingency table representation of -the partition co-occurrence, see [`counts`](@ref). +[`clustering_quality()`][@ref clustering_quality] methods allow computing *intrinsic* clustering quality indices, +i.e. the metrics that depend only on the clustering itself and do not use the external knowledge. +These metrics can be used to compare different clustering algorithms or choose the optimal number of clusters. + +| **quality index** | **`quality_index` option** | **clustering type** | **better quality** | **cluster centers** | +|:-------------------------------------------:|:--------------------:|:----------:|:-------------:|:-------------------:| +| [Calinski-Harabasz](@ref calinsky_harabasz) | `:calinsky_harabasz` | hard/fuzzy | *higher* values | required | +| [Xie-Beni](@ref xie_beni) | `:xie_beni` | hard/fuzzy | *lower* values | required | +| [Davis-Bouldin](@ref davis_bouldin) | `:davis_bouldin` | hard | *lower* values | required | +| [Dunn](@ref dunn) | `:dunn` | hard | *higher* values | not required | +| [silhouettes](@ref silhouettes_index) | `:silhouettes` | hard | *higher* values | not required | ```@docs -confusion +clustering_quality ``` +The clustering quality index definitions use the following notation: +- ``x_1, x_2, \ldots, x_n``: data points, +- ``C_1, C_2, \ldots, C_k``: clusters, +- ``c_j`` and ``c``: cluster centers and global dataset center, +- ``d``: a similarity (distance) function, +- ``w_{ij}``: weights measuring membership of a point ``x_i`` to a cluster ``C_j``, +- ``\alpha``: a fuzziness parameter. + +### [Calinski-Harabasz index](@id calinsky_harabasz) + +[*Calinski-Harabasz* index](https://en.wikipedia.org/wiki/Calinski%E2%80%93Harabasz_index) (option `:calinski_harabasz`) +measures corrected ratio between global inertia of the cluster centers and the summed internal inertias of clusters: +```math +\frac{n-k}{k-1}\frac{\sum_{C_j}|C_j|d(c_j,c)}{\sum\limits_{C_j}\sum\limits_{x_i\in C_j} d(x_i,c_j)} \quad \text{and}\quad +\frac{n-k}{k-1} \frac{\sum\limits_{C_j}\left(\sum\limits_{x_i}w_{ij}^\alpha\right) d(c_j,c)}{\sum_{C_j} \sum_{x_i} w_{ij}^\alpha d(x_i,c_j)} +``` +for hard and fuzzy (soft) clusterings, respectively. +*Higher* values indicate better quality. + +### [Xie-Beni index](@id xie_beni) + +*Xie-Beni* index (option `:xie_beni`) measures ratio between summed inertia of clusters +and the minimum distance between cluster centres: +```math +\frac{\sum_{C_j}\sum_{x_i\in C_j}d(x_i,c_j)}{n\min\limits_{c_{j_1}\neq c_{j_2}} d(c_{j_1},c_{j_2}) } +\quad \text{and}\quad +\frac{\sum_{C_j}\sum_{x_i} w_{ij}^\alpha d(x_i,c_j)}{n\min\limits_{c_{j_1}\neq c_{j_2}} d(c_{j_1},c_{j_2}) } +``` +for hard and fuzzy (soft) clusterings, respectively. +*Lower* values indicate better quality. + +### [Davis-Bouldin index](@id davis_bouldin) +[*Davis-Bouldin* index](https://en.wikipedia.org/wiki/Davies%E2%80%93Bouldin_index) +(option `:davis_bouldin`) measures average cohesion based on the cluster diameters and distances between cluster centers: +```math +\frac{1}{k}\sum_{C_{j_1}}\max_{c_{j_2}\neq c_{j_1}}\frac{S(C_{j_1})+S(C_{j_2})}{d(c_{j_1},c_{j_2})} +``` +where +```math +S(C_j) = \frac{1}{|C_j|}\sum_{x_i\in C_j}d(x_i,c_j). +``` +*Lower* values indicate better quality. + +### [Dunn index](@id dunn) +[*Dunn* index](https://en.wikipedia.org/wiki/Dunn_index) (option `:dunn`) +measures the ratio between the nearest neighbour distance divided by the maximum cluster diameter: +```math +\frac{\min\limits_{ C_{j_1}\neq C_{j_2}} \mathrm{dist}(C_{j_1},C_{j_2})}{\max\limits_{C_j}\mathrm{diam}(C_j)} +``` +where +```math +\mathrm{dist}(C_{j_1},C_{j_2}) = \min\limits_{x_{i_1}\in C_{j_1},x_{i_2}\in C_{j_2}} d(x_{i_1},x_{i_2}),\quad \mathrm{diam}(C_j) = \max\limits_{x_{i_1},x_{i_2}\in C_j} d(x_{i_1},x_{i_2}). +``` +It is more computationally demanding quality index, which can be used when the centres are not known. *Higher* values indicate better quality. + +### [Silhouettes](@id silhouettes_index) + +[*Silhouettes* metric](http://en.wikipedia.org/wiki/Silhouette_(clustering)) quantifies the correctness of point-to-cluster asssignment by +comparing the distance of the point to its cluster and to the other clusters. + +The *Silhouette* value for the ``i``-th data point is: +```math +s_i = \frac{b_i - a_i}{\max(a_i, b_i)}, \ \text{where} +``` + - ``a_i`` is the average distance from the ``i``-th point to the other points in + the *same* cluster ``z_i``, + - ``b_i ≝ \min_{k \ne z_i} b_{ik}``, where ``b_{ik}`` is the average distance + from the ``i``-th point to the points in the ``k``-th cluster. + +Note that ``s_i \le 1``, and that ``s_i`` is close to ``1`` when the ``i``-th +point lies well within its own cluster. This property allows using average silhouette value +`mean(silhouettes(assignments, counts, X))` as a measure of clustering quality; +it is also available using [`clustering_quality(...; quality_index = :silhouettes)`](@ref clustering_quality) method. +Higher values indicate better separation of clusters w.r.t. point distances. + +```@docs +silhouettes +``` + +[`clustering_quality(..., quality_index=:silhouettes)`][@ref clustering_quality] +provides mean silhouette metric for the datapoints. Higher values indicate better quality. + +## References +> Olatz Arbelaitz *et al.* (2013). *An extensive comparative study of cluster validity indices*. Pattern Recognition. 46 1: 243-256. [doi:10.1016/j.patcog.2012.07.021](https://doi.org/10.1016/j.patcog.2012.07.021) + +> Aybükë Oztürk, Stéphane Lallich, Jérôme Darmont. (2018). *A Visual Quality Index for Fuzzy C-Means*. 14th International Conference on Artificial Intelligence Applications and Innovations (AIAI 2018). 546-555. [doi:10.1007/978-3-319-92007-8_46](https://doi.org/10.1007/978-3-319-92007-8_46). + +### Examples + +Exemplary data with 3 real clusters. +```@example +using Plots, Clustering +X = hcat([4., 5.] .+ 0.4 * randn(2, 10), + [9., -5.] .+ 0.4 * randn(2, 5), + [-4., -9.] .+ 1 * randn(2, 5)) + + +scatter(view(X, 1, :), view(X, 2, :), + label = "data points", + xlabel = "x", + ylabel = "y", + legend = :right, +) +``` + +Hard clustering quality for K-means method with 2 to 5 clusters: + +```@example +using Plots, Clustering +X = hcat([4., 5.] .+ 0.4 * randn(2, 10), + [9., -5.] .+ 0.4 * randn(2, 5), + [-4., -9.] .+ 1 * randn(2, 5)) + +nclusters = 2:5 +clusterings = kmeans.(Ref(X), nclusters) + +plot(( + plot(nclusters, + clustering_quality.(Ref(X), clusterings, quality_index = qidx), + marker = :circle, + title = ":$qidx", label = nothing, + ) for qidx in [:silhouettes, :calinski_harabasz, :xie_beni, :davies_bouldin, :dunn])..., + layout = (3, 2), + xaxis = "N clusters", + plot_title = "\"Hard\" clustering quality indices" +) +``` + +Fuzzy clustering quality for fuzzy C-means method with 2 to 5 clusters: +```@example +using Plots, Clustering +X = hcat([4., 5.] .+ 0.4 * randn(2, 10), + [9., -5.] .+ 0.4 * randn(2, 5), + [-4., -9.] .+ 1 * randn(2, 5)) + +fuzziness = 2 +fuzzy_nclusters = 2:5 +fuzzy_clusterings = fuzzy_cmeans.(Ref(X), fuzzy_nclusters, fuzziness) + +plot(( + plot(fuzzy_nclusters, + clustering_quality.(Ref(X), fuzzy_clusterings, + fuzziness = fuzziness, quality_index = qidx), + marker = :circle, + title = ":$qidx", label = nothing, + ) for qidx in [:calinski_harabasz, :xie_beni])..., + layout = (2, 1), + xaxis = "N clusters", + plot_title = "\"Soft\" clustering quality indices" +) +``` + + ## Other packages * [ClusteringBenchmarks.jl](https://github.com/HolyLab/ClusteringBenchmarks.jl) provides - benchmark datasets and implements additional methods for evaluating clustering performance. \ No newline at end of file + benchmark datasets and implements additional methods for evaluating clustering performance. diff --git a/examples/clustering_quality.jl b/examples/clustering_quality.jl new file mode 100644 index 00000000..0c902707 --- /dev/null +++ b/examples/clustering_quality.jl @@ -0,0 +1,55 @@ +using Plots, Clustering + +## test data with 3 clusters +X = hcat([4., 5.] .+ 0.4 * randn(2, 10), + [9., -5.] .+ 0.4 * randn(2, 5), + [-4., -9.] .+ 1 * randn(2, 5)) + +## visualisation of the exemplary data +scatter(X[1,:], X[2,:], + label = "data points", + xlabel = "x", + ylabel = "y", + legend = :right, +) + +nclusters = 2:5 + +## hard clustering quality +clusterings = kmeans.(Ref(X), nclusters) +hard_indices = [:silhouettes, :calinski_harabasz, :xie_beni, :davies_bouldin, :dunn] + +kmeans_quality = Dict( + qidx => clustering_quality.(Ref(X), clusterings, quality_index = qidx) + for qidx in hard_indices) + +plot(( + plot(nclusters, kmeans_quality[qidx], + marker = :circle, + title = qidx, + label = nothing, + ) for qidx in hard_indices)..., + layout = (3, 2), + xaxis = "N clusters", + plot_title = "\"Hard\" clustering quality indices" +) + +## soft clustering quality +fuzziness = 2 +fuzzy_clusterings = fuzzy_cmeans.(Ref(X), nclusters, fuzziness) +soft_indices = [:calinski_harabasz, :xie_beni] + +fuzzy_cmeans_quality = Dict( + qidx => clustering_quality.(Ref(X), fuzzy_clusterings, fuzziness = fuzziness, quality_index = qidx) + for qidx in soft_indices) + +plot(( + plot(nclusters, fuzzy_cmeans_quality[qidx], + marker = :circle, + title = qidx, + label = nothing, + ) for qidx in soft_indices)..., + layout = (2, 1), + xaxis = "N clusters", + plot_title = "\"Soft\" clustering quality indices" +) diff --git a/src/Clustering.jl b/src/Clustering.jl index 808b37b2..fae9ee82 100644 --- a/src/Clustering.jl +++ b/src/Clustering.jl @@ -49,6 +49,9 @@ module Clustering # silhouette silhouettes, + # quality indices + clustering_quality, + # varinfo varinfo, @@ -70,6 +73,7 @@ module Clustering # pair confusion matrix confusion + ## source files include("utils.jl") @@ -84,13 +88,15 @@ module Clustering include("counts.jl") include("cluster_distances.jl") + include("silhouette.jl") + include("clustering_quality.jl") + include("randindex.jl") include("varinfo.jl") include("vmeasure.jl") include("mutualinfo.jl") include("confusion.jl") - include("hclust.jl") include("deprecate.jl") diff --git a/src/clustering_quality.jl b/src/clustering_quality.jl new file mode 100644 index 00000000..2d21859a --- /dev/null +++ b/src/clustering_quality.jl @@ -0,0 +1,320 @@ +# main method for hard clustering indices + docs +""" +For "hard" clustering: + + clustering_quality(data, centers, assignments; quality_index, [metric]) + clustering_quality(data, clustering; quality_index, [metric]) + +For fuzzy ("soft") clustering: + + clustering_quality(data, centers, weights; quality_index, fuzziness, [metric]) + clustering_quality(data, clustering; quality_index, fuzziness, [metric]) + +For "hard" clustering without specifying cluster centers: + + clustering_quality(data, assignments; quality_index, [metric]) + clustering_quality(data, clustering; quality_index, [metric]) + +For "hard" clustering without specifying data points and cluster centers: + + clustering_quality(assignments, dist_matrix; quality_index) + clustering_quality(clustering, dist_matrix; quality_index) + +Compute the *quality index* for a given clustering. + +Returns a quality index (real value). + +## Arguments + - `data::AbstractMatrix`: ``d×n`` data matrix with each column representing one ``d``-dimensional data point + - `centers::AbstractMatrix`: ``d×k`` matrix with cluster centers represented as columns + - `assignments::AbstractVector{Int}`: ``n`` vector of point assignments (cluster indices) + - `weights::AbstractMatrix`: ``n×k`` matrix with fuzzy clustering weights, `weights[i,j]` is the degree of membership of ``i``-th data point to ``j``-th cluster + - `clustering::Union{ClusteringResult, FuzzyCMeansResult}`: the output of the clustering method + - `quality_index::Symbol`: quality index to calculate; see below for the supported options + - `dist_matrix::AbstractMatrix`: a ``n×n`` pairwise distance matrix; `dist_matrix[i,j]` is the distance between ``i``-th and ``j``-th points + + ## Keyword arguments + - `quality_index::Symbol`: clustering *quality index* to calculate; see below for the supported options + - `fuzziness::Real`: clustering *fuzziness* > 1 + - `metric::SemiMetric=SqEuclidean()`: `SemiMetric` object that defines the metric/distance/similarity function + +When calling `clustering_quality`, one can explicitly specify `centers`, `assignments`, and `weights`, +or provide `ClusteringResult` via `clustering`, from which the necessary data will be read automatically. + +For clustering without known cluster centers the `data` points are not required. +`dist_matrix` could be provided explicitly, otherwise it would be calculated from the `data` points +using the specified `metric`. + +## Supported quality indices + +- `:calinski_harabasz`: hard or fuzzy Calinski-Harabsz index (↑), the corrected ratio of between cluster centers inertia and within-clusters inertia +- `:xie_beni`: hard or fuzzy Xie-Beni index (↓), the ratio betwen inertia within clusters and minimal distance between the cluster centers +- `:davies_bouldin`: Davies-Bouldin index (↓), the similarity between the cluster and the other most similar one, averaged over all clusters +- `:dunn`: Dunn index (↑), the ratio of the minimal distance between clusters and the maximal cluster diameter +- `:silhouettes`: the average silhouette index (↑), see [`silhouettes`](@ref) + +The arrows ↑ or ↓ specify the direction of the incresing clustering quality. +Please refer to the [documentation](@ref clustering_quality) for more details on the clustering quality indices. +""" +function clustering_quality( + data::AbstractMatrix{<:Real}, # d×n matrix + centers::AbstractMatrix{<:Real}, # d×k matrix + assignments::AbstractVector{<:Integer}; # n vector + quality_index::Symbol, + metric::SemiMetric=SqEuclidean() +) + d, n = size(data) + dc, k = size(centers) + + d == dc || throw(DimensionMismatch("Inconsistent array dimensions for `data` and `centers`.")) + (1 <= k <= n) || throw(ArgumentError("Number of clusters k must be from 1:n (n=$n), k=$k given.")) + k >= 2 || throw(ArgumentError("Quality index not defined for the degenerated clustering with a single cluster.")) + n == k && throw(ArgumentError("Quality index not defined for the degenerated clustering where each data point is its own cluster.")) + seen_clusters = falses(k) + for (i, clu) in enumerate(assignments) + (clu in axes(centers, 2)) || throw(ArgumentError("Invalid cluster index: assignments[$i]=$(clu).")) + seen_clusters[clu] = true + end + if !all(seen_clusters) + empty_clu_ixs = findall(!, seen_clusters) + @warn "Detected empty cluster(s): $(join(string.("#", empty_clu_ixs), ", ")). clustering_quality() results might be incorrect." + + newClusterIndices = cumsum(seen_clusters) + centers = view(centers, :, seen_clusters) + assignments = newClusterIndices[assignments] + end + + if quality_index == :calinski_harabasz + _cluquality_calinski_harabasz(metric, data, centers, assignments, nothing) + elseif quality_index == :xie_beni + _cluquality_xie_beni(metric, data, centers, assignments, nothing) + elseif quality_index == :davies_bouldin + _cluquality_davies_bouldin(metric, data, centers, assignments) + elseif quality_index == :silhouettes + mean(silhouettes(assignments, pairwise(metric, data, dims=2))) + elseif quality_index == :dunn + _cluquality_dunn(assignments, pairwise(metric, data, dims=2)) + else + throw(ArgumentError("quality_index=:$quality_index not supported.")) + end +end + +clustering_quality(data::AbstractMatrix{<:Real}, R::ClusteringResult; + quality_index::Symbol, metric::SemiMetric=SqEuclidean()) = + clustering_quality(data, R.centers, R.assignments; + quality_index = quality_index, metric = metric) + + +# main method for fuzzy clustering indices +function clustering_quality( + data::AbstractMatrix{<:Real}, # d×n matrix + centers::AbstractMatrix{<:Real}, # d×k matrix + weights::AbstractMatrix{<:Real}; # n×k matrix + quality_index::Symbol, + fuzziness::Real, + metric::SemiMetric=SqEuclidean() +) + d, n = size(data) + dc, k = size(centers) + nw, kw = size(weights) + + d == dc || throw(DimensionMismatch("Inconsistent array dimensions for `data` and `centers`.")) + n == nw || throw(DimensionMismatch("Inconsistent data length for `data` and `weights`.")) + k == kw || throw(DimensionMismatch("Inconsistent number of clusters for `centers` and `weights`.")) + (1 <= k <= n) || throw(ArgumentError("Number of clusters k must be from 1:n (n=$n), k=$k given.")) + k >= 2 || throw(ArgumentError("Quality index not defined for the degenerated clustering with a single cluster.")) + n == k && throw(ArgumentError("Quality index not defined for the degenerated clustering where each data point is its own cluster.")) + all(>=(0), weights) || throw(ArgumentError("All weights must be larger or equal 0.")) + 1 < fuzziness || throw(ArgumentError("Fuzziness must be greater than 1 ($fuzziness given)")) + + if quality_index == :calinski_harabasz + _cluquality_calinski_harabasz(metric, data, centers, weights, fuzziness) + elseif quality_index == :xie_beni + _cluquality_xie_beni(metric, data, centers, weights, fuzziness) + elseif quality_index in [:davies_bouldin, :silhouettes, :dunn] + throw(ArgumentError("quality_index=:$quality_index does not support fuzzy clusterings.")) + else + throw(ArgumentError("quality_index=:$quality_index not supported.")) + end +end + +clustering_quality(data::AbstractMatrix{<:Real}, R::FuzzyCMeansResult; + quality_index::Symbol, fuzziness::Real, metric::SemiMetric=SqEuclidean()) = + clustering_quality(data, R.centers, R.weights; + quality_index = quality_index, + fuzziness = fuzziness, metric = metric) + + +# main method for clustering indices when cluster centres not known +function clustering_quality( + assignments::AbstractVector{<:Integer}, # n vector + dist::AbstractMatrix{<:Real}; # n×n matrix + quality_index::Symbol +) + n, m = size(dist) + na = length(assignments) + n == m || throw(ArgumentError("Distance matrix must be square.")) + n == na || throw(DimensionMismatch("Inconsistent array dimensions for distance matrix and assignments.")) + + if quality_index == :silhouettes + mean(silhouettes(assignments, dist)) + elseif quality_index == :dunn + _cluquality_dunn(assignments, dist) + elseif quality_index ∈ [:calinski_harabasz, :xie_beni, :davies_bouldin] + throw(ArgumentError("quality_index=:$quality_index requires cluster centers.")) + else + throw(ArgumentError("quality_index=:$quality_index not supported.")) + end +end + + +clustering_quality(data::AbstractMatrix{<:Real}, assignments::AbstractVector{<:Integer}; + quality_index::Symbol, metric::SemiMetric=SqEuclidean()) = + clustering_quality(assignments, pairwise(metric, data, dims=2); + quality_index = quality_index) + +clustering_quality(R::ClusteringResult, dist::AbstractMatrix{<:Real}; + quality_index::Symbol) = + clustering_quality(R.assignments, dist; + quality_index = quality_index) + + +# utility functions + +# convert assignments into a vector of vectors of data point indices for each cluster +function _gather_samples(assignments, k) + cluster_samples = [Int[] for _ in 1:k] + for (i, a) in zip(eachindex(assignments), assignments) + push!(cluster_samples[a], i) + end + return cluster_samples +end + +# shared between hard clustering calinski_harabasz and xie_beni +function _inner_inertia( + metric::SemiMetric, + data::AbstractMatrix, + centers::AbstractMatrix, + assignments::AbstractVector{<:Integer}, + fuzziness::Nothing +) + inner_inertia = sum( + sum(colwise(metric, view(data, :, samples), center)) + for (center, samples) in zip((view(centers, :, j) for j in axes(centers, 2)), + _gather_samples(assignments, size(centers, 2))) + ) + return inner_inertia +end + +# shared between fuzzy clustering calinski_harabasz and xie_beni (fuzzy version) +function _inner_inertia( + metric::SemiMetric, + data::AbstractMatrix, + centers::AbstractMatrix, + weights::AbstractMatrix, + fuzziness::Real +) + data_to_center_dists = pairwise(metric, data, centers, dims=2) + inner_inertia = sum( + w^fuzziness * d for (w, d) in zip(weights, data_to_center_dists) + ) + return inner_inertia +end + +# hard outer inertia for calinski_harabasz +function _outer_inertia( + metric::SemiMetric, + data::AbstractMatrix, + centers::AbstractMatrix, + assignments::AbstractVector{<:Integer}, + fuzziness::Nothing +) + global_center = vec(mean(data, dims=2)) + center_distances = colwise(metric, centers, global_center) + return sum(center_distances[clu] for clu in assignments) +end + +# fuzzy outer inertia for calinski_harabasz +function _outer_inertia( + metric::SemiMetric, + data::AbstractMatrix, + centers::AbstractMatrix, + weights::AbstractMatrix, + fuzziness::Real +) + global_center = vec(mean(data, dims=2)) + center_distances = colwise(metric, centers, global_center) + return sum(sum(w^fuzziness for w in view(weights, :, clu)) * d + for (clu, d) in enumerate(center_distances)) +end + +# Calinsk-Harabasz index +function _cluquality_calinski_harabasz( + metric::SemiMetric, + data::AbstractMatrix{<:Real}, + centers::AbstractMatrix{<:Real}, + assignments::Union{AbstractVector{<:Integer}, AbstractMatrix{<:Real}}, + fuzziness::Union{Real, Nothing} +) + n, k = size(data, 2), size(centers, 2) + outer_inertia = _outer_inertia(metric, data, centers, assignments, fuzziness) + inner_inertia = _inner_inertia(metric, data, centers, assignments, fuzziness) + return (outer_inertia / inner_inertia) * (n - k) / (k - 1) +end + + +# Davies Bouldin index +function _cluquality_davies_bouldin( + metric::SemiMetric, + data::AbstractMatrix{<:Real}, + centers::AbstractMatrix{<:Real}, + assignments::AbstractVector{<:Integer}, +) + clu_idx = axes(centers, 2) + clu_samples = _gather_samples(assignments, length(clu_idx)) + clu_diams = [mean(colwise(metric, view(data, :, samples), view(centers, :, clu))) + for (clu, samples) in zip(clu_idx, clu_samples)] + center_dists = pairwise(metric, centers, dims=2) + + DB = mean( + maximum(@inbounds (clu_diams[j₁] + clu_diams[j₂]) / center_dists[j₁, j₂] + for j₂ in clu_idx if j₂ ≠ j₁) + for j₁ in clu_idx) + return DB +end + + +# Xie-Beni index +function _cluquality_xie_beni( + metric::SemiMetric, + data::AbstractMatrix{<:Real}, + centers::AbstractMatrix{<:Real}, + assignments::Union{AbstractVector{<:Integer}, AbstractMatrix{<:Real}}, + fuzziness::Union{Real, Nothing} +) + n, k = size(data, 2), size(centers, 2) + inner_intertia = _inner_inertia(metric, data, centers, assignments, fuzziness) + center_distances = pairwise(metric, centers, dims=2) + min_center_distance = minimum(center_distances[j₁,j₂] for j₁ in 1:k for j₂ in j₁+1:k) + + return inner_intertia / (n * min_center_distance) +end + +# Dunn index +function _cluquality_dunn(assignments::AbstractVector{<:Integer}, dist::AbstractMatrix{<:Real}) + max_inner_distance, min_outer_distance = typemin(eltype(dist)), typemax(eltype(dist)) + + for i in eachindex(assignments), j in (i + 1):lastindex(assignments) + @inbounds d = dist[i, j] + if assignments[i] == assignments[j] + if max_inner_distance < d + max_inner_distance = d + end + else + if min_outer_distance > d + min_outer_distance = d + end + end + end + return min_outer_distance / max_inner_distance +end diff --git a/src/confusion.jl b/src/confusion.jl index 89cc42d7..6556b814 100644 --- a/src/confusion.jl +++ b/src/confusion.jl @@ -19,6 +19,11 @@ true negatives is `C₂₂`: |:--:|:-:|:-:| |Positive|C₁₁|C₁₂| |Negative|C₂₁|C₂₂| + +## See also + +[`counts(a::ClusteringResult, a::ClusteringResult)`](@ref counts) for full *contingency matrix*. + """ function confusion(::Type{T}, a::AbstractVector{<:Integer}, b::AbstractVector{<:Integer}) where T<:Union{Integer, AbstractFloat} cc = counts(a, b) diff --git a/src/counts.jl b/src/counts.jl index 1eef56fb..7e5a303c 100644 --- a/src/counts.jl +++ b/src/counts.jl @@ -29,6 +29,11 @@ from `b`. The clusterings could be specified either as [`ClusteringResult`](@ref) instances or as vectors of data point assignments. + +## See also + +[`confusion(a::ClusteringResult, a::ClusteringResult)`](@ref confusion) for 2×2 *confusion matrix*. + """ counts(a::ClusteringResult, b::ClusteringResult) = _counts(assignments(a), assignments(b)) diff --git a/src/mutualinfo.jl b/src/mutualinfo.jl index f50a7e4f..65b0a527 100644 --- a/src/mutualinfo.jl +++ b/src/mutualinfo.jl @@ -35,7 +35,7 @@ If `normed` parameter is `true` the return value is the normalized mutual inform see "Data Mining Practical Machine Tools and Techniques", Witten & Frank 2005. # References -> Vinh, Epps, and Bailey, (2009). “Information theoretic measures for clusterings comparison”. -Proceedings of the 26th Annual International Conference on Machine Learning - ICML ‘09. +> Vinh, Epps, and Bailey, (2009). *Information theoretic measures for clusterings comparison*. +> Proceedings of the 26th Annual International Conference on Machine Learning - ICML ‘09. """ mutualinfo(a, b; normed::Bool=true) = _mutualinfo(counts(a, b), normed) diff --git a/test/clustering_quality.jl b/test/clustering_quality.jl new file mode 100644 index 00000000..4e7c1d3a --- /dev/null +++ b/test/clustering_quality.jl @@ -0,0 +1,122 @@ +using Test +using Clustering, Distances + +@testset "clustering_quality()" begin + + # test data with 12 2D points and 4 clusters + Y = [-2 2 2 3 2 1 2 -2 -2 -1 -2 -3 + 4 4 1 0 -1 0 -4 -4 1 0 -1 0] + # cluster centers + C = [0 2 0 -2 + 4 0 -4 0] + # point-to-cluster assignments + A = [1, 1, 2, 2, 2, 2, 3, 3, 4, 4, 4, 4] + # convert A to fuzzy clusters weights + W = zeros(Int, (size(Y, 2), size(C, 2))) + for (i, c) in enumerate(A) + W[i, c] = 1 + end + # fuzzy clustering with 4 clusters + W2 = [ + 1 0 0 0 + 1 0 0 0 + 0 1/2 0 1/2 + 0 1/2 0 1/2 + 0 1/2 0 1/2 + 0 1/2 0 1/2 + 0 0 1 0 + 0 0 1 0 + 0 1/2 0 1/2 + 0 1/2 0 1/2 + 0 1/2 0 1/2 + 0 1/2 0 1/2 + ] + # mock hard and fuzzy clusterings for testing interface; only C, W and A arguments are actually used + A_kmeans = KmeansResult(Float64.(C), A, ones(12), [4, 4, 4], ones(4), 42., 42, true) + W_cmeans = FuzzyCMeansResult(Float64.(C), Float64.(W), 42, true) + W2_cmeans = FuzzyCMeansResult(Float64.(C), Float64.(W2), 42, true) + + @testset "input checks" begin + @test_throws ArgumentError clustering_quality(zeros(2,2), zeros(2,3), [1, 2], quality_index = :calinski_harabasz) + @test_throws DimensionMismatch clustering_quality(zeros(2,2), zeros(3,2), [1, 2], quality_index = :calinski_harabasz) + @test_throws ArgumentError clustering_quality(zeros(2,2), zeros(2,1), [1, ], quality_index = :calinski_harabasz) + @test_throws ArgumentError clustering_quality(zeros(2,2), zeros(2,2), [1, 2], quality_index = :calinski_harabasz) + @test_throws ArgumentError clustering_quality(zeros(0,0), zeros(0,0), zeros(Int,0); quality_index = :calinski_harabasz) + @test_throws ArgumentError clustering_quality(zeros(0,0), zeros(0,0), zeros(0,0); quality_index = :calinski_harabasz, fuzziness = 2) + @test_throws DimensionMismatch clustering_quality([1,2,3], zeros(2,2), quality_index = :dunn) + # wrong quality index + @test_throws ArgumentError clustering_quality(Y, C, A; quality_index = :nonexistent_index) + @test_throws ArgumentError clustering_quality(Y, C, W; quality_index = :nonexistent_index, fuzziness = 2) + @test_throws ArgumentError clustering_quality(Y, A; quality_index = :nonexistent_index) + end + + @testset "correct quality index values" begin + @testset "calinski_harabasz" begin + @test clustering_quality(Y, C, A; quality_index = :calinski_harabasz, metric = Euclidean()) ≈ (32/3) / (16/8) + @test clustering_quality(Y, A_kmeans; quality_index = :calinski_harabasz, metric = Euclidean()) ≈ (32/3) / (16/8) + # requires centers + @test_throws ArgumentError clustering_quality(A_kmeans, pairwise(Euclidean(), Y, dims=2); quality_index = :calinski_harabasz) + + @test clustering_quality(Y, C, W; quality_index = :calinski_harabasz, fuzziness = 2, metric = Euclidean()) ≈ (32/3) / (16/8) + @test clustering_quality(Y, W_cmeans; quality_index = :calinski_harabasz, fuzziness = 2, metric = Euclidean()) ≈ (32/3) / (16/8) + @test_throws MethodError clustering_quality(W_cmeans, pairwise(Euclidean(), Y, dims=2); quality_index = :calinski_harabasz, fuzziness = 2) ≈ (32/3) / (16/8) + + @test clustering_quality(Y, C, W2; quality_index = :calinski_harabasz, fuzziness = 2, metric = Euclidean()) ≈ 8/3 * ( 24 ) / (14+sqrt(17)) + @test clustering_quality(Y, W2_cmeans; quality_index = :calinski_harabasz, fuzziness = 2, metric = Euclidean()) ≈ 8/3 * ( 24 ) / (14+sqrt(17)) + @test_throws MethodError clustering_quality(W2_cmeans, pairwise(Euclidean(), Y, dims=2); quality_index = :calinski_harabasz, fuzziness = 2) + end + + @testset "davies_bouldin" begin + @test clustering_quality(Y, C, A; quality_index = :davies_bouldin, metric = Euclidean()) ≈ 3/sqrt(20) + @test clustering_quality(Y, A_kmeans; quality_index = :davies_bouldin, metric = Euclidean()) ≈ 3/sqrt(20) + # requires centers + @test_throws ArgumentError clustering_quality(A_kmeans, pairwise(Euclidean(), Y, dims=2); quality_index = :davies_bouldin) ≈ 3/sqrt(20) + # fuzziness not supported + @test_throws ArgumentError clustering_quality(Y, W_cmeans; quality_index = :davies_bouldin, fuzziness = 2) + end + + @testset "dunn" begin + @test clustering_quality(Y, C, A; quality_index = :dunn, metric = Euclidean()) ≈ 1/2 + @test clustering_quality(Y, A_kmeans; quality_index = :dunn, metric = Euclidean()) ≈ 1/2 + @test clustering_quality(A_kmeans, pairwise(Euclidean(), Y, dims=2); quality_index = :dunn) ≈ 1/2 + # fuzziness not supported + @test_throws ArgumentError clustering_quality(Y, W_cmeans; quality_index = :dunn, fuzziness = 2) + end + + @testset "xie_beni" begin + @test clustering_quality(Y, C, A; quality_index = :xie_beni, metric = Euclidean()) ≈ 1/3 + + @test clustering_quality(Y, C, W; quality_index = :xie_beni, fuzziness = 2, metric = Euclidean()) ≈ 1/3 + @test clustering_quality(Y, W_cmeans; quality_index = :xie_beni, fuzziness = 2, metric = Euclidean()) ≈ 1/3 + + @test clustering_quality(Y, C, W2; quality_index = :xie_beni, fuzziness = 2, metric = Euclidean()) ≈ (14+sqrt(17)) / (12 * 4) + @test clustering_quality(Y, W2_cmeans; quality_index = :xie_beni, fuzziness = 2, metric = Euclidean()) ≈ (14+sqrt(17)) / (12 * 4) + end + + @testset "silhouettes" begin + avg_silh = 1 - 1/12*( # average over silhouettes 1 - a_i * 1/b_i + + 4 * 16 /(3+2sqrt(17)+5) # 4 points in clusters 1 and 3 + + 4 * (2sqrt(2)+2)/3 * 1/4 # 4 points in clusters 2 and 4, top + bottom + + 2 * (2sqrt(2)+2)/3 * 4/(4+2sqrt(26)+6) # 2 points clusters 2 and 4, left and right + + 2 * (2sqrt(2)+2)/3 * 4/(2+2sqrt(10)+4) # 2 points clusters 2 and 4, center + ) + @test clustering_quality(Y, A; quality_index = :silhouettes, metric = Euclidean()) ≈ avg_silh + @test clustering_quality(Y, A_kmeans; quality_index = :silhouettes, metric = Euclidean()) ≈ avg_silh + @test clustering_quality(A_kmeans, pairwise(Euclidean(), Y, dims=2); quality_index = :silhouettes) ≈ avg_silh + # fuzziness not supported + @test_throws ArgumentError clustering_quality(Y, W_cmeans; quality_index = :silhouettes, fuzziness = 2) + end + end + + @testset "empty clusters" begin + # degenerated clustering, no 4th cluster + degenC = [0 2 0 -2 -2 + 4 0 -4 0 0] + degenA = [1, 1, 2, 2, 2, 2, 3, 3, 5, 5, 5, 5] + + @test_logs((:warn, "Detected empty cluster(s): #4. clustering_quality() results might be incorrect."), + clustering_quality(Y, degenC, degenA; quality_index = :calinski_harabasz)) + @test clustering_quality(Y, degenC, degenA; quality_index = :calinski_harabasz, metric = Euclidean()) ≈ (32/3) / (16/8) + end + +end \ No newline at end of file diff --git a/test/runtests.jl b/test/runtests.jl index 9eaca2ed..42301653 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -14,6 +14,7 @@ tests = ["seeding", "fuzzycmeans", "counts", "silhouette", + "clustering_quality", "varinfo", "randindex", "hclust",