Merge pull request #255 from nomadbl/Distributed_Silhouette

Distributed silhouette
JuliaStats · Oct 3, 2023 · 24dba09 · 24dba09
2 parents 23bded0 + 7da02f5
commit 24dba09
Show file tree

Hide file tree

Showing 6 changed files with 371 additions and 108 deletions.
diff --git a/benchmark/benchmarks.jl b/benchmark/benchmarks.jl
@@ -29,3 +29,22 @@ SUITE["cutree"] = BenchmarkGroup()
 for (n, k) in ((10, 3), (100, 10), (1000, 100), (10000, 1000))
     SUITE["cutree"][(n,k)] = @benchmarkable cutree(hclu, k=$k) setup=(D=random_distance_matrix($n, 5); hclu=hclust(D, linkage=:single))
 end
+
+function silhouette_benchmark(metric, assgns, points, nclusters)
+    res = BenchmarkGroup()
+    res[:distances] = @benchmarkable silhouettes($assgns, pairwise($metric, $points, $points, dims=2))
+    res[:points] = @benchmarkable silhouettes($assgns, $points; metric=$metric)
+    return res
+end
+
+SUITE["silhouette"] = BenchmarkGroup()
+for metric in [SqEuclidean(), Euclidean()]
+    SUITE["silhouette"]["metric=$(typeof(metric))"] = metric_bench = BenchmarkGroup()
+    for n in [100, 1000, 10000, 20000]
+        nclusters = 10
+        dims = 10
+        points = randn(dims, n)
+        assgns = rand(1:nclusters, n)
+        metric_bench["n=$n"] = silhouette_benchmark(metric, assgns, points, nclusters)
+    end
+end
diff --git a/src/Clustering.jl b/src/Clustering.jl
@@ -83,6 +83,7 @@ module Clustering
     include("fuzzycmeans.jl")
 
     include("counts.jl")
+    include("cluster_distances.jl")
     include("silhouette.jl")
     include("randindex.jl")
     include("varinfo.jl")

diff --git a/src/cluster_distances.jl b/src/cluster_distances.jl
@@ -0,0 +1,226 @@
+"""
+Base type for efficient computation of average(mean) distances
+from the cluster centers to a given point.
+
+The descendant types should implement the following methods:
+  * `update!(dists, assignments, points)`: update the internal
+    state of `dists` with point coordinates and their assignments to the clusters
+  * `sumdistances(dists, points, indices)`: compute the sum of
+    distances from all `dists` clusters to `points`
+"""
+abstract type ClusterDistances{T} end
+
+# create empty ClusterDistances object for a given metric
+# and update it with a given clustering
+# if batch_size is specified, the updates are done in point batches of given size
+function ClusterDistances(metric::SemiMetric,
+                 assignments::AbstractVector{<:Integer},
+                 points::AbstractMatrix{<:Real},
+                 batch_size::Union{Integer, Nothing} = nothing)
+    update!(ClusterDistances(eltype(points), metric, length(assignments), size(points, 1),
+                             maximum(assignments)),
+            assignments, points, batch_size)
+end
+
+ClusterDistances(metric, R::ClusteringResult, args...) =
+    ClusterDistances(metric, assignments(R), args...)
+
+# fallback implementations of ClusteringDistances methods
+
+cluster_sizes(dists::ClusterDistances) = dists.cluster_sizes
+nclusters(dists::ClusterDistances) = length(cluster_sizes(dists))
+
+update!(dists::ClusterDistances,
+        assignments::AbstractVector, points::AbstractMatrix) =
+    error("update!(dists::$(typeof(dists))) not implemented")
+
+sumdistances(dists::ClusterDistances,
+             points::Union{AbstractMatrix, Nothing},
+             indices::Union{AbstractVector{<:Integer}, Nothing}) =
+    error("sumdistances(dists::$(typeof(dists))) not implemented")
+
+# average distances from each cluster to each point, nclusters×n matrix
+function meandistances(dists::ClusterDistances,
+                       assignments::AbstractVector{<:Integer},
+                       points::Union{AbstractMatrix, Nothing},
+                       indices::AbstractVector{<:Integer})
+    @assert length(assignments) == length(indices)
+    (points === nothing) || @assert(size(points, 2) == length(indices))
+    clu_to_pt = sumdistances(dists, points, indices)
+    clu_sizes = cluster_sizes(dists)
+    @assert length(assignments) == length(indices)
+    @assert size(clu_to_pt) == (length(clu_sizes), length(assignments))
+
+    # normalize distances by cluster sizes
+    @inbounds for j in eachindex(assignments)
+        for (i, c) in enumerate(clu_sizes)
+            if i == assignments[j]
+                c -= 1
+            end
+            if c == 0
+                clu_to_pt[i,j] = 0
+            else
+                clu_to_pt[i,j] /= c
+            end
+        end
+    end
+    return clu_to_pt
+end
+
+# wrapper for ClusteringResult
+update!(dists::ClusterDistances, R::ClusteringResult, args...) =
+    update!(dists, assignments(R), args...)
+
+# batch-update silhouette dists (splitting the points into chunks of batch_size size)
+function update!(dists::ClusterDistances,
+                 assignments::AbstractVector{<:Integer}, points::AbstractMatrix{<:Real},
+                 batch_size::Union{Integer, Nothing})
+    n = size(points, 2)
+    ((batch_size === nothing) || (n <= batch_size)) &&
+        return update!(dists, assignments, points)
+
+    for batch_start in 1:batch_size:n
+        batch_ixs = batch_start:min(batch_start + batch_size - 1, n)
+        update!(dists, view(assignments, batch_ixs), view(points, :, batch_ixs))
+    end
+    return dists
+end
+
+# generic ClusterDistances implementation for an arbitrary metric M
+# if M is Nothing, point_dists is an arbitrary matrix of point distances
+struct SimpleClusterDistances{M, T} <: ClusterDistances{T}
+    metric::M
+    cluster_sizes::Vector{Int}
+    assignments::Vector{Int}
+    point_dists::Matrix{T}
+
+    SimpleClusterDistances(::Type{T}, metric::M,
+                           npoints::Integer, nclusters::Integer) where {M<:Union{SemiMetric, Nothing}, T<:Real} =
+        new{M, T}(metric, zeros(Int, nclusters), Vector{Int}(),
+                  Matrix{T}(undef, npoints, npoints))
+
+    # reuse given points matrix
+    function SimpleClusterDistances(
+        metric::Nothing,
+        assignments::AbstractVector{<:Integer},
+        point_dists::AbstractMatrix{T}
+    ) where T<:Real
+        n = length(assignments)
+        size(point_dists) == (n, n) || throw(DimensionMismatch("assignments length ($n) does not match distances matrix size ($(size(point_dists)))"))
+        issymmetric(point_dists) || throw(ArgumentError("point distances matrix must be symmetric"))
+        clu_sizes = zeros(Int, maximum(assignments))
+        @inbounds for cluster in assignments
+            clu_sizes[cluster] += 1
+        end
+        new{Nothing, T}(metric, clu_sizes, assignments, point_dists)
+    end
+end
+
+# fallback ClusterDistances constructor
+ClusterDistances(::Type{T}, metric::Union{SemiMetric, Nothing},
+                 npoints::Union{Integer, Nothing}, dims::Integer, nclusters::Integer) where T<:Real =
+    SimpleClusterDistances(T, metric, npoints, nclusters)
+
+# when metric is nothing, points is the matrix of distances
+function ClusterDistances(metric::Nothing,
+                          assignments::AbstractVector{<:Integer},
+                          points::AbstractMatrix,
+                          batch_size::Union{Integer, Nothing} = nothing)
+    (batch_size === nothing) || (size(points, 2) > batch_size) ||
+        error("batch-updates of distance matrix-based ClusterDistances not supported")
+    SimpleClusterDistances(metric, assignments, points)
+end
+
+function update!(dists::SimpleClusterDistances{M},
+                 assignments::AbstractVector{<:Integer},
+                 points::AbstractMatrix{<:Real}) where M
+    @assert length(assignments) == size(points, 2)
+    check_assignments(assignments, nclusters(dists))
+    append!(dists.assignments, assignments)
+    n = size(dists.point_dists, 1)
+    length(dists.assignments) == n ||
+        error("$(typeof(dists)) does not support batch updates: $(length(assignments)) points given, $n expected")
+    @inbounds for cluster in assignments
+        dists.cluster_sizes[cluster] += 1
+    end
+
+    if M === Nothing
+        size(point_dists) == (n, n) ||
+            throw(DimensionMismatch("points should be a point-to-point distances matrix of ($n, $n) size, $(size(points)) given"))
+        copy!(dists.point_dists, point_dists)
+    else
+        # metric-based SimpleClusterDistances does not support batched updates
+        size(points, 2) == n ||
+            throw(DimensionMismatch("points should be a point coordinates matrix with $n columns, $(size(points, 2)) found"))
+        pairwise!(dists.metric, dists.point_dists, points, dims=2)
+    end
+
+    return dists
+end
+
+# this function returns matrix r nclusters x n, such that
+# r[i, j] is the sum of distances from all i-th cluster points to the point indices[j]
+function sumdistances(dists::SimpleClusterDistances,
+                      points::Union{AbstractMatrix, Nothing}, # unused as distances are already in point_dists
+                      indices::AbstractVector{<:Integer})
+    T = eltype(dists.point_dists)
+    n = length(dists.assignments)
+    S = typeof((one(T)+one(T))/2)
+    r = zeros(S, nclusters(dists), n)
+    @inbounds for (jj, j) in enumerate(indices)
+        for i = 1:j-1
+            r[dists.assignments[i], jj] += dists.point_dists[i,j]
+        end
+        for i = j+1:n
+            r[dists.assignments[i], jj] += dists.point_dists[i,j]
+        end
+    end
+    return r
+end
+
+# uses the method from "Distributed Silhouette Algorithm: Evaluating Clustering on Big Data"
+# https://arxiv.org/abs/2303.14102
+# for SqEuclidean point distances
+struct SqEuclideanClusterDistances{T} <: ClusterDistances{T}
+    cluster_sizes::Vector{Int} # [nclusters]
+    Y::Matrix{T} # [dims, nclusters], the first moments of each cluster (sum of point coords)
+    Ψ::Vector{T} # [nclusters], the second moments of each cluster (sum of point coord squares)
+
+    SqEuclideanClusterDistances(::Type{T}, npoints::Union{Integer, Nothing}, dims::Integer,
+                                nclusters::Integer) where T<:Real =
+        new{T}(zeros(Int, nclusters), zeros(T, dims, nclusters), zeros(T, nclusters))
+end
+
+ClusterDistances(::Type{T}, metric::SqEuclidean, npoints::Union{Integer, Nothing},
+                 dims::Integer, nclusters::Integer) where T<:Real =
+    SqEuclideanClusterDistances(T, npoints, dims, nclusters)
+
+function update!(dists::SqEuclideanClusterDistances,
+                 assignments::AbstractVector{<:Integer},
+                 points::AbstractMatrix{<:Real})
+    # x dims are [D,N]
+    d, n = size(points)
+    k = length(cluster_sizes(dists))
+    check_assignments(assignments, k)
+    n == length(assignments) || throw(DimensionMismatch("points count ($n) does not match assignments length $(length(assignments)))"))
+    d == size(dists.Y, 1) || throw(DimensionMismatch("points dims ($(size(points, 1))) do no must match ClusterDistances dims ($(size(dists.Y, 1)))"))
+    # precompute moments and update counts
+    @inbounds for (i, cluster) in enumerate(assignments)
+        point = view(points, :, i) # switch to eachslice() once Julia-1.0 support is dropped
+        dists.cluster_sizes[cluster] += 1
+        dists.Y[:, cluster] .+= point
+        dists.Ψ[cluster] += sum(abs2, point)
+    end
+    return dists
+end
+
+# sum distances from each cluster to each point in `points`, [nclusters, n]
+function sumdistances(dists::SqEuclideanClusterDistances,
+                      points::AbstractMatrix,
+                      indices::AbstractVector{<:Integer})
+    @assert size(points, 2) == length(indices)
+    point_norms = sum(abs2, points; dims=1) # [1,n]
+    return dists.cluster_sizes .* point_norms .+
+        reshape(dists.Ψ, nclusters(dists), 1) .-
+        2 * (transpose(dists.Y) * points)
+end