Skip to content

Commit

Permalink
Merge branch 'master' of https://github.com/jaksle/Clustering.jl
Browse files Browse the repository at this point in the history
  • Loading branch information
jaksle committed Oct 7, 2023
2 parents 3b6c9e8 + 31ad6a5 commit 7c2b0c4
Show file tree
Hide file tree
Showing 14 changed files with 404 additions and 136 deletions.
4 changes: 2 additions & 2 deletions Project.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
name = "Clustering"
uuid = "aaaa29a8-35af-508c-8bc3-b662a17a0fe5"
version = "0.15.3"
version = "0.15.4"

[deps]
Distances = "b4f34e82-e78d-54a5-968a-f98e89d6e8f7"
Expand All @@ -13,7 +13,7 @@ Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2"
StatsBase = "2913bbd2-ae8a-5f71-8c99-4fb6c76f3a91"

[compat]
Distances = "0.8, 0.9, 0.10"
Distances = "0.10.9"
NearestNeighbors = "0.4"
StatsBase = "0.25, 0.26, 0.27, 0.28, 0.29, 0.30, 0.31, 0.32, 0.33, 0.34"
julia = "1"
Expand Down
19 changes: 19 additions & 0 deletions benchmark/benchmarks.jl
Original file line number Diff line number Diff line change
Expand Up @@ -29,3 +29,22 @@ SUITE["cutree"] = BenchmarkGroup()
for (n, k) in ((10, 3), (100, 10), (1000, 100), (10000, 1000))
SUITE["cutree"][(n,k)] = @benchmarkable cutree(hclu, k=$k) setup=(D=random_distance_matrix($n, 5); hclu=hclust(D, linkage=:single))
end

function silhouette_benchmark(metric, assgns, points, nclusters)
res = BenchmarkGroup()
res[:distances] = @benchmarkable silhouettes($assgns, pairwise($metric, $points, $points, dims=2))
res[:points] = @benchmarkable silhouettes($assgns, $points; metric=$metric)
return res
end

SUITE["silhouette"] = BenchmarkGroup()
for metric in [SqEuclidean(), Euclidean()]
SUITE["silhouette"]["metric=$(typeof(metric))"] = metric_bench = BenchmarkGroup()
for n in [100, 1000, 10000, 20000]
nclusters = 10
dims = 10
points = randn(dims, n)
assgns = rand(1:nclusters, n)
metric_bench["n=$n"] = silhouette_benchmark(metric, assgns, points, nclusters)
end
end
2 changes: 1 addition & 1 deletion docs/source/fuzzycmeans.md
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ and ``m > 1`` is a user-defined fuzziness parameter.
```@docs
fuzzy_cmeans
FuzzyCMeansResult
wcounts(::FuzzyCMeansResult)
wcounts
```

## Examples
Expand Down
2 changes: 2 additions & 0 deletions src/Clustering.jl
Original file line number Diff line number Diff line change
Expand Up @@ -88,6 +88,8 @@ module Clustering

include("counts.jl")

include("cluster_distances.jl")

include("silhouette.jl")
include("clustering_quality.jl")

Expand Down
18 changes: 10 additions & 8 deletions src/affprop.jl
Original file line number Diff line number Diff line change
Expand Up @@ -118,13 +118,17 @@ function _affinityprop(S::AbstractMatrix{T},

# extract exemplars and assignments
exemplars = _afp_extract_exemplars(A, R)
if isempty(exemplars)
@show A R
end
@assert !isempty(exemplars)
assignments, counts = _afp_get_assignments(S, exemplars)

if displevel >= 1
if converged
println("Affinity propagation converged with $t iterations: $(length(exemplars)) exemplars.")
@info "Affinity propagation converged with $t iterations: $(length(exemplars)) exemplars."
else
println("Affinity propagation terminated without convergence after $t iterations: $(length(exemplars)) exemplars.")
@warn "Affinity propagation terminated without convergence after $t iterations: $(length(exemplars)) exemplars."
end
end

Expand Down Expand Up @@ -250,7 +254,6 @@ function _afp_get_assignments(S::AbstractMatrix, exemplars::Vector{Int})
k = length(exemplars)
Se = S[:, exemplars]
a = Vector{Int}(undef, n)
cnts = zeros(Int, k)
for i = 1:n
p = 1
v = Se[i,1]
Expand All @@ -263,11 +266,10 @@ function _afp_get_assignments(S::AbstractMatrix, exemplars::Vector{Int})
end
a[i] = p
end
for i = 1:k
a[exemplars[i]] = i
end
for i = 1:n
@inbounds cnts[a[i]] += 1
a[exemplars] = eachindex(exemplars)
cnts = zeros(Int, k)
for aa in a
@inbounds cnts[aa] += 1
end
return (a, cnts)
end
226 changes: 226 additions & 0 deletions src/cluster_distances.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,226 @@
#===
Base type for efficient computation of average(mean) distances
from the cluster centers to a given point.
The descendant types should implement the following methods:
* `update!(dists, assignments, points)`: update the internal
state of `dists` with point coordinates and their assignments to the clusters
* `sumdistances(dists, points, indices)`: compute the sum of
distances from all `dists` clusters to `points`
===#
abstract type ClusterDistances{T} end

# create empty ClusterDistances object for a given metric
# and update it with a given clustering
# if batch_size is specified, the updates are done in point batches of given size
function ClusterDistances(metric::SemiMetric,
assignments::AbstractVector{<:Integer},
points::AbstractMatrix{<:Real},
batch_size::Union{Integer, Nothing} = nothing)
update!(ClusterDistances(eltype(points), metric, length(assignments), size(points, 1),
maximum(assignments)),
assignments, points, batch_size)
end

ClusterDistances(metric, R::ClusteringResult, args...) =
ClusterDistances(metric, assignments(R), args...)

# fallback implementations of ClusteringDistances methods

cluster_sizes(dists::ClusterDistances) = dists.cluster_sizes
nclusters(dists::ClusterDistances) = length(cluster_sizes(dists))

update!(dists::ClusterDistances,
assignments::AbstractVector, points::AbstractMatrix) =
error("update!(dists::$(typeof(dists))) not implemented")

sumdistances(dists::ClusterDistances,
points::Union{AbstractMatrix, Nothing},
indices::Union{AbstractVector{<:Integer}, Nothing}) =
error("sumdistances(dists::$(typeof(dists))) not implemented")

# average distances from each cluster to each point, nclusters×n matrix
function meandistances(dists::ClusterDistances,
assignments::AbstractVector{<:Integer},
points::Union{AbstractMatrix, Nothing},
indices::AbstractVector{<:Integer})
@assert length(assignments) == length(indices)
(points === nothing) || @assert(size(points, 2) == length(indices))
clu_to_pt = sumdistances(dists, points, indices)
clu_sizes = cluster_sizes(dists)
@assert length(assignments) == length(indices)
@assert size(clu_to_pt) == (length(clu_sizes), length(assignments))

# normalize distances by cluster sizes
@inbounds for j in eachindex(assignments)
for (i, c) in enumerate(clu_sizes)
if i == assignments[j]
c -= 1
end
if c == 0
clu_to_pt[i,j] = 0
else
clu_to_pt[i,j] /= c
end
end
end
return clu_to_pt
end

# wrapper for ClusteringResult
update!(dists::ClusterDistances, R::ClusteringResult, args...) =
update!(dists, assignments(R), args...)

# batch-update silhouette dists (splitting the points into chunks of batch_size size)
function update!(dists::ClusterDistances,
assignments::AbstractVector{<:Integer}, points::AbstractMatrix{<:Real},
batch_size::Union{Integer, Nothing})
n = size(points, 2)
((batch_size === nothing) || (n <= batch_size)) &&
return update!(dists, assignments, points)

for batch_start in 1:batch_size:n
batch_ixs = batch_start:min(batch_start + batch_size - 1, n)
update!(dists, view(assignments, batch_ixs), view(points, :, batch_ixs))
end
return dists
end

# generic ClusterDistances implementation for an arbitrary metric M
# if M is Nothing, point_dists is an arbitrary matrix of point distances
struct SimpleClusterDistances{M, T} <: ClusterDistances{T}
metric::M
cluster_sizes::Vector{Int}
assignments::Vector{Int}
point_dists::Matrix{T}

SimpleClusterDistances(::Type{T}, metric::M,
npoints::Integer, nclusters::Integer) where {M<:Union{SemiMetric, Nothing}, T<:Real} =
new{M, T}(metric, zeros(Int, nclusters), Vector{Int}(),
Matrix{T}(undef, npoints, npoints))

# reuse given points matrix
function SimpleClusterDistances(
metric::Nothing,
assignments::AbstractVector{<:Integer},
point_dists::AbstractMatrix{T}
) where T<:Real
n = length(assignments)
size(point_dists) == (n, n) || throw(DimensionMismatch("assignments length ($n) does not match distances matrix size ($(size(point_dists)))"))
issymmetric(point_dists) || throw(ArgumentError("point distances matrix must be symmetric"))
clu_sizes = zeros(Int, maximum(assignments))
@inbounds for cluster in assignments
clu_sizes[cluster] += 1
end
new{Nothing, T}(metric, clu_sizes, assignments, point_dists)
end
end

# fallback ClusterDistances constructor
ClusterDistances(::Type{T}, metric::Union{SemiMetric, Nothing},
npoints::Union{Integer, Nothing}, dims::Integer, nclusters::Integer) where T<:Real =
SimpleClusterDistances(T, metric, npoints, nclusters)

# when metric is nothing, points is the matrix of distances
function ClusterDistances(metric::Nothing,
assignments::AbstractVector{<:Integer},
points::AbstractMatrix,
batch_size::Union{Integer, Nothing} = nothing)
(batch_size === nothing) || (size(points, 2) > batch_size) ||
error("batch-updates of distance matrix-based ClusterDistances not supported")
SimpleClusterDistances(metric, assignments, points)
end

function update!(dists::SimpleClusterDistances{M},
assignments::AbstractVector{<:Integer},
points::AbstractMatrix{<:Real}) where M
@assert length(assignments) == size(points, 2)
check_assignments(assignments, nclusters(dists))
append!(dists.assignments, assignments)
n = size(dists.point_dists, 1)
length(dists.assignments) == n ||
error("$(typeof(dists)) does not support batch updates: $(length(assignments)) points given, $n expected")
@inbounds for cluster in assignments
dists.cluster_sizes[cluster] += 1
end

if M === Nothing
size(point_dists) == (n, n) ||
throw(DimensionMismatch("points should be a point-to-point distances matrix of ($n, $n) size, $(size(points)) given"))
copy!(dists.point_dists, point_dists)
else
# metric-based SimpleClusterDistances does not support batched updates
size(points, 2) == n ||
throw(DimensionMismatch("points should be a point coordinates matrix with $n columns, $(size(points, 2)) found"))
pairwise!(dists.metric, dists.point_dists, points, dims=2)
end

return dists
end

# this function returns matrix r nclusters x n, such that
# r[i, j] is the sum of distances from all i-th cluster points to the point indices[j]
function sumdistances(dists::SimpleClusterDistances,
points::Union{AbstractMatrix, Nothing}, # unused as distances are already in point_dists
indices::AbstractVector{<:Integer})
T = eltype(dists.point_dists)
n = length(dists.assignments)
S = typeof((one(T)+one(T))/2)
r = zeros(S, nclusters(dists), n)
@inbounds for (jj, j) in enumerate(indices)
for i = 1:j-1
r[dists.assignments[i], jj] += dists.point_dists[i,j]
end
for i = j+1:n
r[dists.assignments[i], jj] += dists.point_dists[i,j]
end
end
return r
end

# uses the method from "Distributed Silhouette Algorithm: Evaluating Clustering on Big Data"
# https://arxiv.org/abs/2303.14102
# for SqEuclidean point distances
struct SqEuclideanClusterDistances{T} <: ClusterDistances{T}
cluster_sizes::Vector{Int} # [nclusters]
Y::Matrix{T} # [dims, nclusters], the first moments of each cluster (sum of point coords)
Ψ::Vector{T} # [nclusters], the second moments of each cluster (sum of point coord squares)

SqEuclideanClusterDistances(::Type{T}, npoints::Union{Integer, Nothing}, dims::Integer,
nclusters::Integer) where T<:Real =
new{T}(zeros(Int, nclusters), zeros(T, dims, nclusters), zeros(T, nclusters))
end

ClusterDistances(::Type{T}, metric::SqEuclidean, npoints::Union{Integer, Nothing},
dims::Integer, nclusters::Integer) where T<:Real =
SqEuclideanClusterDistances(T, npoints, dims, nclusters)

function update!(dists::SqEuclideanClusterDistances,
assignments::AbstractVector{<:Integer},
points::AbstractMatrix{<:Real})
# x dims are [D,N]
d, n = size(points)
k = length(cluster_sizes(dists))
check_assignments(assignments, k)
n == length(assignments) || throw(DimensionMismatch("points count ($n) does not match assignments length $(length(assignments)))"))
d == size(dists.Y, 1) || throw(DimensionMismatch("points dims ($(size(points, 1))) do no must match ClusterDistances dims ($(size(dists.Y, 1)))"))
# precompute moments and update counts
@inbounds for (i, cluster) in enumerate(assignments)
point = view(points, :, i) # switch to eachslice() once Julia-1.0 support is dropped
dists.cluster_sizes[cluster] += 1
dists.Y[:, cluster] .+= point
dists.Ψ[cluster] += sum(abs2, point)
end
return dists
end

# sum distances from each cluster to each point in `points`, [nclusters, n]
function sumdistances(dists::SqEuclideanClusterDistances,
points::AbstractMatrix,
indices::AbstractVector{<:Integer})
@assert size(points, 2) == length(indices)
point_norms = sum(abs2, points; dims=1) # [1,n]
return dists.cluster_sizes .* point_norms .+
reshape(dists.Ψ, nclusters(dists), 1) .-
2 * (transpose(dists.Y) * points)
end
Loading

0 comments on commit 7c2b0c4

Please sign in to comment.