From a66145095f2f85c2768a6234042cd552cd0240ca Mon Sep 17 00:00:00 2001 From: Art Wild Date: Sat, 25 Dec 2021 14:27:43 -0500 Subject: [PATCH 1/4] fixed ARI (fixes #225 & #226) - added pair confusion matrix (CM) calculations - fixed ARI calculation using CM --- doc/source/validate.md | 10 ++++++++++ src/Clustering.jl | 6 +++++- src/confusion.jl | 42 ++++++++++++++++++++++++++++++++++++++++ src/randindex.jl | 38 ++++++++++++++---------------------- test/confusion.jl | 44 ++++++++++++++++++++++++++++++++++++++++++ test/randindex.jl | 5 +++++ test/runtests.jl | 3 ++- 7 files changed, 122 insertions(+), 26 deletions(-) create mode 100644 src/confusion.jl create mode 100644 test/confusion.jl diff --git a/doc/source/validate.md b/doc/source/validate.md index abd19881..bfffbfdf 100644 --- a/doc/source/validate.md +++ b/doc/source/validate.md @@ -99,3 +99,13 @@ the similarity of two different clusterings of a dataset. ```@docs mutualinfo ``` + +## Confusion matrix + +Pair [confusion matrix](https://en.wikipedia.org/wiki/Confusion_matrix) +arising from two clusterings is a 2x2 contingency table representation of +the partition co-occurrence table, see [`counts`](@ref). + +```@docs +confusion +``` diff --git a/src/Clustering.jl b/src/Clustering.jl index 2da54517..a3513a17 100644 --- a/src/Clustering.jl +++ b/src/Clustering.jl @@ -65,7 +65,10 @@ module Clustering Hclust, hclust, cutree, # MCL - mcl, MCLResult + mcl, MCLResult, + + # pair confusion matrix + confusion ## source files @@ -85,6 +88,7 @@ module Clustering include("varinfo.jl") include("vmeasure.jl") include("mutualinfo.jl") + include("confusion.jl") include("hclust.jl") diff --git a/src/confusion.jl b/src/confusion.jl new file mode 100644 index 00000000..eb0af892 --- /dev/null +++ b/src/confusion.jl @@ -0,0 +1,42 @@ +""" + confusion(a::ClusteringResult, b::ClusteringResult) -> Matrix{Int} + confusion(a::ClusteringResult, b::AbstractVector{<:Integer}) -> Matrix{Int} + confusion(a::AbstractVector{<:Integer}, b::ClusteringResult) -> Matrix{Int} + confusion(a::AbstractVector{<:Integer}, b::AbstractVector{<:Integer}) -> Matrix{Int} + +Return 2x2 confusion matrix `C` that represents partition co-occurrence or +similarity matrix between two clusterings by considering all pairs of samples +and counting pairs that are assigned into the same or into different clusters +under the true and predicted clusterings. + +Considering a pair of samples that is in the same group as a **positive pair**, +and a pair is in the different group as a **negative pair**, then the count of +true positives is `C₀₀`, false negatives is `C₀₁`, false positives `C₁₀`, and +true negatives is `C₁₁`: + +| | Positive | Negative | +|:--:|:-:|:-:| +|Positive|C₀₀|C₁₀| +|Negative|C₀₁|C₁₁| +""" +function confusion(a::AbstractVector{<:Integer}, b::AbstractVector{<:Integer}) + c = counts(a, b) + + n = sum(c) + nis = sum(abs2, sum(c, dims=2)) # sum of squares of sums of rows + njs = sum(abs2, sum(c, dims=1)) # sum of squares of sums of columns + + t2 = sum(abs2, c) # sum over rows & columns of nij^2 + t3 = nis+njs + #println("n: $n") + C = Int[(t2-n)/2 (nis-t2)/2; (njs-t2)/2 (t2+n^2-t3)/2] + #println("C: $C") + return C +end +confusion(a::ClusteringResult, b::ClusteringResult) = + confusion(assignments(a), assignments(b)) +confusion(a::AbstractVector{<:Integer}, b::ClusteringResult) = + confusion(a, assignments(b)) +confusion(a::ClusteringResult, b::AbstractVector{<:Integer}) = + confusion(assignments(a), b) + diff --git a/src/randindex.jl b/src/randindex.jl index 23576a31..c21afb03 100644 --- a/src/randindex.jl +++ b/src/randindex.jl @@ -18,35 +18,25 @@ Returns a tuple of indices: > Meila, Marina (2003). *Comparing Clusterings by the Variation of > Information.* Learning Theory and Kernel Machines: 173–187. + +> Steinley, Douglas (2004). *Properties of the Hubert–Arabie Adjusted +> Rand Index.* Psychological Methods, Vol. 9, No. 3: 386-396 """ function randindex(a, b) - c = counts(a, b) - - n = sum(c) - nis = sum(abs2, sum(c, dims=2)) # sum of squares of sums of rows - njs = sum(abs2, sum(c, dims=1)) # sum of squares of sums of columns - - t1 = binomial(n, 2) # total number of pairs of entities - t2 = sum(abs2, c) # sum over rows & columnns of nij^2 - t3 = .5*(nis+njs) - - # Expected index (for adjustment) - nc = (n*(n^2+1)-(n+1)*nis-(n+1)*njs+2*(nis*njs)/n)/(2*(n-1)) + a, c, b, d = confusion(a,b) # Table 2 from Steinley 2004 - A = t1+t2-t3; # agreements count - D = -t2+t3; # disagreements count + t = a+ b + c + d # total number of pairs of entities + A = a + d + D = b + c - if t1 == nc - # avoid division by zero; if k=1, define Rand = 0 - ARI = 0 - else - # adjusted Rand - Hubert & Arabie 1985 - ARI = (A-nc)/(t1-nc) - end + # expected index + ERI = (a+b)*(a+c)+(c+d)*(b+d) + # adjusted Rand - Hubert & Arabie 1985 + ARI = D == 0 ? 1.0 : (t*A-ERI)/(t*t-ERI) # (9) from Steinley 2004 - RI = A/t1 # Rand 1971 # Probability of agreement - MI = D/t1 # Mirkin 1970 # p(disagreement) - HI = (A-D)/t1 # Hubert 1977 # p(agree)-p(disagree) + RI = A/t # Rand 1971 # Probability of agreement + MI = D/t # Mirkin 1970 # p(disagreement) + HI = (A-D)/t # Hubert 1977 # p(agree)-p(disagree) return (ARI, RI, MI, HI) end diff --git a/test/confusion.jl b/test/confusion.jl new file mode 100644 index 00000000..24c931ac --- /dev/null +++ b/test/confusion.jl @@ -0,0 +1,44 @@ +# Test confusion matrix + +using Test +using Clustering + +@testset "confusion() (Confusion matrix)" begin + + @testset "small size tests" begin + @test confusion([0,0,0], [0,0,0]) == [3 0; 0 0] + @test confusion([0,0,1], [0,0,0]) == [1 0; 2 0] + @test confusion([0,1,1], [0,0,0]) == [1 0; 2 0] + @test confusion([1,1,1], [0,0,0]) == [3 0; 0 0] + + @test confusion([0,0,0], [0,0,1]) == [1 2; 0 0] + @test confusion([0,0,1], [0,0,1]) == [1 0; 0 2] + @test confusion([0,1,1], [0,0,1]) == [0 1; 1 1] + @test confusion([1,1,1], [0,0,1]) == [1 2; 0 0] + + @test confusion([0,0,0], [0,1,1]) == [1 2; 0 0] + @test confusion([0,0,1], [0,1,1]) == [0 1; 1 1] + @test confusion([0,1,1], [0,1,1]) == [1 0; 0 2] + @test confusion([1,1,1], [0,1,1]) == [1 2; 0 0] + + @test confusion([0,0,0], [1,1,1]) == [3 0; 0 0] + @test confusion([0,0,1], [1,1,1]) == [1 0; 2 0] + @test confusion([0,1,1], [1,1,1]) == [1 0; 2 0] + @test confusion([1,1,1], [1,1,1]) == [3 0; 0 0] + end + + @testset "comparing 2 k-means clusterings" begin + m = 3 + n = 100 + k = 1 + x = rand(m, n) + + # non-weighted + r1 = kmeans(x, k; maxiter=5) + r2 = kmeans(x, k; maxiter=5) + C = confusion(r1, r2) + @test C == [n*(n-1)/2 0; 0 0] + end + +end + diff --git a/test/randindex.jl b/test/randindex.jl index d3b83033..ad9b4c6f 100644 --- a/test/randindex.jl +++ b/test/randindex.jl @@ -34,4 +34,9 @@ a3 = [3, 3, 3, 2, 2, 2, 1, 1, 1, 1] @test randindex(a1, a2) == randindex(a2, a1) +@test randindex(ones(Int, 3), ones(Int, 3)) == (1, 1, 0, 1) + +a,b = rand(1:5,100_000), rand(1:5,100_000) +@test randindex(a,b)[1] < 1.0e-2 + end diff --git a/test/runtests.jl b/test/runtests.jl index 1f9d483a..9eaca2ed 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -19,7 +19,8 @@ tests = ["seeding", "hclust", "mcl", "vmeasure", - "mutualinfo"] + "mutualinfo", + "confusion"] println("Runing tests:") for t in tests From f773f92b0be14964a8ed2b9c3d7a83f422d5d92b Mon Sep 17 00:00:00 2001 From: wildart Date: Fri, 23 Sep 2022 14:38:44 -0400 Subject: [PATCH 2/4] fix test for x86 --- test/randindex.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/randindex.jl b/test/randindex.jl index ad9b4c6f..02240182 100644 --- a/test/randindex.jl +++ b/test/randindex.jl @@ -36,7 +36,7 @@ a3 = [3, 3, 3, 2, 2, 2, 1, 1, 1, 1] @test randindex(ones(Int, 3), ones(Int, 3)) == (1, 1, 0, 1) -a,b = rand(1:5,100_000), rand(1:5,100_000) +a,b = rand(1:5,10_000), rand(1:5,10_000) @test randindex(a,b)[1] < 1.0e-2 end From 563014a35fb47febb2d924f66464f141bce61e39 Mon Sep 17 00:00:00 2001 From: wildart Date: Fri, 23 Sep 2022 14:41:29 -0400 Subject: [PATCH 3/4] remove commented code --- src/confusion.jl | 2 -- 1 file changed, 2 deletions(-) diff --git a/src/confusion.jl b/src/confusion.jl index eb0af892..0832122c 100644 --- a/src/confusion.jl +++ b/src/confusion.jl @@ -28,9 +28,7 @@ function confusion(a::AbstractVector{<:Integer}, b::AbstractVector{<:Integer}) t2 = sum(abs2, c) # sum over rows & columns of nij^2 t3 = nis+njs - #println("n: $n") C = Int[(t2-n)/2 (nis-t2)/2; (njs-t2)/2 (t2+n^2-t3)/2] - #println("C: $C") return C end confusion(a::ClusteringResult, b::ClusteringResult) = From 945c5376202db1be48f78b8a060b52d28596a5ee Mon Sep 17 00:00:00 2001 From: wildart Date: Fri, 23 Sep 2022 20:45:50 -0400 Subject: [PATCH 4/4] PR corrections --- doc/source/validate.md | 4 ++-- src/confusion.jl | 21 ++++++++++----------- src/randindex.jl | 16 ++++++++-------- test/randindex.jl | 4 ++-- 4 files changed, 22 insertions(+), 23 deletions(-) diff --git a/doc/source/validate.md b/doc/source/validate.md index bfffbfdf..be57af62 100644 --- a/doc/source/validate.md +++ b/doc/source/validate.md @@ -103,8 +103,8 @@ mutualinfo ## Confusion matrix Pair [confusion matrix](https://en.wikipedia.org/wiki/Confusion_matrix) -arising from two clusterings is a 2x2 contingency table representation of -the partition co-occurrence table, see [`counts`](@ref). +arising from two clusterings is a 2×2 contingency table representation of +the partition co-occurrence, see [`counts`](@ref). ```@docs confusion diff --git a/src/confusion.jl b/src/confusion.jl index 0832122c..0aabb97d 100644 --- a/src/confusion.jl +++ b/src/confusion.jl @@ -1,23 +1,21 @@ """ - confusion(a::ClusteringResult, b::ClusteringResult) -> Matrix{Int} - confusion(a::ClusteringResult, b::AbstractVector{<:Integer}) -> Matrix{Int} - confusion(a::AbstractVector{<:Integer}, b::ClusteringResult) -> Matrix{Int} - confusion(a::AbstractVector{<:Integer}, b::AbstractVector{<:Integer}) -> Matrix{Int} + confusion(a::Union{ClusteringResult, AbstractVector}, + b::Union{ClusteringResult, AbstractVector}) -> Matrix{Int} -Return 2x2 confusion matrix `C` that represents partition co-occurrence or +Return 2×2 confusion matrix `C` that represents partition co-occurrence or similarity matrix between two clusterings by considering all pairs of samples and counting pairs that are assigned into the same or into different clusters under the true and predicted clusterings. Considering a pair of samples that is in the same group as a **positive pair**, and a pair is in the different group as a **negative pair**, then the count of -true positives is `C₀₀`, false negatives is `C₀₁`, false positives `C₁₀`, and -true negatives is `C₁₁`: +true positives is `C₁₁`, false negatives is `C₁₂`, false positives `C₂₁`, and +true negatives is `C₂₂`: | | Positive | Negative | |:--:|:-:|:-:| -|Positive|C₀₀|C₁₀| -|Negative|C₀₁|C₁₁| +|Positive|C₁₁|C₁₂| +|Negative|C₂₁|C₂₂| """ function confusion(a::AbstractVector{<:Integer}, b::AbstractVector{<:Integer}) c = counts(a, b) @@ -27,10 +25,11 @@ function confusion(a::AbstractVector{<:Integer}, b::AbstractVector{<:Integer}) njs = sum(abs2, sum(c, dims=1)) # sum of squares of sums of columns t2 = sum(abs2, c) # sum over rows & columns of nij^2 - t3 = nis+njs - C = Int[(t2-n)/2 (nis-t2)/2; (njs-t2)/2 (t2+n^2-t3)/2] + t3 = nis + njs + C = [(t2 - n)÷2 (nis - t2)÷2; (njs - t2)÷2 (t2 + n^2 - t3)÷2] return C end + confusion(a::ClusteringResult, b::ClusteringResult) = confusion(assignments(a), assignments(b)) confusion(a::AbstractVector{<:Integer}, b::ClusteringResult) = diff --git a/src/randindex.jl b/src/randindex.jl index c21afb03..3cc146f2 100644 --- a/src/randindex.jl +++ b/src/randindex.jl @@ -14,23 +14,23 @@ Returns a tuple of indices: # References > Lawrence Hubert and Phipps Arabie (1985). *Comparing partitions.* -> Journal of Classification 2 (1): 193–218 +> Journal of Classification 2 (1): 193-218 > Meila, Marina (2003). *Comparing Clusterings by the Variation of -> Information.* Learning Theory and Kernel Machines: 173–187. +> Information.* Learning Theory and Kernel Machines: 173-187. -> Steinley, Douglas (2004). *Properties of the Hubert–Arabie Adjusted +> Steinley, Douglas (2004). *Properties of the Hubert-Arabie Adjusted > Rand Index.* Psychological Methods, Vol. 9, No. 3: 386-396 """ function randindex(a, b) - a, c, b, d = confusion(a,b) # Table 2 from Steinley 2004 + c11, c21, c12, c22 = confusion(a, b) # Table 2 from Steinley 2004 - t = a+ b + c + d # total number of pairs of entities - A = a + d - D = b + c + t = c11 + c12 + c21 + c22 # total number of pairs of entities + A = c11 + c22 + D = c12 + c21 # expected index - ERI = (a+b)*(a+c)+(c+d)*(b+d) + ERI = (c11+c12)*(c11+c21)+(c21+c22)*(c12+c22) # adjusted Rand - Hubert & Arabie 1985 ARI = D == 0 ? 1.0 : (t*A-ERI)/(t*t-ERI) # (9) from Steinley 2004 diff --git a/test/randindex.jl b/test/randindex.jl index 02240182..c6ad3de4 100644 --- a/test/randindex.jl +++ b/test/randindex.jl @@ -36,7 +36,7 @@ a3 = [3, 3, 3, 2, 2, 2, 1, 1, 1, 1] @test randindex(ones(Int, 3), ones(Int, 3)) == (1, 1, 0, 1) -a,b = rand(1:5,10_000), rand(1:5,10_000) -@test randindex(a,b)[1] < 1.0e-2 +a, b = rand(1:5, 10_000), rand(1:5, 10_000) +@test randindex(a, b)[1] < 1.0e-2 end