From a66145095f2f85c2768a6234042cd552cd0240ca Mon Sep 17 00:00:00 2001
From: Art Wild <wildart@gmail.com>
Date: Sat, 25 Dec 2021 14:27:43 -0500
Subject: [PATCH 1/4] fixed ARI (fixes #225 & #226) - added pair confusion
 matrix (CM) calculations - fixed ARI calculation using CM

---
 doc/source/validate.md | 10 ++++++++++
 src/Clustering.jl      |  6 +++++-
 src/confusion.jl       | 42 ++++++++++++++++++++++++++++++++++++++++
 src/randindex.jl       | 38 ++++++++++++++----------------------
 test/confusion.jl      | 44 ++++++++++++++++++++++++++++++++++++++++++
 test/randindex.jl      |  5 +++++
 test/runtests.jl       |  3 ++-
 7 files changed, 122 insertions(+), 26 deletions(-)
 create mode 100644 src/confusion.jl
 create mode 100644 test/confusion.jl

diff --git a/doc/source/validate.md b/doc/source/validate.md
index abd19881..bfffbfdf 100644
--- a/doc/source/validate.md
+++ b/doc/source/validate.md
@@ -99,3 +99,13 @@ the similarity of two different clusterings of a dataset.
 ```@docs
 mutualinfo
 ```
+
+## Confusion matrix
+
+Pair [confusion matrix](https://en.wikipedia.org/wiki/Confusion_matrix)
+arising from two clusterings is a 2x2 contingency table representation of
+the partition co-occurrence table, see [`counts`](@ref).
+
+```@docs
+confusion
+```
diff --git a/src/Clustering.jl b/src/Clustering.jl
index 2da54517..a3513a17 100644
--- a/src/Clustering.jl
+++ b/src/Clustering.jl
@@ -65,7 +65,10 @@ module Clustering
     Hclust, hclust, cutree,
 
     # MCL
-    mcl, MCLResult
+    mcl, MCLResult,
+
+    # pair confusion matrix
+    confusion
 
     ## source files
 
@@ -85,6 +88,7 @@ module Clustering
     include("varinfo.jl")
     include("vmeasure.jl")
     include("mutualinfo.jl")
+    include("confusion.jl")
 
     include("hclust.jl")
 
diff --git a/src/confusion.jl b/src/confusion.jl
new file mode 100644
index 00000000..eb0af892
--- /dev/null
+++ b/src/confusion.jl
@@ -0,0 +1,42 @@
+"""
+    confusion(a::ClusteringResult, b::ClusteringResult) -> Matrix{Int}
+    confusion(a::ClusteringResult, b::AbstractVector{<:Integer}) -> Matrix{Int}
+    confusion(a::AbstractVector{<:Integer}, b::ClusteringResult) -> Matrix{Int}
+    confusion(a::AbstractVector{<:Integer}, b::AbstractVector{<:Integer}) -> Matrix{Int}
+
+Return 2x2 confusion matrix `C` that represents partition co-occurrence or
+similarity matrix between two clusterings by considering all pairs of samples
+and counting  pairs that are assigned into the same or into different clusters
+under the true and predicted clusterings.
+
+Considering a pair of samples that is in the same group as a **positive pair**,
+and a pair is in the different group as a **negative pair**, then the count of
+true positives is `C₀₀`, false negatives is `C₀₁`, false positives `C₁₀`, and
+true negatives is `C₁₁`:
+
+|  | Positive | Negative |
+|:--:|:-:|:-:|
+|Positive|C₀₀|C₁₀|
+|Negative|C₀₁|C₁₁|
+"""
+function confusion(a::AbstractVector{<:Integer}, b::AbstractVector{<:Integer})
+    c = counts(a, b)
+
+    n = sum(c)
+    nis = sum(abs2, sum(c, dims=2))        # sum of squares of sums of rows
+    njs = sum(abs2, sum(c, dims=1))        # sum of squares of sums of columns
+
+    t2 = sum(abs2, c)                      # sum over rows & columns of nij^2
+    t3 = nis+njs
+    #println("n: $n")
+    C = Int[(t2-n)/2 (nis-t2)/2; (njs-t2)/2 (t2+n^2-t3)/2]
+    #println("C: $C")
+    return C
+end
+confusion(a::ClusteringResult, b::ClusteringResult) =
+    confusion(assignments(a), assignments(b))
+confusion(a::AbstractVector{<:Integer}, b::ClusteringResult) =
+    confusion(a, assignments(b))
+confusion(a::ClusteringResult, b::AbstractVector{<:Integer}) =
+    confusion(assignments(a), b)
+
diff --git a/src/randindex.jl b/src/randindex.jl
index 23576a31..c21afb03 100644
--- a/src/randindex.jl
+++ b/src/randindex.jl
@@ -18,35 +18,25 @@ Returns a tuple of indices:
 
 > Meila, Marina (2003). *Comparing Clusterings by the Variation of
 > Information.* Learning Theory and Kernel Machines: 173–187.
+
+> Steinley, Douglas (2004). *Properties of the Hubert–Arabie Adjusted
+> Rand Index.* Psychological Methods, Vol. 9, No. 3: 386-396
 """
 function randindex(a, b)
-    c = counts(a, b)
-
-    n = sum(c)
-    nis = sum(abs2, sum(c, dims=2))        # sum of squares of sums of rows
-    njs = sum(abs2, sum(c, dims=1))        # sum of squares of sums of columns
-
-    t1 = binomial(n, 2)                    # total number of pairs of entities
-    t2 = sum(abs2, c)                      # sum over rows & columnns of nij^2
-    t3 = .5*(nis+njs)
-
-    # Expected index (for adjustment)
-    nc = (n*(n^2+1)-(n+1)*nis-(n+1)*njs+2*(nis*njs)/n)/(2*(n-1))
+    a, c, b, d = confusion(a,b) # Table 2 from Steinley 2004
 
-    A = t1+t2-t3;        # agreements count
-    D = -t2+t3;          # disagreements count
+    t = a+ b + c + d   # total number of pairs of entities
+    A = a + d
+    D = b + c
 
-    if t1 == nc
-        # avoid division by zero; if k=1, define Rand = 0
-        ARI = 0
-    else
-        # adjusted Rand - Hubert & Arabie 1985
-        ARI = (A-nc)/(t1-nc)
-    end
+    # expected index
+    ERI = (a+b)*(a+c)+(c+d)*(b+d)
+    # adjusted Rand - Hubert & Arabie 1985
+    ARI = D == 0 ? 1.0 : (t*A-ERI)/(t*t-ERI) # (9) from Steinley 2004
 
-    RI = A/t1            # Rand 1971      # Probability of agreement
-    MI = D/t1            # Mirkin 1970    # p(disagreement)
-    HI = (A-D)/t1        # Hubert 1977    # p(agree)-p(disagree)
+    RI = A/t            # Rand 1971      # Probability of agreement
+    MI = D/t            # Mirkin 1970    # p(disagreement)
+    HI = (A-D)/t        # Hubert 1977    # p(agree)-p(disagree)
 
     return (ARI, RI, MI, HI)
 end
diff --git a/test/confusion.jl b/test/confusion.jl
new file mode 100644
index 00000000..24c931ac
--- /dev/null
+++ b/test/confusion.jl
@@ -0,0 +1,44 @@
+# Test confusion matrix
+
+using Test
+using Clustering
+
+@testset "confusion() (Confusion matrix)" begin
+
+    @testset "small size tests" begin
+        @test confusion([0,0,0], [0,0,0]) == [3 0; 0 0]
+        @test confusion([0,0,1], [0,0,0]) == [1 0; 2 0]
+        @test confusion([0,1,1], [0,0,0]) == [1 0; 2 0]
+        @test confusion([1,1,1], [0,0,0]) == [3 0; 0 0]
+        
+        @test confusion([0,0,0], [0,0,1]) == [1 2; 0 0]
+        @test confusion([0,0,1], [0,0,1]) == [1 0; 0 2]
+        @test confusion([0,1,1], [0,0,1]) == [0 1; 1 1]
+        @test confusion([1,1,1], [0,0,1]) == [1 2; 0 0]
+        
+        @test confusion([0,0,0], [0,1,1]) == [1 2; 0 0]
+        @test confusion([0,0,1], [0,1,1]) == [0 1; 1 1]
+        @test confusion([0,1,1], [0,1,1]) == [1 0; 0 2]
+        @test confusion([1,1,1], [0,1,1]) == [1 2; 0 0]
+        
+        @test confusion([0,0,0], [1,1,1]) == [3 0; 0 0]
+        @test confusion([0,0,1], [1,1,1]) == [1 0; 2 0]
+        @test confusion([0,1,1], [1,1,1]) == [1 0; 2 0]
+        @test confusion([1,1,1], [1,1,1]) == [3 0; 0 0]
+    end
+
+    @testset "comparing 2 k-means clusterings" begin
+        m = 3
+        n = 100
+        k = 1
+        x = rand(m, n)
+
+        # non-weighted
+        r1 = kmeans(x, k; maxiter=5)
+        r2 = kmeans(x, k; maxiter=5)
+        C = confusion(r1, r2)
+        @test C == [n*(n-1)/2 0; 0 0]
+    end
+
+end
+
diff --git a/test/randindex.jl b/test/randindex.jl
index d3b83033..ad9b4c6f 100644
--- a/test/randindex.jl
+++ b/test/randindex.jl
@@ -34,4 +34,9 @@ a3 = [3, 3, 3, 2, 2, 2, 1, 1, 1, 1]
 
 @test randindex(a1, a2) == randindex(a2, a1)
 
+@test randindex(ones(Int, 3), ones(Int, 3)) == (1, 1, 0, 1)
+
+a,b = rand(1:5,100_000), rand(1:5,100_000)
+@test randindex(a,b)[1] < 1.0e-2
+
 end
diff --git a/test/runtests.jl b/test/runtests.jl
index 1f9d483a..9eaca2ed 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -19,7 +19,8 @@ tests = ["seeding",
          "hclust",
          "mcl",
          "vmeasure",
-         "mutualinfo"]
+         "mutualinfo",
+         "confusion"]
 
 println("Runing tests:")
 for t in tests

From f773f92b0be14964a8ed2b9c3d7a83f422d5d92b Mon Sep 17 00:00:00 2001
From: wildart <wildart@gmail.com>
Date: Fri, 23 Sep 2022 14:38:44 -0400
Subject: [PATCH 2/4] fix test for x86

---
 test/randindex.jl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test/randindex.jl b/test/randindex.jl
index ad9b4c6f..02240182 100644
--- a/test/randindex.jl
+++ b/test/randindex.jl
@@ -36,7 +36,7 @@ a3 = [3, 3, 3, 2, 2, 2, 1, 1, 1, 1]
 
 @test randindex(ones(Int, 3), ones(Int, 3)) == (1, 1, 0, 1)
 
-a,b = rand(1:5,100_000), rand(1:5,100_000)
+a,b = rand(1:5,10_000), rand(1:5,10_000)
 @test randindex(a,b)[1] < 1.0e-2
 
 end

From 563014a35fb47febb2d924f66464f141bce61e39 Mon Sep 17 00:00:00 2001
From: wildart <wildart@gmail.com>
Date: Fri, 23 Sep 2022 14:41:29 -0400
Subject: [PATCH 3/4] remove commented code

---
 src/confusion.jl | 2 --
 1 file changed, 2 deletions(-)

diff --git a/src/confusion.jl b/src/confusion.jl
index eb0af892..0832122c 100644
--- a/src/confusion.jl
+++ b/src/confusion.jl
@@ -28,9 +28,7 @@ function confusion(a::AbstractVector{<:Integer}, b::AbstractVector{<:Integer})
 
     t2 = sum(abs2, c)                      # sum over rows & columns of nij^2
     t3 = nis+njs
-    #println("n: $n")
     C = Int[(t2-n)/2 (nis-t2)/2; (njs-t2)/2 (t2+n^2-t3)/2]
-    #println("C: $C")
     return C
 end
 confusion(a::ClusteringResult, b::ClusteringResult) =

From 945c5376202db1be48f78b8a060b52d28596a5ee Mon Sep 17 00:00:00 2001
From: wildart <wildart@gmail.com>
Date: Fri, 23 Sep 2022 20:45:50 -0400
Subject: [PATCH 4/4] PR corrections

---
 doc/source/validate.md |  4 ++--
 src/confusion.jl       | 21 ++++++++++-----------
 src/randindex.jl       | 16 ++++++++--------
 test/randindex.jl      |  4 ++--
 4 files changed, 22 insertions(+), 23 deletions(-)

diff --git a/doc/source/validate.md b/doc/source/validate.md
index bfffbfdf..be57af62 100644
--- a/doc/source/validate.md
+++ b/doc/source/validate.md
@@ -103,8 +103,8 @@ mutualinfo
 ## Confusion matrix
 
 Pair [confusion matrix](https://en.wikipedia.org/wiki/Confusion_matrix)
-arising from two clusterings is a 2x2 contingency table representation of
-the partition co-occurrence table, see [`counts`](@ref).
+arising from two clusterings is a 2×2 contingency table representation of
+the partition co-occurrence, see [`counts`](@ref).
 
 ```@docs
 confusion
diff --git a/src/confusion.jl b/src/confusion.jl
index 0832122c..0aabb97d 100644
--- a/src/confusion.jl
+++ b/src/confusion.jl
@@ -1,23 +1,21 @@
 """
-    confusion(a::ClusteringResult, b::ClusteringResult) -> Matrix{Int}
-    confusion(a::ClusteringResult, b::AbstractVector{<:Integer}) -> Matrix{Int}
-    confusion(a::AbstractVector{<:Integer}, b::ClusteringResult) -> Matrix{Int}
-    confusion(a::AbstractVector{<:Integer}, b::AbstractVector{<:Integer}) -> Matrix{Int}
+    confusion(a::Union{ClusteringResult, AbstractVector},
+              b::Union{ClusteringResult, AbstractVector}) -> Matrix{Int}
 
-Return 2x2 confusion matrix `C` that represents partition co-occurrence or
+Return 2×2 confusion matrix `C` that represents partition co-occurrence or
 similarity matrix between two clusterings by considering all pairs of samples
 and counting  pairs that are assigned into the same or into different clusters
 under the true and predicted clusterings.
 
 Considering a pair of samples that is in the same group as a **positive pair**,
 and a pair is in the different group as a **negative pair**, then the count of
-true positives is `C₀₀`, false negatives is `C₀₁`, false positives `C₁₀`, and
-true negatives is `C₁₁`:
+true positives is `C₁₁`, false negatives is `C₁₂`, false positives `C₂₁`, and
+true negatives is `C₂₂`:
 
 |  | Positive | Negative |
 |:--:|:-:|:-:|
-|Positive|C₀₀|C₁₀|
-|Negative|C₀₁|C₁₁|
+|Positive|C₁₁|C₁₂|
+|Negative|C₂₁|C₂₂|
 """
 function confusion(a::AbstractVector{<:Integer}, b::AbstractVector{<:Integer})
     c = counts(a, b)
@@ -27,10 +25,11 @@ function confusion(a::AbstractVector{<:Integer}, b::AbstractVector{<:Integer})
     njs = sum(abs2, sum(c, dims=1))        # sum of squares of sums of columns
 
     t2 = sum(abs2, c)                      # sum over rows & columns of nij^2
-    t3 = nis+njs
-    C = Int[(t2-n)/2 (nis-t2)/2; (njs-t2)/2 (t2+n^2-t3)/2]
+    t3 = nis + njs
+    C = [(t2 - n)÷2 (nis - t2)÷2; (njs - t2)÷2 (t2 + n^2 - t3)÷2]
     return C
 end
+
 confusion(a::ClusteringResult, b::ClusteringResult) =
     confusion(assignments(a), assignments(b))
 confusion(a::AbstractVector{<:Integer}, b::ClusteringResult) =
diff --git a/src/randindex.jl b/src/randindex.jl
index c21afb03..3cc146f2 100644
--- a/src/randindex.jl
+++ b/src/randindex.jl
@@ -14,23 +14,23 @@ Returns a tuple of indices:
 
 # References
 > Lawrence Hubert and Phipps Arabie (1985). *Comparing partitions.*
-> Journal of Classification 2 (1): 193–218
+> Journal of Classification 2 (1): 193-218
 
 > Meila, Marina (2003). *Comparing Clusterings by the Variation of
-> Information.* Learning Theory and Kernel Machines: 173–187.
+> Information.* Learning Theory and Kernel Machines: 173-187.
 
-> Steinley, Douglas (2004). *Properties of the Hubert–Arabie Adjusted
+> Steinley, Douglas (2004). *Properties of the Hubert-Arabie Adjusted
 > Rand Index.* Psychological Methods, Vol. 9, No. 3: 386-396
 """
 function randindex(a, b)
-    a, c, b, d = confusion(a,b) # Table 2 from Steinley 2004
+    c11, c21, c12, c22 = confusion(a, b) # Table 2 from Steinley 2004
 
-    t = a+ b + c + d   # total number of pairs of entities
-    A = a + d
-    D = b + c
+    t = c11 + c12 + c21 + c22   # total number of pairs of entities
+    A = c11 + c22
+    D = c12 + c21
 
     # expected index
-    ERI = (a+b)*(a+c)+(c+d)*(b+d)
+    ERI = (c11+c12)*(c11+c21)+(c21+c22)*(c12+c22)
     # adjusted Rand - Hubert & Arabie 1985
     ARI = D == 0 ? 1.0 : (t*A-ERI)/(t*t-ERI) # (9) from Steinley 2004
 
diff --git a/test/randindex.jl b/test/randindex.jl
index 02240182..c6ad3de4 100644
--- a/test/randindex.jl
+++ b/test/randindex.jl
@@ -36,7 +36,7 @@ a3 = [3, 3, 3, 2, 2, 2, 1, 1, 1, 1]
 
 @test randindex(ones(Int, 3), ones(Int, 3)) == (1, 1, 0, 1)
 
-a,b = rand(1:5,10_000), rand(1:5,10_000)
-@test randindex(a,b)[1] < 1.0e-2
+a, b = rand(1:5, 10_000), rand(1:5, 10_000)
+@test randindex(a, b)[1] < 1.0e-2
 
 end