From 0abe26b93d570173c14ab5428cc124ddd807ecab Mon Sep 17 00:00:00 2001
From: Samuel Ortion <samuel+git@ortion.fr>
Date: Thu, 28 Sep 2023 18:08:24 +0200
Subject: [PATCH] feat: Add execution trace dump for k-means

---
 .gitattributes                 |   1 +
 content/chapters/1.tex         |   6 ++
 figures/trace/kmeans-trace.pdf |   3 +
 figures/trace/kmeans-trace.tex |  12 +++
 kmeans-trace.txt               |  39 ++++++++++
 main.pdf                       |   4 +-
 scripts/kmeans_steps.lua       | 130 +++++++++++++++++++++++++++++++++
 7 files changed, 193 insertions(+), 2 deletions(-)
 create mode 100644 figures/trace/kmeans-trace.pdf
 create mode 100644 figures/trace/kmeans-trace.tex
 create mode 100644 kmeans-trace.txt
 create mode 100644 scripts/kmeans_steps.lua

diff --git a/.gitattributes b/.gitattributes
index defcde4..e3b1f6b 100644
--- a/.gitattributes
+++ b/.gitattributes
@@ -1 +1,2 @@
 main.pdf filter=lfs diff=lfs merge=lfs -text
+figures/trace/kmeans-trace.pdf filter=lfs diff=lfs merge=lfs -text
diff --git a/content/chapters/1.tex b/content/chapters/1.tex
index cf88ac7..c747019 100644
--- a/content/chapters/1.tex
+++ b/content/chapters/1.tex
@@ -161,6 +161,12 @@ The cost function is minimized:
   With $k=2$ and $m_{1} = 10 \cdot 10^{-2}$ and $m_{2} = 9 \cdot 10^{-2}$ the two initial randomly chosen means, run the $k$-means algorithm.
 \end{exercise}
 
+\begin{figure}
+  \centering
+  \includegraphics[scale=1]{figures/trace/kmeans-trace.pdf}
+  \caption{$k$-means algorithm execution trace. The tables show the value of the centroids and the distance of each gene expression value to the centroid.}
+\end{figure}
+
 \begin{figure}
   \centering
   \includegraphics[scale=1]{figures/plots/kmeans.pdf}
diff --git a/figures/trace/kmeans-trace.pdf b/figures/trace/kmeans-trace.pdf
new file mode 100644
index 0000000..79772c4
--- /dev/null
+++ b/figures/trace/kmeans-trace.pdf
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d222b5f866d2139fb1a3a9c05559d43a991d1fbedb4f787fe47fb0492e89f694
+size 21485
diff --git a/figures/trace/kmeans-trace.tex b/figures/trace/kmeans-trace.tex
new file mode 100644
index 0000000..f791525
--- /dev/null
+++ b/figures/trace/kmeans-trace.tex
@@ -0,0 +1,12 @@
+\documentclass[a4paper,margin=0.5cm,varwidth]{standalone}
+
+\usepackage{luacode}
+
+\begin{document}
+
+\begin{luacode}
+local repr = dofile("../../scripts/kmeans_steps.lua")
+tex.print(repr)
+\end{luacode}
+
+\end{document}
\ No newline at end of file
diff --git a/kmeans-trace.txt b/kmeans-trace.txt
new file mode 100644
index 0000000..9c8b0b8
--- /dev/null
+++ b/kmeans-trace.txt
@@ -0,0 +1,39 @@
+Entry: 
+id, expression
+g1, 10
+g2, 12
+g3, 9
+g4, 15
+g5, 17
+g6, 18
+
+... ⨉ 10^{-2}
+
+Iterations:
+1. m1 = 10E-2, m2 = 9E-2.
+
+Let A, be the cluster / set of gene whose associated mean is m1, B the cluster for m2.
+
+A: g1, g2, g4, g5, g6
+B: g3
+
+m1 := (10+12+15+17+18) / 5 = 14.4
+m2 := 9
+
+2.
+
+A: g1, g3
+B: g2, g4, g5, g6
+
+m1 := (10 + 9) / 2 = 9.5
+m2 := (12 + 15 + 17 + 18) / 4 = (40 + 14 + 8) = 62 / 4 = 15.5
+
+3. 
+
+A: g1, g2, g3
+B: g4, g5, g6
+
+m1 := (10 + 12 + 9) / 3 = 31/3 = 10.333
+m2 := (13+17+18) / 3 = 48/3 = 16
+
+4. The following iteration will return the same clusters.
diff --git a/main.pdf b/main.pdf
index d3d41e2..5cbeef8 100644
--- a/main.pdf
+++ b/main.pdf
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:704862fba6f4c540a8327e3c662179eb52a50eb3d53c630a4cf932c7996b7246
-size 242454
+oid sha256:20c1f5ebd9bd1caa7e07a5b756ffae785090213d0ba69e9b912606881fb66cce
+size 264316
diff --git a/scripts/kmeans_steps.lua b/scripts/kmeans_steps.lua
new file mode 100644
index 0000000..711a5cd
--- /dev/null
+++ b/scripts/kmeans_steps.lua
@@ -0,0 +1,130 @@
+local labels = {
+    "$g_1$",
+    "$g_2$",
+    "$g_3$",
+    "$g_4$",
+    "$g_5$",
+    "$g_6$"
+}
+local values = {
+    10,
+    12,
+    9,
+    15,
+    17,
+    18,
+}
+
+local centroids = {9, 10}
+
+table.reduce = function (list, fn, init)
+    local acc = init
+    for k, v in ipairs(list) do
+        if k == 1 and not init then
+            acc = v
+        else
+            acc = fn(acc, v)
+        end
+    end
+    return acc
+end
+
+
+local function manhattan_distance(a, b) 
+    return math.abs(a-b)
+end
+
+
+local function kmeans_step(distance, centroids, values)
+    local clusters = {}
+    local next_centroids = {}
+    for i, _ in ipairs(centroids) do
+        next_centroids[i] = {}
+    end
+    for _, value in ipairs(values) do
+        local minimal_distance = nil
+        local closest_centroid_index = nil
+        for centroid_index, centroid in ipairs(centroids) do
+            local d = distance(centroid, value)
+            if minimal_distance == nil or d < minimal_distance then
+                minimal_distance = d
+                closest_centroid_index = centroid_index
+            end
+        end
+        if closest_centroid_index ~= nil then
+            if clusters[closest_centroid_index] == nil then
+                clusters[closest_centroid_index] = {}
+            end
+            table.insert(clusters[closest_centroid_index], value)
+        end
+    end
+    for cluster_index, cluster in ipairs(clusters) do
+        next_centroids[cluster_index] = table.reduce(cluster, function (a, b) return a+b end) / #cluster
+    end
+    return {
+        centroids = next_centroids,
+        clusters = clusters,
+    }
+end
+
+table.rep = function (value, times) 
+    t = {}
+    for i=1,times do
+        t[i] = value 
+    end
+    return t
+end
+
+
+local function print_distance_table(distance, labels, values, centroids)
+    local latex_code = [[\begin{tabular}{c]] .. table.concat(table.rep("c", #values), "") .. "c}\n & "
+    for index=1,#labels do
+        latex_code = latex_code .. " " .. labels[index]
+        if index ~= #labels then
+            latex_code = latex_code .. " & "
+        end
+    end
+    local function round(n)
+        return string.format("%.2f", n)
+    end
+    latex_code = latex_code .. " \\\\ \n "
+    for i, centroid in ipairs(centroids) do
+        latex_code = latex_code .. " " .. round(centroid)
+        for index=1,#values do
+            latex_code = latex_code .. " & " .. round(distance(values[index], centroid))
+        end
+        latex_code = latex_code .. " \\\\ \n "
+    end
+    latex_code = latex_code .. [[\end{tabular}]]
+    return latex_code
+end
+
+table.equals = function(A, B)
+    if #A ~= #B then
+        return false
+    end
+    for i=1,#A do
+        if A[i] ~= B[i] then
+            return false
+        end
+    end
+    return true
+end
+
+local repr = ""
+
+local converged = false
+local iteration = 0
+repeat
+    iteration = iteration + 1
+    repr = repr .. string.format("%dth iteration:", iteration) .. " \n "
+    repr = repr .. print_distance_table(manhattan_distance, labels, values, centroids)
+    local next_state = kmeans_step(manhattan_distance, centroids, values)
+    converged = table.equals(next_state.centroids, centroids)
+    centroids = next_state.centroids
+    repr = repr .. [[ \newline ]]
+until converged
+repr = repr .. "\nkmeans converged" 
+print(repr)
+
+return repr
\ No newline at end of file