feat: Add execution trace dump for k-means

This commit is contained in:
Samuel Ortion 2023-09-28 18:08:24 +02:00
parent 4bf97cb0fd
commit 0abe26b93d
7 changed files with 193 additions and 2 deletions

1
.gitattributes vendored
View File

@ -1 +1,2 @@
main.pdf filter=lfs diff=lfs merge=lfs -text
figures/trace/kmeans-trace.pdf filter=lfs diff=lfs merge=lfs -text

View File

@ -161,6 +161,12 @@ The cost function is minimized:
With $k=2$ and $m_{1} = 10 \cdot 10^{-2}$ and $m_{2} = 9 \cdot 10^{-2}$ the two initial randomly chosen means, run the $k$-means algorithm.
\end{exercise}
\begin{figure}
\centering
\includegraphics[scale=1]{figures/trace/kmeans-trace.pdf}
\caption{$k$-means algorithm execution trace. The tables show the value of the centroids and the distance of each gene expression value to the centroid.}
\end{figure}
\begin{figure}
\centering
\includegraphics[scale=1]{figures/plots/kmeans.pdf}

BIN
figures/trace/kmeans-trace.pdf (Stored with Git LFS) Normal file

Binary file not shown.

View File

@ -0,0 +1,12 @@
\documentclass[a4paper,margin=0.5cm,varwidth]{standalone}
\usepackage{luacode}
\begin{document}
\begin{luacode}
local repr = dofile("../../scripts/kmeans_steps.lua")
tex.print(repr)
\end{luacode}
\end{document}

39
kmeans-trace.txt Normal file
View File

@ -0,0 +1,39 @@
Entry:
id, expression
g1, 10
g2, 12
g3, 9
g4, 15
g5, 17
g6, 18
... ⨉ 10^{-2}
Iterations:
1. m1 = 10E-2, m2 = 9E-2.
Let A, be the cluster / set of gene whose associated mean is m1, B the cluster for m2.
A: g1, g2, g4, g5, g6
B: g3
m1 := (10+12+15+17+18) / 5 = 14.4
m2 := 9
2.
A: g1, g3
B: g2, g4, g5, g6
m1 := (10 + 9) / 2 = 9.5
m2 := (12 + 15 + 17 + 18) / 4 = (40 + 14 + 8) = 62 / 4 = 15.5
3.
A: g1, g2, g3
B: g4, g5, g6
m1 := (10 + 12 + 9) / 3 = 31/3 = 10.333
m2 := (13+17+18) / 3 = 48/3 = 16
4. The following iteration will return the same clusters.

BIN
main.pdf (Stored with Git LFS)

Binary file not shown.

130
scripts/kmeans_steps.lua Normal file
View File

@ -0,0 +1,130 @@
local labels = {
"$g_1$",
"$g_2$",
"$g_3$",
"$g_4$",
"$g_5$",
"$g_6$"
}
local values = {
10,
12,
9,
15,
17,
18,
}
local centroids = {9, 10}
table.reduce = function (list, fn, init)
local acc = init
for k, v in ipairs(list) do
if k == 1 and not init then
acc = v
else
acc = fn(acc, v)
end
end
return acc
end
local function manhattan_distance(a, b)
return math.abs(a-b)
end
local function kmeans_step(distance, centroids, values)
local clusters = {}
local next_centroids = {}
for i, _ in ipairs(centroids) do
next_centroids[i] = {}
end
for _, value in ipairs(values) do
local minimal_distance = nil
local closest_centroid_index = nil
for centroid_index, centroid in ipairs(centroids) do
local d = distance(centroid, value)
if minimal_distance == nil or d < minimal_distance then
minimal_distance = d
closest_centroid_index = centroid_index
end
end
if closest_centroid_index ~= nil then
if clusters[closest_centroid_index] == nil then
clusters[closest_centroid_index] = {}
end
table.insert(clusters[closest_centroid_index], value)
end
end
for cluster_index, cluster in ipairs(clusters) do
next_centroids[cluster_index] = table.reduce(cluster, function (a, b) return a+b end) / #cluster
end
return {
centroids = next_centroids,
clusters = clusters,
}
end
table.rep = function (value, times)
t = {}
for i=1,times do
t[i] = value
end
return t
end
local function print_distance_table(distance, labels, values, centroids)
local latex_code = [[\begin{tabular}{c]] .. table.concat(table.rep("c", #values), "") .. "c}\n & "
for index=1,#labels do
latex_code = latex_code .. " " .. labels[index]
if index ~= #labels then
latex_code = latex_code .. " & "
end
end
local function round(n)
return string.format("%.2f", n)
end
latex_code = latex_code .. " \\\\ \n "
for i, centroid in ipairs(centroids) do
latex_code = latex_code .. " " .. round(centroid)
for index=1,#values do
latex_code = latex_code .. " & " .. round(distance(values[index], centroid))
end
latex_code = latex_code .. " \\\\ \n "
end
latex_code = latex_code .. [[\end{tabular}]]
return latex_code
end
table.equals = function(A, B)
if #A ~= #B then
return false
end
for i=1,#A do
if A[i] ~= B[i] then
return false
end
end
return true
end
local repr = ""
local converged = false
local iteration = 0
repeat
iteration = iteration + 1
repr = repr .. string.format("%dth iteration:", iteration) .. " \n "
repr = repr .. print_distance_table(manhattan_distance, labels, values, centroids)
local next_state = kmeans_step(manhattan_distance, centroids, values)
converged = table.equals(next_state.centroids, centroids)
centroids = next_state.centroids
repr = repr .. [[ \newline ]]
until converged
repr = repr .. "\nkmeans converged"
print(repr)
return repr