From 0abe26b93d570173c14ab5428cc124ddd807ecab Mon Sep 17 00:00:00 2001 From: Samuel Ortion Date: Thu, 28 Sep 2023 18:08:24 +0200 Subject: [PATCH] feat: Add execution trace dump for k-means --- .gitattributes | 1 + content/chapters/1.tex | 6 ++ figures/trace/kmeans-trace.pdf | 3 + figures/trace/kmeans-trace.tex | 12 +++ kmeans-trace.txt | 39 ++++++++++ main.pdf | 4 +- scripts/kmeans_steps.lua | 130 +++++++++++++++++++++++++++++++++ 7 files changed, 193 insertions(+), 2 deletions(-) create mode 100644 figures/trace/kmeans-trace.pdf create mode 100644 figures/trace/kmeans-trace.tex create mode 100644 kmeans-trace.txt create mode 100644 scripts/kmeans_steps.lua diff --git a/.gitattributes b/.gitattributes index defcde4..e3b1f6b 100644 --- a/.gitattributes +++ b/.gitattributes @@ -1 +1,2 @@ main.pdf filter=lfs diff=lfs merge=lfs -text +figures/trace/kmeans-trace.pdf filter=lfs diff=lfs merge=lfs -text diff --git a/content/chapters/1.tex b/content/chapters/1.tex index cf88ac7..c747019 100644 --- a/content/chapters/1.tex +++ b/content/chapters/1.tex @@ -161,6 +161,12 @@ The cost function is minimized: With $k=2$ and $m_{1} = 10 \cdot 10^{-2}$ and $m_{2} = 9 \cdot 10^{-2}$ the two initial randomly chosen means, run the $k$-means algorithm. \end{exercise} +\begin{figure} + \centering + \includegraphics[scale=1]{figures/trace/kmeans-trace.pdf} + \caption{$k$-means algorithm execution trace. The tables show the value of the centroids and the distance of each gene expression value to the centroid.} +\end{figure} + \begin{figure} \centering \includegraphics[scale=1]{figures/plots/kmeans.pdf} diff --git a/figures/trace/kmeans-trace.pdf b/figures/trace/kmeans-trace.pdf new file mode 100644 index 0000000..79772c4 --- /dev/null +++ b/figures/trace/kmeans-trace.pdf @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d222b5f866d2139fb1a3a9c05559d43a991d1fbedb4f787fe47fb0492e89f694 +size 21485 diff --git a/figures/trace/kmeans-trace.tex b/figures/trace/kmeans-trace.tex new file mode 100644 index 0000000..f791525 --- /dev/null +++ b/figures/trace/kmeans-trace.tex @@ -0,0 +1,12 @@ +\documentclass[a4paper,margin=0.5cm,varwidth]{standalone} + +\usepackage{luacode} + +\begin{document} + +\begin{luacode} +local repr = dofile("../../scripts/kmeans_steps.lua") +tex.print(repr) +\end{luacode} + +\end{document} \ No newline at end of file diff --git a/kmeans-trace.txt b/kmeans-trace.txt new file mode 100644 index 0000000..9c8b0b8 --- /dev/null +++ b/kmeans-trace.txt @@ -0,0 +1,39 @@ +Entry: +id, expression +g1, 10 +g2, 12 +g3, 9 +g4, 15 +g5, 17 +g6, 18 + +... ⨉ 10^{-2} + +Iterations: +1. m1 = 10E-2, m2 = 9E-2. + +Let A, be the cluster / set of gene whose associated mean is m1, B the cluster for m2. + +A: g1, g2, g4, g5, g6 +B: g3 + +m1 := (10+12+15+17+18) / 5 = 14.4 +m2 := 9 + +2. + +A: g1, g3 +B: g2, g4, g5, g6 + +m1 := (10 + 9) / 2 = 9.5 +m2 := (12 + 15 + 17 + 18) / 4 = (40 + 14 + 8) = 62 / 4 = 15.5 + +3. + +A: g1, g2, g3 +B: g4, g5, g6 + +m1 := (10 + 12 + 9) / 3 = 31/3 = 10.333 +m2 := (13+17+18) / 3 = 48/3 = 16 + +4. The following iteration will return the same clusters. diff --git a/main.pdf b/main.pdf index d3d41e2..5cbeef8 100644 --- a/main.pdf +++ b/main.pdf @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:704862fba6f4c540a8327e3c662179eb52a50eb3d53c630a4cf932c7996b7246 -size 242454 +oid sha256:20c1f5ebd9bd1caa7e07a5b756ffae785090213d0ba69e9b912606881fb66cce +size 264316 diff --git a/scripts/kmeans_steps.lua b/scripts/kmeans_steps.lua new file mode 100644 index 0000000..711a5cd --- /dev/null +++ b/scripts/kmeans_steps.lua @@ -0,0 +1,130 @@ +local labels = { + "$g_1$", + "$g_2$", + "$g_3$", + "$g_4$", + "$g_5$", + "$g_6$" +} +local values = { + 10, + 12, + 9, + 15, + 17, + 18, +} + +local centroids = {9, 10} + +table.reduce = function (list, fn, init) + local acc = init + for k, v in ipairs(list) do + if k == 1 and not init then + acc = v + else + acc = fn(acc, v) + end + end + return acc +end + + +local function manhattan_distance(a, b) + return math.abs(a-b) +end + + +local function kmeans_step(distance, centroids, values) + local clusters = {} + local next_centroids = {} + for i, _ in ipairs(centroids) do + next_centroids[i] = {} + end + for _, value in ipairs(values) do + local minimal_distance = nil + local closest_centroid_index = nil + for centroid_index, centroid in ipairs(centroids) do + local d = distance(centroid, value) + if minimal_distance == nil or d < minimal_distance then + minimal_distance = d + closest_centroid_index = centroid_index + end + end + if closest_centroid_index ~= nil then + if clusters[closest_centroid_index] == nil then + clusters[closest_centroid_index] = {} + end + table.insert(clusters[closest_centroid_index], value) + end + end + for cluster_index, cluster in ipairs(clusters) do + next_centroids[cluster_index] = table.reduce(cluster, function (a, b) return a+b end) / #cluster + end + return { + centroids = next_centroids, + clusters = clusters, + } +end + +table.rep = function (value, times) + t = {} + for i=1,times do + t[i] = value + end + return t +end + + +local function print_distance_table(distance, labels, values, centroids) + local latex_code = [[\begin{tabular}{c]] .. table.concat(table.rep("c", #values), "") .. "c}\n & " + for index=1,#labels do + latex_code = latex_code .. " " .. labels[index] + if index ~= #labels then + latex_code = latex_code .. " & " + end + end + local function round(n) + return string.format("%.2f", n) + end + latex_code = latex_code .. " \\\\ \n " + for i, centroid in ipairs(centroids) do + latex_code = latex_code .. " " .. round(centroid) + for index=1,#values do + latex_code = latex_code .. " & " .. round(distance(values[index], centroid)) + end + latex_code = latex_code .. " \\\\ \n " + end + latex_code = latex_code .. [[\end{tabular}]] + return latex_code +end + +table.equals = function(A, B) + if #A ~= #B then + return false + end + for i=1,#A do + if A[i] ~= B[i] then + return false + end + end + return true +end + +local repr = "" + +local converged = false +local iteration = 0 +repeat + iteration = iteration + 1 + repr = repr .. string.format("%dth iteration:", iteration) .. " \n " + repr = repr .. print_distance_table(manhattan_distance, labels, values, centroids) + local next_state = kmeans_step(manhattan_distance, centroids, values) + converged = table.equals(next_state.centroids, centroids) + centroids = next_state.centroids + repr = repr .. [[ \newline ]] +until converged +repr = repr .. "\nkmeans converged" +print(repr) + +return repr \ No newline at end of file