diff --git a/dev/lablecture_day4_clustering.key b/dev/lablecture_day4_clustering.key
index e6cea5a..4d9f54d 100755
Binary files a/dev/lablecture_day4_clustering.key and b/dev/lablecture_day4_clustering.key differ
diff --git a/docs/index.html b/docs/index.html
index 5e57a26..bd169ef 100644
--- a/docs/index.html
+++ b/docs/index.html
@@ -314,7 +314,7 @@ <h3 class="anchored" data-anchor-id="day-4">Day 4</h3>
 <tr class="odd">
 <td style="text-align: center;">13:00 - 14:00</td>
 <td style="text-align: center;">Take-home exam simulation</td>
-<td style="text-align: center;"></td>
+<td style="text-align: center;"><a href="">Exam</a></td>
 </tr>
 <tr class="even">
 <td style="text-align: center;">14:00 - 16:00</td>
diff --git a/docs/lab/lab_day3_pca.html b/docs/lab/lab_day3_pca.html
index b731759..4b4e0ab 100644
--- a/docs/lab/lab_day3_pca.html
+++ b/docs/lab/lab_day3_pca.html
@@ -192,7 +192,7 @@ <h2 id="toc-title">On this page</h2>
 
 <p>Download datasets <a href="https://github.com/ocbe-uio/course_med3007/tree/main/lab/data">here</a> or from Canvas.</p>
 <p>R script: <a href="https://github.com/ocbe-uio/course_med3007/blob/main/lab/code/MED3007_Lab2_pca.R">Code</a></p>
-<p>Lab Lecture</p>
+<p>Lab Lecture: <a href="presentation/Lab_pca.pdf">Slides</a></p>
 <section id="exercise-1-food" class="level2">
 <h2 class="anchored" data-anchor-id="exercise-1-food">Exercise 1: Food</h2>
 <p>In the first exercise, we explore a low-dimensional dataset, <code>Food.txt</code>.</p>
@@ -985,7 +985,7 @@ <h2 class="anchored" data-anchor-id="exercise-3-gene-expression-data">Exercise 3
 <span id="cb69-11"><a href="#cb69-11" aria-hidden="true" tabindex="-1"></a></span>
 <span id="cb69-12"><a href="#cb69-12" aria-hidden="true" tabindex="-1"></a>R script: <span class="co">[</span><span class="ot">Code</span><span class="co">](https://github.com/ocbe-uio/course_med3007/blob/main/lab/code/MED3007_Lab2_pca.R)</span></span>
 <span id="cb69-13"><a href="#cb69-13" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb69-14"><a href="#cb69-14" aria-hidden="true" tabindex="-1"></a>Lab Lecture </span>
+<span id="cb69-14"><a href="#cb69-14" aria-hidden="true" tabindex="-1"></a>Lab Lecture: <span class="co">[</span><span class="ot">Slides</span><span class="co">](presentation/Lab_pca.pdf)</span></span>
 <span id="cb69-15"><a href="#cb69-15" aria-hidden="true" tabindex="-1"></a></span>
 <span id="cb69-16"><a href="#cb69-16" aria-hidden="true" tabindex="-1"></a></span>
 <span id="cb69-17"><a href="#cb69-17" aria-hidden="true" tabindex="-1"></a></span>
diff --git a/docs/lab/lab_day4_clustering.html b/docs/lab/lab_day4_clustering.html
index aafac1e..4b81ddb 100644
--- a/docs/lab/lab_day4_clustering.html
+++ b/docs/lab/lab_day4_clustering.html
@@ -153,17 +153,24 @@
     <h2 id="toc-title">On this page</h2>
    
   <ul>
-  <li><a href="#exercise-1-nci60" id="toc-exercise-1-nci60" class="nav-link active" data-scroll-target="#exercise-1-nci60">Exercise 1: NCI60</a>
+  <li><a href="#exercise-1-food" id="toc-exercise-1-food" class="nav-link active" data-scroll-target="#exercise-1-food">Exercise 1: Food</a>
   <ul class="collapse">
+  <li><a href="#distances" id="toc-distances" class="nav-link" data-scroll-target="#distances">Distances</a></li>
   <li><a href="#hierarchical-clustering" id="toc-hierarchical-clustering" class="nav-link" data-scroll-target="#hierarchical-clustering">Hierarchical clustering</a></li>
+  <li><a href="#linkage-dissimilarity-scaling" id="toc-linkage-dissimilarity-scaling" class="nav-link" data-scroll-target="#linkage-dissimilarity-scaling">Linkage, dissimilarity, scaling</a></li>
+  <li><a href="#heatmap" id="toc-heatmap" class="nav-link" data-scroll-target="#heatmap">Heatmap</a></li>
+  </ul></li>
+  <li><a href="#exercise-2-nci60" id="toc-exercise-2-nci60" class="nav-link" data-scroll-target="#exercise-2-nci60">Exercise 2: NCI60</a>
+  <ul class="collapse">
+  <li><a href="#hierarchical-clustering-1" id="toc-hierarchical-clustering-1" class="nav-link" data-scroll-target="#hierarchical-clustering-1">Hierarchical clustering</a></li>
   <li><a href="#k-means-clustering" id="toc-k-means-clustering" class="nav-link" data-scroll-target="#k-means-clustering">K-means clustering</a></li>
   <li><a href="#compare-with-hierarchical-clustering" id="toc-compare-with-hierarchical-clustering" class="nav-link" data-scroll-target="#compare-with-hierarchical-clustering">Compare with hierarchical clustering</a></li>
   <li><a href="#visualize-clusters" id="toc-visualize-clusters" class="nav-link" data-scroll-target="#visualize-clusters">Visualize clusters</a></li>
-  <li><a href="#heatmap" id="toc-heatmap" class="nav-link" data-scroll-target="#heatmap">Heatmap</a></li>
+  <li><a href="#heatmap-1" id="toc-heatmap-1" class="nav-link" data-scroll-target="#heatmap-1">Heatmap</a></li>
   </ul></li>
-  <li><a href="#exercise-2-gene-expression-data" id="toc-exercise-2-gene-expression-data" class="nav-link" data-scroll-target="#exercise-2-gene-expression-data">Exercise 2: Gene expression data</a>
+  <li><a href="#exercise-3-gene-expression-data" id="toc-exercise-3-gene-expression-data" class="nav-link" data-scroll-target="#exercise-3-gene-expression-data">Exercise 3: Gene expression data</a>
   <ul class="collapse">
-  <li><a href="#hierarchical-clustering-1" id="toc-hierarchical-clustering-1" class="nav-link" data-scroll-target="#hierarchical-clustering-1">Hierarchical clustering</a></li>
+  <li><a href="#hierarchical-clustering-2" id="toc-hierarchical-clustering-2" class="nav-link" data-scroll-target="#hierarchical-clustering-2">Hierarchical clustering</a></li>
   <li><a href="#k-means" id="toc-k-means" class="nav-link" data-scroll-target="#k-means">K-means</a></li>
   </ul></li>
   </ul>
@@ -191,35 +198,215 @@ <h2 id="toc-title">On this page</h2>
 
 <p>Download datasets <a href="https://github.com/ocbe-uio/course_med3007/tree/main/lab/data">here</a>, or from Canvas.</p>
 <p>R script: <a href="https://github.com/ocbe-uio/course_med3007/blob/main/lab/code/MED3007_Lab3_clustering.R">Code</a></p>
-<p>Presentation: <a href="">Slides</a></p>
-<section id="exercise-1-nci60" class="level2">
-<h2 class="anchored" data-anchor-id="exercise-1-nci60">Exercise 1: NCI60</h2>
-<p>We look at the NCI60 data again. First load the dataset.</p>
+<p>Presentation: <a href="presentation/Lab_clustering.pdf">Slides</a></p>
+<section id="exercise-1-food" class="level2">
+<h2 class="anchored" data-anchor-id="exercise-1-food">Exercise 1: Food</h2>
+<p>We use the same <code>Food.txt</code> data to illustrate two concepts: hierarchical clustering, and heatmap.</p>
+<p>This is not a genomics dataset, but for the ease of interpretability, we use it for teaching purposes.</p>
+<p>Let us load the dataset.</p>
+<div class="cell">
+<div class="sourceCode cell-code" id="cb1"><pre class="sourceCode r code-with-copy"><code class="sourceCode r"><span id="cb1-1"><a href="#cb1-1" aria-hidden="true" tabindex="-1"></a>food <span class="ot">&lt;-</span> <span class="fu">read.table</span>(<span class="st">'data/Food.txt'</span>, <span class="at">header=</span>T)</span>
+<span id="cb1-2"><a href="#cb1-2" aria-hidden="true" tabindex="-1"></a><span class="co"># we change the name from pulses to a more common name, legume</span></span>
+<span id="cb1-3"><a href="#cb1-3" aria-hidden="true" tabindex="-1"></a><span class="fu">colnames</span>(food)[<span class="dv">7</span>] <span class="ot">&lt;-</span> <span class="st">'Legume'</span></span>
+<span id="cb1-4"><a href="#cb1-4" aria-hidden="true" tabindex="-1"></a><span class="fu">head</span>(food) <span class="co"># print first 6 lines </span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<div class="cell-output cell-output-stdout">
+<pre><code>               Meat Pigs Eggs Milk Fish Cereals Legume Fruit
+Albania        10.1  1.4  0.5  8.9  0.2    42.3    5.5   1.7
+Austria         8.9 14.0  4.3 19.9  2.1    28.0    1.3   4.3
+Belg.Lux.      13.5  9.3  4.1 17.5  4.5    26.6    2.1   4.0
+Bulgaria        7.8  6.0  1.6  8.3  1.2    56.7    3.7   4.2
+Czechoslovakia  9.7 11.4  2.8 12.5  2.0    34.3    1.1   4.0
+Denmark        10.6 10.8  3.7 25.0  9.9    21.9    0.7   2.4</code></pre>
+</div>
+</div>
+<p>We <strong>scale</strong> the data (also called standardize, or normalize sometimes) so that each column (feature, variable) has 0 mean and 1 variance. We call the scaled data <code>food_s</code>.</p>
 <div class="cell">
-<div class="sourceCode cell-code" id="cb1"><pre class="sourceCode r code-with-copy"><code class="sourceCode r"><span id="cb1-1"><a href="#cb1-1" aria-hidden="true" tabindex="-1"></a><span class="fu">library</span>(ISLR)</span>
-<span id="cb1-2"><a href="#cb1-2" aria-hidden="true" tabindex="-1"></a><span class="co"># or, load('data/NCI60.RData')</span></span>
-<span id="cb1-3"><a href="#cb1-3" aria-hidden="true" tabindex="-1"></a>nci.labs <span class="ot">&lt;-</span> NCI60<span class="sc">$</span>labs <span class="co"># Sample labels (tissue type)</span></span>
-<span id="cb1-4"><a href="#cb1-4" aria-hidden="true" tabindex="-1"></a>nci.data <span class="ot">&lt;-</span> NCI60<span class="sc">$</span>data <span class="co"># Gene expression data set</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<div class="sourceCode cell-code" id="cb3"><pre class="sourceCode r code-with-copy"><code class="sourceCode r"><span id="cb3-1"><a href="#cb3-1" aria-hidden="true" tabindex="-1"></a>food_s <span class="ot">&lt;-</span> <span class="fu">scale</span>(food)</span>
+<span id="cb3-2"><a href="#cb3-2" aria-hidden="true" tabindex="-1"></a><span class="fu">head</span>(food_s) <span class="co"># print first 6 lines</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<div class="cell-output cell-output-stdout">
+<pre><code>                      Meat       Pigs       Eggs        Milk        Fish
+Albania         0.08126490 -1.8299828 -2.2437259 -1.15570645 -1.20028213
+Austria        -0.27725673  1.6636208  1.2335002  0.39161231 -0.64187467
+Belg.Lux.       1.09707621  0.3604512  1.0504883  0.05401549  0.06348211
+Bulgaria       -0.60590157 -0.5545403 -1.2371605 -1.24010566 -0.90638347
+Czechoslovakia -0.03824231  0.9427184 -0.1390890 -0.64931122 -0.67126454
+Denmark         0.23064892  0.7763564  0.6844645  1.10900556  1.65053488
+                  Cereals     Legume       Fruit
+Albania         0.9159176  1.2227536 -1.35040507
+Austria        -0.3870690 -0.8923886  0.09091397
+Belg.Lux.      -0.5146342 -0.4895043 -0.07539207
+Bulgaria        2.2280161  0.3162641  0.03547862
+Czechoslovakia  0.1869740 -0.9931096 -0.07539207
+Denmark        -0.9428885 -1.1945517 -0.96235764</code></pre>
+</div>
+</div>
+<section id="distances" class="level3">
+<h3 class="anchored" data-anchor-id="distances">Distances</h3>
+<p>To do hierarchical clustering, the most convenient command is <code>hclust()</code>. As input you would need a <strong>distance</strong> between the subjects (patients, or countries in this example). We do it on the scaled data.</p>
+<p>The command to compute pair-wise distance is <code>dist()</code>. By default, the distance being computed is the Euclidean distance (<em>details optional</em>). Euclidean distance is possibly the most commonly used metric, but there are others. See <code>?dist()</code> to find out more options.</p>
+<p>We can present the pair-wise distances in a matrix format. You can see that this matrix is symmetric, with 0 on the diagonal - this should be intuitive: the distance between A - B is the same as B - A, and the distance between A and itself is 0.</p>
+<div class="cell">
+<div class="sourceCode cell-code" id="cb5"><pre class="sourceCode r code-with-copy"><code class="sourceCode r"><span id="cb5-1"><a href="#cb5-1" aria-hidden="true" tabindex="-1"></a><span class="co"># compute distance</span></span>
+<span id="cb5-2"><a href="#cb5-2" aria-hidden="true" tabindex="-1"></a>food_dist <span class="ot">&lt;-</span> <span class="fu">dist</span>(food_s)</span>
+<span id="cb5-3"><a href="#cb5-3" aria-hidden="true" tabindex="-1"></a><span class="co"># round(food_dist, digits = 2) # try this yourself to see what it does</span></span>
+<span id="cb5-4"><a href="#cb5-4" aria-hidden="true" tabindex="-1"></a><span class="co"># alternatively, look at this as a matrix</span></span>
+<span id="cb5-5"><a href="#cb5-5" aria-hidden="true" tabindex="-1"></a>food_dist_matrix <span class="ot">&lt;-</span> <span class="fu">as.matrix</span>(food_dist)</span>
+<span id="cb5-6"><a href="#cb5-6" aria-hidden="true" tabindex="-1"></a><span class="fu">round</span>(food_dist_matrix[<span class="dv">1</span><span class="sc">:</span><span class="dv">5</span>, <span class="dv">1</span><span class="sc">:</span><span class="dv">5</span>], <span class="at">digits =</span> <span class="dv">2</span>) <span class="co"># first 5 row 5 col</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<div class="cell-output cell-output-stdout">
+<pre><code>               Albania Austria Belg.Lux. Bulgaria Czechoslovakia
+Albania           0.00    5.95      5.13     2.77           4.44
+Austria           5.95    0.00      2.11     4.71           1.98
+Belg.Lux.         5.13    2.11      0.00     4.45           2.20
+Bulgaria          2.77    4.71      4.45     0.00           3.17
+Czechoslovakia    4.44    1.98      2.20     3.17           0.00</code></pre>
+</div>
+</div>
+<div class="callout callout-style-default callout-note callout-titled">
+<div class="callout-header d-flex align-content-center">
+<div class="callout-icon-container">
+<i class="callout-icon"></i>
+</div>
+<div class="callout-title-container flex-fill">
+Optional: Euclidean disance
+</div>
+</div>
+<div class="callout-body-container callout-body">
+<p>You can check the Euclidean distance between Albania and Austria is indeed 5.95. This distance is the square root of the sum of squared differences between two subjects in all their measurements.</p>
 </div>
+</div>
+<div class="cell">
+<div class="sourceCode cell-code" id="cb7"><pre class="sourceCode r code-with-copy"><code class="sourceCode r"><span id="cb7-1"><a href="#cb7-1" aria-hidden="true" tabindex="-1"></a><span class="fu">round</span>(food_s[<span class="dv">1</span><span class="sc">:</span><span class="dv">2</span>,], <span class="at">digits =</span> <span class="dv">2</span>) <span class="co"># we only keep first 2 digits</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<div class="cell-output cell-output-stdout">
+<pre><code>         Meat  Pigs  Eggs  Milk  Fish Cereals Legume Fruit
+Albania  0.08 -1.83 -2.24 -1.16 -1.20    0.92   1.22 -1.35
+Austria -0.28  1.66  1.23  0.39 -0.64   -0.39  -0.89  0.09</code></pre>
+</div>
+<div class="sourceCode cell-code" id="cb9"><pre class="sourceCode r code-with-copy"><code class="sourceCode r"><span id="cb9-1"><a href="#cb9-1" aria-hidden="true" tabindex="-1"></a><span class="co"># take the data for two countries each </span></span>
+<span id="cb9-2"><a href="#cb9-2" aria-hidden="true" tabindex="-1"></a>albania <span class="ot">&lt;-</span> <span class="fu">round</span>(food_s[<span class="dv">1</span>,], <span class="at">digits =</span> <span class="dv">2</span>)</span>
+<span id="cb9-3"><a href="#cb9-3" aria-hidden="true" tabindex="-1"></a>austria <span class="ot">&lt;-</span> <span class="fu">round</span>(food_s[<span class="dv">2</span>,], <span class="at">digits =</span> <span class="dv">2</span>)</span>
+<span id="cb9-4"><a href="#cb9-4" aria-hidden="true" tabindex="-1"></a><span class="co"># compute difference between each col</span></span>
+<span id="cb9-5"><a href="#cb9-5" aria-hidden="true" tabindex="-1"></a>d <span class="ot">&lt;-</span> albania <span class="sc">-</span> austria</span>
+<span id="cb9-6"><a href="#cb9-6" aria-hidden="true" tabindex="-1"></a>d</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<div class="cell-output cell-output-stdout">
+<pre><code>   Meat    Pigs    Eggs    Milk    Fish Cereals  Legume   Fruit 
+   0.36   -3.49   -3.47   -1.55   -0.56    1.31    2.11   -1.44 </code></pre>
+</div>
+<div class="sourceCode cell-code" id="cb11"><pre class="sourceCode r code-with-copy"><code class="sourceCode r"><span id="cb11-1"><a href="#cb11-1" aria-hidden="true" tabindex="-1"></a><span class="co"># euclidean distance: square each element, sum together, and take a square root</span></span>
+<span id="cb11-2"><a href="#cb11-2" aria-hidden="true" tabindex="-1"></a><span class="fu">sqrt</span>(<span class="fu">sum</span>(d<span class="sc">^</span><span class="dv">2</span>)) </span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<div class="cell-output cell-output-stdout">
+<pre><code>[1] 5.942096</code></pre>
+</div>
+</div>
+</section>
 <section id="hierarchical-clustering" class="level3">
 <h3 class="anchored" data-anchor-id="hierarchical-clustering">Hierarchical clustering</h3>
+<p>Now that we have computed the distance <code>food_dist</code>, we plug it in the clustering algorithm, <code>hclust()</code>.</p>
+<p>We try the complete linkage method, by specifying <code>method = 'complete'</code>. The result is saved as <code>hc.complete</code>. You can visualize it, and add label of the country names to make it easier to read.</p>
+<div class="cell">
+<div class="sourceCode cell-code" id="cb13"><pre class="sourceCode r code-with-copy"><code class="sourceCode r"><span id="cb13-1"><a href="#cb13-1" aria-hidden="true" tabindex="-1"></a>hc.complete <span class="ot">&lt;-</span> <span class="fu">hclust</span>(food_dist, <span class="at">method=</span><span class="st">"complete"</span>)</span>
+<span id="cb13-2"><a href="#cb13-2" aria-hidden="true" tabindex="-1"></a><span class="fu">plot</span>(hc.complete, <span class="at">labels=</span><span class="fu">rownames</span>(food), <span class="at">main=</span><span class="st">"Complete Linkage"</span>, <span class="at">xlab=</span><span class="st">""</span>, <span class="at">sub=</span><span class="st">""</span>)</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<div class="cell-output-display">
+<p><img src="lab_day4_clustering_files/figure-html/hc-food-complete-1.png" class="img-fluid" width="672"></p>
+</div>
+</div>
+</section>
+<section id="linkage-dissimilarity-scaling" class="level3">
+<h3 class="anchored" data-anchor-id="linkage-dissimilarity-scaling">Linkage, dissimilarity, scaling</h3>
+<p>Hierarchical clustering is a class of methods, and there are a variety of options to set.</p>
+<ul>
+<li>Linkage (by seting <code>method</code> inside <code>hclust()</code>): complete, single, average</li>
+<li>Dissimilarity: Euclidean, correlation, …</li>
+<li>Scaling: scaled data (mean 0 variance 1) or unscaled, original data</li>
+</ul>
+<p>There is no definite guide on which combination works the best, hence you can try them out and see what could make most sense. Again, in unsupervised learning data do not have outcome labels, so the interpretation is left for the domain experts to make.</p>
+<div class="cell">
+<div class="sourceCode cell-code" id="cb14"><pre class="sourceCode r code-with-copy"><code class="sourceCode r"><span id="cb14-1"><a href="#cb14-1" aria-hidden="true" tabindex="-1"></a><span class="co"># single linkage</span></span>
+<span id="cb14-2"><a href="#cb14-2" aria-hidden="true" tabindex="-1"></a>hc.single <span class="ot">&lt;-</span> <span class="fu">hclust</span>(food_dist, <span class="at">method=</span><span class="st">"single"</span>)</span>
+<span id="cb14-3"><a href="#cb14-3" aria-hidden="true" tabindex="-1"></a><span class="fu">plot</span>(hc.single, <span class="at">labels=</span><span class="fu">rownames</span>(food), <span class="at">main=</span><span class="st">"Single Linkage"</span>, <span class="at">xlab=</span><span class="st">""</span>, <span class="at">sub=</span><span class="st">""</span>)</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<div class="cell-output-display">
+<p><img src="lab_day4_clustering_files/figure-html/hc-food-linkage-1.png" class="img-fluid" width="672"></p>
+</div>
+<div class="sourceCode cell-code" id="cb15"><pre class="sourceCode r code-with-copy"><code class="sourceCode r"><span id="cb15-1"><a href="#cb15-1" aria-hidden="true" tabindex="-1"></a><span class="co"># average linkage</span></span>
+<span id="cb15-2"><a href="#cb15-2" aria-hidden="true" tabindex="-1"></a>hc.average <span class="ot">&lt;-</span> <span class="fu">hclust</span>(food_dist, <span class="at">method=</span><span class="st">"average"</span>)</span>
+<span id="cb15-3"><a href="#cb15-3" aria-hidden="true" tabindex="-1"></a><span class="fu">plot</span>(hc.average, <span class="at">labels=</span><span class="fu">rownames</span>(food), <span class="at">main=</span><span class="st">"Average Linkage"</span>, <span class="at">xlab=</span><span class="st">""</span>, <span class="at">sub=</span><span class="st">""</span>)</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<div class="cell-output-display">
+<p><img src="lab_day4_clustering_files/figure-html/hc-food-linkage-2.png" class="img-fluid" width="672"></p>
+</div>
+</div>
+<div class="cell">
+<div class="sourceCode cell-code" id="cb16"><pre class="sourceCode r code-with-copy"><code class="sourceCode r"><span id="cb16-1"><a href="#cb16-1" aria-hidden="true" tabindex="-1"></a><span class="co"># unscaled data, complete linkage</span></span>
+<span id="cb16-2"><a href="#cb16-2" aria-hidden="true" tabindex="-1"></a>hc.unscaled <span class="ot">&lt;-</span> <span class="fu">hclust</span>(<span class="fu">dist</span>(food), <span class="at">method=</span><span class="st">"complete"</span>)</span>
+<span id="cb16-3"><a href="#cb16-3" aria-hidden="true" tabindex="-1"></a><span class="fu">plot</span>(hc.unscaled, <span class="at">labels=</span><span class="fu">rownames</span>(food), <span class="at">main=</span><span class="st">"Complete linkage with unscaled features"</span>, <span class="at">xlab=</span><span class="st">""</span>, <span class="at">sub=</span><span class="st">""</span>)</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<div class="cell-output-display">
+<p><img src="lab_day4_clustering_files/figure-html/hc-food-unscaled-1.png" class="img-fluid" width="672"></p>
+</div>
+</div>
+<div class="cell">
+<div class="sourceCode cell-code" id="cb17"><pre class="sourceCode r code-with-copy"><code class="sourceCode r"><span id="cb17-1"><a href="#cb17-1" aria-hidden="true" tabindex="-1"></a><span class="co"># correlation as dissimiarity, rather than euclidean distance</span></span>
+<span id="cb17-2"><a href="#cb17-2" aria-hidden="true" tabindex="-1"></a>dd <span class="ot">&lt;-</span> <span class="fu">as.dist</span>(<span class="dv">1</span><span class="sc">-</span><span class="fu">cor</span>(<span class="fu">t</span>(food_s))) <span class="co"># compute the metric</span></span>
+<span id="cb17-3"><a href="#cb17-3" aria-hidden="true" tabindex="-1"></a>hc.corr <span class="ot">&lt;-</span> <span class="fu">hclust</span>(dd, <span class="at">method=</span><span class="st">"complete"</span>) <span class="co"># cluster</span></span>
+<span id="cb17-4"><a href="#cb17-4" aria-hidden="true" tabindex="-1"></a><span class="fu">plot</span>(hc.corr, <span class="at">labels=</span><span class="fu">rownames</span>(food), <span class="at">main=</span><span class="st">"Complete linkage with correlation-based distance"</span>, <span class="at">xlab=</span><span class="st">""</span>, <span class="at">sub=</span><span class="st">""</span>)</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<div class="cell-output-display">
+<p><img src="lab_day4_clustering_files/figure-html/hc-food-cor-1.png" class="img-fluid" width="672"></p>
+</div>
+</div>
+</section>
+<section id="heatmap" class="level3">
+<h3 class="anchored" data-anchor-id="heatmap">Heatmap</h3>
+<p>Heatmap is a visualization tool to plot data of similar values in similar colors, so that you can identify visualy if there is any pattern. It can also be combined with hierarchical clustering - this is actually the default outcome: dendrograms are displayed for both rows and column.</p>
+<div class="cell">
+<div class="sourceCode cell-code" id="cb18"><pre class="sourceCode r code-with-copy"><code class="sourceCode r"><span id="cb18-1"><a href="#cb18-1" aria-hidden="true" tabindex="-1"></a><span class="co"># make heatmap on the scaled data</span></span>
+<span id="cb18-2"><a href="#cb18-2" aria-hidden="true" tabindex="-1"></a><span class="fu">heatmap</span>(food_s)</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<div class="cell-output-display">
+<p><img src="lab_day4_clustering_files/figure-html/hc-food-heatmap1-1.png" class="img-fluid" width="672"></p>
+</div>
+</div>
+<p>To preserve the original ordering of the columns and rows, you can specify <code>Rowv = NA, Colv = NA</code>.</p>
+<div class="cell">
+<div class="sourceCode cell-code" id="cb19"><pre class="sourceCode r code-with-copy"><code class="sourceCode r"><span id="cb19-1"><a href="#cb19-1" aria-hidden="true" tabindex="-1"></a><span class="co"># no clustering for row or col, this preserves the original ordering</span></span>
+<span id="cb19-2"><a href="#cb19-2" aria-hidden="true" tabindex="-1"></a><span class="fu">heatmap</span>(food_s, <span class="at">Rowv =</span> <span class="cn">NA</span>, <span class="at">Colv =</span> <span class="cn">NA</span>)</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<div class="cell-output-display">
+<p><img src="lab_day4_clustering_files/figure-html/hc-food-heatmap2-1.png" class="img-fluid" width="672"></p>
+</div>
+</div>
+<p>Can also only do clustering for row only (or column only).</p>
+<div class="cell">
+<div class="sourceCode cell-code" id="cb20"><pre class="sourceCode r code-with-copy"><code class="sourceCode r"><span id="cb20-1"><a href="#cb20-1" aria-hidden="true" tabindex="-1"></a><span class="co"># only clustering for row</span></span>
+<span id="cb20-2"><a href="#cb20-2" aria-hidden="true" tabindex="-1"></a><span class="fu">heatmap</span>(food_s, <span class="at">Colv =</span> <span class="cn">NA</span>)</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<div class="cell-output-display">
+<p><img src="lab_day4_clustering_files/figure-html/hc-food-heatmap3-1.png" class="img-fluid" width="672"></p>
+</div>
+</div>
+</section>
+</section>
+<section id="exercise-2-nci60" class="level2">
+<h2 class="anchored" data-anchor-id="exercise-2-nci60">Exercise 2: NCI60</h2>
+<p>We look at the NCI60 data again. First load the dataset.</p>
+<div class="cell">
+<div class="sourceCode cell-code" id="cb21"><pre class="sourceCode r code-with-copy"><code class="sourceCode r"><span id="cb21-1"><a href="#cb21-1" aria-hidden="true" tabindex="-1"></a><span class="fu">library</span>(ISLR)</span>
+<span id="cb21-2"><a href="#cb21-2" aria-hidden="true" tabindex="-1"></a><span class="co"># or, load('data/NCI60.RData')</span></span>
+<span id="cb21-3"><a href="#cb21-3" aria-hidden="true" tabindex="-1"></a>nci.labs <span class="ot">&lt;-</span> NCI60<span class="sc">$</span>labs <span class="co"># Sample labels (tissue type)</span></span>
+<span id="cb21-4"><a href="#cb21-4" aria-hidden="true" tabindex="-1"></a>nci.data <span class="ot">&lt;-</span> NCI60<span class="sc">$</span>data <span class="co"># Gene expression data set</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+</div>
+<section id="hierarchical-clustering-1" class="level3">
+<h3 class="anchored" data-anchor-id="hierarchical-clustering-1">Hierarchical clustering</h3>
 <p>We start by scaling the data, and calculate the distance matrix (using the Euclidean distance), and then investigate different linkage methods.</p>
 <div class="cell">
-<div class="sourceCode cell-code" id="cb2"><pre class="sourceCode r code-with-copy"><code class="sourceCode r"><span id="cb2-1"><a href="#cb2-1" aria-hidden="true" tabindex="-1"></a><span class="co"># Scale the data to zero mean and unit variance:</span></span>
-<span id="cb2-2"><a href="#cb2-2" aria-hidden="true" tabindex="-1"></a>sd.data <span class="ot">&lt;-</span> <span class="fu">scale</span>(nci.data)</span>
-<span id="cb2-3"><a href="#cb2-3" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb2-4"><a href="#cb2-4" aria-hidden="true" tabindex="-1"></a><span class="co"># Calculate the distance matrix </span></span>
-<span id="cb2-5"><a href="#cb2-5" aria-hidden="true" tabindex="-1"></a><span class="co"># equivalent: data.dist &lt;- dist(sd.data, method="euclidean")</span></span>
-<span id="cb2-6"><a href="#cb2-6" aria-hidden="true" tabindex="-1"></a>data.dist <span class="ot">&lt;-</span> <span class="fu">dist</span>(sd.data)</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<div class="sourceCode cell-code" id="cb22"><pre class="sourceCode r code-with-copy"><code class="sourceCode r"><span id="cb22-1"><a href="#cb22-1" aria-hidden="true" tabindex="-1"></a><span class="co"># Scale the data to zero mean and unit variance:</span></span>
+<span id="cb22-2"><a href="#cb22-2" aria-hidden="true" tabindex="-1"></a>sd.data <span class="ot">&lt;-</span> <span class="fu">scale</span>(nci.data)</span>
+<span id="cb22-3"><a href="#cb22-3" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb22-4"><a href="#cb22-4" aria-hidden="true" tabindex="-1"></a><span class="co"># Calculate the distance matrix </span></span>
+<span id="cb22-5"><a href="#cb22-5" aria-hidden="true" tabindex="-1"></a><span class="co"># equivalent: data.dist &lt;- dist(sd.data, method="euclidean")</span></span>
+<span id="cb22-6"><a href="#cb22-6" aria-hidden="true" tabindex="-1"></a>data.dist <span class="ot">&lt;-</span> <span class="fu">dist</span>(sd.data)</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 </div>
 <p>Next we perform hierarchical clustering with distance matrix as input. The function we use is <code>hclust()</code>. We specify the linkage method to be <code>complete</code>.</p>
 <p>Once the result is saved in <code>hc.complete</code> object, you can plot the dendrogram.</p>
 <div class="cell">
-<div class="sourceCode cell-code" id="cb3"><pre class="sourceCode r code-with-copy"><code class="sourceCode r"><span id="cb3-1"><a href="#cb3-1" aria-hidden="true" tabindex="-1"></a><span class="co"># Perform clustering</span></span>
-<span id="cb3-2"><a href="#cb3-2" aria-hidden="true" tabindex="-1"></a>hc.complete <span class="ot">&lt;-</span> <span class="fu">hclust</span>(data.dist, <span class="at">method=</span><span class="st">"complete"</span>)</span>
-<span id="cb3-3"><a href="#cb3-3" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb3-4"><a href="#cb3-4" aria-hidden="true" tabindex="-1"></a><span class="co"># names(hc.complete)</span></span>
-<span id="cb3-5"><a href="#cb3-5" aria-hidden="true" tabindex="-1"></a><span class="fu">plot</span>(hc.complete, <span class="at">labels=</span>nci.labs, <span class="at">main=</span><span class="st">"Complete Linkage"</span>, <span class="at">xlab=</span><span class="st">""</span>, <span class="at">sub=</span><span class="st">""</span>)</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<div class="sourceCode cell-code" id="cb23"><pre class="sourceCode r code-with-copy"><code class="sourceCode r"><span id="cb23-1"><a href="#cb23-1" aria-hidden="true" tabindex="-1"></a><span class="co"># Perform clustering</span></span>
+<span id="cb23-2"><a href="#cb23-2" aria-hidden="true" tabindex="-1"></a>hc.complete <span class="ot">&lt;-</span> <span class="fu">hclust</span>(data.dist, <span class="at">method=</span><span class="st">"complete"</span>)</span>
+<span id="cb23-3"><a href="#cb23-3" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb23-4"><a href="#cb23-4" aria-hidden="true" tabindex="-1"></a><span class="co"># names(hc.complete)</span></span>
+<span id="cb23-5"><a href="#cb23-5" aria-hidden="true" tabindex="-1"></a><span class="fu">plot</span>(hc.complete, <span class="at">labels=</span>nci.labs, <span class="at">main=</span><span class="st">"Complete Linkage"</span>, <span class="at">xlab=</span><span class="st">""</span>, <span class="at">sub=</span><span class="st">""</span>)</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 <div class="cell-output-display">
 <p><img src="lab_day4_clustering_files/figure-html/clust-nci60-hccomplete-1.png" class="img-fluid" width="672"></p>
 </div>
@@ -227,30 +414,30 @@ <h3 class="anchored" data-anchor-id="hierarchical-clustering">Hierarchical clust
 <p>The object <code>hc.complete</code> contains a lot of information. To get the information, you can use the <code>$</code> operator.</p>
 <p>You should refer to the documentation for <code>hclust()</code> to see a complete list of output. Use <code>?hclust</code> to get the documentation on how to use the function.</p>
 <div class="cell">
-<div class="sourceCode cell-code" id="cb4"><pre class="sourceCode r code-with-copy"><code class="sourceCode r"><span id="cb4-1"><a href="#cb4-1" aria-hidden="true" tabindex="-1"></a>hc.complete<span class="sc">$</span>dist.method <span class="co"># distance method</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<div class="sourceCode cell-code" id="cb24"><pre class="sourceCode r code-with-copy"><code class="sourceCode r"><span id="cb24-1"><a href="#cb24-1" aria-hidden="true" tabindex="-1"></a>hc.complete<span class="sc">$</span>dist.method <span class="co"># distance method</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 <div class="cell-output cell-output-stdout">
 <pre><code>[1] "euclidean"</code></pre>
 </div>
-<div class="sourceCode cell-code" id="cb6"><pre class="sourceCode r code-with-copy"><code class="sourceCode r"><span id="cb6-1"><a href="#cb6-1" aria-hidden="true" tabindex="-1"></a><span class="co"># hc.complete$merge  # order of aggregations of samples / clusters</span></span>
-<span id="cb6-2"><a href="#cb6-2" aria-hidden="true" tabindex="-1"></a><span class="co"># hc.complete$height # distance at which aggregations happen</span></span>
-<span id="cb6-3"><a href="#cb6-3" aria-hidden="true" tabindex="-1"></a><span class="co"># hc.complete$labels # labels (numeric, since we don't know the original categories!)</span></span>
-<span id="cb6-4"><a href="#cb6-4" aria-hidden="true" tabindex="-1"></a><span class="co"># hc.complete$method</span></span>
-<span id="cb6-5"><a href="#cb6-5" aria-hidden="true" tabindex="-1"></a><span class="co"># hc.complete$call</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<div class="sourceCode cell-code" id="cb26"><pre class="sourceCode r code-with-copy"><code class="sourceCode r"><span id="cb26-1"><a href="#cb26-1" aria-hidden="true" tabindex="-1"></a><span class="co"># hc.complete$merge  # order of aggregations of samples / clusters</span></span>
+<span id="cb26-2"><a href="#cb26-2" aria-hidden="true" tabindex="-1"></a><span class="co"># hc.complete$height # distance at which aggregations happen</span></span>
+<span id="cb26-3"><a href="#cb26-3" aria-hidden="true" tabindex="-1"></a><span class="co"># hc.complete$labels # labels (numeric, since we don't know the original categories!)</span></span>
+<span id="cb26-4"><a href="#cb26-4" aria-hidden="true" tabindex="-1"></a><span class="co"># hc.complete$method</span></span>
+<span id="cb26-5"><a href="#cb26-5" aria-hidden="true" tabindex="-1"></a><span class="co"># hc.complete$call</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 </div>
 <p>We can try different linkage methods and see how the clustering results differ. Change the <code>method</code> argument in the function, and plot the results.</p>
 <div class="cell">
-<div class="sourceCode cell-code" id="cb7"><pre class="sourceCode r code-with-copy"><code class="sourceCode r"><span id="cb7-1"><a href="#cb7-1" aria-hidden="true" tabindex="-1"></a>hc.average <span class="ot">&lt;-</span> <span class="fu">hclust</span>(data.dist, <span class="at">method=</span><span class="st">"average"</span>)</span>
-<span id="cb7-2"><a href="#cb7-2" aria-hidden="true" tabindex="-1"></a>hc.single <span class="ot">&lt;-</span> <span class="fu">hclust</span>(data.dist, <span class="at">method=</span><span class="st">"single"</span>)</span>
-<span id="cb7-3"><a href="#cb7-3" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb7-4"><a href="#cb7-4" aria-hidden="true" tabindex="-1"></a><span class="fu">plot</span>(hc.average, <span class="at">labels=</span>nci.labs, <span class="at">main=</span><span class="st">"Average Linkage"</span>, <span class="at">xlab=</span><span class="st">""</span>, <span class="at">sub=</span><span class="st">""</span>)</span>
-<span id="cb7-5"><a href="#cb7-5" aria-hidden="true" tabindex="-1"></a><span class="fu">plot</span>(hc.single, <span class="at">labels=</span>nci.labs,  <span class="at">main=</span><span class="st">"Single Linkage"</span>, <span class="at">xlab=</span><span class="st">""</span>, <span class="at">sub=</span><span class="st">""</span>)</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<div class="sourceCode cell-code" id="cb27"><pre class="sourceCode r code-with-copy"><code class="sourceCode r"><span id="cb27-1"><a href="#cb27-1" aria-hidden="true" tabindex="-1"></a>hc.average <span class="ot">&lt;-</span> <span class="fu">hclust</span>(data.dist, <span class="at">method=</span><span class="st">"average"</span>)</span>
+<span id="cb27-2"><a href="#cb27-2" aria-hidden="true" tabindex="-1"></a>hc.single <span class="ot">&lt;-</span> <span class="fu">hclust</span>(data.dist, <span class="at">method=</span><span class="st">"single"</span>)</span>
+<span id="cb27-3"><a href="#cb27-3" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb27-4"><a href="#cb27-4" aria-hidden="true" tabindex="-1"></a><span class="fu">plot</span>(hc.average, <span class="at">labels=</span>nci.labs, <span class="at">main=</span><span class="st">"Average Linkage"</span>, <span class="at">xlab=</span><span class="st">""</span>, <span class="at">sub=</span><span class="st">""</span>)</span>
+<span id="cb27-5"><a href="#cb27-5" aria-hidden="true" tabindex="-1"></a><span class="fu">plot</span>(hc.single, <span class="at">labels=</span>nci.labs,  <span class="at">main=</span><span class="st">"Single Linkage"</span>, <span class="at">xlab=</span><span class="st">""</span>, <span class="at">sub=</span><span class="st">""</span>)</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 </div>
 <p>Now we focus on <strong>complete linkage</strong> only.</p>
 <p>First, we use <code>cutree()</code> to compare the results when the data are separated into either 2 or 4 clusters.</p>
 <div class="cell">
-<div class="sourceCode cell-code" id="cb8"><pre class="sourceCode r code-with-copy"><code class="sourceCode r"><span id="cb8-1"><a href="#cb8-1" aria-hidden="true" tabindex="-1"></a><span class="co"># Compare 2 clusters and 4 clusters:</span></span>
-<span id="cb8-2"><a href="#cb8-2" aria-hidden="true" tabindex="-1"></a>hc.clusters <span class="ot">&lt;-</span> <span class="fu">cutree</span>(hc.complete, <span class="fu">c</span>(<span class="dv">2</span>, <span class="dv">4</span>))</span>
-<span id="cb8-3"><a href="#cb8-3" aria-hidden="true" tabindex="-1"></a><span class="fu">head</span>(hc.clusters) <span class="co"># print first 6 results</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<div class="sourceCode cell-code" id="cb28"><pre class="sourceCode r code-with-copy"><code class="sourceCode r"><span id="cb28-1"><a href="#cb28-1" aria-hidden="true" tabindex="-1"></a><span class="co"># Compare 2 clusters and 4 clusters:</span></span>
+<span id="cb28-2"><a href="#cb28-2" aria-hidden="true" tabindex="-1"></a>hc.clusters <span class="ot">&lt;-</span> <span class="fu">cutree</span>(hc.complete, <span class="fu">c</span>(<span class="dv">2</span>, <span class="dv">4</span>))</span>
+<span id="cb28-3"><a href="#cb28-3" aria-hidden="true" tabindex="-1"></a><span class="fu">head</span>(hc.clusters) <span class="co"># print first 6 results</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 <div class="cell-output cell-output-stdout">
 <pre><code>   2 4
 V1 1 1
@@ -260,8 +447,8 @@ <h3 class="anchored" data-anchor-id="hierarchical-clustering">Hierarchical clust
 V5 1 2
 V6 1 2</code></pre>
 </div>
-<div class="sourceCode cell-code" id="cb10"><pre class="sourceCode r code-with-copy"><code class="sourceCode r"><span id="cb10-1"><a href="#cb10-1" aria-hidden="true" tabindex="-1"></a><span class="co"># cross tabulation</span></span>
-<span id="cb10-2"><a href="#cb10-2" aria-hidden="true" tabindex="-1"></a><span class="fu">table</span>(hc.clusters[,<span class="st">"2"</span>], hc.clusters[,<span class="st">"4"</span>])</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<div class="sourceCode cell-code" id="cb30"><pre class="sourceCode r code-with-copy"><code class="sourceCode r"><span id="cb30-1"><a href="#cb30-1" aria-hidden="true" tabindex="-1"></a><span class="co"># cross tabulation</span></span>
+<span id="cb30-2"><a href="#cb30-2" aria-hidden="true" tabindex="-1"></a><span class="fu">table</span>(hc.clusters[,<span class="st">"2"</span>], hc.clusters[,<span class="st">"4"</span>])</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 <div class="cell-output cell-output-stdout">
 <pre><code>   
      1  2  3  4
@@ -271,25 +458,25 @@ <h3 class="anchored" data-anchor-id="hierarchical-clustering">Hierarchical clust
 </div>
 <p>It is more straightforward to check the results with a dendrogram.</p>
 <div class="cell">
-<div class="sourceCode cell-code" id="cb12"><pre class="sourceCode r code-with-copy"><code class="sourceCode r"><span id="cb12-1"><a href="#cb12-1" aria-hidden="true" tabindex="-1"></a><span class="co"># visualize the cuts</span></span>
-<span id="cb12-2"><a href="#cb12-2" aria-hidden="true" tabindex="-1"></a><span class="co"># how do you know where to draw the line? check height</span></span>
-<span id="cb12-3"><a href="#cb12-3" aria-hidden="true" tabindex="-1"></a>heights <span class="ot">&lt;-</span> hc.complete<span class="sc">$</span>height</span>
-<span id="cb12-4"><a href="#cb12-4" aria-hidden="true" tabindex="-1"></a><span class="fu">tail</span>(heights, <span class="dv">4</span>)  <span class="co"># print the last 4</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<div class="sourceCode cell-code" id="cb32"><pre class="sourceCode r code-with-copy"><code class="sourceCode r"><span id="cb32-1"><a href="#cb32-1" aria-hidden="true" tabindex="-1"></a><span class="co"># visualize the cuts</span></span>
+<span id="cb32-2"><a href="#cb32-2" aria-hidden="true" tabindex="-1"></a><span class="co"># how do you know where to draw the line? check height</span></span>
+<span id="cb32-3"><a href="#cb32-3" aria-hidden="true" tabindex="-1"></a>heights <span class="ot">&lt;-</span> hc.complete<span class="sc">$</span>height</span>
+<span id="cb32-4"><a href="#cb32-4" aria-hidden="true" tabindex="-1"></a><span class="fu">tail</span>(heights, <span class="dv">4</span>)  <span class="co"># print the last 4</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 <div class="cell-output cell-output-stdout">
 <pre><code>[1] 137.5633 141.2472 142.9218 162.2074</code></pre>
 </div>
-<div class="sourceCode cell-code" id="cb14"><pre class="sourceCode r code-with-copy"><code class="sourceCode r"><span id="cb14-1"><a href="#cb14-1" aria-hidden="true" tabindex="-1"></a><span class="fu">plot</span>(hc.complete, <span class="at">labels=</span>nci.labs, <span class="at">main=</span><span class="st">"Complete Linkage"</span>, <span class="at">xlab=</span><span class="st">""</span>, <span class="at">sub=</span><span class="st">""</span>)</span>
-<span id="cb14-2"><a href="#cb14-2" aria-hidden="true" tabindex="-1"></a><span class="fu">abline</span>(<span class="at">h=</span><span class="dv">140</span>, <span class="at">col=</span><span class="st">"red"</span>)  <span class="co"># 4 clusters</span></span>
-<span id="cb14-3"><a href="#cb14-3" aria-hidden="true" tabindex="-1"></a><span class="fu">abline</span>(<span class="at">h=</span><span class="dv">150</span>, <span class="at">col=</span><span class="st">"blue"</span>) <span class="co"># 2 clusters</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<div class="sourceCode cell-code" id="cb34"><pre class="sourceCode r code-with-copy"><code class="sourceCode r"><span id="cb34-1"><a href="#cb34-1" aria-hidden="true" tabindex="-1"></a><span class="fu">plot</span>(hc.complete, <span class="at">labels=</span>nci.labs, <span class="at">main=</span><span class="st">"Complete Linkage"</span>, <span class="at">xlab=</span><span class="st">""</span>, <span class="at">sub=</span><span class="st">""</span>)</span>
+<span id="cb34-2"><a href="#cb34-2" aria-hidden="true" tabindex="-1"></a><span class="fu">abline</span>(<span class="at">h=</span><span class="dv">140</span>, <span class="at">col=</span><span class="st">"red"</span>)  <span class="co"># 4 clusters</span></span>
+<span id="cb34-3"><a href="#cb34-3" aria-hidden="true" tabindex="-1"></a><span class="fu">abline</span>(<span class="at">h=</span><span class="dv">150</span>, <span class="at">col=</span><span class="st">"blue"</span>) <span class="co"># 2 clusters</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 <div class="cell-output-display">
-<p><img src="lab_day4_clustering_files/figure-html/unnamed-chunk-7-1.png" class="img-fluid" width="672"></p>
+<p><img src="lab_day4_clustering_files/figure-html/unnamed-chunk-18-1.png" class="img-fluid" width="672"></p>
 </div>
 </div>
 <p>The way to interpret the height variable is simple: it is where two clusters are merged into one. For example, the largest cluster corresponds to the last value of <code>height</code> (162.2) - if you check the figure, it is exactly where the horizontal line is merging the two groups. Similarly, 142.9 is where three groups became two, 141.2 is where four groups became three. If you draw a line at 140, it points out the four clusters.</p>
 <p>How are the labels distributed between clusters? We can focus on 4 cluster situation, and use <code>table()</code> to list out which cancer types is merged in which of the four clusters.</p>
 <p>For example, breast cancer appears in all but 3rd cluster; melanoma only appears in the first clsuter; so on so forth.</p>
 <div class="cell">
-<div class="sourceCode cell-code" id="cb15"><pre class="sourceCode r code-with-copy"><code class="sourceCode r"><span id="cb15-1"><a href="#cb15-1" aria-hidden="true" tabindex="-1"></a><span class="fu">table</span>(hc.clusters[,<span class="st">"4"</span>], nci.labs)</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<div class="sourceCode cell-code" id="cb35"><pre class="sourceCode r code-with-copy"><code class="sourceCode r"><span id="cb35-1"><a href="#cb35-1" aria-hidden="true" tabindex="-1"></a><span class="fu">table</span>(hc.clusters[,<span class="st">"4"</span>], nci.labs)</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 <div class="cell-output cell-output-stdout">
 <pre><code>   nci.labs
     BREAST CNS COLON K562A-repro K562B-repro LEUKEMIA MCF7A-repro MCF7D-repro
@@ -308,16 +495,16 @@ <h3 class="anchored" data-anchor-id="hierarchical-clustering">Hierarchical clust
 <p>Finally, we see what happens if we use <strong>unscaled data</strong> instead of scaled data, or if we use a <strong>correlation-based distance metric</strong> instead of the Euclidean distance.</p>
 <p>Compare the dendrograms: How different are the resulting clusterings? Do you recognise subclusters that are consistent?</p>
 <div class="cell">
-<div class="sourceCode cell-code" id="cb17"><pre class="sourceCode r code-with-copy"><code class="sourceCode r"><span id="cb17-1"><a href="#cb17-1" aria-hidden="true" tabindex="-1"></a><span class="co"># Compare scaled data versus non-scaled data:</span></span>
-<span id="cb17-2"><a href="#cb17-2" aria-hidden="true" tabindex="-1"></a>hc.unscaled <span class="ot">&lt;-</span> <span class="fu">hclust</span>(<span class="fu">dist</span>(nci.data), <span class="at">method=</span><span class="st">"complete"</span>)</span>
-<span id="cb17-3"><a href="#cb17-3" aria-hidden="true" tabindex="-1"></a><span class="fu">plot</span>(hc.unscaled, <span class="at">labels=</span>nci.labs, <span class="at">main=</span><span class="st">"Complete linkage with unscaled features"</span>, <span class="at">xlab=</span><span class="st">""</span>, <span class="at">sub=</span><span class="st">""</span>)</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<div class="sourceCode cell-code" id="cb37"><pre class="sourceCode r code-with-copy"><code class="sourceCode r"><span id="cb37-1"><a href="#cb37-1" aria-hidden="true" tabindex="-1"></a><span class="co"># Compare scaled data versus non-scaled data:</span></span>
+<span id="cb37-2"><a href="#cb37-2" aria-hidden="true" tabindex="-1"></a>hc.unscaled <span class="ot">&lt;-</span> <span class="fu">hclust</span>(<span class="fu">dist</span>(nci.data), <span class="at">method=</span><span class="st">"complete"</span>)</span>
+<span id="cb37-3"><a href="#cb37-3" aria-hidden="true" tabindex="-1"></a><span class="fu">plot</span>(hc.unscaled, <span class="at">labels=</span>nci.labs, <span class="at">main=</span><span class="st">"Complete linkage with unscaled features"</span>, <span class="at">xlab=</span><span class="st">""</span>, <span class="at">sub=</span><span class="st">""</span>)</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 <div class="cell-output-display">
 <p><img src="lab_day4_clustering_files/figure-html/clust-nci60-unscale-1.png" class="img-fluid" width="672"></p>
 </div>
-<div class="sourceCode cell-code" id="cb18"><pre class="sourceCode r code-with-copy"><code class="sourceCode r"><span id="cb18-1"><a href="#cb18-1" aria-hidden="true" tabindex="-1"></a><span class="co"># Compare Euclidean distance with correlation-based distance:</span></span>
-<span id="cb18-2"><a href="#cb18-2" aria-hidden="true" tabindex="-1"></a>dd <span class="ot">&lt;-</span> <span class="fu">as.dist</span>(<span class="dv">1</span><span class="sc">-</span><span class="fu">cor</span>(<span class="fu">t</span>(sd.data)))</span>
-<span id="cb18-3"><a href="#cb18-3" aria-hidden="true" tabindex="-1"></a>hc.corr <span class="ot">&lt;-</span> <span class="fu">hclust</span>(dd, <span class="at">method=</span><span class="st">"complete"</span>)</span>
-<span id="cb18-4"><a href="#cb18-4" aria-hidden="true" tabindex="-1"></a><span class="fu">plot</span>(hc.corr, <span class="at">labels=</span>nci.labs, <span class="at">main=</span><span class="st">"Complete linkage with correlation-based distance"</span>, <span class="at">xlab=</span><span class="st">""</span>, <span class="at">sub=</span><span class="st">""</span>)</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<div class="sourceCode cell-code" id="cb38"><pre class="sourceCode r code-with-copy"><code class="sourceCode r"><span id="cb38-1"><a href="#cb38-1" aria-hidden="true" tabindex="-1"></a><span class="co"># Compare Euclidean distance with correlation-based distance:</span></span>
+<span id="cb38-2"><a href="#cb38-2" aria-hidden="true" tabindex="-1"></a>dd <span class="ot">&lt;-</span> <span class="fu">as.dist</span>(<span class="dv">1</span><span class="sc">-</span><span class="fu">cor</span>(<span class="fu">t</span>(sd.data)))</span>
+<span id="cb38-3"><a href="#cb38-3" aria-hidden="true" tabindex="-1"></a>hc.corr <span class="ot">&lt;-</span> <span class="fu">hclust</span>(dd, <span class="at">method=</span><span class="st">"complete"</span>)</span>
+<span id="cb38-4"><a href="#cb38-4" aria-hidden="true" tabindex="-1"></a><span class="fu">plot</span>(hc.corr, <span class="at">labels=</span>nci.labs, <span class="at">main=</span><span class="st">"Complete linkage with correlation-based distance"</span>, <span class="at">xlab=</span><span class="st">""</span>, <span class="at">sub=</span><span class="st">""</span>)</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 <div class="cell-output-display">
 <p><img src="lab_day4_clustering_files/figure-html/clust-nci60-unscale-2.png" class="img-fluid" width="672"></p>
 </div>
@@ -329,9 +516,9 @@ <h3 class="anchored" data-anchor-id="k-means-clustering">K-means clustering</h3>
 <p>In contrast to the hierarchical clustering which requires a <strong>distance</strong> as input, with K-means you would provide the data matrix. The data matrix can be scaled (centered and with unit variance), or unscaled.</p>
 <p>In this example we use scaled data computed from before, <code>sd.data</code>.</p>
 <div class="cell">
-<div class="sourceCode cell-code" id="cb19"><pre class="sourceCode r code-with-copy"><code class="sourceCode r"><span id="cb19-1"><a href="#cb19-1" aria-hidden="true" tabindex="-1"></a><span class="fu">set.seed</span>(<span class="dv">4</span>) <span class="co"># set random seed</span></span>
-<span id="cb19-2"><a href="#cb19-2" aria-hidden="true" tabindex="-1"></a>km.out4 <span class="ot">&lt;-</span> <span class="fu">kmeans</span>(sd.data, <span class="at">centers =</span> <span class="dv">4</span>, <span class="at">nstart=</span><span class="dv">20</span>)</span>
-<span id="cb19-3"><a href="#cb19-3" aria-hidden="true" tabindex="-1"></a>km.out4<span class="sc">$</span>cluster</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<div class="sourceCode cell-code" id="cb39"><pre class="sourceCode r code-with-copy"><code class="sourceCode r"><span id="cb39-1"><a href="#cb39-1" aria-hidden="true" tabindex="-1"></a><span class="fu">set.seed</span>(<span class="dv">4</span>) <span class="co"># set random seed</span></span>
+<span id="cb39-2"><a href="#cb39-2" aria-hidden="true" tabindex="-1"></a>km.out4 <span class="ot">&lt;-</span> <span class="fu">kmeans</span>(sd.data, <span class="at">centers =</span> <span class="dv">4</span>, <span class="at">nstart=</span><span class="dv">20</span>)</span>
+<span id="cb39-3"><a href="#cb39-3" aria-hidden="true" tabindex="-1"></a>km.out4<span class="sc">$</span>cluster</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 <div class="cell-output cell-output-stdout">
 <pre><code> V1  V2  V3  V4  V5  V6  V7  V8  V9 V10 V11 V12 V13 V14 V15 V16 V17 V18 V19 V20 
   4   4   4   4   4   4   4   4   4   4   4   4   4   4   4   4   4   4   4   4 
@@ -346,10 +533,10 @@ <h3 class="anchored" data-anchor-id="k-means-clustering">K-means clustering</h3>
 <p>Read the help file <code>?kmeans</code> to understand what the argument nstart=20 does. Comparing an analysis with nstart=20 versus nstart=1 demonstrates how the cluster results can be improved if we allow more evaluations with different randomly chosen starting centroids.</p>
 <p>Set a different random seed, say 3 (as long as it’s different from the one you used before), and run the analysis again. This time we use a different <code>nstart</code></p>
 <div class="cell">
-<div class="sourceCode cell-code" id="cb21"><pre class="sourceCode r code-with-copy"><code class="sourceCode r"><span id="cb21-1"><a href="#cb21-1" aria-hidden="true" tabindex="-1"></a><span class="co"># different starting centroids improve the clustering:</span></span>
-<span id="cb21-2"><a href="#cb21-2" aria-hidden="true" tabindex="-1"></a><span class="fu">set.seed</span>(<span class="dv">3</span>)</span>
-<span id="cb21-3"><a href="#cb21-3" aria-hidden="true" tabindex="-1"></a>km.out <span class="ot">&lt;-</span> <span class="fu">kmeans</span>(sd.data, <span class="at">centers =</span> <span class="dv">4</span>, <span class="at">nstart=</span><span class="dv">1</span>)</span>
-<span id="cb21-4"><a href="#cb21-4" aria-hidden="true" tabindex="-1"></a>km.out<span class="sc">$</span>cluster</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<div class="sourceCode cell-code" id="cb41"><pre class="sourceCode r code-with-copy"><code class="sourceCode r"><span id="cb41-1"><a href="#cb41-1" aria-hidden="true" tabindex="-1"></a><span class="co"># different starting centroids improve the clustering:</span></span>
+<span id="cb41-2"><a href="#cb41-2" aria-hidden="true" tabindex="-1"></a><span class="fu">set.seed</span>(<span class="dv">3</span>)</span>
+<span id="cb41-3"><a href="#cb41-3" aria-hidden="true" tabindex="-1"></a>km.out <span class="ot">&lt;-</span> <span class="fu">kmeans</span>(sd.data, <span class="at">centers =</span> <span class="dv">4</span>, <span class="at">nstart=</span><span class="dv">1</span>)</span>
+<span id="cb41-4"><a href="#cb41-4" aria-hidden="true" tabindex="-1"></a>km.out<span class="sc">$</span>cluster <span class="co"># cluster label</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 <div class="cell-output cell-output-stdout">
 <pre><code> V1  V2  V3  V4  V5  V6  V7  V8  V9 V10 V11 V12 V13 V14 V15 V16 V17 V18 V19 V20 
   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1 
@@ -360,18 +547,14 @@ <h3 class="anchored" data-anchor-id="k-means-clustering">K-means clustering</h3>
 V61 V62 V63 V64 
   2   2   2   2 </code></pre>
 </div>
-<div class="sourceCode cell-code" id="cb23"><pre class="sourceCode r code-with-copy"><code class="sourceCode r"><span id="cb23-1"><a href="#cb23-1" aria-hidden="true" tabindex="-1"></a>km.out<span class="sc">$</span>tot.withinss</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
-<div class="cell-output cell-output-stdout">
-<pre><code>[1] 344566.9</code></pre>
-</div>
 </div>
 </section>
 <section id="compare-with-hierarchical-clustering" class="level3">
 <h3 class="anchored" data-anchor-id="compare-with-hierarchical-clustering">Compare with hierarchical clustering</h3>
 <div class="cell">
-<div class="sourceCode cell-code" id="cb25"><pre class="sourceCode r code-with-copy"><code class="sourceCode r"><span id="cb25-1"><a href="#cb25-1" aria-hidden="true" tabindex="-1"></a><span class="co"># we can directly compare the k-means result (along rows)</span></span>
-<span id="cb25-2"><a href="#cb25-2" aria-hidden="true" tabindex="-1"></a><span class="co"># with the hierarchical clustering result (along columns)</span></span>
-<span id="cb25-3"><a href="#cb25-3" aria-hidden="true" tabindex="-1"></a><span class="fu">table</span>(km.out4<span class="sc">$</span>cluster, hc.clusters[,<span class="st">"4"</span>], <span class="at">deparse.level=</span><span class="dv">2</span>)</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<div class="sourceCode cell-code" id="cb43"><pre class="sourceCode r code-with-copy"><code class="sourceCode r"><span id="cb43-1"><a href="#cb43-1" aria-hidden="true" tabindex="-1"></a><span class="co"># we can directly compare the k-means result (along rows)</span></span>
+<span id="cb43-2"><a href="#cb43-2" aria-hidden="true" tabindex="-1"></a><span class="co"># with the hierarchical clustering result (along columns)</span></span>
+<span id="cb43-3"><a href="#cb43-3" aria-hidden="true" tabindex="-1"></a><span class="fu">table</span>(km.out4<span class="sc">$</span>cluster, hc.clusters[,<span class="st">"4"</span>], <span class="at">deparse.level=</span><span class="dv">2</span>)</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 <div class="cell-output cell-output-stdout">
 <pre><code>               hc.clusters[, "4"]
 km.out4$cluster  1  2  3  4
@@ -387,49 +570,49 @@ <h3 class="anchored" data-anchor-id="compare-with-hierarchical-clustering">Compa
 <h3 class="anchored" data-anchor-id="visualize-clusters">Visualize clusters</h3>
 <p>We can visualise the K-means clustering results of high-dimensional data by using PCA for dimension reduction. We plot the first two principal components and colour the data points (= individual cell lines) by their assigned cluster from K-means.</p>
 <div class="cell">
-<div class="sourceCode cell-code" id="cb27"><pre class="sourceCode r code-with-copy"><code class="sourceCode r"><span id="cb27-1"><a href="#cb27-1" aria-hidden="true" tabindex="-1"></a><span class="co"># first, run PCA again on the NCI60 data</span></span>
-<span id="cb27-2"><a href="#cb27-2" aria-hidden="true" tabindex="-1"></a>pr.out <span class="ot">&lt;-</span> <span class="fu">prcomp</span>(nci.data, <span class="at">scale=</span><span class="cn">TRUE</span>)</span>
-<span id="cb27-3"><a href="#cb27-3" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb27-4"><a href="#cb27-4" aria-hidden="true" tabindex="-1"></a><span class="co"># more cluster options</span></span>
-<span id="cb27-5"><a href="#cb27-5" aria-hidden="true" tabindex="-1"></a>km.out2 <span class="ot">&lt;-</span> <span class="fu">kmeans</span>(sd.data, <span class="dv">2</span>, <span class="at">nstart=</span><span class="dv">20</span>)</span>
-<span id="cb27-6"><a href="#cb27-6" aria-hidden="true" tabindex="-1"></a>km.out3 <span class="ot">&lt;-</span> <span class="fu">kmeans</span>(sd.data, <span class="dv">3</span>, <span class="at">nstart=</span><span class="dv">20</span>)</span>
-<span id="cb27-7"><a href="#cb27-7" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb27-8"><a href="#cb27-8" aria-hidden="true" tabindex="-1"></a><span class="co"># we can now visualise the K-Means results by labelling the data points</span></span>
-<span id="cb27-9"><a href="#cb27-9" aria-hidden="true" tabindex="-1"></a><span class="co"># in a plot of the scores of the first 2 principal components:</span></span>
-<span id="cb27-10"><a href="#cb27-10" aria-hidden="true" tabindex="-1"></a><span class="fu">par</span>(<span class="at">mfrow=</span><span class="fu">c</span>(<span class="dv">1</span>,<span class="dv">3</span>))</span>
-<span id="cb27-11"><a href="#cb27-11" aria-hidden="true" tabindex="-1"></a><span class="fu">plot</span>(pr.out<span class="sc">$</span>x[,<span class="dv">1</span><span class="sc">:</span><span class="dv">2</span>], <span class="at">col=</span>(km.out2<span class="sc">$</span>cluster<span class="sc">+</span><span class="dv">1</span>), <span class="at">main=</span><span class="st">"K-Means with K=2"</span>,</span>
-<span id="cb27-12"><a href="#cb27-12" aria-hidden="true" tabindex="-1"></a>     <span class="at">xlab=</span><span class="st">"PC 1"</span>, <span class="at">ylab=</span><span class="st">"PC 2"</span>, <span class="at">pch=</span><span class="dv">20</span>)</span>
-<span id="cb27-13"><a href="#cb27-13" aria-hidden="true" tabindex="-1"></a><span class="fu">plot</span>(pr.out<span class="sc">$</span>x[,<span class="dv">1</span><span class="sc">:</span><span class="dv">2</span>], <span class="at">col=</span>(km.out3<span class="sc">$</span>cluster<span class="sc">+</span><span class="dv">1</span>), <span class="at">main=</span><span class="st">"K-Means with K=3"</span>,</span>
-<span id="cb27-14"><a href="#cb27-14" aria-hidden="true" tabindex="-1"></a>     <span class="at">xlab=</span><span class="st">"PC 1"</span>,  <span class="at">ylab=</span><span class="st">"PC 2"</span>, <span class="at">pch=</span><span class="dv">20</span>)</span>
-<span id="cb27-15"><a href="#cb27-15" aria-hidden="true" tabindex="-1"></a><span class="fu">plot</span>(pr.out<span class="sc">$</span>x[,<span class="dv">1</span><span class="sc">:</span><span class="dv">2</span>], <span class="at">col=</span>(km.out4<span class="sc">$</span>cluster<span class="sc">+</span><span class="dv">1</span>), <span class="at">main=</span><span class="st">"K-Means with K=4"</span>,</span>
-<span id="cb27-16"><a href="#cb27-16" aria-hidden="true" tabindex="-1"></a>     <span class="at">xlab=</span><span class="st">"PC 1"</span>, <span class="at">ylab=</span><span class="st">"PC 2"</span>, <span class="at">pch=</span><span class="dv">20</span>)</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<div class="sourceCode cell-code" id="cb45"><pre class="sourceCode r code-with-copy"><code class="sourceCode r"><span id="cb45-1"><a href="#cb45-1" aria-hidden="true" tabindex="-1"></a><span class="co"># first, run PCA again on the NCI60 data</span></span>
+<span id="cb45-2"><a href="#cb45-2" aria-hidden="true" tabindex="-1"></a>pr.out <span class="ot">&lt;-</span> <span class="fu">prcomp</span>(nci.data, <span class="at">scale=</span><span class="cn">TRUE</span>)</span>
+<span id="cb45-3"><a href="#cb45-3" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb45-4"><a href="#cb45-4" aria-hidden="true" tabindex="-1"></a><span class="co"># more cluster options</span></span>
+<span id="cb45-5"><a href="#cb45-5" aria-hidden="true" tabindex="-1"></a>km.out2 <span class="ot">&lt;-</span> <span class="fu">kmeans</span>(sd.data, <span class="dv">2</span>, <span class="at">nstart=</span><span class="dv">20</span>)</span>
+<span id="cb45-6"><a href="#cb45-6" aria-hidden="true" tabindex="-1"></a>km.out3 <span class="ot">&lt;-</span> <span class="fu">kmeans</span>(sd.data, <span class="dv">3</span>, <span class="at">nstart=</span><span class="dv">20</span>)</span>
+<span id="cb45-7"><a href="#cb45-7" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb45-8"><a href="#cb45-8" aria-hidden="true" tabindex="-1"></a><span class="co"># we can now visualise the K-Means results by labelling the data points</span></span>
+<span id="cb45-9"><a href="#cb45-9" aria-hidden="true" tabindex="-1"></a><span class="co"># in a plot of the scores of the first 2 principal components:</span></span>
+<span id="cb45-10"><a href="#cb45-10" aria-hidden="true" tabindex="-1"></a><span class="fu">par</span>(<span class="at">mfrow=</span><span class="fu">c</span>(<span class="dv">1</span>,<span class="dv">3</span>))</span>
+<span id="cb45-11"><a href="#cb45-11" aria-hidden="true" tabindex="-1"></a><span class="fu">plot</span>(pr.out<span class="sc">$</span>x[,<span class="dv">1</span><span class="sc">:</span><span class="dv">2</span>], <span class="at">col=</span>(km.out2<span class="sc">$</span>cluster<span class="sc">+</span><span class="dv">1</span>), <span class="at">main=</span><span class="st">"K-Means with K=2"</span>,</span>
+<span id="cb45-12"><a href="#cb45-12" aria-hidden="true" tabindex="-1"></a>     <span class="at">xlab=</span><span class="st">"PC 1"</span>, <span class="at">ylab=</span><span class="st">"PC 2"</span>, <span class="at">pch=</span><span class="dv">20</span>)</span>
+<span id="cb45-13"><a href="#cb45-13" aria-hidden="true" tabindex="-1"></a><span class="fu">plot</span>(pr.out<span class="sc">$</span>x[,<span class="dv">1</span><span class="sc">:</span><span class="dv">2</span>], <span class="at">col=</span>(km.out3<span class="sc">$</span>cluster<span class="sc">+</span><span class="dv">1</span>), <span class="at">main=</span><span class="st">"K-Means with K=3"</span>,</span>
+<span id="cb45-14"><a href="#cb45-14" aria-hidden="true" tabindex="-1"></a>     <span class="at">xlab=</span><span class="st">"PC 1"</span>,  <span class="at">ylab=</span><span class="st">"PC 2"</span>, <span class="at">pch=</span><span class="dv">20</span>)</span>
+<span id="cb45-15"><a href="#cb45-15" aria-hidden="true" tabindex="-1"></a><span class="fu">plot</span>(pr.out<span class="sc">$</span>x[,<span class="dv">1</span><span class="sc">:</span><span class="dv">2</span>], <span class="at">col=</span>(km.out4<span class="sc">$</span>cluster<span class="sc">+</span><span class="dv">1</span>), <span class="at">main=</span><span class="st">"K-Means with K=4"</span>,</span>
+<span id="cb45-16"><a href="#cb45-16" aria-hidden="true" tabindex="-1"></a>     <span class="at">xlab=</span><span class="st">"PC 1"</span>, <span class="at">ylab=</span><span class="st">"PC 2"</span>, <span class="at">pch=</span><span class="dv">20</span>)</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 <div class="cell-output-display">
 <p><img src="lab_day4_clustering_files/figure-html/clust-nci60-km-pca-1.png" class="img-fluid" width="672"></p>
 </div>
 </div>
 <p>Compare with the plot from Exercise 2 yesterday (left panel) along with the cancer type labels. The clusters from K-means seem to correspond decently to partition the data into groups.</p>
 <div class="cell">
-<div class="sourceCode cell-code" id="cb28"><pre class="sourceCode r code-with-copy"><code class="sourceCode r"><span id="cb28-1"><a href="#cb28-1" aria-hidden="true" tabindex="-1"></a><span class="fu">par</span>(<span class="at">mfrow=</span><span class="fu">c</span>(<span class="dv">1</span>,<span class="dv">1</span>))</span>
-<span id="cb28-2"><a href="#cb28-2" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb28-3"><a href="#cb28-3" aria-hidden="true" tabindex="-1"></a>Cols<span class="ot">=</span><span class="cf">function</span>(vec){</span>
-<span id="cb28-4"><a href="#cb28-4" aria-hidden="true" tabindex="-1"></a>  cols<span class="ot">=</span><span class="fu">rainbow</span>(<span class="fu">length</span>(<span class="fu">unique</span>(vec)))</span>
-<span id="cb28-5"><a href="#cb28-5" aria-hidden="true" tabindex="-1"></a>  <span class="fu">return</span>(cols[<span class="fu">as.numeric</span>(<span class="fu">as.factor</span>(vec))])</span>
-<span id="cb28-6"><a href="#cb28-6" aria-hidden="true" tabindex="-1"></a>}</span>
-<span id="cb28-7"><a href="#cb28-7" aria-hidden="true" tabindex="-1"></a><span class="fu">plot</span>(pr.out<span class="sc">$</span>x[,<span class="dv">1</span><span class="sc">:</span><span class="dv">2</span>], <span class="at">col=</span><span class="fu">Cols</span>(nci.labs), <span class="at">pch=</span><span class="dv">19</span>,<span class="at">xlab=</span><span class="st">"PC 1"</span>,<span class="at">ylab=</span><span class="st">" PC 2"</span>)</span>
-<span id="cb28-8"><a href="#cb28-8" aria-hidden="true" tabindex="-1"></a><span class="fu">legend</span>(<span class="st">'topleft'</span>, <span class="at">col=</span><span class="fu">rainbow</span>(<span class="fu">length</span>(<span class="fu">unique</span>(nci.labs))), <span class="at">legend=</span><span class="fu">unique</span>(nci.labs), <span class="at">bty=</span><span class="st">'n'</span>, <span class="at">lwd=</span><span class="dv">2</span>, <span class="at">cex=</span>.<span class="dv">6</span>)</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<div class="sourceCode cell-code" id="cb46"><pre class="sourceCode r code-with-copy"><code class="sourceCode r"><span id="cb46-1"><a href="#cb46-1" aria-hidden="true" tabindex="-1"></a><span class="fu">par</span>(<span class="at">mfrow=</span><span class="fu">c</span>(<span class="dv">1</span>,<span class="dv">1</span>))</span>
+<span id="cb46-2"><a href="#cb46-2" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb46-3"><a href="#cb46-3" aria-hidden="true" tabindex="-1"></a>Cols<span class="ot">=</span><span class="cf">function</span>(vec){</span>
+<span id="cb46-4"><a href="#cb46-4" aria-hidden="true" tabindex="-1"></a>  cols<span class="ot">=</span><span class="fu">rainbow</span>(<span class="fu">length</span>(<span class="fu">unique</span>(vec)))</span>
+<span id="cb46-5"><a href="#cb46-5" aria-hidden="true" tabindex="-1"></a>  <span class="fu">return</span>(cols[<span class="fu">as.numeric</span>(<span class="fu">as.factor</span>(vec))])</span>
+<span id="cb46-6"><a href="#cb46-6" aria-hidden="true" tabindex="-1"></a>}</span>
+<span id="cb46-7"><a href="#cb46-7" aria-hidden="true" tabindex="-1"></a><span class="fu">plot</span>(pr.out<span class="sc">$</span>x[,<span class="dv">1</span><span class="sc">:</span><span class="dv">2</span>], <span class="at">col=</span><span class="fu">Cols</span>(nci.labs), <span class="at">pch=</span><span class="dv">19</span>,<span class="at">xlab=</span><span class="st">"PC 1"</span>,<span class="at">ylab=</span><span class="st">" PC 2"</span>)</span>
+<span id="cb46-8"><a href="#cb46-8" aria-hidden="true" tabindex="-1"></a><span class="fu">legend</span>(<span class="st">'topleft'</span>, <span class="at">col=</span><span class="fu">rainbow</span>(<span class="fu">length</span>(<span class="fu">unique</span>(nci.labs))), <span class="at">legend=</span><span class="fu">unique</span>(nci.labs), <span class="at">bty=</span><span class="st">'n'</span>, <span class="at">lwd=</span><span class="dv">2</span>, <span class="at">cex=</span>.<span class="dv">6</span>)</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 <div class="cell-output-display">
-<p><img src="lab_day4_clustering_files/figure-html/unnamed-chunk-14-1.png" class="img-fluid" width="672"></p>
+<p><img src="lab_day4_clustering_files/figure-html/unnamed-chunk-25-1.png" class="img-fluid" width="672"></p>
 </div>
 </div>
 </section>
-<section id="heatmap" class="level3">
-<h3 class="anchored" data-anchor-id="heatmap">Heatmap</h3>
+<section id="heatmap-1" class="level3">
+<h3 class="anchored" data-anchor-id="heatmap-1">Heatmap</h3>
 <p>A heatmap is another way to visualize the clusters from the data. We use the principal components rather than the raw data, as PCs are already explaining a large amount of variability in the over 6000 features.</p>
 <p>Similar values are presented with similar colors.</p>
 <div class="cell">
-<div class="sourceCode cell-code" id="cb29"><pre class="sourceCode r code-with-copy"><code class="sourceCode r"><span id="cb29-1"><a href="#cb29-1" aria-hidden="true" tabindex="-1"></a><span class="do">## We use the scores of the PCA on the NCI60 data, to reduce dimension</span></span>
-<span id="cb29-2"><a href="#cb29-2" aria-hidden="true" tabindex="-1"></a>scores <span class="ot">&lt;-</span> pr.out<span class="sc">$</span>x</span>
-<span id="cb29-3"><a href="#cb29-3" aria-hidden="true" tabindex="-1"></a>scores[<span class="dv">1</span><span class="sc">:</span><span class="dv">5</span>, <span class="dv">1</span><span class="sc">:</span><span class="dv">5</span>] <span class="co"># first 5 pc, first 5 measurements</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<div class="sourceCode cell-code" id="cb47"><pre class="sourceCode r code-with-copy"><code class="sourceCode r"><span id="cb47-1"><a href="#cb47-1" aria-hidden="true" tabindex="-1"></a><span class="do">## We use the scores of the PCA on the NCI60 data, to reduce dimension</span></span>
+<span id="cb47-2"><a href="#cb47-2" aria-hidden="true" tabindex="-1"></a>scores <span class="ot">&lt;-</span> pr.out<span class="sc">$</span>x</span>
+<span id="cb47-3"><a href="#cb47-3" aria-hidden="true" tabindex="-1"></a>scores[<span class="dv">1</span><span class="sc">:</span><span class="dv">5</span>, <span class="dv">1</span><span class="sc">:</span><span class="dv">5</span>] <span class="co"># first 5 pc, first 5 measurements</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 <div class="cell-output cell-output-stdout">
 <pre><code>         PC1       PC2         PC3         PC4        PC5
 V1 -19.68245  3.527748  -9.7354382   0.8177816 -12.511081
@@ -438,33 +621,33 @@ <h3 class="anchored" data-anchor-id="heatmap">Heatmap</h3>
 V4 -42.48098 -9.691742  -0.8830921  -3.4180227 -41.938370
 V5 -54.98387 -5.158121 -20.9291076 -15.7253986 -10.361364</code></pre>
 </div>
-<div class="sourceCode cell-code" id="cb31"><pre class="sourceCode r code-with-copy"><code class="sourceCode r"><span id="cb31-1"><a href="#cb31-1" aria-hidden="true" tabindex="-1"></a><span class="co">#  default choices</span></span>
-<span id="cb31-2"><a href="#cb31-2" aria-hidden="true" tabindex="-1"></a><span class="fu">heatmap</span>(pr.out<span class="sc">$</span>x)</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<div class="sourceCode cell-code" id="cb49"><pre class="sourceCode r code-with-copy"><code class="sourceCode r"><span id="cb49-1"><a href="#cb49-1" aria-hidden="true" tabindex="-1"></a><span class="co">#  default choices</span></span>
+<span id="cb49-2"><a href="#cb49-2" aria-hidden="true" tabindex="-1"></a><span class="fu">heatmap</span>(pr.out<span class="sc">$</span>x)</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 <div class="cell-output-display">
 <p><img src="lab_day4_clustering_files/figure-html/clust-nci60-heatmap-pca-1.png" class="img-fluid" width="672"></p>
 </div>
 </div>
 <p>You can remove the dendrogram on the PCs, only keeping the ones for cancer types. Now you see that the PCs have kept their original order from 1 to 64.</p>
 <div class="cell">
-<div class="sourceCode cell-code" id="cb32"><pre class="sourceCode r code-with-copy"><code class="sourceCode r"><span id="cb32-1"><a href="#cb32-1" aria-hidden="true" tabindex="-1"></a><span class="co"># hc.corr is the result from hclust. check the section on hierarchical clustering</span></span>
-<span id="cb32-2"><a href="#cb32-2" aria-hidden="true" tabindex="-1"></a><span class="fu">heatmap</span>(pr.out<span class="sc">$</span>x, <span class="at">Rowv =</span> <span class="fu">as.dendrogram</span>(hc.corr), <span class="at">Colv =</span> <span class="cn">NA</span>)</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<div class="sourceCode cell-code" id="cb50"><pre class="sourceCode r code-with-copy"><code class="sourceCode r"><span id="cb50-1"><a href="#cb50-1" aria-hidden="true" tabindex="-1"></a><span class="co"># hc.corr is the result from hclust. check the section on hierarchical clustering</span></span>
+<span id="cb50-2"><a href="#cb50-2" aria-hidden="true" tabindex="-1"></a><span class="fu">heatmap</span>(pr.out<span class="sc">$</span>x, <span class="at">Rowv =</span> <span class="fu">as.dendrogram</span>(hc.corr), <span class="at">Colv =</span> <span class="cn">NA</span>)</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 <div class="cell-output-display">
 <p><img src="lab_day4_clustering_files/figure-html/clust-nci60-heatmap-dendro-1.png" class="img-fluid" width="672"></p>
 </div>
 </div>
 <p>You can also reduce the number of PCs, and add titles to the plot annd y-axis.</p>
 <div class="cell">
-<div class="sourceCode cell-code" id="cb33"><pre class="sourceCode r code-with-copy"><code class="sourceCode r"><span id="cb33-1"><a href="#cb33-1" aria-hidden="true" tabindex="-1"></a><span class="fu">par</span>(<span class="at">cex.main =</span> .<span class="dv">7</span>)</span>
-<span id="cb33-2"><a href="#cb33-2" aria-hidden="true" tabindex="-1"></a><span class="fu">heatmap</span>(pr.out<span class="sc">$</span>x[,<span class="dv">1</span><span class="sc">:</span><span class="dv">40</span>], <span class="at">Rowv =</span> <span class="fu">as.dendrogram</span>(hc.corr), <span class="at">Colv =</span> <span class="cn">NA</span>,</span>
-<span id="cb33-3"><a href="#cb33-3" aria-hidden="true" tabindex="-1"></a>        <span class="at">labRow =</span> nci.labs, <span class="at">main =</span> <span class="st">'Heatmap of the scores of the first 40 PCs on the NCI60 data'</span>)</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<div class="sourceCode cell-code" id="cb51"><pre class="sourceCode r code-with-copy"><code class="sourceCode r"><span id="cb51-1"><a href="#cb51-1" aria-hidden="true" tabindex="-1"></a><span class="fu">par</span>(<span class="at">cex.main =</span> .<span class="dv">7</span>)</span>
+<span id="cb51-2"><a href="#cb51-2" aria-hidden="true" tabindex="-1"></a><span class="fu">heatmap</span>(pr.out<span class="sc">$</span>x[,<span class="dv">1</span><span class="sc">:</span><span class="dv">40</span>], <span class="at">Rowv =</span> <span class="fu">as.dendrogram</span>(hc.corr), <span class="at">Colv =</span> <span class="cn">NA</span>,</span>
+<span id="cb51-3"><a href="#cb51-3" aria-hidden="true" tabindex="-1"></a>        <span class="at">labRow =</span> nci.labs, <span class="at">main =</span> <span class="st">'Heatmap of the scores of the first 40 PCs on the NCI60 data'</span>)</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 <div class="cell-output-display">
 <p><img src="lab_day4_clustering_files/figure-html/clust-nci60-heatmap-type-1.png" class="img-fluid" width="672"></p>
 </div>
 </div>
 </section>
 </section>
-<section id="exercise-2-gene-expression-data" class="level2">
-<h2 class="anchored" data-anchor-id="exercise-2-gene-expression-data">Exercise 2: Gene expression data</h2>
+<section id="exercise-3-gene-expression-data" class="level2">
+<h2 class="anchored" data-anchor-id="exercise-3-gene-expression-data">Exercise 3: Gene expression data</h2>
 <p>(CH12Ex13 from statistical learning)</p>
 <p>We use the <code>Ch12Ex13.csv</code> data to repeat some of the clustering analysis we did.</p>
 <p>The first 20 samples are from healthy patients, while the second 20 are from a diseased group.</p>
@@ -472,36 +655,36 @@ <h2 class="anchored" data-anchor-id="exercise-2-gene-expression-data">Exercise 2
 <p>Carry out both hierarchical clustering and K-means clustering. You should choose the most meaningful number of clusters (think about how many groups of patients we have!). Compare the results.</p>
 <p>Note: remember that the data has genes on the rows and patients on the columns. You need to transpose the data so that the orders are reversed.</p>
 <div class="cell">
-<div class="sourceCode cell-code" id="cb34"><pre class="sourceCode r code-with-copy"><code class="sourceCode r"><span id="cb34-1"><a href="#cb34-1" aria-hidden="true" tabindex="-1"></a><span class="co"># load in the data using read.csv(). You will need to select header=F.</span></span>
-<span id="cb34-2"><a href="#cb34-2" aria-hidden="true" tabindex="-1"></a>data <span class="ot">&lt;-</span> <span class="fu">read.csv</span>(<span class="st">"data/Ch12Ex13.csv"</span>, <span class="at">header=</span><span class="cn">FALSE</span>)</span>
-<span id="cb34-3"><a href="#cb34-3" aria-hidden="true" tabindex="-1"></a><span class="fu">dim</span>(data)</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<div class="sourceCode cell-code" id="cb52"><pre class="sourceCode r code-with-copy"><code class="sourceCode r"><span id="cb52-1"><a href="#cb52-1" aria-hidden="true" tabindex="-1"></a><span class="co"># load in the data using read.csv(). You will need to select header=F.</span></span>
+<span id="cb52-2"><a href="#cb52-2" aria-hidden="true" tabindex="-1"></a>data <span class="ot">&lt;-</span> <span class="fu">read.csv</span>(<span class="st">"data/Ch12Ex13.csv"</span>, <span class="at">header=</span><span class="cn">FALSE</span>)</span>
+<span id="cb52-3"><a href="#cb52-3" aria-hidden="true" tabindex="-1"></a><span class="fu">dim</span>(data)</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 <div class="cell-output cell-output-stdout">
 <pre><code>[1] 1000   40</code></pre>
 </div>
-<div class="sourceCode cell-code" id="cb36"><pre class="sourceCode r code-with-copy"><code class="sourceCode r"><span id="cb36-1"><a href="#cb36-1" aria-hidden="true" tabindex="-1"></a><span class="co"># transpose the data, so that we have each row is one patient (subject)</span></span>
-<span id="cb36-2"><a href="#cb36-2" aria-hidden="true" tabindex="-1"></a>data <span class="ot">&lt;-</span> <span class="fu">t</span>(data) </span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<div class="sourceCode cell-code" id="cb54"><pre class="sourceCode r code-with-copy"><code class="sourceCode r"><span id="cb54-1"><a href="#cb54-1" aria-hidden="true" tabindex="-1"></a><span class="co"># transpose the data, so that we have each row is one patient (subject)</span></span>
+<span id="cb54-2"><a href="#cb54-2" aria-hidden="true" tabindex="-1"></a>data <span class="ot">&lt;-</span> <span class="fu">t</span>(data) </span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 </div>
 <p>Now the first 20 rows are measurements from healthy patients (group 0), and 21-50 rows are the disease patients (group 1). We can denote this information in a vector like this.</p>
 <div class="cell">
-<div class="sourceCode cell-code" id="cb37"><pre class="sourceCode r code-with-copy"><code class="sourceCode r"><span id="cb37-1"><a href="#cb37-1" aria-hidden="true" tabindex="-1"></a>true.groups <span class="ot">&lt;-</span> <span class="fu">c</span>( <span class="fu">rep</span>(<span class="dv">0</span>,<span class="dv">20</span>), <span class="fu">rep</span>(<span class="dv">1</span>,<span class="dv">20</span>))</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<div class="sourceCode cell-code" id="cb55"><pre class="sourceCode r code-with-copy"><code class="sourceCode r"><span id="cb55-1"><a href="#cb55-1" aria-hidden="true" tabindex="-1"></a>true.groups <span class="ot">&lt;-</span> <span class="fu">c</span>( <span class="fu">rep</span>(<span class="dv">0</span>,<span class="dv">20</span>), <span class="fu">rep</span>(<span class="dv">1</span>,<span class="dv">20</span>))</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 </div>
-<section id="hierarchical-clustering-1" class="level3">
-<h3 class="anchored" data-anchor-id="hierarchical-clustering-1">Hierarchical clustering</h3>
+<section id="hierarchical-clustering-2" class="level3">
+<h3 class="anchored" data-anchor-id="hierarchical-clustering-2">Hierarchical clustering</h3>
 <p>You can use different linkage options and distance metrics of your choosing. For example, with complete linkage the code is like this.</p>
 <div class="cell">
-<div class="sourceCode cell-code" id="cb38"><pre class="sourceCode r code-with-copy"><code class="sourceCode r"><span id="cb38-1"><a href="#cb38-1" aria-hidden="true" tabindex="-1"></a>data.dist <span class="ot">&lt;-</span> <span class="fu">dist</span>(data) <span class="co"># need to compute the distance matrix</span></span>
-<span id="cb38-2"><a href="#cb38-2" aria-hidden="true" tabindex="-1"></a>hclust.df <span class="ot">&lt;-</span> <span class="fu">hclust</span>(data.dist, <span class="at">method=</span><span class="st">"complete"</span> )</span>
-<span id="cb38-3"><a href="#cb38-3" aria-hidden="true" tabindex="-1"></a><span class="co">#alternatives:</span></span>
-<span id="cb38-4"><a href="#cb38-4" aria-hidden="true" tabindex="-1"></a><span class="co">#hclust.df &lt;- hclust( D, method="average" )</span></span>
-<span id="cb38-5"><a href="#cb38-5" aria-hidden="true" tabindex="-1"></a><span class="co">#hclust.df &lt;- hclust( D, method="single" )</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<div class="sourceCode cell-code" id="cb56"><pre class="sourceCode r code-with-copy"><code class="sourceCode r"><span id="cb56-1"><a href="#cb56-1" aria-hidden="true" tabindex="-1"></a>data.dist <span class="ot">&lt;-</span> <span class="fu">dist</span>(data) <span class="co"># need to compute the distance matrix</span></span>
+<span id="cb56-2"><a href="#cb56-2" aria-hidden="true" tabindex="-1"></a>hclust.df <span class="ot">&lt;-</span> <span class="fu">hclust</span>(data.dist, <span class="at">method=</span><span class="st">"complete"</span> )</span>
+<span id="cb56-3"><a href="#cb56-3" aria-hidden="true" tabindex="-1"></a><span class="co">#alternatives:</span></span>
+<span id="cb56-4"><a href="#cb56-4" aria-hidden="true" tabindex="-1"></a><span class="co">#hclust.df &lt;- hclust( D, method="average" )</span></span>
+<span id="cb56-5"><a href="#cb56-5" aria-hidden="true" tabindex="-1"></a><span class="co">#hclust.df &lt;- hclust( D, method="single" )</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 </div>
 <p>We can keep 2 clusters with <code>cutree</code>. Then do a cross tabulation of the true labels and clustered results: how well do they correspond?</p>
 <div class="cell">
-<div class="sourceCode cell-code" id="cb39"><pre class="sourceCode r code-with-copy"><code class="sourceCode r"><span id="cb39-1"><a href="#cb39-1" aria-hidden="true" tabindex="-1"></a><span class="co"># find the clusters</span></span>
-<span id="cb39-2"><a href="#cb39-2" aria-hidden="true" tabindex="-1"></a>predicted <span class="ot">&lt;-</span> <span class="fu">cutree</span>( hclust.df, <span class="at">k=</span><span class="dv">2</span> )</span>
-<span id="cb39-3"><a href="#cb39-3" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb39-4"><a href="#cb39-4" aria-hidden="true" tabindex="-1"></a><span class="co"># How well does our clustering predict health vs. diseased</span></span>
-<span id="cb39-5"><a href="#cb39-5" aria-hidden="true" tabindex="-1"></a><span class="fu">table</span>(predicted, true.groups )</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<div class="sourceCode cell-code" id="cb57"><pre class="sourceCode r code-with-copy"><code class="sourceCode r"><span id="cb57-1"><a href="#cb57-1" aria-hidden="true" tabindex="-1"></a><span class="co"># find the clusters</span></span>
+<span id="cb57-2"><a href="#cb57-2" aria-hidden="true" tabindex="-1"></a>predicted <span class="ot">&lt;-</span> <span class="fu">cutree</span>( hclust.df, <span class="at">k=</span><span class="dv">2</span> )</span>
+<span id="cb57-3"><a href="#cb57-3" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb57-4"><a href="#cb57-4" aria-hidden="true" tabindex="-1"></a><span class="co"># How well does our clustering predict health vs. diseased</span></span>
+<span id="cb57-5"><a href="#cb57-5" aria-hidden="true" tabindex="-1"></a><span class="fu">table</span>(predicted, true.groups )</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 <div class="cell-output cell-output-stdout">
 <pre><code>         true.groups
 predicted  0  1
@@ -514,9 +697,9 @@ <h3 class="anchored" data-anchor-id="hierarchical-clustering-1">Hierarchical clu
 <h3 class="anchored" data-anchor-id="k-means">K-means</h3>
 <p>Now you can use K-means to identify 2 clusters.</p>
 <div class="cell">
-<div class="sourceCode cell-code" id="cb41"><pre class="sourceCode r code-with-copy"><code class="sourceCode r"><span id="cb41-1"><a href="#cb41-1" aria-hidden="true" tabindex="-1"></a>predicted.kmean <span class="ot">&lt;-</span> <span class="fu">kmeans</span>(data, <span class="dv">2</span>, <span class="at">nstart=</span><span class="dv">20</span>)<span class="sc">$</span>cluster</span>
-<span id="cb41-2"><a href="#cb41-2" aria-hidden="true" tabindex="-1"></a><span class="co"># agreement with true label</span></span>
-<span id="cb41-3"><a href="#cb41-3" aria-hidden="true" tabindex="-1"></a><span class="fu">table</span>(predicted.kmean, true.groups )</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<div class="sourceCode cell-code" id="cb59"><pre class="sourceCode r code-with-copy"><code class="sourceCode r"><span id="cb59-1"><a href="#cb59-1" aria-hidden="true" tabindex="-1"></a>predicted.kmean <span class="ot">&lt;-</span> <span class="fu">kmeans</span>(data, <span class="dv">2</span>, <span class="at">nstart=</span><span class="dv">20</span>)<span class="sc">$</span>cluster</span>
+<span id="cb59-2"><a href="#cb59-2" aria-hidden="true" tabindex="-1"></a><span class="co"># agreement with true label</span></span>
+<span id="cb59-3"><a href="#cb59-3" aria-hidden="true" tabindex="-1"></a><span class="fu">table</span>(predicted.kmean, true.groups )</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 <div class="cell-output cell-output-stdout">
 <pre><code>               true.groups
 predicted.kmean  0  1
@@ -817,386 +1000,565 @@ <h3 class="anchored" data-anchor-id="k-means">K-means</h3>
   }
 });
 </script><div class="modal fade" id="quarto-embedded-source-code-modal" tabindex="-1" aria-labelledby="quarto-embedded-source-code-modal-label" aria-hidden="true"><div class="modal-dialog modal-dialog-scrollable"><div class="modal-content"><div class="modal-header"><h5 class="modal-title" id="quarto-embedded-source-code-modal-label">Source Code</h5><button class="btn-close" data-bs-dismiss="modal"></button></div><div class="modal-body"><div class="">
-<div class="sourceCode" id="cb43" data-shortcodes="false"><pre class="sourceCode markdown code-with-copy"><code class="sourceCode markdown"><span id="cb43-1"><a href="#cb43-1" aria-hidden="true" tabindex="-1"></a><span class="co">---</span></span>
-<span id="cb43-2"><a href="#cb43-2" aria-hidden="true" tabindex="-1"></a><span class="an">title:</span><span class="co"> "R Lab (day 4): Clustering"</span></span>
-<span id="cb43-3"><a href="#cb43-3" aria-hidden="true" tabindex="-1"></a><span class="an">format:</span><span class="co"> </span></span>
-<span id="cb43-4"><a href="#cb43-4" aria-hidden="true" tabindex="-1"></a><span class="co">  html:</span></span>
-<span id="cb43-5"><a href="#cb43-5" aria-hidden="true" tabindex="-1"></a><span class="co">    code-fold: false</span></span>
-<span id="cb43-6"><a href="#cb43-6" aria-hidden="true" tabindex="-1"></a><span class="co">    code-tools: true</span></span>
-<span id="cb43-7"><a href="#cb43-7" aria-hidden="true" tabindex="-1"></a><span class="co">---</span></span>
-<span id="cb43-8"><a href="#cb43-8" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb43-9"><a href="#cb43-9" aria-hidden="true" tabindex="-1"></a>Download datasets <span class="co">[</span><span class="ot">here</span><span class="co">](https://github.com/ocbe-uio/course_med3007/tree/main/lab/data)</span>, or from Canvas.</span>
-<span id="cb43-10"><a href="#cb43-10" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb43-11"><a href="#cb43-11" aria-hidden="true" tabindex="-1"></a>R script: <span class="co">[</span><span class="ot">Code</span><span class="co">](https://github.com/ocbe-uio/course_med3007/blob/main/lab/code/MED3007_Lab3_clustering.R)</span></span>
-<span id="cb43-12"><a href="#cb43-12" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb43-13"><a href="#cb43-13" aria-hidden="true" tabindex="-1"></a>Presentation: <span class="co">[</span><span class="ot">Slides</span><span class="co">]()</span></span>
-<span id="cb43-14"><a href="#cb43-14" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb43-15"><a href="#cb43-15" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb43-16"><a href="#cb43-16" aria-hidden="true" tabindex="-1"></a><span class="fu">## Exercise 1: NCI60</span></span>
-<span id="cb43-17"><a href="#cb43-17" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb43-18"><a href="#cb43-18" aria-hidden="true" tabindex="-1"></a>We look at the NCI60 data again. First load the dataset.</span>
-<span id="cb43-19"><a href="#cb43-19" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb43-22"><a href="#cb43-22" aria-hidden="true" tabindex="-1"></a><span class="in">```{r}</span></span>
-<span id="cb43-23"><a href="#cb43-23" aria-hidden="true" tabindex="-1"></a><span class="co">#| label: clust-nci60-loaddata</span></span>
-<span id="cb43-24"><a href="#cb43-24" aria-hidden="true" tabindex="-1"></a><span class="co">#| warning: false</span></span>
-<span id="cb43-25"><a href="#cb43-25" aria-hidden="true" tabindex="-1"></a><span class="co">#| echo: true</span></span>
-<span id="cb43-26"><a href="#cb43-26" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb43-27"><a href="#cb43-27" aria-hidden="true" tabindex="-1"></a><span class="fu">library</span>(ISLR)</span>
-<span id="cb43-28"><a href="#cb43-28" aria-hidden="true" tabindex="-1"></a><span class="co"># or, load('data/NCI60.RData')</span></span>
-<span id="cb43-29"><a href="#cb43-29" aria-hidden="true" tabindex="-1"></a>nci.labs <span class="ot">&lt;-</span> NCI60<span class="sc">$</span>labs <span class="co"># Sample labels (tissue type)</span></span>
-<span id="cb43-30"><a href="#cb43-30" aria-hidden="true" tabindex="-1"></a>nci.data <span class="ot">&lt;-</span> NCI60<span class="sc">$</span>data <span class="co"># Gene expression data set</span></span>
-<span id="cb43-31"><a href="#cb43-31" aria-hidden="true" tabindex="-1"></a><span class="in">```</span></span>
-<span id="cb43-32"><a href="#cb43-32" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb43-33"><a href="#cb43-33" aria-hidden="true" tabindex="-1"></a><span class="fu">### Hierarchical clustering</span></span>
-<span id="cb43-34"><a href="#cb43-34" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb43-35"><a href="#cb43-35" aria-hidden="true" tabindex="-1"></a>We start by scaling the data, and calculate the distance matrix (using the Euclidean distance), and then investigate different linkage methods.</span>
-<span id="cb43-36"><a href="#cb43-36" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb43-37"><a href="#cb43-37" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb43-40"><a href="#cb43-40" aria-hidden="true" tabindex="-1"></a><span class="in">```{r}</span></span>
-<span id="cb43-41"><a href="#cb43-41" aria-hidden="true" tabindex="-1"></a><span class="co">#| label: clust-nci60-scale</span></span>
-<span id="cb43-42"><a href="#cb43-42" aria-hidden="true" tabindex="-1"></a><span class="co">#| warning: false</span></span>
-<span id="cb43-43"><a href="#cb43-43" aria-hidden="true" tabindex="-1"></a><span class="co">#| echo: true</span></span>
-<span id="cb43-44"><a href="#cb43-44" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb43-45"><a href="#cb43-45" aria-hidden="true" tabindex="-1"></a><span class="co"># Scale the data to zero mean and unit variance:</span></span>
-<span id="cb43-46"><a href="#cb43-46" aria-hidden="true" tabindex="-1"></a>sd.data <span class="ot">&lt;-</span> <span class="fu">scale</span>(nci.data)</span>
-<span id="cb43-47"><a href="#cb43-47" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb43-48"><a href="#cb43-48" aria-hidden="true" tabindex="-1"></a><span class="co"># Calculate the distance matrix </span></span>
-<span id="cb43-49"><a href="#cb43-49" aria-hidden="true" tabindex="-1"></a><span class="co"># equivalent: data.dist &lt;- dist(sd.data, method="euclidean")</span></span>
-<span id="cb43-50"><a href="#cb43-50" aria-hidden="true" tabindex="-1"></a>data.dist <span class="ot">&lt;-</span> <span class="fu">dist</span>(sd.data)</span>
-<span id="cb43-51"><a href="#cb43-51" aria-hidden="true" tabindex="-1"></a><span class="in">```</span></span>
-<span id="cb43-52"><a href="#cb43-52" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb43-53"><a href="#cb43-53" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb43-54"><a href="#cb43-54" aria-hidden="true" tabindex="-1"></a>Next we perform hierarchical clustering with distance matrix as input. The function we use is <span class="in">`hclust()`</span>. We specify the linkage method to be <span class="in">`complete`</span>.</span>
-<span id="cb43-55"><a href="#cb43-55" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb43-56"><a href="#cb43-56" aria-hidden="true" tabindex="-1"></a>Once the result is saved in <span class="in">`hc.complete`</span> object, you can plot the dendrogram.</span>
-<span id="cb43-57"><a href="#cb43-57" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb43-60"><a href="#cb43-60" aria-hidden="true" tabindex="-1"></a><span class="in">```{r}</span></span>
-<span id="cb43-61"><a href="#cb43-61" aria-hidden="true" tabindex="-1"></a><span class="co">#| label: clust-nci60-hccomplete</span></span>
-<span id="cb43-62"><a href="#cb43-62" aria-hidden="true" tabindex="-1"></a><span class="co">#| warning: false</span></span>
-<span id="cb43-63"><a href="#cb43-63" aria-hidden="true" tabindex="-1"></a><span class="co">#| echo: true</span></span>
-<span id="cb43-64"><a href="#cb43-64" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb43-65"><a href="#cb43-65" aria-hidden="true" tabindex="-1"></a><span class="co"># Perform clustering</span></span>
-<span id="cb43-66"><a href="#cb43-66" aria-hidden="true" tabindex="-1"></a>hc.complete <span class="ot">&lt;-</span> <span class="fu">hclust</span>(data.dist, <span class="at">method=</span><span class="st">"complete"</span>)</span>
-<span id="cb43-67"><a href="#cb43-67" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb43-68"><a href="#cb43-68" aria-hidden="true" tabindex="-1"></a><span class="co"># names(hc.complete)</span></span>
-<span id="cb43-69"><a href="#cb43-69" aria-hidden="true" tabindex="-1"></a><span class="fu">plot</span>(hc.complete, <span class="at">labels=</span>nci.labs, <span class="at">main=</span><span class="st">"Complete Linkage"</span>, <span class="at">xlab=</span><span class="st">""</span>, <span class="at">sub=</span><span class="st">""</span>)</span>
-<span id="cb43-70"><a href="#cb43-70" aria-hidden="true" tabindex="-1"></a><span class="in">```</span></span>
-<span id="cb43-71"><a href="#cb43-71" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb43-72"><a href="#cb43-72" aria-hidden="true" tabindex="-1"></a>The object <span class="in">`hc.complete`</span> contains a lot of information. To get the information, you can use the <span class="in">`$`</span> operator. </span>
-<span id="cb43-73"><a href="#cb43-73" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb43-74"><a href="#cb43-74" aria-hidden="true" tabindex="-1"></a>You should refer to the documentation for <span class="in">`hclust()`</span> to see a complete list of output. Use <span class="in">`?hclust`</span> to get the documentation on how to use the function.</span>
-<span id="cb43-75"><a href="#cb43-75" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb43-78"><a href="#cb43-78" aria-hidden="true" tabindex="-1"></a><span class="in">```{r}</span></span>
-<span id="cb43-79"><a href="#cb43-79" aria-hidden="true" tabindex="-1"></a><span class="co">#| label: clust-nci60-hccomplete-2</span></span>
-<span id="cb43-80"><a href="#cb43-80" aria-hidden="true" tabindex="-1"></a><span class="co">#| warning: false</span></span>
-<span id="cb43-81"><a href="#cb43-81" aria-hidden="true" tabindex="-1"></a><span class="co">#| echo: true</span></span>
-<span id="cb43-82"><a href="#cb43-82" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb43-83"><a href="#cb43-83" aria-hidden="true" tabindex="-1"></a>hc.complete<span class="sc">$</span>dist.method <span class="co"># distance method</span></span>
-<span id="cb43-84"><a href="#cb43-84" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb43-85"><a href="#cb43-85" aria-hidden="true" tabindex="-1"></a><span class="co"># hc.complete$merge  # order of aggregations of samples / clusters</span></span>
-<span id="cb43-86"><a href="#cb43-86" aria-hidden="true" tabindex="-1"></a><span class="co"># hc.complete$height # distance at which aggregations happen</span></span>
-<span id="cb43-87"><a href="#cb43-87" aria-hidden="true" tabindex="-1"></a><span class="co"># hc.complete$labels # labels (numeric, since we don't know the original categories!)</span></span>
-<span id="cb43-88"><a href="#cb43-88" aria-hidden="true" tabindex="-1"></a><span class="co"># hc.complete$method</span></span>
-<span id="cb43-89"><a href="#cb43-89" aria-hidden="true" tabindex="-1"></a><span class="co"># hc.complete$call</span></span>
-<span id="cb43-90"><a href="#cb43-90" aria-hidden="true" tabindex="-1"></a><span class="in">```</span></span>
-<span id="cb43-91"><a href="#cb43-91" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb43-92"><a href="#cb43-92" aria-hidden="true" tabindex="-1"></a>We can try different linkage methods and see how the clustering results differ. Change the <span class="in">`method`</span> argument in the function, and plot the results.</span>
-<span id="cb43-93"><a href="#cb43-93" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb43-96"><a href="#cb43-96" aria-hidden="true" tabindex="-1"></a><span class="in">```{r}</span></span>
-<span id="cb43-97"><a href="#cb43-97" aria-hidden="true" tabindex="-1"></a><span class="co">#| label: clust-nci60-otherlink</span></span>
-<span id="cb43-98"><a href="#cb43-98" aria-hidden="true" tabindex="-1"></a><span class="co">#| warning: false</span></span>
-<span id="cb43-99"><a href="#cb43-99" aria-hidden="true" tabindex="-1"></a><span class="co">#| echo: true</span></span>
-<span id="cb43-100"><a href="#cb43-100" aria-hidden="true" tabindex="-1"></a><span class="co">#| eval: false</span></span>
-<span id="cb43-101"><a href="#cb43-101" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb43-102"><a href="#cb43-102" aria-hidden="true" tabindex="-1"></a>hc.average <span class="ot">&lt;-</span> <span class="fu">hclust</span>(data.dist, <span class="at">method=</span><span class="st">"average"</span>)</span>
-<span id="cb43-103"><a href="#cb43-103" aria-hidden="true" tabindex="-1"></a>hc.single <span class="ot">&lt;-</span> <span class="fu">hclust</span>(data.dist, <span class="at">method=</span><span class="st">"single"</span>)</span>
-<span id="cb43-104"><a href="#cb43-104" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb43-105"><a href="#cb43-105" aria-hidden="true" tabindex="-1"></a><span class="fu">plot</span>(hc.average, <span class="at">labels=</span>nci.labs, <span class="at">main=</span><span class="st">"Average Linkage"</span>, <span class="at">xlab=</span><span class="st">""</span>, <span class="at">sub=</span><span class="st">""</span>)</span>
-<span id="cb43-106"><a href="#cb43-106" aria-hidden="true" tabindex="-1"></a><span class="fu">plot</span>(hc.single, <span class="at">labels=</span>nci.labs,  <span class="at">main=</span><span class="st">"Single Linkage"</span>, <span class="at">xlab=</span><span class="st">""</span>, <span class="at">sub=</span><span class="st">""</span>)</span>
-<span id="cb43-107"><a href="#cb43-107" aria-hidden="true" tabindex="-1"></a><span class="in">```</span></span>
-<span id="cb43-108"><a href="#cb43-108" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb43-109"><a href="#cb43-109" aria-hidden="true" tabindex="-1"></a>Now we focus on **complete linkage** only.</span>
-<span id="cb43-110"><a href="#cb43-110" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb43-111"><a href="#cb43-111" aria-hidden="true" tabindex="-1"></a>First, we use <span class="in">`cutree()`</span> to compare the results when the data are separated into either 2 or 4 clusters.</span>
-<span id="cb43-112"><a href="#cb43-112" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb43-115"><a href="#cb43-115" aria-hidden="true" tabindex="-1"></a><span class="in">```{r}</span></span>
-<span id="cb43-116"><a href="#cb43-116" aria-hidden="true" tabindex="-1"></a><span class="co">#| label: clust-nci60-cutree</span></span>
-<span id="cb43-117"><a href="#cb43-117" aria-hidden="true" tabindex="-1"></a><span class="co">#| warning: false</span></span>
-<span id="cb43-118"><a href="#cb43-118" aria-hidden="true" tabindex="-1"></a><span class="co">#| echo: true</span></span>
-<span id="cb43-119"><a href="#cb43-119" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb43-120"><a href="#cb43-120" aria-hidden="true" tabindex="-1"></a><span class="co"># Compare 2 clusters and 4 clusters:</span></span>
-<span id="cb43-121"><a href="#cb43-121" aria-hidden="true" tabindex="-1"></a>hc.clusters <span class="ot">&lt;-</span> <span class="fu">cutree</span>(hc.complete, <span class="fu">c</span>(<span class="dv">2</span>, <span class="dv">4</span>))</span>
-<span id="cb43-122"><a href="#cb43-122" aria-hidden="true" tabindex="-1"></a><span class="fu">head</span>(hc.clusters) <span class="co"># print first 6 results</span></span>
-<span id="cb43-123"><a href="#cb43-123" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb43-124"><a href="#cb43-124" aria-hidden="true" tabindex="-1"></a><span class="co"># cross tabulation</span></span>
-<span id="cb43-125"><a href="#cb43-125" aria-hidden="true" tabindex="-1"></a><span class="fu">table</span>(hc.clusters[,<span class="st">"2"</span>], hc.clusters[,<span class="st">"4"</span>])</span>
-<span id="cb43-126"><a href="#cb43-126" aria-hidden="true" tabindex="-1"></a><span class="in">```</span></span>
-<span id="cb43-127"><a href="#cb43-127" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb43-128"><a href="#cb43-128" aria-hidden="true" tabindex="-1"></a>It is more straightforward to check the results with a dendrogram.</span>
-<span id="cb43-129"><a href="#cb43-129" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb43-132"><a href="#cb43-132" aria-hidden="true" tabindex="-1"></a><span class="in">```{r}</span></span>
-<span id="cb43-133"><a href="#cb43-133" aria-hidden="true" tabindex="-1"></a><span class="co"># visualize the cuts</span></span>
-<span id="cb43-134"><a href="#cb43-134" aria-hidden="true" tabindex="-1"></a><span class="co"># how do you know where to draw the line? check height</span></span>
-<span id="cb43-135"><a href="#cb43-135" aria-hidden="true" tabindex="-1"></a>heights <span class="ot">&lt;-</span> hc.complete<span class="sc">$</span>height</span>
-<span id="cb43-136"><a href="#cb43-136" aria-hidden="true" tabindex="-1"></a><span class="fu">tail</span>(heights, <span class="dv">4</span>)  <span class="co"># print the last 4</span></span>
-<span id="cb43-137"><a href="#cb43-137" aria-hidden="true" tabindex="-1"></a><span class="fu">plot</span>(hc.complete, <span class="at">labels=</span>nci.labs, <span class="at">main=</span><span class="st">"Complete Linkage"</span>, <span class="at">xlab=</span><span class="st">""</span>, <span class="at">sub=</span><span class="st">""</span>)</span>
-<span id="cb43-138"><a href="#cb43-138" aria-hidden="true" tabindex="-1"></a><span class="fu">abline</span>(<span class="at">h=</span><span class="dv">140</span>, <span class="at">col=</span><span class="st">"red"</span>)  <span class="co"># 4 clusters</span></span>
-<span id="cb43-139"><a href="#cb43-139" aria-hidden="true" tabindex="-1"></a><span class="fu">abline</span>(<span class="at">h=</span><span class="dv">150</span>, <span class="at">col=</span><span class="st">"blue"</span>) <span class="co"># 2 clusters</span></span>
-<span id="cb43-140"><a href="#cb43-140" aria-hidden="true" tabindex="-1"></a><span class="in">```</span></span>
-<span id="cb43-141"><a href="#cb43-141" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb43-142"><a href="#cb43-142" aria-hidden="true" tabindex="-1"></a>The way to interpret the height variable is simple: it is where two clusters are merged into one. For example, the largest cluster corresponds to the last value of <span class="in">`height`</span> (162.2) - if you check the figure, it is exactly where the horizontal line is merging the two groups. Similarly, 142.9 is where three groups became two, 141.2 is where four groups became three. If you draw a line at 140, it points out the four clusters.</span>
-<span id="cb43-143"><a href="#cb43-143" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb43-144"><a href="#cb43-144" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb43-145"><a href="#cb43-145" aria-hidden="true" tabindex="-1"></a>How are the labels distributed between clusters? We can focus on 4 cluster situation, and use <span class="in">`table()`</span> to list out which cancer types is merged in which of the four clusters.</span>
-<span id="cb43-146"><a href="#cb43-146" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb43-147"><a href="#cb43-147" aria-hidden="true" tabindex="-1"></a>For example, breast cancer appears in all but 3rd cluster; melanoma only appears in the first clsuter; so on so forth.</span>
-<span id="cb43-148"><a href="#cb43-148" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb43-151"><a href="#cb43-151" aria-hidden="true" tabindex="-1"></a><span class="in">```{r}</span></span>
-<span id="cb43-152"><a href="#cb43-152" aria-hidden="true" tabindex="-1"></a><span class="co">#| label: clust-nci60-cutree2</span></span>
-<span id="cb43-153"><a href="#cb43-153" aria-hidden="true" tabindex="-1"></a><span class="co">#| warning: false</span></span>
-<span id="cb43-154"><a href="#cb43-154" aria-hidden="true" tabindex="-1"></a><span class="co">#| echo: true</span></span>
-<span id="cb43-155"><a href="#cb43-155" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb43-156"><a href="#cb43-156" aria-hidden="true" tabindex="-1"></a><span class="fu">table</span>(hc.clusters[,<span class="st">"4"</span>], nci.labs)</span>
-<span id="cb43-157"><a href="#cb43-157" aria-hidden="true" tabindex="-1"></a><span class="in">```</span></span>
-<span id="cb43-158"><a href="#cb43-158" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb43-159"><a href="#cb43-159" aria-hidden="true" tabindex="-1"></a>Finally, we see what happens if we use **unscaled data** instead of scaled data, or if we use a **correlation-based distance metric** instead of the Euclidean distance.</span>
-<span id="cb43-160"><a href="#cb43-160" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb43-161"><a href="#cb43-161" aria-hidden="true" tabindex="-1"></a>Compare the dendrograms: How different are the resulting clusterings? Do you recognise subclusters that are consistent?</span>
-<span id="cb43-162"><a href="#cb43-162" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb43-163"><a href="#cb43-163" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb43-166"><a href="#cb43-166" aria-hidden="true" tabindex="-1"></a><span class="in">```{r}</span></span>
-<span id="cb43-167"><a href="#cb43-167" aria-hidden="true" tabindex="-1"></a><span class="co">#| label: clust-nci60-unscale</span></span>
-<span id="cb43-168"><a href="#cb43-168" aria-hidden="true" tabindex="-1"></a><span class="co">#| warning: false</span></span>
-<span id="cb43-169"><a href="#cb43-169" aria-hidden="true" tabindex="-1"></a><span class="co">#| echo: true</span></span>
-<span id="cb43-170"><a href="#cb43-170" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb43-171"><a href="#cb43-171" aria-hidden="true" tabindex="-1"></a><span class="co"># Compare scaled data versus non-scaled data:</span></span>
-<span id="cb43-172"><a href="#cb43-172" aria-hidden="true" tabindex="-1"></a>hc.unscaled <span class="ot">&lt;-</span> <span class="fu">hclust</span>(<span class="fu">dist</span>(nci.data), <span class="at">method=</span><span class="st">"complete"</span>)</span>
-<span id="cb43-173"><a href="#cb43-173" aria-hidden="true" tabindex="-1"></a><span class="fu">plot</span>(hc.unscaled, <span class="at">labels=</span>nci.labs, <span class="at">main=</span><span class="st">"Complete linkage with unscaled features"</span>, <span class="at">xlab=</span><span class="st">""</span>, <span class="at">sub=</span><span class="st">""</span>)</span>
-<span id="cb43-174"><a href="#cb43-174" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb43-175"><a href="#cb43-175" aria-hidden="true" tabindex="-1"></a><span class="co"># Compare Euclidean distance with correlation-based distance:</span></span>
-<span id="cb43-176"><a href="#cb43-176" aria-hidden="true" tabindex="-1"></a>dd <span class="ot">&lt;-</span> <span class="fu">as.dist</span>(<span class="dv">1</span><span class="sc">-</span><span class="fu">cor</span>(<span class="fu">t</span>(sd.data)))</span>
-<span id="cb43-177"><a href="#cb43-177" aria-hidden="true" tabindex="-1"></a>hc.corr <span class="ot">&lt;-</span> <span class="fu">hclust</span>(dd, <span class="at">method=</span><span class="st">"complete"</span>)</span>
-<span id="cb43-178"><a href="#cb43-178" aria-hidden="true" tabindex="-1"></a><span class="fu">plot</span>(hc.corr, <span class="at">labels=</span>nci.labs, <span class="at">main=</span><span class="st">"Complete linkage with correlation-based distance"</span>, <span class="at">xlab=</span><span class="st">""</span>, <span class="at">sub=</span><span class="st">""</span>)</span>
-<span id="cb43-179"><a href="#cb43-179" aria-hidden="true" tabindex="-1"></a><span class="in">```</span></span>
-<span id="cb43-180"><a href="#cb43-180" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb43-181"><a href="#cb43-181" aria-hidden="true" tabindex="-1"></a><span class="fu">### K-means clustering</span></span>
-<span id="cb43-182"><a href="#cb43-182" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb43-183"><a href="#cb43-183" aria-hidden="true" tabindex="-1"></a>In this section we explore the K-means clustering on the same dataset. </span>
-<span id="cb43-184"><a href="#cb43-184" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb43-185"><a href="#cb43-185" aria-hidden="true" tabindex="-1"></a>In contrast to the hierarchical clustering which requires a **distance** as input, with K-means you would provide the data matrix. The data matrix can be scaled (centered and with unit variance), or unscaled.</span>
-<span id="cb43-186"><a href="#cb43-186" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb43-187"><a href="#cb43-187" aria-hidden="true" tabindex="-1"></a>In this example we use scaled data computed from before, <span class="in">`sd.data`</span>. </span>
-<span id="cb43-188"><a href="#cb43-188" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb43-191"><a href="#cb43-191" aria-hidden="true" tabindex="-1"></a><span class="in">```{r}</span></span>
-<span id="cb43-192"><a href="#cb43-192" aria-hidden="true" tabindex="-1"></a><span class="co">#| label: clust-nci60-km4</span></span>
-<span id="cb43-193"><a href="#cb43-193" aria-hidden="true" tabindex="-1"></a><span class="co">#| warning: false</span></span>
-<span id="cb43-194"><a href="#cb43-194" aria-hidden="true" tabindex="-1"></a><span class="co">#| echo: true</span></span>
-<span id="cb43-195"><a href="#cb43-195" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb43-196"><a href="#cb43-196" aria-hidden="true" tabindex="-1"></a><span class="fu">set.seed</span>(<span class="dv">4</span>) <span class="co"># set random seed</span></span>
-<span id="cb43-197"><a href="#cb43-197" aria-hidden="true" tabindex="-1"></a>km.out4 <span class="ot">&lt;-</span> <span class="fu">kmeans</span>(sd.data, <span class="at">centers =</span> <span class="dv">4</span>, <span class="at">nstart=</span><span class="dv">20</span>)</span>
-<span id="cb43-198"><a href="#cb43-198" aria-hidden="true" tabindex="-1"></a>km.out4<span class="sc">$</span>cluster</span>
-<span id="cb43-199"><a href="#cb43-199" aria-hidden="true" tabindex="-1"></a><span class="in">```</span></span>
-<span id="cb43-200"><a href="#cb43-200" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb43-201"><a href="#cb43-201" aria-hidden="true" tabindex="-1"></a>Read the help file <span class="in">`?kmeans`</span> to understand what the argument nstart=20 does. Comparing an analysis with nstart=20 versus nstart=1 demonstrates how the cluster results can be improved if we allow more evaluations with different randomly chosen starting centroids.</span>
-<span id="cb43-202"><a href="#cb43-202" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb43-203"><a href="#cb43-203" aria-hidden="true" tabindex="-1"></a>Set a different random seed, say 3 (as long as it's different from the one you used before), and run the analysis again. This time we use a different <span class="in">`nstart`</span></span>
-<span id="cb43-204"><a href="#cb43-204" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb43-207"><a href="#cb43-207" aria-hidden="true" tabindex="-1"></a><span class="in">```{r}</span></span>
-<span id="cb43-208"><a href="#cb43-208" aria-hidden="true" tabindex="-1"></a><span class="co">#| label: clust-nci60-km3</span></span>
-<span id="cb43-209"><a href="#cb43-209" aria-hidden="true" tabindex="-1"></a><span class="co">#| warning: false</span></span>
-<span id="cb43-210"><a href="#cb43-210" aria-hidden="true" tabindex="-1"></a><span class="co">#| echo: true</span></span>
-<span id="cb43-211"><a href="#cb43-211" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb43-212"><a href="#cb43-212" aria-hidden="true" tabindex="-1"></a><span class="co"># different starting centroids improve the clustering:</span></span>
-<span id="cb43-213"><a href="#cb43-213" aria-hidden="true" tabindex="-1"></a><span class="fu">set.seed</span>(<span class="dv">3</span>)</span>
-<span id="cb43-214"><a href="#cb43-214" aria-hidden="true" tabindex="-1"></a>km.out <span class="ot">&lt;-</span> <span class="fu">kmeans</span>(sd.data, <span class="at">centers =</span> <span class="dv">4</span>, <span class="at">nstart=</span><span class="dv">1</span>)</span>
-<span id="cb43-215"><a href="#cb43-215" aria-hidden="true" tabindex="-1"></a>km.out<span class="sc">$</span>cluster</span>
-<span id="cb43-216"><a href="#cb43-216" aria-hidden="true" tabindex="-1"></a>km.out<span class="sc">$</span>tot.withinss</span>
-<span id="cb43-217"><a href="#cb43-217" aria-hidden="true" tabindex="-1"></a><span class="in">```</span></span>
-<span id="cb43-218"><a href="#cb43-218" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb43-219"><a href="#cb43-219" aria-hidden="true" tabindex="-1"></a><span class="fu">### Compare with hierarchical clustering</span></span>
-<span id="cb43-220"><a href="#cb43-220" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb43-223"><a href="#cb43-223" aria-hidden="true" tabindex="-1"></a><span class="in">```{r}</span></span>
-<span id="cb43-224"><a href="#cb43-224" aria-hidden="true" tabindex="-1"></a><span class="co">#| label: clust-nci60-km-res</span></span>
-<span id="cb43-225"><a href="#cb43-225" aria-hidden="true" tabindex="-1"></a><span class="co">#| warning: false</span></span>
-<span id="cb43-226"><a href="#cb43-226" aria-hidden="true" tabindex="-1"></a><span class="co">#| echo: true</span></span>
-<span id="cb43-227"><a href="#cb43-227" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb43-228"><a href="#cb43-228" aria-hidden="true" tabindex="-1"></a><span class="co"># we can directly compare the k-means result (along rows)</span></span>
-<span id="cb43-229"><a href="#cb43-229" aria-hidden="true" tabindex="-1"></a><span class="co"># with the hierarchical clustering result (along columns)</span></span>
-<span id="cb43-230"><a href="#cb43-230" aria-hidden="true" tabindex="-1"></a><span class="fu">table</span>(km.out4<span class="sc">$</span>cluster, hc.clusters[,<span class="st">"4"</span>], <span class="at">deparse.level=</span><span class="dv">2</span>)</span>
-<span id="cb43-231"><a href="#cb43-231" aria-hidden="true" tabindex="-1"></a><span class="in">```</span></span>
-<span id="cb43-232"><a href="#cb43-232" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb43-233"><a href="#cb43-233" aria-hidden="true" tabindex="-1"></a>From the results, you can see that the results are slightly different between the two methods. Keep in mind that in unsupervised learning you do not have the real outcome label (such as the cancer types here), so you need to try a few different methods and compare the outputs, and make interpretations accordingly.</span>
-<span id="cb43-234"><a href="#cb43-234" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb43-235"><a href="#cb43-235" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb43-236"><a href="#cb43-236" aria-hidden="true" tabindex="-1"></a><span class="fu">### Visualize clusters</span></span>
-<span id="cb43-237"><a href="#cb43-237" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb43-238"><a href="#cb43-238" aria-hidden="true" tabindex="-1"></a>We can visualise the K-means clustering results of high-dimensional data by using PCA for dimension reduction. We plot the first two principal components and colour the data points (= individual cell lines) by their assigned cluster from K-means.</span>
-<span id="cb43-239"><a href="#cb43-239" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb43-242"><a href="#cb43-242" aria-hidden="true" tabindex="-1"></a><span class="in">```{r}</span></span>
-<span id="cb43-243"><a href="#cb43-243" aria-hidden="true" tabindex="-1"></a><span class="co">#| label: clust-nci60-km-pca</span></span>
-<span id="cb43-244"><a href="#cb43-244" aria-hidden="true" tabindex="-1"></a><span class="co">#| warning: false</span></span>
-<span id="cb43-245"><a href="#cb43-245" aria-hidden="true" tabindex="-1"></a><span class="co">#| echo: true</span></span>
-<span id="cb43-246"><a href="#cb43-246" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb43-247"><a href="#cb43-247" aria-hidden="true" tabindex="-1"></a><span class="co"># first, run PCA again on the NCI60 data</span></span>
-<span id="cb43-248"><a href="#cb43-248" aria-hidden="true" tabindex="-1"></a>pr.out <span class="ot">&lt;-</span> <span class="fu">prcomp</span>(nci.data, <span class="at">scale=</span><span class="cn">TRUE</span>)</span>
-<span id="cb43-249"><a href="#cb43-249" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb43-250"><a href="#cb43-250" aria-hidden="true" tabindex="-1"></a><span class="co"># more cluster options</span></span>
-<span id="cb43-251"><a href="#cb43-251" aria-hidden="true" tabindex="-1"></a>km.out2 <span class="ot">&lt;-</span> <span class="fu">kmeans</span>(sd.data, <span class="dv">2</span>, <span class="at">nstart=</span><span class="dv">20</span>)</span>
-<span id="cb43-252"><a href="#cb43-252" aria-hidden="true" tabindex="-1"></a>km.out3 <span class="ot">&lt;-</span> <span class="fu">kmeans</span>(sd.data, <span class="dv">3</span>, <span class="at">nstart=</span><span class="dv">20</span>)</span>
-<span id="cb43-253"><a href="#cb43-253" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb43-254"><a href="#cb43-254" aria-hidden="true" tabindex="-1"></a><span class="co"># we can now visualise the K-Means results by labelling the data points</span></span>
-<span id="cb43-255"><a href="#cb43-255" aria-hidden="true" tabindex="-1"></a><span class="co"># in a plot of the scores of the first 2 principal components:</span></span>
-<span id="cb43-256"><a href="#cb43-256" aria-hidden="true" tabindex="-1"></a><span class="fu">par</span>(<span class="at">mfrow=</span><span class="fu">c</span>(<span class="dv">1</span>,<span class="dv">3</span>))</span>
-<span id="cb43-257"><a href="#cb43-257" aria-hidden="true" tabindex="-1"></a><span class="fu">plot</span>(pr.out<span class="sc">$</span>x[,<span class="dv">1</span><span class="sc">:</span><span class="dv">2</span>], <span class="at">col=</span>(km.out2<span class="sc">$</span>cluster<span class="sc">+</span><span class="dv">1</span>), <span class="at">main=</span><span class="st">"K-Means with K=2"</span>,</span>
-<span id="cb43-258"><a href="#cb43-258" aria-hidden="true" tabindex="-1"></a>     <span class="at">xlab=</span><span class="st">"PC 1"</span>, <span class="at">ylab=</span><span class="st">"PC 2"</span>, <span class="at">pch=</span><span class="dv">20</span>)</span>
-<span id="cb43-259"><a href="#cb43-259" aria-hidden="true" tabindex="-1"></a><span class="fu">plot</span>(pr.out<span class="sc">$</span>x[,<span class="dv">1</span><span class="sc">:</span><span class="dv">2</span>], <span class="at">col=</span>(km.out3<span class="sc">$</span>cluster<span class="sc">+</span><span class="dv">1</span>), <span class="at">main=</span><span class="st">"K-Means with K=3"</span>,</span>
-<span id="cb43-260"><a href="#cb43-260" aria-hidden="true" tabindex="-1"></a>     <span class="at">xlab=</span><span class="st">"PC 1"</span>,  <span class="at">ylab=</span><span class="st">"PC 2"</span>, <span class="at">pch=</span><span class="dv">20</span>)</span>
-<span id="cb43-261"><a href="#cb43-261" aria-hidden="true" tabindex="-1"></a><span class="fu">plot</span>(pr.out<span class="sc">$</span>x[,<span class="dv">1</span><span class="sc">:</span><span class="dv">2</span>], <span class="at">col=</span>(km.out4<span class="sc">$</span>cluster<span class="sc">+</span><span class="dv">1</span>), <span class="at">main=</span><span class="st">"K-Means with K=4"</span>,</span>
-<span id="cb43-262"><a href="#cb43-262" aria-hidden="true" tabindex="-1"></a>     <span class="at">xlab=</span><span class="st">"PC 1"</span>, <span class="at">ylab=</span><span class="st">"PC 2"</span>, <span class="at">pch=</span><span class="dv">20</span>)</span>
-<span id="cb43-263"><a href="#cb43-263" aria-hidden="true" tabindex="-1"></a><span class="in">```</span></span>
-<span id="cb43-264"><a href="#cb43-264" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb43-265"><a href="#cb43-265" aria-hidden="true" tabindex="-1"></a>Compare with the plot from Exercise 2 yesterday (left panel) along with the cancer type labels. The clusters from K-means seem to correspond decently to partition the data into groups.</span>
-<span id="cb43-266"><a href="#cb43-266" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb43-269"><a href="#cb43-269" aria-hidden="true" tabindex="-1"></a><span class="in">```{r}</span></span>
-<span id="cb43-270"><a href="#cb43-270" aria-hidden="true" tabindex="-1"></a><span class="fu">par</span>(<span class="at">mfrow=</span><span class="fu">c</span>(<span class="dv">1</span>,<span class="dv">1</span>))</span>
-<span id="cb43-271"><a href="#cb43-271" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb43-272"><a href="#cb43-272" aria-hidden="true" tabindex="-1"></a>Cols<span class="ot">=</span><span class="cf">function</span>(vec){</span>
-<span id="cb43-273"><a href="#cb43-273" aria-hidden="true" tabindex="-1"></a>  cols<span class="ot">=</span><span class="fu">rainbow</span>(<span class="fu">length</span>(<span class="fu">unique</span>(vec)))</span>
-<span id="cb43-274"><a href="#cb43-274" aria-hidden="true" tabindex="-1"></a>  <span class="fu">return</span>(cols[<span class="fu">as.numeric</span>(<span class="fu">as.factor</span>(vec))])</span>
-<span id="cb43-275"><a href="#cb43-275" aria-hidden="true" tabindex="-1"></a>}</span>
-<span id="cb43-276"><a href="#cb43-276" aria-hidden="true" tabindex="-1"></a><span class="fu">plot</span>(pr.out<span class="sc">$</span>x[,<span class="dv">1</span><span class="sc">:</span><span class="dv">2</span>], <span class="at">col=</span><span class="fu">Cols</span>(nci.labs), <span class="at">pch=</span><span class="dv">19</span>,<span class="at">xlab=</span><span class="st">"PC 1"</span>,<span class="at">ylab=</span><span class="st">" PC 2"</span>)</span>
-<span id="cb43-277"><a href="#cb43-277" aria-hidden="true" tabindex="-1"></a><span class="fu">legend</span>(<span class="st">'topleft'</span>, <span class="at">col=</span><span class="fu">rainbow</span>(<span class="fu">length</span>(<span class="fu">unique</span>(nci.labs))), <span class="at">legend=</span><span class="fu">unique</span>(nci.labs), <span class="at">bty=</span><span class="st">'n'</span>, <span class="at">lwd=</span><span class="dv">2</span>, <span class="at">cex=</span>.<span class="dv">6</span>)</span>
-<span id="cb43-278"><a href="#cb43-278" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb43-279"><a href="#cb43-279" aria-hidden="true" tabindex="-1"></a><span class="in">```</span></span>
-<span id="cb43-280"><a href="#cb43-280" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb43-281"><a href="#cb43-281" aria-hidden="true" tabindex="-1"></a><span class="fu">### Heatmap </span></span>
-<span id="cb43-282"><a href="#cb43-282" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb43-283"><a href="#cb43-283" aria-hidden="true" tabindex="-1"></a>A heatmap is another way to visualize the clusters from the data. We use the principal components rather than the raw data, as PCs are already explaining a large amount of variability in the over 6000 features.</span>
-<span id="cb43-284"><a href="#cb43-284" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb43-285"><a href="#cb43-285" aria-hidden="true" tabindex="-1"></a>Similar values are presented with similar colors. </span>
-<span id="cb43-286"><a href="#cb43-286" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb43-289"><a href="#cb43-289" aria-hidden="true" tabindex="-1"></a><span class="in">```{r}</span></span>
-<span id="cb43-290"><a href="#cb43-290" aria-hidden="true" tabindex="-1"></a><span class="co">#| label: clust-nci60-heatmap-pca</span></span>
-<span id="cb43-291"><a href="#cb43-291" aria-hidden="true" tabindex="-1"></a><span class="co">#| warning: false</span></span>
-<span id="cb43-292"><a href="#cb43-292" aria-hidden="true" tabindex="-1"></a><span class="co">#| echo: true</span></span>
-<span id="cb43-293"><a href="#cb43-293" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb43-294"><a href="#cb43-294" aria-hidden="true" tabindex="-1"></a><span class="do">## We use the scores of the PCA on the NCI60 data, to reduce dimension</span></span>
-<span id="cb43-295"><a href="#cb43-295" aria-hidden="true" tabindex="-1"></a>scores <span class="ot">&lt;-</span> pr.out<span class="sc">$</span>x</span>
-<span id="cb43-296"><a href="#cb43-296" aria-hidden="true" tabindex="-1"></a>scores[<span class="dv">1</span><span class="sc">:</span><span class="dv">5</span>, <span class="dv">1</span><span class="sc">:</span><span class="dv">5</span>] <span class="co"># first 5 pc, first 5 measurements</span></span>
-<span id="cb43-297"><a href="#cb43-297" aria-hidden="true" tabindex="-1"></a><span class="co">#  default choices</span></span>
-<span id="cb43-298"><a href="#cb43-298" aria-hidden="true" tabindex="-1"></a><span class="fu">heatmap</span>(pr.out<span class="sc">$</span>x)</span>
-<span id="cb43-299"><a href="#cb43-299" aria-hidden="true" tabindex="-1"></a><span class="in">```</span></span>
-<span id="cb43-300"><a href="#cb43-300" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb43-301"><a href="#cb43-301" aria-hidden="true" tabindex="-1"></a>You can remove the dendrogram on the PCs, only keeping the ones for cancer types. Now you see that the PCs have kept their original order from 1 to 64.</span>
-<span id="cb43-302"><a href="#cb43-302" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb43-305"><a href="#cb43-305" aria-hidden="true" tabindex="-1"></a><span class="in">```{r}</span></span>
-<span id="cb43-306"><a href="#cb43-306" aria-hidden="true" tabindex="-1"></a><span class="co">#| label: clust-nci60-heatmap-dendro</span></span>
-<span id="cb43-307"><a href="#cb43-307" aria-hidden="true" tabindex="-1"></a><span class="co">#| warning: false</span></span>
-<span id="cb43-308"><a href="#cb43-308" aria-hidden="true" tabindex="-1"></a><span class="co">#| echo: true</span></span>
-<span id="cb43-309"><a href="#cb43-309" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb43-310"><a href="#cb43-310" aria-hidden="true" tabindex="-1"></a><span class="co"># hc.corr is the result from hclust. check the section on hierarchical clustering</span></span>
-<span id="cb43-311"><a href="#cb43-311" aria-hidden="true" tabindex="-1"></a><span class="fu">heatmap</span>(pr.out<span class="sc">$</span>x, <span class="at">Rowv =</span> <span class="fu">as.dendrogram</span>(hc.corr), <span class="at">Colv =</span> <span class="cn">NA</span>)</span>
-<span id="cb43-312"><a href="#cb43-312" aria-hidden="true" tabindex="-1"></a><span class="in">```</span></span>
-<span id="cb43-313"><a href="#cb43-313" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb43-314"><a href="#cb43-314" aria-hidden="true" tabindex="-1"></a>You can also reduce the number of PCs, and add titles to the plot annd y-axis.</span>
-<span id="cb43-315"><a href="#cb43-315" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb43-318"><a href="#cb43-318" aria-hidden="true" tabindex="-1"></a><span class="in">```{r}</span></span>
-<span id="cb43-319"><a href="#cb43-319" aria-hidden="true" tabindex="-1"></a><span class="co">#| label: clust-nci60-heatmap-type</span></span>
-<span id="cb43-320"><a href="#cb43-320" aria-hidden="true" tabindex="-1"></a><span class="co">#| warning: false</span></span>
-<span id="cb43-321"><a href="#cb43-321" aria-hidden="true" tabindex="-1"></a><span class="co">#| echo: true</span></span>
-<span id="cb43-322"><a href="#cb43-322" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb43-323"><a href="#cb43-323" aria-hidden="true" tabindex="-1"></a><span class="fu">par</span>(<span class="at">cex.main =</span> .<span class="dv">7</span>)</span>
-<span id="cb43-324"><a href="#cb43-324" aria-hidden="true" tabindex="-1"></a><span class="fu">heatmap</span>(pr.out<span class="sc">$</span>x[,<span class="dv">1</span><span class="sc">:</span><span class="dv">40</span>], <span class="at">Rowv =</span> <span class="fu">as.dendrogram</span>(hc.corr), <span class="at">Colv =</span> <span class="cn">NA</span>,</span>
-<span id="cb43-325"><a href="#cb43-325" aria-hidden="true" tabindex="-1"></a>        <span class="at">labRow =</span> nci.labs, <span class="at">main =</span> <span class="st">'Heatmap of the scores of the first 40 PCs on the NCI60 data'</span>)</span>
-<span id="cb43-326"><a href="#cb43-326" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb43-327"><a href="#cb43-327" aria-hidden="true" tabindex="-1"></a><span class="in">```</span></span>
-<span id="cb43-328"><a href="#cb43-328" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb43-329"><a href="#cb43-329" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb43-330"><a href="#cb43-330" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb43-331"><a href="#cb43-331" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb43-332"><a href="#cb43-332" aria-hidden="true" tabindex="-1"></a><span class="fu">## Exercise 2: Gene expression data</span></span>
-<span id="cb43-333"><a href="#cb43-333" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb43-334"><a href="#cb43-334" aria-hidden="true" tabindex="-1"></a>(CH12Ex13 from statistical learning)</span>
-<span id="cb43-335"><a href="#cb43-335" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb43-336"><a href="#cb43-336" aria-hidden="true" tabindex="-1"></a>We use the <span class="in">`Ch12Ex13.csv`</span> data to repeat some of the clustering analysis we did.</span>
-<span id="cb43-337"><a href="#cb43-337" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb43-338"><a href="#cb43-338" aria-hidden="true" tabindex="-1"></a>The first 20 samples are from healthy patients, while the second 20 are from a diseased group.</span>
-<span id="cb43-339"><a href="#cb43-339" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb43-340"><a href="#cb43-340" aria-hidden="true" tabindex="-1"></a>Load in the data using read.csv(). You will need to select header=F. Alternatively: load in the data using “Import dataset” in the upper right window, and click “no” on the “Heading” option.</span>
-<span id="cb43-341"><a href="#cb43-341" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb43-342"><a href="#cb43-342" aria-hidden="true" tabindex="-1"></a>Carry out both hierarchical clustering and K-means clustering. You should choose the most meaningful number of clusters (think about how many groups of patients we have!). Compare the results. </span>
-<span id="cb43-343"><a href="#cb43-343" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb43-344"><a href="#cb43-344" aria-hidden="true" tabindex="-1"></a>Note: remember that the data has genes on the rows and patients on the columns. You need to transpose the data so that the orders are reversed.</span>
-<span id="cb43-345"><a href="#cb43-345" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb43-348"><a href="#cb43-348" aria-hidden="true" tabindex="-1"></a><span class="in">```{r}</span></span>
-<span id="cb43-349"><a href="#cb43-349" aria-hidden="true" tabindex="-1"></a><span class="co">#| label: clust-ge-loaddata</span></span>
-<span id="cb43-350"><a href="#cb43-350" aria-hidden="true" tabindex="-1"></a><span class="co">#| warning: false</span></span>
-<span id="cb43-351"><a href="#cb43-351" aria-hidden="true" tabindex="-1"></a><span class="co">#| echo: true</span></span>
-<span id="cb43-352"><a href="#cb43-352" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb43-353"><a href="#cb43-353" aria-hidden="true" tabindex="-1"></a><span class="co"># load in the data using read.csv(). You will need to select header=F.</span></span>
-<span id="cb43-354"><a href="#cb43-354" aria-hidden="true" tabindex="-1"></a>data <span class="ot">&lt;-</span> <span class="fu">read.csv</span>(<span class="st">"data/Ch12Ex13.csv"</span>, <span class="at">header=</span><span class="cn">FALSE</span>)</span>
-<span id="cb43-355"><a href="#cb43-355" aria-hidden="true" tabindex="-1"></a><span class="fu">dim</span>(data)</span>
-<span id="cb43-356"><a href="#cb43-356" aria-hidden="true" tabindex="-1"></a><span class="co"># transpose the data, so that we have each row is one patient (subject)</span></span>
-<span id="cb43-357"><a href="#cb43-357" aria-hidden="true" tabindex="-1"></a>data <span class="ot">&lt;-</span> <span class="fu">t</span>(data) </span>
-<span id="cb43-358"><a href="#cb43-358" aria-hidden="true" tabindex="-1"></a><span class="in">```</span></span>
-<span id="cb43-359"><a href="#cb43-359" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb43-360"><a href="#cb43-360" aria-hidden="true" tabindex="-1"></a>Now the first 20 rows are measurements from healthy patients (group 0), and 21-50 rows are the disease patients (group 1). We can denote this information in a vector like this.</span>
-<span id="cb43-361"><a href="#cb43-361" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb43-364"><a href="#cb43-364" aria-hidden="true" tabindex="-1"></a><span class="in">```{r}</span></span>
-<span id="cb43-365"><a href="#cb43-365" aria-hidden="true" tabindex="-1"></a><span class="co">#| label: clust-ge-truelabel</span></span>
-<span id="cb43-366"><a href="#cb43-366" aria-hidden="true" tabindex="-1"></a><span class="co">#| warning: false</span></span>
-<span id="cb43-367"><a href="#cb43-367" aria-hidden="true" tabindex="-1"></a><span class="co">#| echo: true</span></span>
-<span id="cb43-368"><a href="#cb43-368" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb43-369"><a href="#cb43-369" aria-hidden="true" tabindex="-1"></a>true.groups <span class="ot">&lt;-</span> <span class="fu">c</span>( <span class="fu">rep</span>(<span class="dv">0</span>,<span class="dv">20</span>), <span class="fu">rep</span>(<span class="dv">1</span>,<span class="dv">20</span>))</span>
-<span id="cb43-370"><a href="#cb43-370" aria-hidden="true" tabindex="-1"></a><span class="in">```</span></span>
-<span id="cb43-371"><a href="#cb43-371" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb43-372"><a href="#cb43-372" aria-hidden="true" tabindex="-1"></a><span class="fu">### Hierarchical clustering </span></span>
-<span id="cb43-373"><a href="#cb43-373" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb43-374"><a href="#cb43-374" aria-hidden="true" tabindex="-1"></a>You can use different linkage options and distance metrics of your choosing. For example, with complete linkage the code is like this.</span>
-<span id="cb43-375"><a href="#cb43-375" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb43-378"><a href="#cb43-378" aria-hidden="true" tabindex="-1"></a><span class="in">```{r}</span></span>
-<span id="cb43-379"><a href="#cb43-379" aria-hidden="true" tabindex="-1"></a><span class="co">#| label: clust-ge-clust</span></span>
-<span id="cb43-380"><a href="#cb43-380" aria-hidden="true" tabindex="-1"></a><span class="co">#| warning: false</span></span>
-<span id="cb43-381"><a href="#cb43-381" aria-hidden="true" tabindex="-1"></a><span class="co">#| echo: true</span></span>
-<span id="cb43-382"><a href="#cb43-382" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb43-383"><a href="#cb43-383" aria-hidden="true" tabindex="-1"></a>data.dist <span class="ot">&lt;-</span> <span class="fu">dist</span>(data) <span class="co"># need to compute the distance matrix</span></span>
-<span id="cb43-384"><a href="#cb43-384" aria-hidden="true" tabindex="-1"></a>hclust.df <span class="ot">&lt;-</span> <span class="fu">hclust</span>(data.dist, <span class="at">method=</span><span class="st">"complete"</span> )</span>
-<span id="cb43-385"><a href="#cb43-385" aria-hidden="true" tabindex="-1"></a><span class="co">#alternatives:</span></span>
-<span id="cb43-386"><a href="#cb43-386" aria-hidden="true" tabindex="-1"></a><span class="co">#hclust.df &lt;- hclust( D, method="average" )</span></span>
-<span id="cb43-387"><a href="#cb43-387" aria-hidden="true" tabindex="-1"></a><span class="co">#hclust.df &lt;- hclust( D, method="single" )</span></span>
-<span id="cb43-388"><a href="#cb43-388" aria-hidden="true" tabindex="-1"></a><span class="in">```</span></span>
-<span id="cb43-389"><a href="#cb43-389" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb43-390"><a href="#cb43-390" aria-hidden="true" tabindex="-1"></a>We can keep 2 clusters with <span class="in">`cutree`</span>. Then do a cross tabulation of the true labels and clustered results: how well do they correspond?</span>
-<span id="cb43-391"><a href="#cb43-391" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb43-394"><a href="#cb43-394" aria-hidden="true" tabindex="-1"></a><span class="in">```{r}</span></span>
-<span id="cb43-395"><a href="#cb43-395" aria-hidden="true" tabindex="-1"></a><span class="co">#| label: clust-ge-clust-res</span></span>
-<span id="cb43-396"><a href="#cb43-396" aria-hidden="true" tabindex="-1"></a><span class="co">#| warning: false</span></span>
-<span id="cb43-397"><a href="#cb43-397" aria-hidden="true" tabindex="-1"></a><span class="co">#| echo: true</span></span>
-<span id="cb43-398"><a href="#cb43-398" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb43-399"><a href="#cb43-399" aria-hidden="true" tabindex="-1"></a><span class="co"># find the clusters</span></span>
-<span id="cb43-400"><a href="#cb43-400" aria-hidden="true" tabindex="-1"></a>predicted <span class="ot">&lt;-</span> <span class="fu">cutree</span>( hclust.df, <span class="at">k=</span><span class="dv">2</span> )</span>
-<span id="cb43-401"><a href="#cb43-401" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb43-402"><a href="#cb43-402" aria-hidden="true" tabindex="-1"></a><span class="co"># How well does our clustering predict health vs. diseased</span></span>
-<span id="cb43-403"><a href="#cb43-403" aria-hidden="true" tabindex="-1"></a><span class="fu">table</span>(predicted, true.groups )</span>
-<span id="cb43-404"><a href="#cb43-404" aria-hidden="true" tabindex="-1"></a><span class="in">```</span></span>
-<span id="cb43-405"><a href="#cb43-405" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb43-406"><a href="#cb43-406" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb43-407"><a href="#cb43-407" aria-hidden="true" tabindex="-1"></a><span class="fu">### K-means</span></span>
-<span id="cb43-408"><a href="#cb43-408" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb43-409"><a href="#cb43-409" aria-hidden="true" tabindex="-1"></a>Now you can use K-means to identify 2 clusters. </span>
-<span id="cb43-410"><a href="#cb43-410" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb43-413"><a href="#cb43-413" aria-hidden="true" tabindex="-1"></a><span class="in">```{r}</span></span>
-<span id="cb43-414"><a href="#cb43-414" aria-hidden="true" tabindex="-1"></a><span class="co">#| label: clust-ge-km</span></span>
-<span id="cb43-415"><a href="#cb43-415" aria-hidden="true" tabindex="-1"></a><span class="co">#| warning: false</span></span>
-<span id="cb43-416"><a href="#cb43-416" aria-hidden="true" tabindex="-1"></a><span class="co">#| echo: true</span></span>
-<span id="cb43-417"><a href="#cb43-417" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb43-418"><a href="#cb43-418" aria-hidden="true" tabindex="-1"></a>predicted.kmean <span class="ot">&lt;-</span> <span class="fu">kmeans</span>(data, <span class="dv">2</span>, <span class="at">nstart=</span><span class="dv">20</span>)<span class="sc">$</span>cluster</span>
-<span id="cb43-419"><a href="#cb43-419" aria-hidden="true" tabindex="-1"></a><span class="co"># agreement with true label</span></span>
-<span id="cb43-420"><a href="#cb43-420" aria-hidden="true" tabindex="-1"></a><span class="fu">table</span>(predicted.kmean, true.groups )</span>
-<span id="cb43-421"><a href="#cb43-421" aria-hidden="true" tabindex="-1"></a><span class="in">```</span></span>
-<span id="cb43-422"><a href="#cb43-422" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb43-423"><a href="#cb43-423" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb43-424"><a href="#cb43-424" aria-hidden="true" tabindex="-1"></a>Both methods seem to do work decently for the task. </span>
+<div class="sourceCode" id="cb61" data-shortcodes="false"><pre class="sourceCode markdown code-with-copy"><code class="sourceCode markdown"><span id="cb61-1"><a href="#cb61-1" aria-hidden="true" tabindex="-1"></a><span class="co">---</span></span>
+<span id="cb61-2"><a href="#cb61-2" aria-hidden="true" tabindex="-1"></a><span class="an">title:</span><span class="co"> "R Lab (day 4): Clustering"</span></span>
+<span id="cb61-3"><a href="#cb61-3" aria-hidden="true" tabindex="-1"></a><span class="an">format:</span><span class="co"> </span></span>
+<span id="cb61-4"><a href="#cb61-4" aria-hidden="true" tabindex="-1"></a><span class="co">  html:</span></span>
+<span id="cb61-5"><a href="#cb61-5" aria-hidden="true" tabindex="-1"></a><span class="co">    code-fold: false</span></span>
+<span id="cb61-6"><a href="#cb61-6" aria-hidden="true" tabindex="-1"></a><span class="co">    code-tools: true</span></span>
+<span id="cb61-7"><a href="#cb61-7" aria-hidden="true" tabindex="-1"></a><span class="co">---</span></span>
+<span id="cb61-8"><a href="#cb61-8" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb61-9"><a href="#cb61-9" aria-hidden="true" tabindex="-1"></a>Download datasets <span class="co">[</span><span class="ot">here</span><span class="co">](https://github.com/ocbe-uio/course_med3007/tree/main/lab/data)</span>, or from Canvas.</span>
+<span id="cb61-10"><a href="#cb61-10" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb61-11"><a href="#cb61-11" aria-hidden="true" tabindex="-1"></a>R script: <span class="co">[</span><span class="ot">Code</span><span class="co">](https://github.com/ocbe-uio/course_med3007/blob/main/lab/code/MED3007_Lab3_clustering.R)</span></span>
+<span id="cb61-12"><a href="#cb61-12" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb61-13"><a href="#cb61-13" aria-hidden="true" tabindex="-1"></a>Presentation: <span class="co">[</span><span class="ot">Slides</span><span class="co">](presentation/Lab_clustering.pdf)</span></span>
+<span id="cb61-14"><a href="#cb61-14" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb61-15"><a href="#cb61-15" aria-hidden="true" tabindex="-1"></a><span class="fu">## Exercise 1: Food</span></span>
+<span id="cb61-16"><a href="#cb61-16" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb61-17"><a href="#cb61-17" aria-hidden="true" tabindex="-1"></a>We use the same <span class="in">`Food.txt`</span> data to illustrate two concepts: hierarchical clustering, and heatmap. </span>
+<span id="cb61-18"><a href="#cb61-18" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb61-19"><a href="#cb61-19" aria-hidden="true" tabindex="-1"></a>This is not a genomics dataset, but for the ease of interpretability, we use it for teaching purposes.</span>
+<span id="cb61-20"><a href="#cb61-20" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb61-21"><a href="#cb61-21" aria-hidden="true" tabindex="-1"></a>Let us load the dataset.</span>
+<span id="cb61-22"><a href="#cb61-22" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb61-25"><a href="#cb61-25" aria-hidden="true" tabindex="-1"></a><span class="in">```{r}</span></span>
+<span id="cb61-26"><a href="#cb61-26" aria-hidden="true" tabindex="-1"></a><span class="co">#| label: hc-food-loaddata</span></span>
+<span id="cb61-27"><a href="#cb61-27" aria-hidden="true" tabindex="-1"></a><span class="co">#| warning: false</span></span>
+<span id="cb61-28"><a href="#cb61-28" aria-hidden="true" tabindex="-1"></a><span class="co">#| echo: true</span></span>
+<span id="cb61-29"><a href="#cb61-29" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb61-30"><a href="#cb61-30" aria-hidden="true" tabindex="-1"></a>food <span class="ot">&lt;-</span> <span class="fu">read.table</span>(<span class="st">'data/Food.txt'</span>, <span class="at">header=</span>T)</span>
+<span id="cb61-31"><a href="#cb61-31" aria-hidden="true" tabindex="-1"></a><span class="co"># we change the name from pulses to a more common name, legume</span></span>
+<span id="cb61-32"><a href="#cb61-32" aria-hidden="true" tabindex="-1"></a><span class="fu">colnames</span>(food)[<span class="dv">7</span>] <span class="ot">&lt;-</span> <span class="st">'Legume'</span></span>
+<span id="cb61-33"><a href="#cb61-33" aria-hidden="true" tabindex="-1"></a><span class="fu">head</span>(food) <span class="co"># print first 6 lines </span></span>
+<span id="cb61-34"><a href="#cb61-34" aria-hidden="true" tabindex="-1"></a><span class="in">```</span></span>
+<span id="cb61-35"><a href="#cb61-35" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb61-36"><a href="#cb61-36" aria-hidden="true" tabindex="-1"></a>We **scale** the data (also called standardize, or normalize sometimes) so that each column (feature, variable) has 0 mean and 1 variance. We call the scaled data <span class="in">`food_s`</span>.</span>
+<span id="cb61-37"><a href="#cb61-37" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb61-40"><a href="#cb61-40" aria-hidden="true" tabindex="-1"></a><span class="in">```{r}</span></span>
+<span id="cb61-41"><a href="#cb61-41" aria-hidden="true" tabindex="-1"></a><span class="co">#| label: hc-food-scale</span></span>
+<span id="cb61-42"><a href="#cb61-42" aria-hidden="true" tabindex="-1"></a><span class="co">#| warning: false</span></span>
+<span id="cb61-43"><a href="#cb61-43" aria-hidden="true" tabindex="-1"></a><span class="co">#| echo: true</span></span>
+<span id="cb61-44"><a href="#cb61-44" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb61-45"><a href="#cb61-45" aria-hidden="true" tabindex="-1"></a>food_s <span class="ot">&lt;-</span> <span class="fu">scale</span>(food)</span>
+<span id="cb61-46"><a href="#cb61-46" aria-hidden="true" tabindex="-1"></a><span class="fu">head</span>(food_s) <span class="co"># print first 6 lines</span></span>
+<span id="cb61-47"><a href="#cb61-47" aria-hidden="true" tabindex="-1"></a><span class="in">```</span></span>
+<span id="cb61-48"><a href="#cb61-48" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb61-49"><a href="#cb61-49" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb61-50"><a href="#cb61-50" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb61-51"><a href="#cb61-51" aria-hidden="true" tabindex="-1"></a><span class="fu">### Distances</span></span>
+<span id="cb61-52"><a href="#cb61-52" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb61-53"><a href="#cb61-53" aria-hidden="true" tabindex="-1"></a>To do hierarchical clustering, the most convenient command is <span class="in">`hclust()`</span>. As input you would need a **distance** between the subjects (patients, or countries in this example). We do it on the scaled data.</span>
+<span id="cb61-54"><a href="#cb61-54" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb61-55"><a href="#cb61-55" aria-hidden="true" tabindex="-1"></a>The command to compute pair-wise distance is <span class="in">`dist()`</span>. By default, the distance being computed is the Euclidean distance (*details optional*). Euclidean distance is possibly the most commonly used metric, but there are others. See <span class="in">`?dist()`</span> to find out more options.</span>
+<span id="cb61-56"><a href="#cb61-56" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb61-57"><a href="#cb61-57" aria-hidden="true" tabindex="-1"></a>We can present the pair-wise distances in a matrix format. You can see that this matrix is symmetric, with 0 on the diagonal - this should be intuitive: the distance between A - B is the same as B - A, and the distance between A and itself is 0.</span>
+<span id="cb61-58"><a href="#cb61-58" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb61-59"><a href="#cb61-59" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb61-62"><a href="#cb61-62" aria-hidden="true" tabindex="-1"></a><span class="in">```{r}</span></span>
+<span id="cb61-63"><a href="#cb61-63" aria-hidden="true" tabindex="-1"></a><span class="co">#| label: hc-food-dist</span></span>
+<span id="cb61-64"><a href="#cb61-64" aria-hidden="true" tabindex="-1"></a><span class="co">#| warning: false</span></span>
+<span id="cb61-65"><a href="#cb61-65" aria-hidden="true" tabindex="-1"></a><span class="co">#| echo: true</span></span>
+<span id="cb61-66"><a href="#cb61-66" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb61-67"><a href="#cb61-67" aria-hidden="true" tabindex="-1"></a><span class="co"># compute distance</span></span>
+<span id="cb61-68"><a href="#cb61-68" aria-hidden="true" tabindex="-1"></a>food_dist <span class="ot">&lt;-</span> <span class="fu">dist</span>(food_s)</span>
+<span id="cb61-69"><a href="#cb61-69" aria-hidden="true" tabindex="-1"></a><span class="co"># round(food_dist, digits = 2) # try this yourself to see what it does</span></span>
+<span id="cb61-70"><a href="#cb61-70" aria-hidden="true" tabindex="-1"></a><span class="co"># alternatively, look at this as a matrix</span></span>
+<span id="cb61-71"><a href="#cb61-71" aria-hidden="true" tabindex="-1"></a>food_dist_matrix <span class="ot">&lt;-</span> <span class="fu">as.matrix</span>(food_dist)</span>
+<span id="cb61-72"><a href="#cb61-72" aria-hidden="true" tabindex="-1"></a><span class="fu">round</span>(food_dist_matrix[<span class="dv">1</span><span class="sc">:</span><span class="dv">5</span>, <span class="dv">1</span><span class="sc">:</span><span class="dv">5</span>], <span class="at">digits =</span> <span class="dv">2</span>) <span class="co"># first 5 row 5 col</span></span>
+<span id="cb61-73"><a href="#cb61-73" aria-hidden="true" tabindex="-1"></a><span class="in">```</span></span>
+<span id="cb61-74"><a href="#cb61-74" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb61-75"><a href="#cb61-75" aria-hidden="true" tabindex="-1"></a>::: callout-note</span>
+<span id="cb61-76"><a href="#cb61-76" aria-hidden="true" tabindex="-1"></a><span class="fu">## Optional: Euclidean disance</span></span>
+<span id="cb61-77"><a href="#cb61-77" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb61-78"><a href="#cb61-78" aria-hidden="true" tabindex="-1"></a>You can check the Euclidean distance between Albania and Austria is indeed 5.95. This distance is the square root of the sum of squared differences between two subjects in all their measurements. </span>
+<span id="cb61-79"><a href="#cb61-79" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb61-80"><a href="#cb61-80" aria-hidden="true" tabindex="-1"></a>:::</span>
+<span id="cb61-81"><a href="#cb61-81" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb61-84"><a href="#cb61-84" aria-hidden="true" tabindex="-1"></a><span class="in">```{r}</span></span>
+<span id="cb61-85"><a href="#cb61-85" aria-hidden="true" tabindex="-1"></a><span class="co">#| label: hc-food-dist2</span></span>
+<span id="cb61-86"><a href="#cb61-86" aria-hidden="true" tabindex="-1"></a><span class="co">#| warning: false</span></span>
+<span id="cb61-87"><a href="#cb61-87" aria-hidden="true" tabindex="-1"></a><span class="co">#| echo: true</span></span>
+<span id="cb61-88"><a href="#cb61-88" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb61-89"><a href="#cb61-89" aria-hidden="true" tabindex="-1"></a><span class="fu">round</span>(food_s[<span class="dv">1</span><span class="sc">:</span><span class="dv">2</span>,], <span class="at">digits =</span> <span class="dv">2</span>) <span class="co"># we only keep first 2 digits</span></span>
+<span id="cb61-90"><a href="#cb61-90" aria-hidden="true" tabindex="-1"></a><span class="co"># take the data for two countries each </span></span>
+<span id="cb61-91"><a href="#cb61-91" aria-hidden="true" tabindex="-1"></a>albania <span class="ot">&lt;-</span> <span class="fu">round</span>(food_s[<span class="dv">1</span>,], <span class="at">digits =</span> <span class="dv">2</span>)</span>
+<span id="cb61-92"><a href="#cb61-92" aria-hidden="true" tabindex="-1"></a>austria <span class="ot">&lt;-</span> <span class="fu">round</span>(food_s[<span class="dv">2</span>,], <span class="at">digits =</span> <span class="dv">2</span>)</span>
+<span id="cb61-93"><a href="#cb61-93" aria-hidden="true" tabindex="-1"></a><span class="co"># compute difference between each col</span></span>
+<span id="cb61-94"><a href="#cb61-94" aria-hidden="true" tabindex="-1"></a>d <span class="ot">&lt;-</span> albania <span class="sc">-</span> austria</span>
+<span id="cb61-95"><a href="#cb61-95" aria-hidden="true" tabindex="-1"></a>d</span>
+<span id="cb61-96"><a href="#cb61-96" aria-hidden="true" tabindex="-1"></a><span class="co"># euclidean distance: square each element, sum together, and take a square root</span></span>
+<span id="cb61-97"><a href="#cb61-97" aria-hidden="true" tabindex="-1"></a><span class="fu">sqrt</span>(<span class="fu">sum</span>(d<span class="sc">^</span><span class="dv">2</span>)) </span>
+<span id="cb61-98"><a href="#cb61-98" aria-hidden="true" tabindex="-1"></a><span class="in">```</span></span>
+<span id="cb61-99"><a href="#cb61-99" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb61-100"><a href="#cb61-100" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb61-101"><a href="#cb61-101" aria-hidden="true" tabindex="-1"></a><span class="fu">### Hierarchical clustering</span></span>
+<span id="cb61-102"><a href="#cb61-102" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb61-103"><a href="#cb61-103" aria-hidden="true" tabindex="-1"></a>Now that we have computed the distance <span class="in">`food_dist`</span>, we plug it in the clustering algorithm, <span class="in">`hclust()`</span>. </span>
+<span id="cb61-104"><a href="#cb61-104" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb61-105"><a href="#cb61-105" aria-hidden="true" tabindex="-1"></a>We try the complete linkage method, by specifying <span class="in">`method = 'complete'`</span>. The result is saved as <span class="in">`hc.complete`</span>. You can visualize it, and add label of the country names to make it easier to read.</span>
+<span id="cb61-106"><a href="#cb61-106" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb61-109"><a href="#cb61-109" aria-hidden="true" tabindex="-1"></a><span class="in">```{r}</span></span>
+<span id="cb61-110"><a href="#cb61-110" aria-hidden="true" tabindex="-1"></a><span class="co">#| label: hc-food-complete</span></span>
+<span id="cb61-111"><a href="#cb61-111" aria-hidden="true" tabindex="-1"></a><span class="co">#| warning: false</span></span>
+<span id="cb61-112"><a href="#cb61-112" aria-hidden="true" tabindex="-1"></a><span class="co">#| echo: true</span></span>
+<span id="cb61-113"><a href="#cb61-113" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb61-114"><a href="#cb61-114" aria-hidden="true" tabindex="-1"></a>hc.complete <span class="ot">&lt;-</span> <span class="fu">hclust</span>(food_dist, <span class="at">method=</span><span class="st">"complete"</span>)</span>
+<span id="cb61-115"><a href="#cb61-115" aria-hidden="true" tabindex="-1"></a><span class="fu">plot</span>(hc.complete, <span class="at">labels=</span><span class="fu">rownames</span>(food), <span class="at">main=</span><span class="st">"Complete Linkage"</span>, <span class="at">xlab=</span><span class="st">""</span>, <span class="at">sub=</span><span class="st">""</span>)</span>
+<span id="cb61-116"><a href="#cb61-116" aria-hidden="true" tabindex="-1"></a><span class="in">```</span></span>
+<span id="cb61-117"><a href="#cb61-117" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb61-118"><a href="#cb61-118" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb61-119"><a href="#cb61-119" aria-hidden="true" tabindex="-1"></a><span class="fu">### Linkage, dissimilarity, scaling</span></span>
+<span id="cb61-120"><a href="#cb61-120" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb61-121"><a href="#cb61-121" aria-hidden="true" tabindex="-1"></a>Hierarchical clustering is a class of methods, and there are a variety of options to set. </span>
+<span id="cb61-122"><a href="#cb61-122" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb61-123"><a href="#cb61-123" aria-hidden="true" tabindex="-1"></a><span class="ss">* </span>Linkage (by seting <span class="in">`method`</span> inside <span class="in">`hclust()`</span>): complete, single, average</span>
+<span id="cb61-124"><a href="#cb61-124" aria-hidden="true" tabindex="-1"></a><span class="ss">* </span>Dissimilarity: Euclidean, correlation, ...</span>
+<span id="cb61-125"><a href="#cb61-125" aria-hidden="true" tabindex="-1"></a><span class="ss">* </span>Scaling: scaled data (mean 0 variance 1) or unscaled, original data</span>
+<span id="cb61-126"><a href="#cb61-126" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb61-127"><a href="#cb61-127" aria-hidden="true" tabindex="-1"></a>There is no definite guide on which combination works the best, hence you can try them out and see what could make most sense. Again, in unsupervised learning data do not have outcome labels, so the interpretation is left for the domain experts to make.</span>
+<span id="cb61-128"><a href="#cb61-128" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb61-129"><a href="#cb61-129" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb61-132"><a href="#cb61-132" aria-hidden="true" tabindex="-1"></a><span class="in">```{r}</span></span>
+<span id="cb61-133"><a href="#cb61-133" aria-hidden="true" tabindex="-1"></a><span class="co">#| label: hc-food-linkage</span></span>
+<span id="cb61-134"><a href="#cb61-134" aria-hidden="true" tabindex="-1"></a><span class="co">#| warning: false</span></span>
+<span id="cb61-135"><a href="#cb61-135" aria-hidden="true" tabindex="-1"></a><span class="co">#| echo: true</span></span>
+<span id="cb61-136"><a href="#cb61-136" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb61-137"><a href="#cb61-137" aria-hidden="true" tabindex="-1"></a><span class="co"># single linkage</span></span>
+<span id="cb61-138"><a href="#cb61-138" aria-hidden="true" tabindex="-1"></a>hc.single <span class="ot">&lt;-</span> <span class="fu">hclust</span>(food_dist, <span class="at">method=</span><span class="st">"single"</span>)</span>
+<span id="cb61-139"><a href="#cb61-139" aria-hidden="true" tabindex="-1"></a><span class="fu">plot</span>(hc.single, <span class="at">labels=</span><span class="fu">rownames</span>(food), <span class="at">main=</span><span class="st">"Single Linkage"</span>, <span class="at">xlab=</span><span class="st">""</span>, <span class="at">sub=</span><span class="st">""</span>)</span>
+<span id="cb61-140"><a href="#cb61-140" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb61-141"><a href="#cb61-141" aria-hidden="true" tabindex="-1"></a><span class="co"># average linkage</span></span>
+<span id="cb61-142"><a href="#cb61-142" aria-hidden="true" tabindex="-1"></a>hc.average <span class="ot">&lt;-</span> <span class="fu">hclust</span>(food_dist, <span class="at">method=</span><span class="st">"average"</span>)</span>
+<span id="cb61-143"><a href="#cb61-143" aria-hidden="true" tabindex="-1"></a><span class="fu">plot</span>(hc.average, <span class="at">labels=</span><span class="fu">rownames</span>(food), <span class="at">main=</span><span class="st">"Average Linkage"</span>, <span class="at">xlab=</span><span class="st">""</span>, <span class="at">sub=</span><span class="st">""</span>)</span>
+<span id="cb61-144"><a href="#cb61-144" aria-hidden="true" tabindex="-1"></a><span class="in">```</span></span>
+<span id="cb61-145"><a href="#cb61-145" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb61-146"><a href="#cb61-146" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb61-147"><a href="#cb61-147" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb61-150"><a href="#cb61-150" aria-hidden="true" tabindex="-1"></a><span class="in">```{r}</span></span>
+<span id="cb61-151"><a href="#cb61-151" aria-hidden="true" tabindex="-1"></a><span class="co">#| label: hc-food-unscaled</span></span>
+<span id="cb61-152"><a href="#cb61-152" aria-hidden="true" tabindex="-1"></a><span class="co">#| warning: false</span></span>
+<span id="cb61-153"><a href="#cb61-153" aria-hidden="true" tabindex="-1"></a><span class="co">#| echo: true</span></span>
+<span id="cb61-154"><a href="#cb61-154" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb61-155"><a href="#cb61-155" aria-hidden="true" tabindex="-1"></a><span class="co"># unscaled data, complete linkage</span></span>
+<span id="cb61-156"><a href="#cb61-156" aria-hidden="true" tabindex="-1"></a>hc.unscaled <span class="ot">&lt;-</span> <span class="fu">hclust</span>(<span class="fu">dist</span>(food), <span class="at">method=</span><span class="st">"complete"</span>)</span>
+<span id="cb61-157"><a href="#cb61-157" aria-hidden="true" tabindex="-1"></a><span class="fu">plot</span>(hc.unscaled, <span class="at">labels=</span><span class="fu">rownames</span>(food), <span class="at">main=</span><span class="st">"Complete linkage with unscaled features"</span>, <span class="at">xlab=</span><span class="st">""</span>, <span class="at">sub=</span><span class="st">""</span>)</span>
+<span id="cb61-158"><a href="#cb61-158" aria-hidden="true" tabindex="-1"></a><span class="in">```</span></span>
+<span id="cb61-159"><a href="#cb61-159" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb61-160"><a href="#cb61-160" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb61-163"><a href="#cb61-163" aria-hidden="true" tabindex="-1"></a><span class="in">```{r}</span></span>
+<span id="cb61-164"><a href="#cb61-164" aria-hidden="true" tabindex="-1"></a><span class="co">#| label: hc-food-cor</span></span>
+<span id="cb61-165"><a href="#cb61-165" aria-hidden="true" tabindex="-1"></a><span class="co">#| warning: false</span></span>
+<span id="cb61-166"><a href="#cb61-166" aria-hidden="true" tabindex="-1"></a><span class="co">#| echo: true</span></span>
+<span id="cb61-167"><a href="#cb61-167" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb61-168"><a href="#cb61-168" aria-hidden="true" tabindex="-1"></a><span class="co"># correlation as dissimiarity, rather than euclidean distance</span></span>
+<span id="cb61-169"><a href="#cb61-169" aria-hidden="true" tabindex="-1"></a>dd <span class="ot">&lt;-</span> <span class="fu">as.dist</span>(<span class="dv">1</span><span class="sc">-</span><span class="fu">cor</span>(<span class="fu">t</span>(food_s))) <span class="co"># compute the metric</span></span>
+<span id="cb61-170"><a href="#cb61-170" aria-hidden="true" tabindex="-1"></a>hc.corr <span class="ot">&lt;-</span> <span class="fu">hclust</span>(dd, <span class="at">method=</span><span class="st">"complete"</span>) <span class="co"># cluster</span></span>
+<span id="cb61-171"><a href="#cb61-171" aria-hidden="true" tabindex="-1"></a><span class="fu">plot</span>(hc.corr, <span class="at">labels=</span><span class="fu">rownames</span>(food), <span class="at">main=</span><span class="st">"Complete linkage with correlation-based distance"</span>, <span class="at">xlab=</span><span class="st">""</span>, <span class="at">sub=</span><span class="st">""</span>)</span>
+<span id="cb61-172"><a href="#cb61-172" aria-hidden="true" tabindex="-1"></a><span class="in">```</span></span>
+<span id="cb61-173"><a href="#cb61-173" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb61-174"><a href="#cb61-174" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb61-175"><a href="#cb61-175" aria-hidden="true" tabindex="-1"></a><span class="fu">### Heatmap</span></span>
+<span id="cb61-176"><a href="#cb61-176" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb61-177"><a href="#cb61-177" aria-hidden="true" tabindex="-1"></a>Heatmap is a visualization tool to plot data of similar values in similar colors, so that you can identify visualy if there is any pattern. It can also be combined with hierarchical clustering - this is actually the default outcome: dendrograms are displayed for both rows and column.</span>
+<span id="cb61-178"><a href="#cb61-178" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb61-181"><a href="#cb61-181" aria-hidden="true" tabindex="-1"></a><span class="in">```{r}</span></span>
+<span id="cb61-182"><a href="#cb61-182" aria-hidden="true" tabindex="-1"></a><span class="co">#| label: hc-food-heatmap1</span></span>
+<span id="cb61-183"><a href="#cb61-183" aria-hidden="true" tabindex="-1"></a><span class="co">#| warning: false</span></span>
+<span id="cb61-184"><a href="#cb61-184" aria-hidden="true" tabindex="-1"></a><span class="co">#| echo: true</span></span>
+<span id="cb61-185"><a href="#cb61-185" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb61-186"><a href="#cb61-186" aria-hidden="true" tabindex="-1"></a><span class="co"># make heatmap on the scaled data</span></span>
+<span id="cb61-187"><a href="#cb61-187" aria-hidden="true" tabindex="-1"></a><span class="fu">heatmap</span>(food_s)</span>
+<span id="cb61-188"><a href="#cb61-188" aria-hidden="true" tabindex="-1"></a><span class="in">```</span></span>
+<span id="cb61-189"><a href="#cb61-189" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb61-190"><a href="#cb61-190" aria-hidden="true" tabindex="-1"></a>To preserve the original ordering of the columns and rows, you can specify <span class="in">`Rowv = NA, Colv = NA`</span>.</span>
+<span id="cb61-191"><a href="#cb61-191" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb61-194"><a href="#cb61-194" aria-hidden="true" tabindex="-1"></a><span class="in">```{r}</span></span>
+<span id="cb61-195"><a href="#cb61-195" aria-hidden="true" tabindex="-1"></a><span class="co">#| label: hc-food-heatmap2</span></span>
+<span id="cb61-196"><a href="#cb61-196" aria-hidden="true" tabindex="-1"></a><span class="co">#| warning: false</span></span>
+<span id="cb61-197"><a href="#cb61-197" aria-hidden="true" tabindex="-1"></a><span class="co">#| echo: true</span></span>
+<span id="cb61-198"><a href="#cb61-198" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb61-199"><a href="#cb61-199" aria-hidden="true" tabindex="-1"></a><span class="co"># no clustering for row or col, this preserves the original ordering</span></span>
+<span id="cb61-200"><a href="#cb61-200" aria-hidden="true" tabindex="-1"></a><span class="fu">heatmap</span>(food_s, <span class="at">Rowv =</span> <span class="cn">NA</span>, <span class="at">Colv =</span> <span class="cn">NA</span>)</span>
+<span id="cb61-201"><a href="#cb61-201" aria-hidden="true" tabindex="-1"></a><span class="in">```</span></span>
+<span id="cb61-202"><a href="#cb61-202" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb61-203"><a href="#cb61-203" aria-hidden="true" tabindex="-1"></a>Can also only do clustering for row only (or column only). </span>
+<span id="cb61-204"><a href="#cb61-204" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb61-207"><a href="#cb61-207" aria-hidden="true" tabindex="-1"></a><span class="in">```{r}</span></span>
+<span id="cb61-208"><a href="#cb61-208" aria-hidden="true" tabindex="-1"></a><span class="co">#| label: hc-food-heatmap3</span></span>
+<span id="cb61-209"><a href="#cb61-209" aria-hidden="true" tabindex="-1"></a><span class="co">#| warning: false</span></span>
+<span id="cb61-210"><a href="#cb61-210" aria-hidden="true" tabindex="-1"></a><span class="co">#| echo: true</span></span>
+<span id="cb61-211"><a href="#cb61-211" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb61-212"><a href="#cb61-212" aria-hidden="true" tabindex="-1"></a><span class="co"># only clustering for row</span></span>
+<span id="cb61-213"><a href="#cb61-213" aria-hidden="true" tabindex="-1"></a><span class="fu">heatmap</span>(food_s, <span class="at">Colv =</span> <span class="cn">NA</span>)</span>
+<span id="cb61-214"><a href="#cb61-214" aria-hidden="true" tabindex="-1"></a><span class="in">```</span></span>
+<span id="cb61-215"><a href="#cb61-215" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb61-216"><a href="#cb61-216" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb61-217"><a href="#cb61-217" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb61-218"><a href="#cb61-218" aria-hidden="true" tabindex="-1"></a><span class="fu">## Exercise 2: NCI60</span></span>
+<span id="cb61-219"><a href="#cb61-219" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb61-220"><a href="#cb61-220" aria-hidden="true" tabindex="-1"></a>We look at the NCI60 data again. First load the dataset.</span>
+<span id="cb61-221"><a href="#cb61-221" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb61-224"><a href="#cb61-224" aria-hidden="true" tabindex="-1"></a><span class="in">```{r}</span></span>
+<span id="cb61-225"><a href="#cb61-225" aria-hidden="true" tabindex="-1"></a><span class="co">#| label: clust-nci60-loaddata</span></span>
+<span id="cb61-226"><a href="#cb61-226" aria-hidden="true" tabindex="-1"></a><span class="co">#| warning: false</span></span>
+<span id="cb61-227"><a href="#cb61-227" aria-hidden="true" tabindex="-1"></a><span class="co">#| echo: true</span></span>
+<span id="cb61-228"><a href="#cb61-228" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb61-229"><a href="#cb61-229" aria-hidden="true" tabindex="-1"></a><span class="fu">library</span>(ISLR)</span>
+<span id="cb61-230"><a href="#cb61-230" aria-hidden="true" tabindex="-1"></a><span class="co"># or, load('data/NCI60.RData')</span></span>
+<span id="cb61-231"><a href="#cb61-231" aria-hidden="true" tabindex="-1"></a>nci.labs <span class="ot">&lt;-</span> NCI60<span class="sc">$</span>labs <span class="co"># Sample labels (tissue type)</span></span>
+<span id="cb61-232"><a href="#cb61-232" aria-hidden="true" tabindex="-1"></a>nci.data <span class="ot">&lt;-</span> NCI60<span class="sc">$</span>data <span class="co"># Gene expression data set</span></span>
+<span id="cb61-233"><a href="#cb61-233" aria-hidden="true" tabindex="-1"></a><span class="in">```</span></span>
+<span id="cb61-234"><a href="#cb61-234" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb61-235"><a href="#cb61-235" aria-hidden="true" tabindex="-1"></a><span class="fu">### Hierarchical clustering</span></span>
+<span id="cb61-236"><a href="#cb61-236" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb61-237"><a href="#cb61-237" aria-hidden="true" tabindex="-1"></a>We start by scaling the data, and calculate the distance matrix (using the Euclidean distance), and then investigate different linkage methods.</span>
+<span id="cb61-238"><a href="#cb61-238" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb61-239"><a href="#cb61-239" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb61-242"><a href="#cb61-242" aria-hidden="true" tabindex="-1"></a><span class="in">```{r}</span></span>
+<span id="cb61-243"><a href="#cb61-243" aria-hidden="true" tabindex="-1"></a><span class="co">#| label: clust-nci60-scale</span></span>
+<span id="cb61-244"><a href="#cb61-244" aria-hidden="true" tabindex="-1"></a><span class="co">#| warning: false</span></span>
+<span id="cb61-245"><a href="#cb61-245" aria-hidden="true" tabindex="-1"></a><span class="co">#| echo: true</span></span>
+<span id="cb61-246"><a href="#cb61-246" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb61-247"><a href="#cb61-247" aria-hidden="true" tabindex="-1"></a><span class="co"># Scale the data to zero mean and unit variance:</span></span>
+<span id="cb61-248"><a href="#cb61-248" aria-hidden="true" tabindex="-1"></a>sd.data <span class="ot">&lt;-</span> <span class="fu">scale</span>(nci.data)</span>
+<span id="cb61-249"><a href="#cb61-249" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb61-250"><a href="#cb61-250" aria-hidden="true" tabindex="-1"></a><span class="co"># Calculate the distance matrix </span></span>
+<span id="cb61-251"><a href="#cb61-251" aria-hidden="true" tabindex="-1"></a><span class="co"># equivalent: data.dist &lt;- dist(sd.data, method="euclidean")</span></span>
+<span id="cb61-252"><a href="#cb61-252" aria-hidden="true" tabindex="-1"></a>data.dist <span class="ot">&lt;-</span> <span class="fu">dist</span>(sd.data)</span>
+<span id="cb61-253"><a href="#cb61-253" aria-hidden="true" tabindex="-1"></a><span class="in">```</span></span>
+<span id="cb61-254"><a href="#cb61-254" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb61-255"><a href="#cb61-255" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb61-256"><a href="#cb61-256" aria-hidden="true" tabindex="-1"></a>Next we perform hierarchical clustering with distance matrix as input. The function we use is <span class="in">`hclust()`</span>. We specify the linkage method to be <span class="in">`complete`</span>.</span>
+<span id="cb61-257"><a href="#cb61-257" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb61-258"><a href="#cb61-258" aria-hidden="true" tabindex="-1"></a>Once the result is saved in <span class="in">`hc.complete`</span> object, you can plot the dendrogram.</span>
+<span id="cb61-259"><a href="#cb61-259" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb61-262"><a href="#cb61-262" aria-hidden="true" tabindex="-1"></a><span class="in">```{r}</span></span>
+<span id="cb61-263"><a href="#cb61-263" aria-hidden="true" tabindex="-1"></a><span class="co">#| label: clust-nci60-hccomplete</span></span>
+<span id="cb61-264"><a href="#cb61-264" aria-hidden="true" tabindex="-1"></a><span class="co">#| warning: false</span></span>
+<span id="cb61-265"><a href="#cb61-265" aria-hidden="true" tabindex="-1"></a><span class="co">#| echo: true</span></span>
+<span id="cb61-266"><a href="#cb61-266" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb61-267"><a href="#cb61-267" aria-hidden="true" tabindex="-1"></a><span class="co"># Perform clustering</span></span>
+<span id="cb61-268"><a href="#cb61-268" aria-hidden="true" tabindex="-1"></a>hc.complete <span class="ot">&lt;-</span> <span class="fu">hclust</span>(data.dist, <span class="at">method=</span><span class="st">"complete"</span>)</span>
+<span id="cb61-269"><a href="#cb61-269" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb61-270"><a href="#cb61-270" aria-hidden="true" tabindex="-1"></a><span class="co"># names(hc.complete)</span></span>
+<span id="cb61-271"><a href="#cb61-271" aria-hidden="true" tabindex="-1"></a><span class="fu">plot</span>(hc.complete, <span class="at">labels=</span>nci.labs, <span class="at">main=</span><span class="st">"Complete Linkage"</span>, <span class="at">xlab=</span><span class="st">""</span>, <span class="at">sub=</span><span class="st">""</span>)</span>
+<span id="cb61-272"><a href="#cb61-272" aria-hidden="true" tabindex="-1"></a><span class="in">```</span></span>
+<span id="cb61-273"><a href="#cb61-273" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb61-274"><a href="#cb61-274" aria-hidden="true" tabindex="-1"></a>The object <span class="in">`hc.complete`</span> contains a lot of information. To get the information, you can use the <span class="in">`$`</span> operator. </span>
+<span id="cb61-275"><a href="#cb61-275" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb61-276"><a href="#cb61-276" aria-hidden="true" tabindex="-1"></a>You should refer to the documentation for <span class="in">`hclust()`</span> to see a complete list of output. Use <span class="in">`?hclust`</span> to get the documentation on how to use the function.</span>
+<span id="cb61-277"><a href="#cb61-277" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb61-280"><a href="#cb61-280" aria-hidden="true" tabindex="-1"></a><span class="in">```{r}</span></span>
+<span id="cb61-281"><a href="#cb61-281" aria-hidden="true" tabindex="-1"></a><span class="co">#| label: clust-nci60-hccomplete-2</span></span>
+<span id="cb61-282"><a href="#cb61-282" aria-hidden="true" tabindex="-1"></a><span class="co">#| warning: false</span></span>
+<span id="cb61-283"><a href="#cb61-283" aria-hidden="true" tabindex="-1"></a><span class="co">#| echo: true</span></span>
+<span id="cb61-284"><a href="#cb61-284" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb61-285"><a href="#cb61-285" aria-hidden="true" tabindex="-1"></a>hc.complete<span class="sc">$</span>dist.method <span class="co"># distance method</span></span>
+<span id="cb61-286"><a href="#cb61-286" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb61-287"><a href="#cb61-287" aria-hidden="true" tabindex="-1"></a><span class="co"># hc.complete$merge  # order of aggregations of samples / clusters</span></span>
+<span id="cb61-288"><a href="#cb61-288" aria-hidden="true" tabindex="-1"></a><span class="co"># hc.complete$height # distance at which aggregations happen</span></span>
+<span id="cb61-289"><a href="#cb61-289" aria-hidden="true" tabindex="-1"></a><span class="co"># hc.complete$labels # labels (numeric, since we don't know the original categories!)</span></span>
+<span id="cb61-290"><a href="#cb61-290" aria-hidden="true" tabindex="-1"></a><span class="co"># hc.complete$method</span></span>
+<span id="cb61-291"><a href="#cb61-291" aria-hidden="true" tabindex="-1"></a><span class="co"># hc.complete$call</span></span>
+<span id="cb61-292"><a href="#cb61-292" aria-hidden="true" tabindex="-1"></a><span class="in">```</span></span>
+<span id="cb61-293"><a href="#cb61-293" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb61-294"><a href="#cb61-294" aria-hidden="true" tabindex="-1"></a>We can try different linkage methods and see how the clustering results differ. Change the <span class="in">`method`</span> argument in the function, and plot the results.</span>
+<span id="cb61-295"><a href="#cb61-295" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb61-298"><a href="#cb61-298" aria-hidden="true" tabindex="-1"></a><span class="in">```{r}</span></span>
+<span id="cb61-299"><a href="#cb61-299" aria-hidden="true" tabindex="-1"></a><span class="co">#| label: clust-nci60-otherlink</span></span>
+<span id="cb61-300"><a href="#cb61-300" aria-hidden="true" tabindex="-1"></a><span class="co">#| warning: false</span></span>
+<span id="cb61-301"><a href="#cb61-301" aria-hidden="true" tabindex="-1"></a><span class="co">#| echo: true</span></span>
+<span id="cb61-302"><a href="#cb61-302" aria-hidden="true" tabindex="-1"></a><span class="co">#| eval: false</span></span>
+<span id="cb61-303"><a href="#cb61-303" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb61-304"><a href="#cb61-304" aria-hidden="true" tabindex="-1"></a>hc.average <span class="ot">&lt;-</span> <span class="fu">hclust</span>(data.dist, <span class="at">method=</span><span class="st">"average"</span>)</span>
+<span id="cb61-305"><a href="#cb61-305" aria-hidden="true" tabindex="-1"></a>hc.single <span class="ot">&lt;-</span> <span class="fu">hclust</span>(data.dist, <span class="at">method=</span><span class="st">"single"</span>)</span>
+<span id="cb61-306"><a href="#cb61-306" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb61-307"><a href="#cb61-307" aria-hidden="true" tabindex="-1"></a><span class="fu">plot</span>(hc.average, <span class="at">labels=</span>nci.labs, <span class="at">main=</span><span class="st">"Average Linkage"</span>, <span class="at">xlab=</span><span class="st">""</span>, <span class="at">sub=</span><span class="st">""</span>)</span>
+<span id="cb61-308"><a href="#cb61-308" aria-hidden="true" tabindex="-1"></a><span class="fu">plot</span>(hc.single, <span class="at">labels=</span>nci.labs,  <span class="at">main=</span><span class="st">"Single Linkage"</span>, <span class="at">xlab=</span><span class="st">""</span>, <span class="at">sub=</span><span class="st">""</span>)</span>
+<span id="cb61-309"><a href="#cb61-309" aria-hidden="true" tabindex="-1"></a><span class="in">```</span></span>
+<span id="cb61-310"><a href="#cb61-310" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb61-311"><a href="#cb61-311" aria-hidden="true" tabindex="-1"></a>Now we focus on **complete linkage** only.</span>
+<span id="cb61-312"><a href="#cb61-312" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb61-313"><a href="#cb61-313" aria-hidden="true" tabindex="-1"></a>First, we use <span class="in">`cutree()`</span> to compare the results when the data are separated into either 2 or 4 clusters.</span>
+<span id="cb61-314"><a href="#cb61-314" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb61-317"><a href="#cb61-317" aria-hidden="true" tabindex="-1"></a><span class="in">```{r}</span></span>
+<span id="cb61-318"><a href="#cb61-318" aria-hidden="true" tabindex="-1"></a><span class="co">#| label: clust-nci60-cutree</span></span>
+<span id="cb61-319"><a href="#cb61-319" aria-hidden="true" tabindex="-1"></a><span class="co">#| warning: false</span></span>
+<span id="cb61-320"><a href="#cb61-320" aria-hidden="true" tabindex="-1"></a><span class="co">#| echo: true</span></span>
+<span id="cb61-321"><a href="#cb61-321" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb61-322"><a href="#cb61-322" aria-hidden="true" tabindex="-1"></a><span class="co"># Compare 2 clusters and 4 clusters:</span></span>
+<span id="cb61-323"><a href="#cb61-323" aria-hidden="true" tabindex="-1"></a>hc.clusters <span class="ot">&lt;-</span> <span class="fu">cutree</span>(hc.complete, <span class="fu">c</span>(<span class="dv">2</span>, <span class="dv">4</span>))</span>
+<span id="cb61-324"><a href="#cb61-324" aria-hidden="true" tabindex="-1"></a><span class="fu">head</span>(hc.clusters) <span class="co"># print first 6 results</span></span>
+<span id="cb61-325"><a href="#cb61-325" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb61-326"><a href="#cb61-326" aria-hidden="true" tabindex="-1"></a><span class="co"># cross tabulation</span></span>
+<span id="cb61-327"><a href="#cb61-327" aria-hidden="true" tabindex="-1"></a><span class="fu">table</span>(hc.clusters[,<span class="st">"2"</span>], hc.clusters[,<span class="st">"4"</span>])</span>
+<span id="cb61-328"><a href="#cb61-328" aria-hidden="true" tabindex="-1"></a><span class="in">```</span></span>
+<span id="cb61-329"><a href="#cb61-329" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb61-330"><a href="#cb61-330" aria-hidden="true" tabindex="-1"></a>It is more straightforward to check the results with a dendrogram.</span>
+<span id="cb61-331"><a href="#cb61-331" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb61-334"><a href="#cb61-334" aria-hidden="true" tabindex="-1"></a><span class="in">```{r}</span></span>
+<span id="cb61-335"><a href="#cb61-335" aria-hidden="true" tabindex="-1"></a><span class="co"># visualize the cuts</span></span>
+<span id="cb61-336"><a href="#cb61-336" aria-hidden="true" tabindex="-1"></a><span class="co"># how do you know where to draw the line? check height</span></span>
+<span id="cb61-337"><a href="#cb61-337" aria-hidden="true" tabindex="-1"></a>heights <span class="ot">&lt;-</span> hc.complete<span class="sc">$</span>height</span>
+<span id="cb61-338"><a href="#cb61-338" aria-hidden="true" tabindex="-1"></a><span class="fu">tail</span>(heights, <span class="dv">4</span>)  <span class="co"># print the last 4</span></span>
+<span id="cb61-339"><a href="#cb61-339" aria-hidden="true" tabindex="-1"></a><span class="fu">plot</span>(hc.complete, <span class="at">labels=</span>nci.labs, <span class="at">main=</span><span class="st">"Complete Linkage"</span>, <span class="at">xlab=</span><span class="st">""</span>, <span class="at">sub=</span><span class="st">""</span>)</span>
+<span id="cb61-340"><a href="#cb61-340" aria-hidden="true" tabindex="-1"></a><span class="fu">abline</span>(<span class="at">h=</span><span class="dv">140</span>, <span class="at">col=</span><span class="st">"red"</span>)  <span class="co"># 4 clusters</span></span>
+<span id="cb61-341"><a href="#cb61-341" aria-hidden="true" tabindex="-1"></a><span class="fu">abline</span>(<span class="at">h=</span><span class="dv">150</span>, <span class="at">col=</span><span class="st">"blue"</span>) <span class="co"># 2 clusters</span></span>
+<span id="cb61-342"><a href="#cb61-342" aria-hidden="true" tabindex="-1"></a><span class="in">```</span></span>
+<span id="cb61-343"><a href="#cb61-343" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb61-344"><a href="#cb61-344" aria-hidden="true" tabindex="-1"></a>The way to interpret the height variable is simple: it is where two clusters are merged into one. For example, the largest cluster corresponds to the last value of <span class="in">`height`</span> (162.2) - if you check the figure, it is exactly where the horizontal line is merging the two groups. Similarly, 142.9 is where three groups became two, 141.2 is where four groups became three. If you draw a line at 140, it points out the four clusters.</span>
+<span id="cb61-345"><a href="#cb61-345" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb61-346"><a href="#cb61-346" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb61-347"><a href="#cb61-347" aria-hidden="true" tabindex="-1"></a>How are the labels distributed between clusters? We can focus on 4 cluster situation, and use <span class="in">`table()`</span> to list out which cancer types is merged in which of the four clusters.</span>
+<span id="cb61-348"><a href="#cb61-348" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb61-349"><a href="#cb61-349" aria-hidden="true" tabindex="-1"></a>For example, breast cancer appears in all but 3rd cluster; melanoma only appears in the first clsuter; so on so forth.</span>
+<span id="cb61-350"><a href="#cb61-350" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb61-353"><a href="#cb61-353" aria-hidden="true" tabindex="-1"></a><span class="in">```{r}</span></span>
+<span id="cb61-354"><a href="#cb61-354" aria-hidden="true" tabindex="-1"></a><span class="co">#| label: clust-nci60-cutree2</span></span>
+<span id="cb61-355"><a href="#cb61-355" aria-hidden="true" tabindex="-1"></a><span class="co">#| warning: false</span></span>
+<span id="cb61-356"><a href="#cb61-356" aria-hidden="true" tabindex="-1"></a><span class="co">#| echo: true</span></span>
+<span id="cb61-357"><a href="#cb61-357" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb61-358"><a href="#cb61-358" aria-hidden="true" tabindex="-1"></a><span class="fu">table</span>(hc.clusters[,<span class="st">"4"</span>], nci.labs)</span>
+<span id="cb61-359"><a href="#cb61-359" aria-hidden="true" tabindex="-1"></a><span class="in">```</span></span>
+<span id="cb61-360"><a href="#cb61-360" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb61-361"><a href="#cb61-361" aria-hidden="true" tabindex="-1"></a>Finally, we see what happens if we use **unscaled data** instead of scaled data, or if we use a **correlation-based distance metric** instead of the Euclidean distance.</span>
+<span id="cb61-362"><a href="#cb61-362" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb61-363"><a href="#cb61-363" aria-hidden="true" tabindex="-1"></a>Compare the dendrograms: How different are the resulting clusterings? Do you recognise subclusters that are consistent?</span>
+<span id="cb61-364"><a href="#cb61-364" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb61-365"><a href="#cb61-365" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb61-368"><a href="#cb61-368" aria-hidden="true" tabindex="-1"></a><span class="in">```{r}</span></span>
+<span id="cb61-369"><a href="#cb61-369" aria-hidden="true" tabindex="-1"></a><span class="co">#| label: clust-nci60-unscale</span></span>
+<span id="cb61-370"><a href="#cb61-370" aria-hidden="true" tabindex="-1"></a><span class="co">#| warning: false</span></span>
+<span id="cb61-371"><a href="#cb61-371" aria-hidden="true" tabindex="-1"></a><span class="co">#| echo: true</span></span>
+<span id="cb61-372"><a href="#cb61-372" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb61-373"><a href="#cb61-373" aria-hidden="true" tabindex="-1"></a><span class="co"># Compare scaled data versus non-scaled data:</span></span>
+<span id="cb61-374"><a href="#cb61-374" aria-hidden="true" tabindex="-1"></a>hc.unscaled <span class="ot">&lt;-</span> <span class="fu">hclust</span>(<span class="fu">dist</span>(nci.data), <span class="at">method=</span><span class="st">"complete"</span>)</span>
+<span id="cb61-375"><a href="#cb61-375" aria-hidden="true" tabindex="-1"></a><span class="fu">plot</span>(hc.unscaled, <span class="at">labels=</span>nci.labs, <span class="at">main=</span><span class="st">"Complete linkage with unscaled features"</span>, <span class="at">xlab=</span><span class="st">""</span>, <span class="at">sub=</span><span class="st">""</span>)</span>
+<span id="cb61-376"><a href="#cb61-376" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb61-377"><a href="#cb61-377" aria-hidden="true" tabindex="-1"></a><span class="co"># Compare Euclidean distance with correlation-based distance:</span></span>
+<span id="cb61-378"><a href="#cb61-378" aria-hidden="true" tabindex="-1"></a>dd <span class="ot">&lt;-</span> <span class="fu">as.dist</span>(<span class="dv">1</span><span class="sc">-</span><span class="fu">cor</span>(<span class="fu">t</span>(sd.data)))</span>
+<span id="cb61-379"><a href="#cb61-379" aria-hidden="true" tabindex="-1"></a>hc.corr <span class="ot">&lt;-</span> <span class="fu">hclust</span>(dd, <span class="at">method=</span><span class="st">"complete"</span>)</span>
+<span id="cb61-380"><a href="#cb61-380" aria-hidden="true" tabindex="-1"></a><span class="fu">plot</span>(hc.corr, <span class="at">labels=</span>nci.labs, <span class="at">main=</span><span class="st">"Complete linkage with correlation-based distance"</span>, <span class="at">xlab=</span><span class="st">""</span>, <span class="at">sub=</span><span class="st">""</span>)</span>
+<span id="cb61-381"><a href="#cb61-381" aria-hidden="true" tabindex="-1"></a><span class="in">```</span></span>
+<span id="cb61-382"><a href="#cb61-382" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb61-383"><a href="#cb61-383" aria-hidden="true" tabindex="-1"></a><span class="fu">### K-means clustering</span></span>
+<span id="cb61-384"><a href="#cb61-384" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb61-385"><a href="#cb61-385" aria-hidden="true" tabindex="-1"></a>In this section we explore the K-means clustering on the same dataset. </span>
+<span id="cb61-386"><a href="#cb61-386" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb61-387"><a href="#cb61-387" aria-hidden="true" tabindex="-1"></a>In contrast to the hierarchical clustering which requires a **distance** as input, with K-means you would provide the data matrix. The data matrix can be scaled (centered and with unit variance), or unscaled.</span>
+<span id="cb61-388"><a href="#cb61-388" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb61-389"><a href="#cb61-389" aria-hidden="true" tabindex="-1"></a>In this example we use scaled data computed from before, <span class="in">`sd.data`</span>. </span>
+<span id="cb61-390"><a href="#cb61-390" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb61-393"><a href="#cb61-393" aria-hidden="true" tabindex="-1"></a><span class="in">```{r}</span></span>
+<span id="cb61-394"><a href="#cb61-394" aria-hidden="true" tabindex="-1"></a><span class="co">#| label: clust-nci60-km4</span></span>
+<span id="cb61-395"><a href="#cb61-395" aria-hidden="true" tabindex="-1"></a><span class="co">#| warning: false</span></span>
+<span id="cb61-396"><a href="#cb61-396" aria-hidden="true" tabindex="-1"></a><span class="co">#| echo: true</span></span>
+<span id="cb61-397"><a href="#cb61-397" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb61-398"><a href="#cb61-398" aria-hidden="true" tabindex="-1"></a><span class="fu">set.seed</span>(<span class="dv">4</span>) <span class="co"># set random seed</span></span>
+<span id="cb61-399"><a href="#cb61-399" aria-hidden="true" tabindex="-1"></a>km.out4 <span class="ot">&lt;-</span> <span class="fu">kmeans</span>(sd.data, <span class="at">centers =</span> <span class="dv">4</span>, <span class="at">nstart=</span><span class="dv">20</span>)</span>
+<span id="cb61-400"><a href="#cb61-400" aria-hidden="true" tabindex="-1"></a>km.out4<span class="sc">$</span>cluster</span>
+<span id="cb61-401"><a href="#cb61-401" aria-hidden="true" tabindex="-1"></a><span class="in">```</span></span>
+<span id="cb61-402"><a href="#cb61-402" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb61-403"><a href="#cb61-403" aria-hidden="true" tabindex="-1"></a>Read the help file <span class="in">`?kmeans`</span> to understand what the argument nstart=20 does. Comparing an analysis with nstart=20 versus nstart=1 demonstrates how the cluster results can be improved if we allow more evaluations with different randomly chosen starting centroids.</span>
+<span id="cb61-404"><a href="#cb61-404" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb61-405"><a href="#cb61-405" aria-hidden="true" tabindex="-1"></a>Set a different random seed, say 3 (as long as it's different from the one you used before), and run the analysis again. This time we use a different <span class="in">`nstart`</span></span>
+<span id="cb61-406"><a href="#cb61-406" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb61-409"><a href="#cb61-409" aria-hidden="true" tabindex="-1"></a><span class="in">```{r}</span></span>
+<span id="cb61-410"><a href="#cb61-410" aria-hidden="true" tabindex="-1"></a><span class="co">#| label: clust-nci60-km3</span></span>
+<span id="cb61-411"><a href="#cb61-411" aria-hidden="true" tabindex="-1"></a><span class="co">#| warning: false</span></span>
+<span id="cb61-412"><a href="#cb61-412" aria-hidden="true" tabindex="-1"></a><span class="co">#| echo: true</span></span>
+<span id="cb61-413"><a href="#cb61-413" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb61-414"><a href="#cb61-414" aria-hidden="true" tabindex="-1"></a><span class="co"># different starting centroids improve the clustering:</span></span>
+<span id="cb61-415"><a href="#cb61-415" aria-hidden="true" tabindex="-1"></a><span class="fu">set.seed</span>(<span class="dv">3</span>)</span>
+<span id="cb61-416"><a href="#cb61-416" aria-hidden="true" tabindex="-1"></a>km.out <span class="ot">&lt;-</span> <span class="fu">kmeans</span>(sd.data, <span class="at">centers =</span> <span class="dv">4</span>, <span class="at">nstart=</span><span class="dv">1</span>)</span>
+<span id="cb61-417"><a href="#cb61-417" aria-hidden="true" tabindex="-1"></a>km.out<span class="sc">$</span>cluster <span class="co"># cluster label</span></span>
+<span id="cb61-418"><a href="#cb61-418" aria-hidden="true" tabindex="-1"></a><span class="in">```</span></span>
+<span id="cb61-419"><a href="#cb61-419" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb61-420"><a href="#cb61-420" aria-hidden="true" tabindex="-1"></a><span class="fu">### Compare with hierarchical clustering</span></span>
+<span id="cb61-421"><a href="#cb61-421" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb61-424"><a href="#cb61-424" aria-hidden="true" tabindex="-1"></a><span class="in">```{r}</span></span>
+<span id="cb61-425"><a href="#cb61-425" aria-hidden="true" tabindex="-1"></a><span class="co">#| label: clust-nci60-km-res</span></span>
+<span id="cb61-426"><a href="#cb61-426" aria-hidden="true" tabindex="-1"></a><span class="co">#| warning: false</span></span>
+<span id="cb61-427"><a href="#cb61-427" aria-hidden="true" tabindex="-1"></a><span class="co">#| echo: true</span></span>
+<span id="cb61-428"><a href="#cb61-428" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb61-429"><a href="#cb61-429" aria-hidden="true" tabindex="-1"></a><span class="co"># we can directly compare the k-means result (along rows)</span></span>
+<span id="cb61-430"><a href="#cb61-430" aria-hidden="true" tabindex="-1"></a><span class="co"># with the hierarchical clustering result (along columns)</span></span>
+<span id="cb61-431"><a href="#cb61-431" aria-hidden="true" tabindex="-1"></a><span class="fu">table</span>(km.out4<span class="sc">$</span>cluster, hc.clusters[,<span class="st">"4"</span>], <span class="at">deparse.level=</span><span class="dv">2</span>)</span>
+<span id="cb61-432"><a href="#cb61-432" aria-hidden="true" tabindex="-1"></a><span class="in">```</span></span>
+<span id="cb61-433"><a href="#cb61-433" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb61-434"><a href="#cb61-434" aria-hidden="true" tabindex="-1"></a>From the results, you can see that the results are slightly different between the two methods. Keep in mind that in unsupervised learning you do not have the real outcome label (such as the cancer types here), so you need to try a few different methods and compare the outputs, and make interpretations accordingly.</span>
+<span id="cb61-435"><a href="#cb61-435" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb61-436"><a href="#cb61-436" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb61-437"><a href="#cb61-437" aria-hidden="true" tabindex="-1"></a><span class="fu">### Visualize clusters</span></span>
+<span id="cb61-438"><a href="#cb61-438" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb61-439"><a href="#cb61-439" aria-hidden="true" tabindex="-1"></a>We can visualise the K-means clustering results of high-dimensional data by using PCA for dimension reduction. We plot the first two principal components and colour the data points (= individual cell lines) by their assigned cluster from K-means.</span>
+<span id="cb61-440"><a href="#cb61-440" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb61-443"><a href="#cb61-443" aria-hidden="true" tabindex="-1"></a><span class="in">```{r}</span></span>
+<span id="cb61-444"><a href="#cb61-444" aria-hidden="true" tabindex="-1"></a><span class="co">#| label: clust-nci60-km-pca</span></span>
+<span id="cb61-445"><a href="#cb61-445" aria-hidden="true" tabindex="-1"></a><span class="co">#| warning: false</span></span>
+<span id="cb61-446"><a href="#cb61-446" aria-hidden="true" tabindex="-1"></a><span class="co">#| echo: true</span></span>
+<span id="cb61-447"><a href="#cb61-447" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb61-448"><a href="#cb61-448" aria-hidden="true" tabindex="-1"></a><span class="co"># first, run PCA again on the NCI60 data</span></span>
+<span id="cb61-449"><a href="#cb61-449" aria-hidden="true" tabindex="-1"></a>pr.out <span class="ot">&lt;-</span> <span class="fu">prcomp</span>(nci.data, <span class="at">scale=</span><span class="cn">TRUE</span>)</span>
+<span id="cb61-450"><a href="#cb61-450" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb61-451"><a href="#cb61-451" aria-hidden="true" tabindex="-1"></a><span class="co"># more cluster options</span></span>
+<span id="cb61-452"><a href="#cb61-452" aria-hidden="true" tabindex="-1"></a>km.out2 <span class="ot">&lt;-</span> <span class="fu">kmeans</span>(sd.data, <span class="dv">2</span>, <span class="at">nstart=</span><span class="dv">20</span>)</span>
+<span id="cb61-453"><a href="#cb61-453" aria-hidden="true" tabindex="-1"></a>km.out3 <span class="ot">&lt;-</span> <span class="fu">kmeans</span>(sd.data, <span class="dv">3</span>, <span class="at">nstart=</span><span class="dv">20</span>)</span>
+<span id="cb61-454"><a href="#cb61-454" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb61-455"><a href="#cb61-455" aria-hidden="true" tabindex="-1"></a><span class="co"># we can now visualise the K-Means results by labelling the data points</span></span>
+<span id="cb61-456"><a href="#cb61-456" aria-hidden="true" tabindex="-1"></a><span class="co"># in a plot of the scores of the first 2 principal components:</span></span>
+<span id="cb61-457"><a href="#cb61-457" aria-hidden="true" tabindex="-1"></a><span class="fu">par</span>(<span class="at">mfrow=</span><span class="fu">c</span>(<span class="dv">1</span>,<span class="dv">3</span>))</span>
+<span id="cb61-458"><a href="#cb61-458" aria-hidden="true" tabindex="-1"></a><span class="fu">plot</span>(pr.out<span class="sc">$</span>x[,<span class="dv">1</span><span class="sc">:</span><span class="dv">2</span>], <span class="at">col=</span>(km.out2<span class="sc">$</span>cluster<span class="sc">+</span><span class="dv">1</span>), <span class="at">main=</span><span class="st">"K-Means with K=2"</span>,</span>
+<span id="cb61-459"><a href="#cb61-459" aria-hidden="true" tabindex="-1"></a>     <span class="at">xlab=</span><span class="st">"PC 1"</span>, <span class="at">ylab=</span><span class="st">"PC 2"</span>, <span class="at">pch=</span><span class="dv">20</span>)</span>
+<span id="cb61-460"><a href="#cb61-460" aria-hidden="true" tabindex="-1"></a><span class="fu">plot</span>(pr.out<span class="sc">$</span>x[,<span class="dv">1</span><span class="sc">:</span><span class="dv">2</span>], <span class="at">col=</span>(km.out3<span class="sc">$</span>cluster<span class="sc">+</span><span class="dv">1</span>), <span class="at">main=</span><span class="st">"K-Means with K=3"</span>,</span>
+<span id="cb61-461"><a href="#cb61-461" aria-hidden="true" tabindex="-1"></a>     <span class="at">xlab=</span><span class="st">"PC 1"</span>,  <span class="at">ylab=</span><span class="st">"PC 2"</span>, <span class="at">pch=</span><span class="dv">20</span>)</span>
+<span id="cb61-462"><a href="#cb61-462" aria-hidden="true" tabindex="-1"></a><span class="fu">plot</span>(pr.out<span class="sc">$</span>x[,<span class="dv">1</span><span class="sc">:</span><span class="dv">2</span>], <span class="at">col=</span>(km.out4<span class="sc">$</span>cluster<span class="sc">+</span><span class="dv">1</span>), <span class="at">main=</span><span class="st">"K-Means with K=4"</span>,</span>
+<span id="cb61-463"><a href="#cb61-463" aria-hidden="true" tabindex="-1"></a>     <span class="at">xlab=</span><span class="st">"PC 1"</span>, <span class="at">ylab=</span><span class="st">"PC 2"</span>, <span class="at">pch=</span><span class="dv">20</span>)</span>
+<span id="cb61-464"><a href="#cb61-464" aria-hidden="true" tabindex="-1"></a><span class="in">```</span></span>
+<span id="cb61-465"><a href="#cb61-465" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb61-466"><a href="#cb61-466" aria-hidden="true" tabindex="-1"></a>Compare with the plot from Exercise 2 yesterday (left panel) along with the cancer type labels. The clusters from K-means seem to correspond decently to partition the data into groups.</span>
+<span id="cb61-467"><a href="#cb61-467" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb61-470"><a href="#cb61-470" aria-hidden="true" tabindex="-1"></a><span class="in">```{r}</span></span>
+<span id="cb61-471"><a href="#cb61-471" aria-hidden="true" tabindex="-1"></a><span class="fu">par</span>(<span class="at">mfrow=</span><span class="fu">c</span>(<span class="dv">1</span>,<span class="dv">1</span>))</span>
+<span id="cb61-472"><a href="#cb61-472" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb61-473"><a href="#cb61-473" aria-hidden="true" tabindex="-1"></a>Cols<span class="ot">=</span><span class="cf">function</span>(vec){</span>
+<span id="cb61-474"><a href="#cb61-474" aria-hidden="true" tabindex="-1"></a>  cols<span class="ot">=</span><span class="fu">rainbow</span>(<span class="fu">length</span>(<span class="fu">unique</span>(vec)))</span>
+<span id="cb61-475"><a href="#cb61-475" aria-hidden="true" tabindex="-1"></a>  <span class="fu">return</span>(cols[<span class="fu">as.numeric</span>(<span class="fu">as.factor</span>(vec))])</span>
+<span id="cb61-476"><a href="#cb61-476" aria-hidden="true" tabindex="-1"></a>}</span>
+<span id="cb61-477"><a href="#cb61-477" aria-hidden="true" tabindex="-1"></a><span class="fu">plot</span>(pr.out<span class="sc">$</span>x[,<span class="dv">1</span><span class="sc">:</span><span class="dv">2</span>], <span class="at">col=</span><span class="fu">Cols</span>(nci.labs), <span class="at">pch=</span><span class="dv">19</span>,<span class="at">xlab=</span><span class="st">"PC 1"</span>,<span class="at">ylab=</span><span class="st">" PC 2"</span>)</span>
+<span id="cb61-478"><a href="#cb61-478" aria-hidden="true" tabindex="-1"></a><span class="fu">legend</span>(<span class="st">'topleft'</span>, <span class="at">col=</span><span class="fu">rainbow</span>(<span class="fu">length</span>(<span class="fu">unique</span>(nci.labs))), <span class="at">legend=</span><span class="fu">unique</span>(nci.labs), <span class="at">bty=</span><span class="st">'n'</span>, <span class="at">lwd=</span><span class="dv">2</span>, <span class="at">cex=</span>.<span class="dv">6</span>)</span>
+<span id="cb61-479"><a href="#cb61-479" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb61-480"><a href="#cb61-480" aria-hidden="true" tabindex="-1"></a><span class="in">```</span></span>
+<span id="cb61-481"><a href="#cb61-481" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb61-482"><a href="#cb61-482" aria-hidden="true" tabindex="-1"></a><span class="fu">### Heatmap </span></span>
+<span id="cb61-483"><a href="#cb61-483" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb61-484"><a href="#cb61-484" aria-hidden="true" tabindex="-1"></a>A heatmap is another way to visualize the clusters from the data. We use the principal components rather than the raw data, as PCs are already explaining a large amount of variability in the over 6000 features.</span>
+<span id="cb61-485"><a href="#cb61-485" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb61-486"><a href="#cb61-486" aria-hidden="true" tabindex="-1"></a>Similar values are presented with similar colors. </span>
+<span id="cb61-487"><a href="#cb61-487" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb61-490"><a href="#cb61-490" aria-hidden="true" tabindex="-1"></a><span class="in">```{r}</span></span>
+<span id="cb61-491"><a href="#cb61-491" aria-hidden="true" tabindex="-1"></a><span class="co">#| label: clust-nci60-heatmap-pca</span></span>
+<span id="cb61-492"><a href="#cb61-492" aria-hidden="true" tabindex="-1"></a><span class="co">#| warning: false</span></span>
+<span id="cb61-493"><a href="#cb61-493" aria-hidden="true" tabindex="-1"></a><span class="co">#| echo: true</span></span>
+<span id="cb61-494"><a href="#cb61-494" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb61-495"><a href="#cb61-495" aria-hidden="true" tabindex="-1"></a><span class="do">## We use the scores of the PCA on the NCI60 data, to reduce dimension</span></span>
+<span id="cb61-496"><a href="#cb61-496" aria-hidden="true" tabindex="-1"></a>scores <span class="ot">&lt;-</span> pr.out<span class="sc">$</span>x</span>
+<span id="cb61-497"><a href="#cb61-497" aria-hidden="true" tabindex="-1"></a>scores[<span class="dv">1</span><span class="sc">:</span><span class="dv">5</span>, <span class="dv">1</span><span class="sc">:</span><span class="dv">5</span>] <span class="co"># first 5 pc, first 5 measurements</span></span>
+<span id="cb61-498"><a href="#cb61-498" aria-hidden="true" tabindex="-1"></a><span class="co">#  default choices</span></span>
+<span id="cb61-499"><a href="#cb61-499" aria-hidden="true" tabindex="-1"></a><span class="fu">heatmap</span>(pr.out<span class="sc">$</span>x)</span>
+<span id="cb61-500"><a href="#cb61-500" aria-hidden="true" tabindex="-1"></a><span class="in">```</span></span>
+<span id="cb61-501"><a href="#cb61-501" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb61-502"><a href="#cb61-502" aria-hidden="true" tabindex="-1"></a>You can remove the dendrogram on the PCs, only keeping the ones for cancer types. Now you see that the PCs have kept their original order from 1 to 64.</span>
+<span id="cb61-503"><a href="#cb61-503" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb61-506"><a href="#cb61-506" aria-hidden="true" tabindex="-1"></a><span class="in">```{r}</span></span>
+<span id="cb61-507"><a href="#cb61-507" aria-hidden="true" tabindex="-1"></a><span class="co">#| label: clust-nci60-heatmap-dendro</span></span>
+<span id="cb61-508"><a href="#cb61-508" aria-hidden="true" tabindex="-1"></a><span class="co">#| warning: false</span></span>
+<span id="cb61-509"><a href="#cb61-509" aria-hidden="true" tabindex="-1"></a><span class="co">#| echo: true</span></span>
+<span id="cb61-510"><a href="#cb61-510" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb61-511"><a href="#cb61-511" aria-hidden="true" tabindex="-1"></a><span class="co"># hc.corr is the result from hclust. check the section on hierarchical clustering</span></span>
+<span id="cb61-512"><a href="#cb61-512" aria-hidden="true" tabindex="-1"></a><span class="fu">heatmap</span>(pr.out<span class="sc">$</span>x, <span class="at">Rowv =</span> <span class="fu">as.dendrogram</span>(hc.corr), <span class="at">Colv =</span> <span class="cn">NA</span>)</span>
+<span id="cb61-513"><a href="#cb61-513" aria-hidden="true" tabindex="-1"></a><span class="in">```</span></span>
+<span id="cb61-514"><a href="#cb61-514" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb61-515"><a href="#cb61-515" aria-hidden="true" tabindex="-1"></a>You can also reduce the number of PCs, and add titles to the plot annd y-axis.</span>
+<span id="cb61-516"><a href="#cb61-516" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb61-519"><a href="#cb61-519" aria-hidden="true" tabindex="-1"></a><span class="in">```{r}</span></span>
+<span id="cb61-520"><a href="#cb61-520" aria-hidden="true" tabindex="-1"></a><span class="co">#| label: clust-nci60-heatmap-type</span></span>
+<span id="cb61-521"><a href="#cb61-521" aria-hidden="true" tabindex="-1"></a><span class="co">#| warning: false</span></span>
+<span id="cb61-522"><a href="#cb61-522" aria-hidden="true" tabindex="-1"></a><span class="co">#| echo: true</span></span>
+<span id="cb61-523"><a href="#cb61-523" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb61-524"><a href="#cb61-524" aria-hidden="true" tabindex="-1"></a><span class="fu">par</span>(<span class="at">cex.main =</span> .<span class="dv">7</span>)</span>
+<span id="cb61-525"><a href="#cb61-525" aria-hidden="true" tabindex="-1"></a><span class="fu">heatmap</span>(pr.out<span class="sc">$</span>x[,<span class="dv">1</span><span class="sc">:</span><span class="dv">40</span>], <span class="at">Rowv =</span> <span class="fu">as.dendrogram</span>(hc.corr), <span class="at">Colv =</span> <span class="cn">NA</span>,</span>
+<span id="cb61-526"><a href="#cb61-526" aria-hidden="true" tabindex="-1"></a>        <span class="at">labRow =</span> nci.labs, <span class="at">main =</span> <span class="st">'Heatmap of the scores of the first 40 PCs on the NCI60 data'</span>)</span>
+<span id="cb61-527"><a href="#cb61-527" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb61-528"><a href="#cb61-528" aria-hidden="true" tabindex="-1"></a><span class="in">```</span></span>
+<span id="cb61-529"><a href="#cb61-529" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb61-530"><a href="#cb61-530" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb61-531"><a href="#cb61-531" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb61-532"><a href="#cb61-532" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb61-533"><a href="#cb61-533" aria-hidden="true" tabindex="-1"></a><span class="fu">## Exercise 3: Gene expression data</span></span>
+<span id="cb61-534"><a href="#cb61-534" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb61-535"><a href="#cb61-535" aria-hidden="true" tabindex="-1"></a>(CH12Ex13 from statistical learning)</span>
+<span id="cb61-536"><a href="#cb61-536" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb61-537"><a href="#cb61-537" aria-hidden="true" tabindex="-1"></a>We use the <span class="in">`Ch12Ex13.csv`</span> data to repeat some of the clustering analysis we did.</span>
+<span id="cb61-538"><a href="#cb61-538" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb61-539"><a href="#cb61-539" aria-hidden="true" tabindex="-1"></a>The first 20 samples are from healthy patients, while the second 20 are from a diseased group.</span>
+<span id="cb61-540"><a href="#cb61-540" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb61-541"><a href="#cb61-541" aria-hidden="true" tabindex="-1"></a>Load in the data using read.csv(). You will need to select header=F. Alternatively: load in the data using “Import dataset” in the upper right window, and click “no” on the “Heading” option.</span>
+<span id="cb61-542"><a href="#cb61-542" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb61-543"><a href="#cb61-543" aria-hidden="true" tabindex="-1"></a>Carry out both hierarchical clustering and K-means clustering. You should choose the most meaningful number of clusters (think about how many groups of patients we have!). Compare the results. </span>
+<span id="cb61-544"><a href="#cb61-544" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb61-545"><a href="#cb61-545" aria-hidden="true" tabindex="-1"></a>Note: remember that the data has genes on the rows and patients on the columns. You need to transpose the data so that the orders are reversed.</span>
+<span id="cb61-546"><a href="#cb61-546" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb61-549"><a href="#cb61-549" aria-hidden="true" tabindex="-1"></a><span class="in">```{r}</span></span>
+<span id="cb61-550"><a href="#cb61-550" aria-hidden="true" tabindex="-1"></a><span class="co">#| label: clust-ge-loaddata</span></span>
+<span id="cb61-551"><a href="#cb61-551" aria-hidden="true" tabindex="-1"></a><span class="co">#| warning: false</span></span>
+<span id="cb61-552"><a href="#cb61-552" aria-hidden="true" tabindex="-1"></a><span class="co">#| echo: true</span></span>
+<span id="cb61-553"><a href="#cb61-553" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb61-554"><a href="#cb61-554" aria-hidden="true" tabindex="-1"></a><span class="co"># load in the data using read.csv(). You will need to select header=F.</span></span>
+<span id="cb61-555"><a href="#cb61-555" aria-hidden="true" tabindex="-1"></a>data <span class="ot">&lt;-</span> <span class="fu">read.csv</span>(<span class="st">"data/Ch12Ex13.csv"</span>, <span class="at">header=</span><span class="cn">FALSE</span>)</span>
+<span id="cb61-556"><a href="#cb61-556" aria-hidden="true" tabindex="-1"></a><span class="fu">dim</span>(data)</span>
+<span id="cb61-557"><a href="#cb61-557" aria-hidden="true" tabindex="-1"></a><span class="co"># transpose the data, so that we have each row is one patient (subject)</span></span>
+<span id="cb61-558"><a href="#cb61-558" aria-hidden="true" tabindex="-1"></a>data <span class="ot">&lt;-</span> <span class="fu">t</span>(data) </span>
+<span id="cb61-559"><a href="#cb61-559" aria-hidden="true" tabindex="-1"></a><span class="in">```</span></span>
+<span id="cb61-560"><a href="#cb61-560" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb61-561"><a href="#cb61-561" aria-hidden="true" tabindex="-1"></a>Now the first 20 rows are measurements from healthy patients (group 0), and 21-50 rows are the disease patients (group 1). We can denote this information in a vector like this.</span>
+<span id="cb61-562"><a href="#cb61-562" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb61-565"><a href="#cb61-565" aria-hidden="true" tabindex="-1"></a><span class="in">```{r}</span></span>
+<span id="cb61-566"><a href="#cb61-566" aria-hidden="true" tabindex="-1"></a><span class="co">#| label: clust-ge-truelabel</span></span>
+<span id="cb61-567"><a href="#cb61-567" aria-hidden="true" tabindex="-1"></a><span class="co">#| warning: false</span></span>
+<span id="cb61-568"><a href="#cb61-568" aria-hidden="true" tabindex="-1"></a><span class="co">#| echo: true</span></span>
+<span id="cb61-569"><a href="#cb61-569" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb61-570"><a href="#cb61-570" aria-hidden="true" tabindex="-1"></a>true.groups <span class="ot">&lt;-</span> <span class="fu">c</span>( <span class="fu">rep</span>(<span class="dv">0</span>,<span class="dv">20</span>), <span class="fu">rep</span>(<span class="dv">1</span>,<span class="dv">20</span>))</span>
+<span id="cb61-571"><a href="#cb61-571" aria-hidden="true" tabindex="-1"></a><span class="in">```</span></span>
+<span id="cb61-572"><a href="#cb61-572" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb61-573"><a href="#cb61-573" aria-hidden="true" tabindex="-1"></a><span class="fu">### Hierarchical clustering </span></span>
+<span id="cb61-574"><a href="#cb61-574" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb61-575"><a href="#cb61-575" aria-hidden="true" tabindex="-1"></a>You can use different linkage options and distance metrics of your choosing. For example, with complete linkage the code is like this.</span>
+<span id="cb61-576"><a href="#cb61-576" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb61-579"><a href="#cb61-579" aria-hidden="true" tabindex="-1"></a><span class="in">```{r}</span></span>
+<span id="cb61-580"><a href="#cb61-580" aria-hidden="true" tabindex="-1"></a><span class="co">#| label: clust-ge-clust</span></span>
+<span id="cb61-581"><a href="#cb61-581" aria-hidden="true" tabindex="-1"></a><span class="co">#| warning: false</span></span>
+<span id="cb61-582"><a href="#cb61-582" aria-hidden="true" tabindex="-1"></a><span class="co">#| echo: true</span></span>
+<span id="cb61-583"><a href="#cb61-583" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb61-584"><a href="#cb61-584" aria-hidden="true" tabindex="-1"></a>data.dist <span class="ot">&lt;-</span> <span class="fu">dist</span>(data) <span class="co"># need to compute the distance matrix</span></span>
+<span id="cb61-585"><a href="#cb61-585" aria-hidden="true" tabindex="-1"></a>hclust.df <span class="ot">&lt;-</span> <span class="fu">hclust</span>(data.dist, <span class="at">method=</span><span class="st">"complete"</span> )</span>
+<span id="cb61-586"><a href="#cb61-586" aria-hidden="true" tabindex="-1"></a><span class="co">#alternatives:</span></span>
+<span id="cb61-587"><a href="#cb61-587" aria-hidden="true" tabindex="-1"></a><span class="co">#hclust.df &lt;- hclust( D, method="average" )</span></span>
+<span id="cb61-588"><a href="#cb61-588" aria-hidden="true" tabindex="-1"></a><span class="co">#hclust.df &lt;- hclust( D, method="single" )</span></span>
+<span id="cb61-589"><a href="#cb61-589" aria-hidden="true" tabindex="-1"></a><span class="in">```</span></span>
+<span id="cb61-590"><a href="#cb61-590" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb61-591"><a href="#cb61-591" aria-hidden="true" tabindex="-1"></a>We can keep 2 clusters with <span class="in">`cutree`</span>. Then do a cross tabulation of the true labels and clustered results: how well do they correspond?</span>
+<span id="cb61-592"><a href="#cb61-592" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb61-595"><a href="#cb61-595" aria-hidden="true" tabindex="-1"></a><span class="in">```{r}</span></span>
+<span id="cb61-596"><a href="#cb61-596" aria-hidden="true" tabindex="-1"></a><span class="co">#| label: clust-ge-clust-res</span></span>
+<span id="cb61-597"><a href="#cb61-597" aria-hidden="true" tabindex="-1"></a><span class="co">#| warning: false</span></span>
+<span id="cb61-598"><a href="#cb61-598" aria-hidden="true" tabindex="-1"></a><span class="co">#| echo: true</span></span>
+<span id="cb61-599"><a href="#cb61-599" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb61-600"><a href="#cb61-600" aria-hidden="true" tabindex="-1"></a><span class="co"># find the clusters</span></span>
+<span id="cb61-601"><a href="#cb61-601" aria-hidden="true" tabindex="-1"></a>predicted <span class="ot">&lt;-</span> <span class="fu">cutree</span>( hclust.df, <span class="at">k=</span><span class="dv">2</span> )</span>
+<span id="cb61-602"><a href="#cb61-602" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb61-603"><a href="#cb61-603" aria-hidden="true" tabindex="-1"></a><span class="co"># How well does our clustering predict health vs. diseased</span></span>
+<span id="cb61-604"><a href="#cb61-604" aria-hidden="true" tabindex="-1"></a><span class="fu">table</span>(predicted, true.groups )</span>
+<span id="cb61-605"><a href="#cb61-605" aria-hidden="true" tabindex="-1"></a><span class="in">```</span></span>
+<span id="cb61-606"><a href="#cb61-606" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb61-607"><a href="#cb61-607" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb61-608"><a href="#cb61-608" aria-hidden="true" tabindex="-1"></a><span class="fu">### K-means</span></span>
+<span id="cb61-609"><a href="#cb61-609" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb61-610"><a href="#cb61-610" aria-hidden="true" tabindex="-1"></a>Now you can use K-means to identify 2 clusters. </span>
+<span id="cb61-611"><a href="#cb61-611" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb61-614"><a href="#cb61-614" aria-hidden="true" tabindex="-1"></a><span class="in">```{r}</span></span>
+<span id="cb61-615"><a href="#cb61-615" aria-hidden="true" tabindex="-1"></a><span class="co">#| label: clust-ge-km</span></span>
+<span id="cb61-616"><a href="#cb61-616" aria-hidden="true" tabindex="-1"></a><span class="co">#| warning: false</span></span>
+<span id="cb61-617"><a href="#cb61-617" aria-hidden="true" tabindex="-1"></a><span class="co">#| echo: true</span></span>
+<span id="cb61-618"><a href="#cb61-618" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb61-619"><a href="#cb61-619" aria-hidden="true" tabindex="-1"></a>predicted.kmean <span class="ot">&lt;-</span> <span class="fu">kmeans</span>(data, <span class="dv">2</span>, <span class="at">nstart=</span><span class="dv">20</span>)<span class="sc">$</span>cluster</span>
+<span id="cb61-620"><a href="#cb61-620" aria-hidden="true" tabindex="-1"></a><span class="co"># agreement with true label</span></span>
+<span id="cb61-621"><a href="#cb61-621" aria-hidden="true" tabindex="-1"></a><span class="fu">table</span>(predicted.kmean, true.groups )</span>
+<span id="cb61-622"><a href="#cb61-622" aria-hidden="true" tabindex="-1"></a><span class="in">```</span></span>
+<span id="cb61-623"><a href="#cb61-623" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb61-624"><a href="#cb61-624" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb61-625"><a href="#cb61-625" aria-hidden="true" tabindex="-1"></a>Both methods seem to do work decently for the task. </span>
 </code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 </div></div></div></div></div>
 </div> <!-- /content -->
diff --git a/docs/lab/lab_day4_clustering_files/figure-html/hc-food-complete-1.png b/docs/lab/lab_day4_clustering_files/figure-html/hc-food-complete-1.png
new file mode 100644
index 0000000..3a8a155
Binary files /dev/null and b/docs/lab/lab_day4_clustering_files/figure-html/hc-food-complete-1.png differ
diff --git a/docs/lab/lab_day4_clustering_files/figure-html/hc-food-cor-1.png b/docs/lab/lab_day4_clustering_files/figure-html/hc-food-cor-1.png
new file mode 100644
index 0000000..c6a0472
Binary files /dev/null and b/docs/lab/lab_day4_clustering_files/figure-html/hc-food-cor-1.png differ
diff --git a/docs/lab/lab_day4_clustering_files/figure-html/hc-food-heatmap1-1.png b/docs/lab/lab_day4_clustering_files/figure-html/hc-food-heatmap1-1.png
new file mode 100644
index 0000000..a5b73b3
Binary files /dev/null and b/docs/lab/lab_day4_clustering_files/figure-html/hc-food-heatmap1-1.png differ
diff --git a/docs/lab/lab_day4_clustering_files/figure-html/hc-food-heatmap2-1.png b/docs/lab/lab_day4_clustering_files/figure-html/hc-food-heatmap2-1.png
new file mode 100644
index 0000000..29197c8
Binary files /dev/null and b/docs/lab/lab_day4_clustering_files/figure-html/hc-food-heatmap2-1.png differ
diff --git a/docs/lab/lab_day4_clustering_files/figure-html/hc-food-heatmap3-1.png b/docs/lab/lab_day4_clustering_files/figure-html/hc-food-heatmap3-1.png
new file mode 100644
index 0000000..99dac6e
Binary files /dev/null and b/docs/lab/lab_day4_clustering_files/figure-html/hc-food-heatmap3-1.png differ
diff --git a/docs/lab/lab_day4_clustering_files/figure-html/hc-food-linkage-1.png b/docs/lab/lab_day4_clustering_files/figure-html/hc-food-linkage-1.png
new file mode 100644
index 0000000..8d417d8
Binary files /dev/null and b/docs/lab/lab_day4_clustering_files/figure-html/hc-food-linkage-1.png differ
diff --git a/docs/lab/lab_day4_clustering_files/figure-html/hc-food-linkage-2.png b/docs/lab/lab_day4_clustering_files/figure-html/hc-food-linkage-2.png
new file mode 100644
index 0000000..f9a8fa2
Binary files /dev/null and b/docs/lab/lab_day4_clustering_files/figure-html/hc-food-linkage-2.png differ
diff --git a/docs/lab/lab_day4_clustering_files/figure-html/hc-food-unscaled-1.png b/docs/lab/lab_day4_clustering_files/figure-html/hc-food-unscaled-1.png
new file mode 100644
index 0000000..d0de0f0
Binary files /dev/null and b/docs/lab/lab_day4_clustering_files/figure-html/hc-food-unscaled-1.png differ
diff --git a/docs/lab/lab_day4_clustering_files/figure-html/pca-nci-keeppc-1.png b/docs/lab/lab_day4_clustering_files/figure-html/pca-nci-keeppc-1.png
deleted file mode 100644
index 4278cb2..0000000
Binary files a/docs/lab/lab_day4_clustering_files/figure-html/pca-nci-keeppc-1.png and /dev/null differ
diff --git a/docs/lab/lab_day4_clustering_files/figure-html/pca-nci-plot2-1.png b/docs/lab/lab_day4_clustering_files/figure-html/pca-nci-plot2-1.png
deleted file mode 100644
index 960cf47..0000000
Binary files a/docs/lab/lab_day4_clustering_files/figure-html/pca-nci-plot2-1.png and /dev/null differ
diff --git a/docs/lab/lab_day4_clustering_files/figure-html/pca-nci-plotpc-1.png b/docs/lab/lab_day4_clustering_files/figure-html/pca-nci-plotpc-1.png
deleted file mode 100644
index bfc6f02..0000000
Binary files a/docs/lab/lab_day4_clustering_files/figure-html/pca-nci-plotpc-1.png and /dev/null differ
diff --git a/docs/lab/lab_day4_clustering_files/figure-html/clust-nci60-cutree-1.png b/docs/lab/lab_day4_clustering_files/figure-html/unnamed-chunk-18-1.png
similarity index 100%
rename from docs/lab/lab_day4_clustering_files/figure-html/clust-nci60-cutree-1.png
rename to docs/lab/lab_day4_clustering_files/figure-html/unnamed-chunk-18-1.png
diff --git a/docs/lab/lab_day4_clustering_files/figure-html/unnamed-chunk-14-1.png b/docs/lab/lab_day4_clustering_files/figure-html/unnamed-chunk-25-1.png
similarity index 100%
rename from docs/lab/lab_day4_clustering_files/figure-html/unnamed-chunk-14-1.png
rename to docs/lab/lab_day4_clustering_files/figure-html/unnamed-chunk-25-1.png
diff --git a/docs/lab/lab_day4_clustering_files/figure-html/unnamed-chunk-7-1.png b/docs/lab/lab_day4_clustering_files/figure-html/unnamed-chunk-7-1.png
deleted file mode 100644
index 6b9de14..0000000
Binary files a/docs/lab/lab_day4_clustering_files/figure-html/unnamed-chunk-7-1.png and /dev/null differ
diff --git a/docs/lab/overview.html b/docs/lab/overview.html
index 7090ed8..0eef877 100644
--- a/docs/lab/overview.html
+++ b/docs/lab/overview.html
@@ -245,7 +245,7 @@ <h2 class="anchored" data-anchor-id="lab-notes-and-r-scripts">Lab notes and R sc
 <td style="text-align: center;">Day 4</td>
 <td style="text-align: center;"><a href="../lab/lab_day4_clustering.html">Day 4: Clustering</a></td>
 <td style="text-align: center;"><a href="https://github.com/ocbe-uio/course_med3007/blob/main/lab/code/MED3007_Lab3_clustering.R">Code</a></td>
-<td style="text-align: center;"></td>
+<td style="text-align: center;"><a href="presentation/Lab_clustering.pdf">Slides</a></td>
 </tr>
 </tbody>
 </table>
@@ -579,7 +579,7 @@ <h2 class="anchored" data-anchor-id="lab-notes-and-r-scripts">Lab notes and R sc
 <span id="cb1-37"><a href="#cb1-37" aria-hidden="true" tabindex="-1"></a>| Day 2  | <span class="co">[</span><span class="ot">Day 2: Multiple testing</span><span class="co">](lab_day2_testing.qmd)</span>  |   <span class="co">[</span><span class="ot">Code</span><span class="co">](https://github.com/ocbe-uio/course_med3007/blob/main/lab/code/MED3007_Lab1.R)</span>, <span class="co">[</span><span class="ot">Code (solution)</span><span class="co">](https://github.com/ocbe-uio/course_med3007/blob/main/lab/code/MED3007_Lab1_exercise_solution.R)</span> | |</span>
 <span id="cb1-38"><a href="#cb1-38" aria-hidden="true" tabindex="-1"></a>| Day 3  | <span class="co">[</span><span class="ot">Day 3: Principal Component Analysis</span><span class="co">](lab_day3_pca.qmd)</span> |<span class="co">[</span><span class="ot">Code</span><span class="co">](https://github.com/ocbe-uio/course_med3007/blob/main/lab/code/MED3007_Lab2_pca.R)</span> | <span class="co">[</span><span class="ot">Slides</span><span class="co">](presentation/Lab_pca.pdf)</span></span>
 <span id="cb1-39"><a href="#cb1-39" aria-hidden="true" tabindex="-1"></a> |</span>
-<span id="cb1-40"><a href="#cb1-40" aria-hidden="true" tabindex="-1"></a>| Day 4  | <span class="co">[</span><span class="ot">Day 4: Clustering</span><span class="co">](lab_day4_clustering.qmd)</span> |<span class="co">[</span><span class="ot">Code</span><span class="co">](https://github.com/ocbe-uio/course_med3007/blob/main/lab/code/MED3007_Lab3_clustering.R)</span> | |</span>
+<span id="cb1-40"><a href="#cb1-40" aria-hidden="true" tabindex="-1"></a>| Day 4  | <span class="co">[</span><span class="ot">Day 4: Clustering</span><span class="co">](lab_day4_clustering.qmd)</span> |<span class="co">[</span><span class="ot">Code</span><span class="co">](https://github.com/ocbe-uio/course_med3007/blob/main/lab/code/MED3007_Lab3_clustering.R)</span> | <span class="co">[</span><span class="ot">Slides</span><span class="co">](presentation/Lab_clustering.pdf)</span>|</span>
 <span id="cb1-41"><a href="#cb1-41" aria-hidden="true" tabindex="-1"></a></span>
 <span id="cb1-42"><a href="#cb1-42" aria-hidden="true" tabindex="-1"></a></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 </div></div></div></div></div>
diff --git a/docs/lab/presentation/Lab_clustering.pdf b/docs/lab/presentation/Lab_clustering.pdf
new file mode 100644
index 0000000..ea019d2
Binary files /dev/null and b/docs/lab/presentation/Lab_clustering.pdf differ
diff --git a/docs/search.json b/docs/search.json
index 0cb8c14..155b23f 100644
--- a/docs/search.json
+++ b/docs/search.json
@@ -46,7 +46,7 @@
     "href": "index.html",
     "title": "MED3007: Statistical Principles in Genomics",
     "section": "",
-    "text": "Welcome!\nThis is the course website for MED3007: Statistical Principles in Genomics at the Faculty of Medicine, University of Oslo.\nThe course website is developed by the instructors of the course, hosted for free and public accessible on Github. Course material can be found in the github repository.\n\nStructure\n\nGet Started provides some information about software installation, data download and code.\nR Lab and Code hosts the lab session exercises and code.\n\n\n\n\n\n\n\nOfficial course webpage\n\n\n\nPlease refer to the official course page by University of Oslo for information related to application, evaluation and other administrative matters.\n\n\n\n\nPreparation before the course starts\nYou should have R installed on your computer before the course. More on preparation please read here.\nIf you have trouble setting it up with your own laptop, you can also use the PC in the PC lab.\n\n\n\nSchedule\nYou can find the time and place on the official course schedule at the University of Oslo course website.\nIf there is a conflict of information, please refer to the official schedule.\n\nDay 1\n\n\n\nTime\nTopic\nCourse material\n\n\n\n\n9:00 - 9:45\nIntroduction to the course\nLecture\n\n\n10:00 - 11:30\nLab: Introduction to R and Rstudio\nLab: intro to R, Code\n\n\n13:00 - 14:00\nData screening and multiple testing\nLecture\n\n\n14:00 - 16:00\nQ&A\n\n\n\n\n\n\nDay 2\n\n\n\nTime\nTopic\nCourse material\n\n\n\n\n9:00 - 9:45\nLab: Data screening and multiple testing\n\n\n\n10:00 - 11:30\nExercises\nLab: multiple testing, Code, Code (solution)\n\n\n13:00 - 14:00\nData visualization, dimensional reduction\nLecture\n\n\n14:00 - 16:00\nQ&A\n\n\n\n\n\n\nDay 3\n\n\n\nTime\nTopic\nCourse material\n\n\n\n\n9:00 - 9:45\nLab: Data visualization, dimensional reduction\n\n\n\n10:00 - 11:30\nExercises\nLab: PCA, Code\n\n\n13:00 - 14:00\nClustering\nLecture\n\n\n14:00 - 16:00\nQ&A\n\n\n\n\n\n\nDay 4\n\n\n\nTime\nTopic\nCourse material\n\n\n\n\n9:00 - 9:45\nLab: Clustering and heatmaps\n\n\n\n10:00 - 11:30\nExercises\nLab: clustering, Code\n\n\n13:00 - 14:00\nTake-home exam simulation\n\n\n\n14:00 - 16:00\nQ&A\n\n\n\n\nPapers\n\nPaper I: Cappelletti 2022\nPaper II: Ankill 2022"
+    "text": "Welcome!\nThis is the course website for MED3007: Statistical Principles in Genomics at the Faculty of Medicine, University of Oslo.\nThe course website is developed by the instructors of the course, hosted for free and public accessible on Github. Course material can be found in the github repository.\n\nStructure\n\nGet Started provides some information about software installation, data download and code.\nR Lab and Code hosts the lab session exercises and code.\n\n\n\n\n\n\n\nOfficial course webpage\n\n\n\nPlease refer to the official course page by University of Oslo for information related to application, evaluation and other administrative matters.\n\n\n\n\nPreparation before the course starts\nYou should have R installed on your computer before the course. More on preparation please read here.\nIf you have trouble setting it up with your own laptop, you can also use the PC in the PC lab.\n\n\n\nSchedule\nYou can find the time and place on the official course schedule at the University of Oslo course website.\nIf there is a conflict of information, please refer to the official schedule.\n\nDay 1\n\n\n\nTime\nTopic\nCourse material\n\n\n\n\n9:00 - 9:45\nIntroduction to the course\nLecture\n\n\n10:00 - 11:30\nLab: Introduction to R and Rstudio\nLab: intro to R, Code\n\n\n13:00 - 14:00\nData screening and multiple testing\nLecture\n\n\n14:00 - 16:00\nQ&A\n\n\n\n\n\n\nDay 2\n\n\n\nTime\nTopic\nCourse material\n\n\n\n\n9:00 - 9:45\nLab: Data screening and multiple testing\n\n\n\n10:00 - 11:30\nExercises\nLab: multiple testing, Code, Code (solution)\n\n\n13:00 - 14:00\nData visualization, dimensional reduction\nLecture\n\n\n14:00 - 16:00\nQ&A\n\n\n\n\n\n\nDay 3\n\n\n\nTime\nTopic\nCourse material\n\n\n\n\n9:00 - 9:45\nLab: Data visualization, dimensional reduction\n\n\n\n10:00 - 11:30\nExercises\nLab: PCA, Code\n\n\n13:00 - 14:00\nClustering\nLecture\n\n\n14:00 - 16:00\nQ&A\n\n\n\n\n\n\nDay 4\n\n\n\nTime\nTopic\nCourse material\n\n\n\n\n9:00 - 9:45\nLab: Clustering and heatmaps\n\n\n\n10:00 - 11:30\nExercises\nLab: clustering, Code\n\n\n13:00 - 14:00\nTake-home exam simulation\nExam\n\n\n14:00 - 16:00\nQ&A\n\n\n\n\nPapers\n\nPaper I: Cappelletti 2022\nPaper II: Ankill 2022"
   },
   {
     "objectID": "about.html",
@@ -186,14 +186,14 @@
     "href": "lab/overview.html#lab-notes-and-r-scripts",
     "title": "R Lab: Overview",
     "section": "Lab notes and R scripts",
-    "text": "Lab notes and R scripts\n\n\n\nDay\nLab notes\nR script\nPresentation\n\n\n\n\nDay 1\nDay 1: Introduction to R\nCode\nSlides\n\n\n\n(supplement): Navigate RStudio and workspace\n\n\n\n\nDay 2\nDay 2: Multiple testing\nCode, Code (solution)\n\n\n\nDay 3\nDay 3: Principal Component Analysis\nCode\nSlides\n\n\n\n\n\n\n\n\nDay 4\nDay 4: Clustering\nCode"
+    "text": "Lab notes and R scripts\n\n\n\nDay\nLab notes\nR script\nPresentation\n\n\n\n\nDay 1\nDay 1: Introduction to R\nCode\nSlides\n\n\n\n(supplement): Navigate RStudio and workspace\n\n\n\n\nDay 2\nDay 2: Multiple testing\nCode, Code (solution)\n\n\n\nDay 3\nDay 3: Principal Component Analysis\nCode\nSlides\n\n\n\n\n\n\n\n\nDay 4\nDay 4: Clustering\nCode\nSlides"
   },
   {
     "objectID": "lab/lab_day3_pca.html",
     "href": "lab/lab_day3_pca.html",
     "title": "R Lab (day 3): Data exploration, Principal Component Analysis",
     "section": "",
-    "text": "Download datasets here or from Canvas.\nR script: Code\nLab Lecture"
+    "text": "Download datasets here or from Canvas.\nR script: Code\nLab Lecture: Slides"
   },
   {
     "objectID": "lab/lab_day3_pca.html#nci60",
@@ -383,5 +383,26 @@
     "title": "R Lab (day 4): Clustering",
     "section": "Exercise 2: Gene expression data",
     "text": "Exercise 2: Gene expression data\n(CH12Ex13 from statistical learning)\nWe use the Ch12Ex13.csv data to repeat some of the clustering analysis we did.\nThe first 20 samples are from healthy patients, while the second 20 are from a diseased group.\nLoad in the data using read.csv(). You will need to select header=F. Alternatively: load in the data using “Import dataset” in the upper right window, and click “no” on the “Heading” option.\nCarry out both hierarchical clustering and K-means clustering. You should choose the most meaningful number of clusters (think about how many groups of patients we have!). Compare the results.\nNote: remember that the data has genes on the rows and patients on the columns. You need to transpose the data so that the orders are reversed.\n\n# load in the data using read.csv(). You will need to select header=F.\ndata &lt;- read.csv(\"data/Ch12Ex13.csv\", header=FALSE)\ndim(data)\n\n[1] 1000   40\n\n# transpose the data, so that we have each row is one patient (subject)\ndata &lt;- t(data) \n\nNow the first 20 rows are measurements from healthy patients (group 0), and 21-50 rows are the disease patients (group 1). We can denote this information in a vector like this.\n\ntrue.groups &lt;- c( rep(0,20), rep(1,20))\n\n\nHierarchical clustering\nYou can use different linkage options and distance metrics of your choosing. For example, with complete linkage the code is like this.\n\ndata.dist &lt;- dist(data) # need to compute the distance matrix\nhclust.df &lt;- hclust(data.dist, method=\"complete\" )\n#alternatives:\n#hclust.df &lt;- hclust( D, method=\"average\" )\n#hclust.df &lt;- hclust( D, method=\"single\" )\n\nWe can keep 2 clusters with cutree. Then do a cross tabulation of the true labels and clustered results: how well do they correspond?\n\n# find the clusters\npredicted &lt;- cutree( hclust.df, k=2 )\n\n# How well does our clustering predict health vs. diseased\ntable(predicted, true.groups )\n\n         true.groups\npredicted  0  1\n        1 20  0\n        2  0 20\n\n\n\n\nK-means\nNow you can use K-means to identify 2 clusters.\n\npredicted.kmean &lt;- kmeans(data, 2, nstart=20)$cluster\n# agreement with true label\ntable(predicted.kmean, true.groups )\n\n               true.groups\npredicted.kmean  0  1\n              1 20  0\n              2  0 20\n\n\nBoth methods seem to do work decently for the task."
+  },
+  {
+    "objectID": "lab/lab_day4_clustering.html#exercise-1-food",
+    "href": "lab/lab_day4_clustering.html#exercise-1-food",
+    "title": "R Lab (day 4): Clustering",
+    "section": "Exercise 1: Food",
+    "text": "Exercise 1: Food\nWe use the same Food.txt data to illustrate two concepts: hierarchical clustering, and heatmap.\nThis is not a genomics dataset, but for the ease of interpretability, we use it for teaching purposes.\nLet us load the dataset.\n\nfood &lt;- read.table('data/Food.txt', header=T)\n# we change the name from pulses to a more common name, legume\ncolnames(food)[7] &lt;- 'Legume'\nhead(food) # print first 6 lines \n\n               Meat Pigs Eggs Milk Fish Cereals Legume Fruit\nAlbania        10.1  1.4  0.5  8.9  0.2    42.3    5.5   1.7\nAustria         8.9 14.0  4.3 19.9  2.1    28.0    1.3   4.3\nBelg.Lux.      13.5  9.3  4.1 17.5  4.5    26.6    2.1   4.0\nBulgaria        7.8  6.0  1.6  8.3  1.2    56.7    3.7   4.2\nCzechoslovakia  9.7 11.4  2.8 12.5  2.0    34.3    1.1   4.0\nDenmark        10.6 10.8  3.7 25.0  9.9    21.9    0.7   2.4\n\n\nWe scale the data (also called standardize, or normalize sometimes) so that each column (feature, variable) has 0 mean and 1 variance. We call the scaled data food_s.\n\nfood_s &lt;- scale(food)\nhead(food_s) # print first 6 lines\n\n                      Meat       Pigs       Eggs        Milk        Fish\nAlbania         0.08126490 -1.8299828 -2.2437259 -1.15570645 -1.20028213\nAustria        -0.27725673  1.6636208  1.2335002  0.39161231 -0.64187467\nBelg.Lux.       1.09707621  0.3604512  1.0504883  0.05401549  0.06348211\nBulgaria       -0.60590157 -0.5545403 -1.2371605 -1.24010566 -0.90638347\nCzechoslovakia -0.03824231  0.9427184 -0.1390890 -0.64931122 -0.67126454\nDenmark         0.23064892  0.7763564  0.6844645  1.10900556  1.65053488\n                  Cereals     Legume       Fruit\nAlbania         0.9159176  1.2227536 -1.35040507\nAustria        -0.3870690 -0.8923886  0.09091397\nBelg.Lux.      -0.5146342 -0.4895043 -0.07539207\nBulgaria        2.2280161  0.3162641  0.03547862\nCzechoslovakia  0.1869740 -0.9931096 -0.07539207\nDenmark        -0.9428885 -1.1945517 -0.96235764\n\n\n\nDistances\nTo do hierarchical clustering, the most convenient command is hclust(). As input you would need a distance between the subjects (patients, or countries in this example). We do it on the scaled data.\nThe command to compute pair-wise distance is dist(). By default, the distance being computed is the Euclidean distance (details optional). Euclidean distance is possibly the most commonly used metric, but there are others. See ?dist() to find out more options.\nWe can present the pair-wise distances in a matrix format. You can see that this matrix is symmetric, with 0 on the diagonal - this should be intuitive: the distance between A - B is the same as B - A, and the distance between A and itself is 0.\n\n# compute distance\nfood_dist &lt;- dist(food_s)\n# round(food_dist, digits = 2) # try this yourself to see what it does\n# alternatively, look at this as a matrix\nfood_dist_matrix &lt;- as.matrix(food_dist)\nround(food_dist_matrix[1:5, 1:5], digits = 2) # first 5 row 5 col\n\n               Albania Austria Belg.Lux. Bulgaria Czechoslovakia\nAlbania           0.00    5.95      5.13     2.77           4.44\nAustria           5.95    0.00      2.11     4.71           1.98\nBelg.Lux.         5.13    2.11      0.00     4.45           2.20\nBulgaria          2.77    4.71      4.45     0.00           3.17\nCzechoslovakia    4.44    1.98      2.20     3.17           0.00\n\n\n\n\n\n\n\n\nOptional: Euclidean disance\n\n\n\nYou can check the Euclidean distance between Albania and Austria is indeed 5.95. This distance is the square root of the sum of squared differences between two subjects in all their measurements.\n\n\n\nround(food_s[1:2,], digits = 2) # we only keep first 2 digits\n\n         Meat  Pigs  Eggs  Milk  Fish Cereals Legume Fruit\nAlbania  0.08 -1.83 -2.24 -1.16 -1.20    0.92   1.22 -1.35\nAustria -0.28  1.66  1.23  0.39 -0.64   -0.39  -0.89  0.09\n\n# take the data for two countries each \nalbania &lt;- round(food_s[1,], digits = 2)\naustria &lt;- round(food_s[2,], digits = 2)\n# compute difference between each col\nd &lt;- albania - austria\nd\n\n   Meat    Pigs    Eggs    Milk    Fish Cereals  Legume   Fruit \n   0.36   -3.49   -3.47   -1.55   -0.56    1.31    2.11   -1.44 \n\n# euclidean distance: square each element, sum together, and take a square root\nsqrt(sum(d^2)) \n\n[1] 5.942096\n\n\n\n\nHierarchical clustering\nNow that we have computed the distance food_dist, we plug it in the clustering algorithm, hclust().\nWe try the complete linkage method, by specifying method = 'complete'. The result is saved as hc.complete. You can visualize it, and add label of the country names to make it easier to read.\n\nhc.complete &lt;- hclust(food_dist, method=\"complete\")\nplot(hc.complete, labels=rownames(food), main=\"Complete Linkage\", xlab=\"\", sub=\"\")\n\n\n\n\n\n\nLinkage, dissimilarity, scaling\nHierarchical clustering is a class of methods, and there are a variety of options to set.\n\nLinkage (by seting method inside hclust()): complete, single, average\nDissimilarity: Euclidean, correlation, …\nScaling: scaled data (mean 0 variance 1) or unscaled, original data\n\nThere is no definite guide on which combination works the best, hence you can try them out and see what could make most sense. Again, in unsupervised learning data do not have outcome labels, so the interpretation is left for the domain experts to make.\n\n# single linkage\nhc.single &lt;- hclust(food_dist, method=\"single\")\nplot(hc.single, labels=rownames(food), main=\"Single Linkage\", xlab=\"\", sub=\"\")\n\n\n\n# average linkage\nhc.average &lt;- hclust(food_dist, method=\"average\")\nplot(hc.average, labels=rownames(food), main=\"Average Linkage\", xlab=\"\", sub=\"\")\n\n\n\n\n\n# unscaled data, complete linkage\nhc.unscaled &lt;- hclust(dist(food), method=\"complete\")\nplot(hc.unscaled, labels=rownames(food), main=\"Complete linkage with unscaled features\", xlab=\"\", sub=\"\")\n\n\n\n\n\n# correlation as dissimiarity, rather than euclidean distance\ndd &lt;- as.dist(1-cor(t(food_s))) # compute the metric\nhc.corr &lt;- hclust(dd, method=\"complete\") # cluster\nplot(hc.corr, labels=rownames(food), main=\"Complete linkage with correlation-based distance\", xlab=\"\", sub=\"\")\n\n\n\n\n\n\nHeatmap\nHeatmap is a visualization tool to plot data of similar values in similar colors, so that you can identify visualy if there is any pattern. It can also be combined with hierarchical clustering - this is actually the default outcome: dendrograms are displayed for both rows and column.\n\n# make heatmap on the scaled data\nheatmap(food_s)\n\n\n\n\nTo preserve the original ordering of the columns and rows, you can specify Rowv = NA, Colv = NA.\n\n# no clustering for row or col, this preserves the original ordering\nheatmap(food_s, Rowv = NA, Colv = NA)\n\n\n\n\nCan also only do clustering for row only (or column only).\n\n# only clustering for row\nheatmap(food_s, Colv = NA)"
+  },
+  {
+    "objectID": "lab/lab_day4_clustering.html#exercise-2-nci60",
+    "href": "lab/lab_day4_clustering.html#exercise-2-nci60",
+    "title": "R Lab (day 4): Clustering",
+    "section": "Exercise 2: NCI60",
+    "text": "Exercise 2: NCI60\nWe look at the NCI60 data again. First load the dataset.\n\nlibrary(ISLR)\n# or, load('data/NCI60.RData')\nnci.labs &lt;- NCI60$labs # Sample labels (tissue type)\nnci.data &lt;- NCI60$data # Gene expression data set\n\n\nHierarchical clustering\nWe start by scaling the data, and calculate the distance matrix (using the Euclidean distance), and then investigate different linkage methods.\n\n# Scale the data to zero mean and unit variance:\nsd.data &lt;- scale(nci.data)\n\n# Calculate the distance matrix \n# equivalent: data.dist &lt;- dist(sd.data, method=\"euclidean\")\ndata.dist &lt;- dist(sd.data)\n\nNext we perform hierarchical clustering with distance matrix as input. The function we use is hclust(). We specify the linkage method to be complete.\nOnce the result is saved in hc.complete object, you can plot the dendrogram.\n\n# Perform clustering\nhc.complete &lt;- hclust(data.dist, method=\"complete\")\n\n# names(hc.complete)\nplot(hc.complete, labels=nci.labs, main=\"Complete Linkage\", xlab=\"\", sub=\"\")\n\n\n\n\nThe object hc.complete contains a lot of information. To get the information, you can use the $ operator.\nYou should refer to the documentation for hclust() to see a complete list of output. Use ?hclust to get the documentation on how to use the function.\n\nhc.complete$dist.method # distance method\n\n[1] \"euclidean\"\n\n# hc.complete$merge  # order of aggregations of samples / clusters\n# hc.complete$height # distance at which aggregations happen\n# hc.complete$labels # labels (numeric, since we don't know the original categories!)\n# hc.complete$method\n# hc.complete$call\n\nWe can try different linkage methods and see how the clustering results differ. Change the method argument in the function, and plot the results.\n\nhc.average &lt;- hclust(data.dist, method=\"average\")\nhc.single &lt;- hclust(data.dist, method=\"single\")\n\nplot(hc.average, labels=nci.labs, main=\"Average Linkage\", xlab=\"\", sub=\"\")\nplot(hc.single, labels=nci.labs,  main=\"Single Linkage\", xlab=\"\", sub=\"\")\n\nNow we focus on complete linkage only.\nFirst, we use cutree() to compare the results when the data are separated into either 2 or 4 clusters.\n\n# Compare 2 clusters and 4 clusters:\nhc.clusters &lt;- cutree(hc.complete, c(2, 4))\nhead(hc.clusters) # print first 6 results\n\n   2 4\nV1 1 1\nV2 1 1\nV3 1 1\nV4 1 1\nV5 1 2\nV6 1 2\n\n# cross tabulation\ntable(hc.clusters[,\"2\"], hc.clusters[,\"4\"])\n\n   \n     1  2  3  4\n  1 40  7  0  0\n  2  0  0  8  9\n\n\nIt is more straightforward to check the results with a dendrogram.\n\n# visualize the cuts\n# how do you know where to draw the line? check height\nheights &lt;- hc.complete$height\ntail(heights, 4)  # print the last 4\n\n[1] 137.5633 141.2472 142.9218 162.2074\n\nplot(hc.complete, labels=nci.labs, main=\"Complete Linkage\", xlab=\"\", sub=\"\")\nabline(h=140, col=\"red\")  # 4 clusters\nabline(h=150, col=\"blue\") # 2 clusters\n\n\n\n\nThe way to interpret the height variable is simple: it is where two clusters are merged into one. For example, the largest cluster corresponds to the last value of height (162.2) - if you check the figure, it is exactly where the horizontal line is merging the two groups. Similarly, 142.9 is where three groups became two, 141.2 is where four groups became three. If you draw a line at 140, it points out the four clusters.\nHow are the labels distributed between clusters? We can focus on 4 cluster situation, and use table() to list out which cancer types is merged in which of the four clusters.\nFor example, breast cancer appears in all but 3rd cluster; melanoma only appears in the first clsuter; so on so forth.\n\ntable(hc.clusters[,\"4\"], nci.labs)\n\n   nci.labs\n    BREAST CNS COLON K562A-repro K562B-repro LEUKEMIA MCF7A-repro MCF7D-repro\n  1      2   3     2           0           0        0           0           0\n  2      3   2     0           0           0        0           0           0\n  3      0   0     0           1           1        6           0           0\n  4      2   0     5           0           0        0           1           1\n   nci.labs\n    MELANOMA NSCLC OVARIAN PROSTATE RENAL UNKNOWN\n  1        8     8       6        2     8       1\n  2        0     1       0        0     1       0\n  3        0     0       0        0     0       0\n  4        0     0       0        0     0       0\n\n\nFinally, we see what happens if we use unscaled data instead of scaled data, or if we use a correlation-based distance metric instead of the Euclidean distance.\nCompare the dendrograms: How different are the resulting clusterings? Do you recognise subclusters that are consistent?\n\n# Compare scaled data versus non-scaled data:\nhc.unscaled &lt;- hclust(dist(nci.data), method=\"complete\")\nplot(hc.unscaled, labels=nci.labs, main=\"Complete linkage with unscaled features\", xlab=\"\", sub=\"\")\n\n\n\n# Compare Euclidean distance with correlation-based distance:\ndd &lt;- as.dist(1-cor(t(sd.data)))\nhc.corr &lt;- hclust(dd, method=\"complete\")\nplot(hc.corr, labels=nci.labs, main=\"Complete linkage with correlation-based distance\", xlab=\"\", sub=\"\")\n\n\n\n\n\n\nK-means clustering\nIn this section we explore the K-means clustering on the same dataset.\nIn contrast to the hierarchical clustering which requires a distance as input, with K-means you would provide the data matrix. The data matrix can be scaled (centered and with unit variance), or unscaled.\nIn this example we use scaled data computed from before, sd.data.\n\nset.seed(4) # set random seed\nkm.out4 &lt;- kmeans(sd.data, centers = 4, nstart=20)\nkm.out4$cluster\n\n V1  V2  V3  V4  V5  V6  V7  V8  V9 V10 V11 V12 V13 V14 V15 V16 V17 V18 V19 V20 \n  4   4   4   4   4   4   4   4   4   4   4   4   4   4   4   4   4   4   4   4 \nV21 V22 V23 V24 V25 V26 V27 V28 V29 V30 V31 V32 V33 V34 V35 V36 V37 V38 V39 V40 \n  4   4   4   1   1   4   1   4   1   4   4   1   1   3   3   3   3   3   3   3 \nV41 V42 V43 V44 V45 V46 V47 V48 V49 V50 V51 V52 V53 V54 V55 V56 V57 V58 V59 V60 \n  3   1   1   1   1   1   1   1   1   1   1   1   1   1   1   2   2   2   2   2 \nV61 V62 V63 V64 \n  2   2   2   2 \n\n\nRead the help file ?kmeans to understand what the argument nstart=20 does. Comparing an analysis with nstart=20 versus nstart=1 demonstrates how the cluster results can be improved if we allow more evaluations with different randomly chosen starting centroids.\nSet a different random seed, say 3 (as long as it’s different from the one you used before), and run the analysis again. This time we use a different nstart\n\n# different starting centroids improve the clustering:\nset.seed(3)\nkm.out &lt;- kmeans(sd.data, centers = 4, nstart=1)\nkm.out$cluster # cluster label\n\n V1  V2  V3  V4  V5  V6  V7  V8  V9 V10 V11 V12 V13 V14 V15 V16 V17 V18 V19 V20 \n  1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1 \nV21 V22 V23 V24 V25 V26 V27 V28 V29 V30 V31 V32 V33 V34 V35 V36 V37 V38 V39 V40 \n  1   1   1   3   3   1   3   1   3   1   1   3   3   4   4   4   4   4   4   4 \nV41 V42 V43 V44 V45 V46 V47 V48 V49 V50 V51 V52 V53 V54 V55 V56 V57 V58 V59 V60 \n  4   3   3   3   3   3   3   3   3   3   3   3   3   3   3   2   2   2   2   2 \nV61 V62 V63 V64 \n  2   2   2   2 \n\n\n\n\nCompare with hierarchical clustering\n\n# we can directly compare the k-means result (along rows)\n# with the hierarchical clustering result (along columns)\ntable(km.out4$cluster, hc.clusters[,\"4\"], deparse.level=2)\n\n               hc.clusters[, \"4\"]\nkm.out4$cluster  1  2  3  4\n              1 11  0  0  9\n              2  9  0  0  0\n              3  0  0  8  0\n              4 20  7  0  0\n\n\nFrom the results, you can see that the results are slightly different between the two methods. Keep in mind that in unsupervised learning you do not have the real outcome label (such as the cancer types here), so you need to try a few different methods and compare the outputs, and make interpretations accordingly.\n\n\nVisualize clusters\nWe can visualise the K-means clustering results of high-dimensional data by using PCA for dimension reduction. We plot the first two principal components and colour the data points (= individual cell lines) by their assigned cluster from K-means.\n\n# first, run PCA again on the NCI60 data\npr.out &lt;- prcomp(nci.data, scale=TRUE)\n\n# more cluster options\nkm.out2 &lt;- kmeans(sd.data, 2, nstart=20)\nkm.out3 &lt;- kmeans(sd.data, 3, nstart=20)\n\n# we can now visualise the K-Means results by labelling the data points\n# in a plot of the scores of the first 2 principal components:\npar(mfrow=c(1,3))\nplot(pr.out$x[,1:2], col=(km.out2$cluster+1), main=\"K-Means with K=2\",\n     xlab=\"PC 1\", ylab=\"PC 2\", pch=20)\nplot(pr.out$x[,1:2], col=(km.out3$cluster+1), main=\"K-Means with K=3\",\n     xlab=\"PC 1\",  ylab=\"PC 2\", pch=20)\nplot(pr.out$x[,1:2], col=(km.out4$cluster+1), main=\"K-Means with K=4\",\n     xlab=\"PC 1\", ylab=\"PC 2\", pch=20)\n\n\n\n\nCompare with the plot from Exercise 2 yesterday (left panel) along with the cancer type labels. The clusters from K-means seem to correspond decently to partition the data into groups.\n\npar(mfrow=c(1,1))\n\nCols=function(vec){\n  cols=rainbow(length(unique(vec)))\n  return(cols[as.numeric(as.factor(vec))])\n}\nplot(pr.out$x[,1:2], col=Cols(nci.labs), pch=19,xlab=\"PC 1\",ylab=\" PC 2\")\nlegend('topleft', col=rainbow(length(unique(nci.labs))), legend=unique(nci.labs), bty='n', lwd=2, cex=.6)\n\n\n\n\n\n\nHeatmap\nA heatmap is another way to visualize the clusters from the data. We use the principal components rather than the raw data, as PCs are already explaining a large amount of variability in the over 6000 features.\nSimilar values are presented with similar colors.\n\n## We use the scores of the PCA on the NCI60 data, to reduce dimension\nscores &lt;- pr.out$x\nscores[1:5, 1:5] # first 5 pc, first 5 measurements\n\n         PC1       PC2         PC3         PC4        PC5\nV1 -19.68245  3.527748  -9.7354382   0.8177816 -12.511081\nV2 -22.90812  6.390938 -13.3725378  -5.5911088  -7.972471\nV3 -27.24077  2.445809  -3.5053437   1.3311502 -12.466296\nV4 -42.48098 -9.691742  -0.8830921  -3.4180227 -41.938370\nV5 -54.98387 -5.158121 -20.9291076 -15.7253986 -10.361364\n\n#  default choices\nheatmap(pr.out$x)\n\n\n\n\nYou can remove the dendrogram on the PCs, only keeping the ones for cancer types. Now you see that the PCs have kept their original order from 1 to 64.\n\n# hc.corr is the result from hclust. check the section on hierarchical clustering\nheatmap(pr.out$x, Rowv = as.dendrogram(hc.corr), Colv = NA)\n\n\n\n\nYou can also reduce the number of PCs, and add titles to the plot annd y-axis.\n\npar(cex.main = .7)\nheatmap(pr.out$x[,1:40], Rowv = as.dendrogram(hc.corr), Colv = NA,\n        labRow = nci.labs, main = 'Heatmap of the scores of the first 40 PCs on the NCI60 data')"
+  },
+  {
+    "objectID": "lab/lab_day4_clustering.html#exercise-3-gene-expression-data",
+    "href": "lab/lab_day4_clustering.html#exercise-3-gene-expression-data",
+    "title": "R Lab (day 4): Clustering",
+    "section": "Exercise 3: Gene expression data",
+    "text": "Exercise 3: Gene expression data\n(CH12Ex13 from statistical learning)\nWe use the Ch12Ex13.csv data to repeat some of the clustering analysis we did.\nThe first 20 samples are from healthy patients, while the second 20 are from a diseased group.\nLoad in the data using read.csv(). You will need to select header=F. Alternatively: load in the data using “Import dataset” in the upper right window, and click “no” on the “Heading” option.\nCarry out both hierarchical clustering and K-means clustering. You should choose the most meaningful number of clusters (think about how many groups of patients we have!). Compare the results.\nNote: remember that the data has genes on the rows and patients on the columns. You need to transpose the data so that the orders are reversed.\n\n# load in the data using read.csv(). You will need to select header=F.\ndata &lt;- read.csv(\"data/Ch12Ex13.csv\", header=FALSE)\ndim(data)\n\n[1] 1000   40\n\n# transpose the data, so that we have each row is one patient (subject)\ndata &lt;- t(data) \n\nNow the first 20 rows are measurements from healthy patients (group 0), and 21-50 rows are the disease patients (group 1). We can denote this information in a vector like this.\n\ntrue.groups &lt;- c( rep(0,20), rep(1,20))\n\n\nHierarchical clustering\nYou can use different linkage options and distance metrics of your choosing. For example, with complete linkage the code is like this.\n\ndata.dist &lt;- dist(data) # need to compute the distance matrix\nhclust.df &lt;- hclust(data.dist, method=\"complete\" )\n#alternatives:\n#hclust.df &lt;- hclust( D, method=\"average\" )\n#hclust.df &lt;- hclust( D, method=\"single\" )\n\nWe can keep 2 clusters with cutree. Then do a cross tabulation of the true labels and clustered results: how well do they correspond?\n\n# find the clusters\npredicted &lt;- cutree( hclust.df, k=2 )\n\n# How well does our clustering predict health vs. diseased\ntable(predicted, true.groups )\n\n         true.groups\npredicted  0  1\n        1 20  0\n        2  0 20\n\n\n\n\nK-means\nNow you can use K-means to identify 2 clusters.\n\npredicted.kmean &lt;- kmeans(data, 2, nstart=20)$cluster\n# agreement with true label\ntable(predicted.kmean, true.groups )\n\n               true.groups\npredicted.kmean  0  1\n              1 20  0\n              2  0 20\n\n\nBoth methods seem to do work decently for the task."
   }
 ]
\ No newline at end of file
diff --git a/exam/MED3007_exam_data_V21.Rdata b/exam/MED3007_exam_data_V21.Rdata
new file mode 100644
index 0000000..dbd3fa8
Binary files /dev/null and b/exam/MED3007_exam_data_V21.Rdata differ
diff --git a/exam/MED3007_exam_simulation.R b/exam/MED3007_exam_simulation.R
new file mode 100644
index 0000000..4b53e4b
--- /dev/null
+++ b/exam/MED3007_exam_simulation.R
@@ -0,0 +1,198 @@
+###------------------------------###
+###   MED3007: Exam simulation   ###
+###------------------------------###
+
+## Load exam data set using "load()"
+load("data/MED3007_exam_data_V21.Rdata")
+
+dim(clin)
+dim(expr)
+head(clin)
+
+## Remember: we want to have the variables on the columns in our datasets, i.e. we want
+## there to be 72 rows (the number of patients) for both datasets "expr" and "clin". 
+## For "expr" we should have 7129 columns, since they are the genes, a.k.a our variables. 
+
+# Use function t() to transpose the dataset so that it is on the correct form
+data <- t(expr) # rename "expr" to "data"
+
+# Make group variable (here that is whether a patient has ALL or AML)
+groups <- clin$ALL.AML
+
+## There are 2 groups: one possible analysis -> testing
+##-----------------------------------------------------
+
+# Apply t-test to all genes in each of the groups using "apply" and "t.test" -> histogram 
+alpha <- 0.05
+pval.ttest <- apply(data, 2, function(x){t.test(x[which(groups=="ALL")], x[which(groups=="AML")])$p.value})
+hist(pval.ttest)
+
+# Simple way of calculation adjusted p-values (using p.adjust()):
+pval.fwer <- p.adjust(pval.ttest, method = "bonferroni")
+pval.fdr <- p.adjust(pval.ttest, method = "BH")
+
+# Number of significant p-values after Bonferroni correction
+sum(pval.fwer < alpha) # conservative
+
+# Number of significant p-values after BH correction
+sum(pval.fdr  < alpha) # less conservative
+
+# Let us find the significant genes after correction
+sign.genes.Bonf <- which(pval.fwer < alpha)
+sign.genes.BH <- which(pval.fdr < alpha)
+
+# compute the group means using "apply" and "tapply"
+genes.gr.means <- apply(data, 2, function(x){tapply(x, groups, mean)}) 
+
+# Plot of the overall group means with significant genes
+plot(genes.gr.means[1,], xlab = 'Genes', main = 'Significant genes, group means', 
+     ylab = 'Mean expression', ylim = range(genes.gr.means))
+points(genes.gr.means[2,], col=2)
+points(sign.genes.BH, genes.gr.means[1,sign.genes.BH], col=4, pch=4, lwd=2)
+points(sign.genes.BH,genes.gr.means[2,sign.genes.BH], col=4, pch=4, lwd=2)
+points(sign.genes.Bonf, genes.gr.means[1,sign.genes.Bonf], col=5, pch=4, lwd=2)
+points(sign.genes.Bonf,genes.gr.means[2,sign.genes.Bonf], col=5, pch=4, lwd=2)
+abline(h=0)
+legend(6500, 25000, col=5:4, bty='n', lwd=2, pt.lwd=.5, legend = c('Bonferroni','B-H'), cex=.6) # add legend
+
+# a slightly better plot: standardized group mean differences, with significant genes
+genes.gr.diff <- apply(genes.gr.means, 2, diff)/apply(data, 2, sd)
+plot(genes.gr.diff, xlab = 'Genes', main = 'Significant genes, standardized group differences', 
+     ylab = 'difference in group means / std dev')
+points(sign.genes.BH,genes.gr.diff[sign.genes.BH], col=4, pch=4, lwd=2)
+points(sign.genes.Bonf,genes.gr.diff[sign.genes.Bonf], col=5, pch=4, lwd=2)
+abline(h=0)
+legend(6500, 1.8, col=5:4, bty='n', lwd=2, pt.lwd=.5, legend = c('Bonferroni','B-H'), cex=.6) # add legend
+
+
+
+
+## There are 2 groups: another possible analysis -> clustering
+##--------------------------------------------------------
+
+# Calculate the distance matrix, here with the Euclidean distance
+data.dist <- dist(data, method = "euclidean")
+
+# Perform clustering with different linkage methods:
+hc.complete <- hclust(data.dist, method="complete")
+hc.average <- hclust(data.dist, method="average")
+hc.single <- hclust(data.dist, method="single")
+
+## Compare the dendrograms and comment on the results in light of how
+## the different linkage methods work.
+par(mfrow=c(1,3))
+plot(hc.single, labels=groups,  main="Single Linkage", xlab="", sub="")
+plot(hc.complete, labels=groups, main="Complete Linkage", xlab="", sub="")
+plot(hc.average, labels=groups, main="Average Linkage", xlab="", sub="")
+
+# How to decide between linkage methods? We look for the most "compact" clusters, where 
+# the division between the new branches are a bit "balanced".
+par(mfrow=c(1,3))
+plot(hc.single, main='Euclidian single', xlab='', labels=F,  sub='')
+rect.hclust(hc.single, k=2)
+rect.hclust(hc.single, k=3)
+rect.hclust(hc.single, k=4)
+plot(hc.complete, main='Euclidean complete',xlab='', labels=F, sub='')
+rect.hclust(hc.complete, k=2)
+rect.hclust(hc.complete, k=3)
+rect.hclust(hc.complete, k=4)
+plot(hc.average, main='Euclidean average',xlab='', labels=F, sub='')
+rect.hclust(hc.average, k=2)
+rect.hclust(hc.average, k=3)
+rect.hclust(hc.average, k=4)
+
+## The different linkages seems to give very different results!
+## Complete linkage seems to give 2 clusters/groups, which we know to be the truth. 
+## It is also reassuring to see that when we try to divide the patients into more than 2 groups, 
+## we do not gain anything. Although the other two linkage methods seems to say there 
+## are no groups at all, only one big group. 
+## Let's see if we detect the true groups for the clustering result from complete linkage. 
+
+# We use the function "cutree" to "cut" the dendrogram into both k=2 and k=3 groups
+cluster.ec <- cutree(hc.complete, k=c(2,3)) # complete linkage, euclidian
+
+# How are the true labels distributed between clusters:
+table(cluster.ec[,"2"], groups) # look only at k=2
+
+# Hierarchical clustering was not able to detect the groups very well.. Let's try K-Means.
+
+## K-means clustering
+set.seed(4)
+km.out2 <- kmeans(data, 2, nstart=20) # 2 clusters
+km.out3 <- kmeans(data, 3, nstart=20) # 3 clusters
+
+# We can directly compare the k-means result (along rows)
+# with the hierarchical clustering result (along columns)
+table(km.out2$cluster, cluster.ec[,"2"], deparse.level=2) 
+table(km.out3$cluster, cluster.ec[,"3"], deparse.level=2)
+# Not too bad, but still some disagreement
+
+# How are the true labels distributed between clusters:
+table(km.out2$cluster, groups) 
+
+# Not very good here either, maybe even worse than hierarchcial clustering..?
+
+
+## Also dimensional reduction via PCA can be an option
+##----------------------------------------------------
+
+pr.out <- prcomp(data, scale=TRUE)
+
+# Plot also the proportion of variance explained
+# Proportion of variance explained
+pr.var <- pr.out$sdev^2
+pve <- pr.var/sum(pr.var)
+pve <- 100*pve
+par(mfrow=c(1,2))
+plot(pve, type="o", ylab="PVE", xlab="Principal Component", col="blue")
+plot(cumsum(pve), type="o", ylab="Cumulative PVE", xlab="Principal Component", col="red")
+
+# Can also try to plot only the first 10 components in the first plot to zoom in a bit
+par(mfrow=c(1,1))
+plot(1:10,pve[1:10], type="o", ylab="PVE", xlab="Principal Component", col="blue")
+# -> the first two components seem to explain a lot
+# alternative: selecting the number of components by having a threshold on pve, f.ex 80% or 70%
+
+# Helper function for colors (for the different groups)
+Cols=function(vec){
+  cols=rainbow(length(unique(vec)))
+  return(cols[as.numeric(as.factor(vec))])
+}
+
+# Plot the different components (color for each group)
+par(mfrow=c(1,2))
+plot(pr.out$x[,1:2], col=Cols(groups), pch=19, xlab="PC 1", ylab=" PC 2")
+legend('bottomright', col=rainbow(length(unique(groups))), legend=paste('group ',unique(groups),sep=''), 
+       bty='n', lwd=1, pt.lwd=.3, cex=.6)
+plot(pr.out$x[,c(1,3)], col=Cols(groups), pch=19,xlab="PC 1",ylab=" PC 3")
+
+# Still not very clear, but the sub-plot on the right seems to show a small tendency towards 
+# two groups..?
+
+# Extra: hierarchical clustering *after* a PCA
+
+# Hierarchical clustering on the SCORES (weights) of the first 3 PCs
+data.dist.pca3 <- dist(pr.out$x[,1:3], method = 'manhattan')
+hclust.df <- hclust(data.dist.pca3, method="complete" )
+
+par(mfrow=c(1,1))
+plot(hclust.df, labels=groups)
+rect.hclust(hclust.df, k=4)
+
+predicted <- cutree(hclust.df, k=2)
+table(predicted, groups)
+
+# K-means clustering on the SCORES (weights) of the first 3 PCs
+predicted.kmean <- kmeans(pr.out$x[,1:3], 2, nstart=20)$cluster
+table(predicted.kmean, groups)
+
+## Possible comment / conclusion:
+## -> this last clustering on the PCA is explaining 
+## the Leukemia groups much better!
+
+## Remember: you do NOT need to do all of these analyses on the exam! One is enough. 
+
+## Final note: why are the methods not working so well on this dataset?? These types of results are 
+## actually quite common, since real datasets are often quite messy and does not necessarily have
+## a clear grouping structures. So: report what you see, even if the results are not "pretty".
+
diff --git a/exam/MED3007_exam_v21.pdf b/exam/MED3007_exam_v21.pdf
new file mode 100644
index 0000000..f314d93
Binary files /dev/null and b/exam/MED3007_exam_v21.pdf differ
diff --git a/exam/MED3007_summary.R b/exam/MED3007_summary.R
new file mode 100644
index 0000000..f9ccfbf
--- /dev/null
+++ b/exam/MED3007_summary.R
@@ -0,0 +1,248 @@
+###----------------------------------------------###
+###   MED3007: Summary of methods with RStudio   ###
+###----------------------------------------------###
+
+### OUTLINE
+###---------
+
+### 0. Statistical testing 
+### 1. Multiple testing with correction (Lab 1)
+### 2. Principal component analysis (Lab 2)
+### 3. Clustering (Lab 3)
+###   3.1. Hierarchical clustering
+###   3.2. K-means clustering
+
+# Set the working directory:
+setwd("~/Dropbox_UiO/Dropbox/MED3007_2023/day 5")
+
+###-------------------------###
+### 0. STATISTICAL TESTING  ###
+###-------------------------###
+
+library(readxl) # reading from excel can be done using the "readxl" package, so we need to load it first
+data_exc <- read_excel("data/Testfil_Rcourse.xlsx")
+data_exc <- as.data.frame(data_exc) # remember to always force your excel-dataset to be a dataframe!
+
+# A single t-test can be performed the following way (given that it is normally distributed)
+out <- t.test(vitD_v1 ~ gender, data_exc) 
+# ..where we test if vitD_v1 is different across gender groups
+out$p.value # print out the p-value from the t-test
+out$conf.int # can also get the confidence interval
+out
+
+# However, there are other tests you can do with similar syntax:
+?wilcox.test() # nonparametric (non-normal data)
+?chisq.test()
+# .. and so on
+
+# Summary statistics
+summary(data_exc)
+
+
+###----------------------###
+### 1. MULTIPLE TESTING  ###
+###----------------------###
+
+## Steps:
+## 1. Load data
+## 2. Make sure you have the variables (that you want to test) column-wise (look at the data!!)
+## 3. Compute the p-values for all the variables you want to test. Nice to visualize it with a histogram!
+## 4. To do the correction:
+##    a) Bonferroni: compute new (stricter) significance level: alpha/k, k=number of tests. 
+##       You keep the original p-values < alpha/k (these are considered significant under Bonf. correction)
+##    b) B-H: sort the p-values from small to large, compute the thresholds for each p-value. 
+##       You keep the original p-values < threshold.
+
+# Load in the data
+data_csv <- read.csv("data/Ch10Ex11.csv", header=F) # data contains 40 samples of 1000 genes 
+
+# Always good to have a look at the data
+View(data_csv) 
+
+# Remember to check if the variables (i.e genes) are on the columns! 
+dim(data_csv) 
+
+# Need to transform the data so we have the variables as columns
+data_csv <- t(data_csv) 
+
+# Grouping variable: the samples were grouped into healthy and diseased patients
+groups <- c(rep(1,20), rep(2,20)) 
+
+# Need to decide on a target significance level
+alpha <- .05
+
+# Apply t-test to all genes in each of the groups using "apply" and "t.test" -> histogram 
+pval.ttest <- apply(data_csv, 2, function(x){t.test(x[which(groups==1)], x[which(groups==2)])$p.value})
+hist(pval.ttest)
+
+# Simple way of calculation adjusted p-values (using p.adjust()):
+pval.fwer <- p.adjust(pval.ttest, method = "bonferroni")
+pval.fdr <- p.adjust(pval.ttest, method = "BH")
+
+# Number of significant p-values after Bonferroni correction
+sum(pval.fwer < alpha) # conservative
+
+# Number of significant p-values after BH correction
+sum(pval.fdr  < alpha)
+
+# Let us find the significant genes after correction
+sign.genes.Bonf <- which(pval.fwer < alpha)
+sign.genes.BH <- which(pval.fdr < alpha)
+
+# Plot of the overall means with significant genes
+genes.means <- apply(data_csv, 2, mean) # compute the means using "apply"
+plot(genes.means, xlab = 'Genes', main = 'Mean across samples', ylab = 'Mean expression')
+points(sign.genes.Bonf, genes.means[sign.genes.Bonf], col=3, pch=16) 
+points(sign.genes.BH, genes.means[sign.genes.BH], col=4, pch=16)
+legend('topright', col=3:4, bty='n', lwd=2, legend = c('Bonferroni','B-H')) # add colorcode 
+
+
+###-------------------------###
+### 2. PCA: GENOMIC EXAMPLE ###
+###-------------------------###
+
+## Steps:
+## 1. Load data
+## 2. Make sure you have the variables column-wise (look at the data!!)
+## 3. Do PCA with "prcomp", and we typically set "scale=TRUE" inside "prcomp"
+## 4. Visualize the results:
+##    a) Compute and plot proportion of variance explained (PVE), try to decide on 
+##       how many principal components you would choose. 
+##    b) Plot the first principal components. What do you see?
+##       Extra: add a color for the group labels (if they exist).
+
+# Do PCA with "prcomp"
+pr.out <- prcomp(data_csv, scale=TRUE)
+
+# Proportion of variance explained
+pr.var <- pr.out$sdev^2
+pve <- pr.var/sum(pr.var)
+pve <- 100*pve
+par(mfrow=c(1,2))
+plot(pve, type="o", ylab="PVE", xlab="Principal Component", col="blue")
+plot(cumsum(pve), type="o", ylab="Cumulative PVE", xlab="Principal Component", col="red")
+
+# How many principal components would you keep to achieve a good dimension reduction,
+# while keeping most of the variability in the data set?
+mysel80 <- which(cumsum(pve) > 80)[1] # explains 80% of the variability
+mysel70 <- which(cumsum(pve) > 70)[1] # explains 70% of the variability
+
+# Setting a threshold at 80% PVE is very common, which would here result in 29 
+# components (much more managable than 1000!)
+# Now what?
+# If we decide to keep 29 components, we can for ex. detect highly expressed genes by 
+# inspecting the different principal components (the very first principal components 
+# should typically have high weights (both positive and negative weight) for the most 
+# interesting variables/genes, and so on..). 
+
+# Visualize results
+# Helper function for colors (for the different groups)
+Cols=function(vec){
+  cols=rainbow(length(unique(vec)))
+  return(cols[as.numeric(as.factor(vec))])
+}
+
+# Plot the different components (color for each group)
+par(mfrow=c(1,2))
+plot(pr.out$x[,1:2], col=Cols(groups), pch=19, xlab="PC 1", ylab=" PC 2")
+plot(pr.out$x[,c(1,3)], col=Cols(groups), pch=19,xlab="PC 1",ylab=" PC 3")
+legend('topleft', col=rainbow(length(unique(groups))), legend=paste('group ',unique(groups),sep=''), bty='n', lwd=2, cex=.6)
+
+
+###---------------###
+### 3. CLUSTERING ###
+###---------------###
+
+## Steps:
+## 1. Load data
+## 2. Make sure you have the variables column-wise (--> look at the data!!)
+## 3.1 Hierarchical clustering:
+##    a) Compute the distance matrix for your dataset using "dist" with a chosen distance measure, 
+##        for ex. "euclidian" (but also good to try several distances)
+##    b) Do the clustering with "hclust" with a linkage method, for ex. "complete" (but also here 
+##        good to try several methods and chose the best at the end).
+##    c) Plot the dendograms to decide which method to chose, and how many clusters to chose. 
+##    d) Use "cutree" to cut the dendogram so that you finally get the clustering of the data.
+##       Now you can f.ex compare the results from different methods, or, if you already have 
+##       the group labels, you can now see if you were able to detect them!
+## 3.2 K-means clustering:
+##    a) Set a random seed 
+##    b) Run clustering with "kmeans" where you need to specify: the number of clusters, and number 
+##       of initial starts. 
+
+
+## Hierarchical clustering
+# We continue to use the same dataset, however with the samples shuffled 
+# (since they are now ordered by group)
+myshuffle <- sample(dim(data_csv)[1])
+data_csv <- data_csv[myshuffle,]
+
+# Calculate the distance matrix (default = Euclidean):
+data.dist <- dist(data_csv)
+
+# Alternatively, try all distances 
+data.dist.e <- dist(data_csv, method="euclidean") # this is now exactly the same as data.dist
+data.dist.c <- dist(data_csv, method="canberra")
+data.dist.m <- dist(data_csv, method="manhattan")
+
+# Perform clustering with different linkage methods:
+hc.complete <- hclust(data.dist, method="complete")
+hc.average <- hclust(data.dist, method="average")
+hc.single <- hclust(data.dist, method="single")
+
+## Compare the dendrograms and comment on the results in light of how
+## the different linkage methods work.
+par(mfrow=c(1,3))
+plot(hc.single, labels=groups,  main="Single Linkage", xlab="", sub="")
+plot(hc.complete, labels=groups, main="Complete Linkage", xlab="", sub="")
+plot(hc.average, labels=groups, main="Average Linkage", xlab="", sub="")
+
+# How to decide between linkage methods? We look for the most "compact" clusters.
+par(mfrow=c(1,3))
+plot(hc.single, main='Euclidian single', xlab='', labels=F,  sub='')
+rect.hclust(hc.single, k=2)
+rect.hclust(hc.single, k=3)
+rect.hclust(hc.single, k=4)
+plot(hc.complete, main='Euclidean complete',xlab='', labels=F, sub='')
+rect.hclust(hc.complete, k=2)
+rect.hclust(hc.complete, k=3)
+rect.hclust(hc.complete, k=4)
+plot(hc.average, main='Euclidean average',xlab='', labels=F, sub='')
+rect.hclust(hc.average, k=2)
+rect.hclust(hc.average, k=3)
+rect.hclust(hc.average, k=4)
+
+# We can see that we do not gain very much by having 3 clusters, compared to 2.. But we 
+# can still compare the results.
+
+# We use the function "cutree" to "cut" the dendrogram at a given number of groups
+# Let's try with both k=2 and k=3 and see what is best
+cluster.ec <- cutree(hc.complete, k=c(2,3)) # complete linkage, euclidian
+cluster.ea <- cutree(hc.average, k=c(2,3))  # average linkage, euclidian
+# Compare the two clustering results with each other with "table"
+table(cluster.ec[,"2"], cluster.ea[,"2"]) # they completely agree!
+table(cluster.ec[,"3"], cluster.ea[,"3"]) # here there are some minor disagreements --> conclusion: k=2!!
+
+# How are the true labels distributed between clusters:
+table(cluster.ec[,"2"], groups[myshuffle]) # completely correct, the groups are detected!
+
+## K-means clustering
+set.seed(4)
+km.out2 <- kmeans(data_csv, 2, nstart=20) # 2 clusters
+km.out3 <- kmeans(data_csv, 3, nstart=20) # 3 clusters
+
+# We can print the cluster values
+km.out2$cluster
+
+# We can also directly compare the k-means result (along rows)
+# with the hierarchical clustering result (along columns)
+table(km.out2$cluster, cluster.ec[,"2"], deparse.level=2) # complete agreement!
+table(km.out3$cluster, cluster.ec[,"3"], deparse.level=2)
+
+# How are the true labels distributed between clusters:
+table(km.out2$cluster, groups[myshuffle]) # completely correct, the groups are detected!
+
+# Since there seems to be overall best agreement on the clusters when k=2 across all methods, we
+# can we be fairly certain that this is the true clustering. However, in many other situations 
+# you will not be so certain. So: you should try several methods, and chose the result that seems 
+# to be most consistent across methods. 
\ No newline at end of file
diff --git a/index.qmd b/index.qmd
index b177db5..c8145eb 100644
--- a/index.qmd
+++ b/index.qmd
@@ -71,7 +71,7 @@ If there is a conflict of information, please refer to the official schedule.
 |:-------------:|:------------------------------------:|:-----------------:|
 | 9:00 - 9:45   | Lab: Clustering and heatmaps |   |
 | 10:00 - 11:30  | Exercises  | [Lab: clustering](https://ocbe-uio.github.io/course_med3007/lab/lab_day4_clustering.html), [Code](https://github.com/ocbe-uio/course_med3007/blob/main/lab/code/MED3007_Lab3_clustering.R) |
-| 13:00 - 14:00  | Take-home exam simulation   |  |
+| 13:00 - 14:00  | Take-home exam simulation   | [Exam]() |
 | 14:00 - 16:00  | Q&A | |
 
 
diff --git a/lab/code/MED3007_Lab3_clustering.R b/lab/code/MED3007_Lab3_clustering.R
index a95f586..7f1c9e5 100644
--- a/lab/code/MED3007_Lab3_clustering.R
+++ b/lab/code/MED3007_Lab3_clustering.R
@@ -1,5 +1,84 @@
 # code for clustering
 
+# food ----
+# food example for clustering
+# the clustered groups are easier to interpret, hence we use it for teaching
+
+# load data, and change the title of pulse to Legume
+food <- read.table('./lab/data/Food.txt', header=T)
+colnames(food)[7] <- 'Legume'
+head(food) # first 6 lines of code
+
+# the food data has columns (features) that have quite big difference
+# Scale the data to zero mean and unit variance, call it food_s
+food_s <- scale(food)
+
+
+## hierarchical clustering ----
+
+# HC requires distances (not original data) as input
+# Calculate the distance matrix on the scaled data
+# (default method for distance = Euclidean)
+food_dist <- dist(food_s)
+# food_dist <- dist(food_s, method="euclidean")
+# food_dist
+
+# food_dist is a vector of 300 elements
+length(food_dist) # 300
+
+# this is the number of pairs of data
+# we have 25 data points, one for each country
+# Albania vs Austria, Albania vs Belg.Lux, ...
+# in total you have 25 * 24 / 2 pairs
+
+
+
+# Perform clustering with different linkage methods:
+hc.complete <- hclust(food_dist, method="complete")
+plot(hc.complete, labels=rownames(food), main="Complete Linkage", xlab="", sub="")
+
+# single linkage
+hc.single <- hclust(food_dist, method="single")
+plot(hc.single, labels=rownames(food), main="Single Linkage", xlab="", sub="")
+
+# average linkage
+hc.average <- hclust(food_dist, method="average")
+plot(hc.average, labels=rownames(food), main="Average Linkage", xlab="", sub="")
+
+
+# unscaled data, complete linkage
+hc.unscaled <- hclust(dist(food), method="complete")
+plot(hc.unscaled, labels=rownames(food), main="Complete linkage with unscaled features", xlab="", sub="")
+
+
+# correlation on scaled data, complete linkage
+cor(food_s) # this is by col (food)
+cor(t(food_s)) # by country
+
+# 1 minus makes them all positive
+dd <- as.dist(1-cor(t(food_s)))
+hc.corr <- hclust(dd, method="complete")
+plot(hc.corr, labels=rownames(food), main="Complete linkage with correlation-based distance", xlab="", sub="")
+
+
+## heatmap ----
+# heatmap (default)
+# the default heatmap does clustering for both row and col variables
+# dendrograms are also shown
+heatmap(food_s)
+
+# you can specify the arguments so that no clustering is done
+# the original order of col and row are kept
+# heatmap with no clustering
+heatmap(food_s, Rowv = NA, Colv = NA)
+
+# can also do clustering for only row (or col)
+heatmap(food_s, Colv = NA)
+
+
+
+
+# _________ ----
 # NCI 60 ----
 library(ISLR)
 nci.labs <- NCI60$labs # Sample labels (tissue type)
@@ -148,45 +227,25 @@ plot(pr.out$x[,1:2], col=(km.out4$cluster+1), main="K-Means with K=4",
 
 
 # heatmap ----
-
+# find out how to set argument for heatmap by reading the documentation
 ?heatmap
 
-## let create a small heatmap, to fix ideas.
-## We use the scores of the PCA on the NCI60 data, to reduce dimension
+# We use the scores of the PCA on the NCI60 data, to reduce dimension
 
-#  default choices
+# default heatmap (clustering done on both col and row)
 heatmap(pr.out$x)
 
-# I use the previous dendrogram for better ordering of the patients,
-# and I remove the dendrogram for the components
+# clustering (on correlation) done for rows
 heatmap(pr.out$x, Rowv = as.dendrogram(hc.corr), Colv = NA)
 
 # I now plot less components for the sake of clarity,
-# I add the patient's tumor type, and I give a title
+# I add tumor type, and I give a title
 par(cex.main = .7)
 heatmap(pr.out$x[,1:40], Rowv = as.dendrogram(hc.corr), Colv = NA,
         labRow = nci.labs, main = 'Heatmap of the scores of the first 40 PCs on the NCI60 data')
 
 
 
-## elbow plot
-
-# names(km.out2)
-# # the within cluster sum-of-squares is within the object "withinss"
-# # but we need to run much more k-means in order to decide...
-#
-# which.clust <- 1:15
-# within.clust.var <- NULL
-# for(k in which.clust){
-#   myresult <- mean(kmeans(sd.data, k, nstart=10)$withinss)
-#   within.clust.var <- c(within.clust.var, myresult)
-# }
-# # let's plot the values and look for the elbow
-# par(mfrow=c(1,1))
-# plot(which.clust, within.clust.var, type = 'b', lwd = 2,
-#      xlab = 'number of clusters',
-#      ylab = 'within-cluster sum-of-squares',
-#      main = 'k-means clustering of NCI60 data')
 
 
 
@@ -195,6 +254,7 @@ heatmap(pr.out$x[,1:40], Rowv = as.dendrogram(hc.corr), Colv = NA,
 # CH10Ex11 ----
 
 # load in the data using read.csv(). You will need to select header=F.
+# set the right path to load the data!
 data <- read.csv("lab/data/Ch12Ex13.csv", header=FALSE)
 data <- t(data) # want each row to represent a sample ... should have n=40 samples/rows
 
@@ -216,6 +276,7 @@ table(predicted, true.groups )
 # very well!!
 
 # kmeans  ----
+
 predicted.kmean <- kmeans(data, 2, nstart=20)$cluster
 table(predicted.kmean, true.groups )
 # also very well!
diff --git a/lab/lab_day4_clustering.qmd b/lab/lab_day4_clustering.qmd
index f94d130..ec09e09 100644
--- a/lab/lab_day4_clustering.qmd
+++ b/lab/lab_day4_clustering.qmd
@@ -10,10 +10,190 @@ Download datasets [here](https://github.com/ocbe-uio/course_med3007/tree/main/la
 
 R script: [Code](https://github.com/ocbe-uio/course_med3007/blob/main/lab/code/MED3007_Lab3_clustering.R)
 
-Presentation: [Slides]()
+Presentation: [Slides](presentation/Lab_clustering.pdf)
 
+## Exercise 1: Food
 
-## Exercise 1: NCI60
+We use the same `Food.txt` data to illustrate two concepts: hierarchical clustering, and heatmap. 
+
+This is not a genomics dataset, but for the ease of interpretability, we use it for teaching purposes.
+
+Let us load the dataset.
+
+```{r}
+#| label: hc-food-loaddata
+#| warning: false
+#| echo: true
+
+food <- read.table('data/Food.txt', header=T)
+# we change the name from pulses to a more common name, legume
+colnames(food)[7] <- 'Legume'
+head(food) # print first 6 lines 
+```
+
+We **scale** the data (also called standardize, or normalize sometimes) so that each column (feature, variable) has 0 mean and 1 variance. We call the scaled data `food_s`.
+
+```{r}
+#| label: hc-food-scale
+#| warning: false
+#| echo: true
+
+food_s <- scale(food)
+head(food_s) # print first 6 lines
+```
+
+
+
+### Distances
+
+To do hierarchical clustering, the most convenient command is `hclust()`. As input you would need a **distance** between the subjects (patients, or countries in this example). We do it on the scaled data.
+
+The command to compute pair-wise distance is `dist()`. By default, the distance being computed is the Euclidean distance (*details optional*). Euclidean distance is possibly the most commonly used metric, but there are others. See `?dist()` to find out more options.
+
+We can present the pair-wise distances in a matrix format. You can see that this matrix is symmetric, with 0 on the diagonal - this should be intuitive: the distance between A - B is the same as B - A, and the distance between A and itself is 0.
+
+
+```{r}
+#| label: hc-food-dist
+#| warning: false
+#| echo: true
+
+# compute distance
+food_dist <- dist(food_s)
+# round(food_dist, digits = 2) # try this yourself to see what it does
+# alternatively, look at this as a matrix
+food_dist_matrix <- as.matrix(food_dist)
+round(food_dist_matrix[1:5, 1:5], digits = 2) # first 5 row 5 col
+```
+
+::: callout-note
+## Optional: Euclidean disance
+
+You can check the Euclidean distance between Albania and Austria is indeed 5.95. This distance is the square root of the sum of squared differences between two subjects in all their measurements. 
+
+:::
+
+```{r}
+#| label: hc-food-dist2
+#| warning: false
+#| echo: true
+
+round(food_s[1:2,], digits = 2) # we only keep first 2 digits
+# take the data for two countries each 
+albania <- round(food_s[1,], digits = 2)
+austria <- round(food_s[2,], digits = 2)
+# compute difference between each col
+d <- albania - austria
+d
+# euclidean distance: square each element, sum together, and take a square root
+sqrt(sum(d^2)) 
+```
+
+
+### Hierarchical clustering
+
+Now that we have computed the distance `food_dist`, we plug it in the clustering algorithm, `hclust()`. 
+
+We try the complete linkage method, by specifying `method = 'complete'`. The result is saved as `hc.complete`. You can visualize it, and add label of the country names to make it easier to read.
+
+```{r}
+#| label: hc-food-complete
+#| warning: false
+#| echo: true
+
+hc.complete <- hclust(food_dist, method="complete")
+plot(hc.complete, labels=rownames(food), main="Complete Linkage", xlab="", sub="")
+```
+
+
+### Linkage, dissimilarity, scaling
+
+Hierarchical clustering is a class of methods, and there are a variety of options to set. 
+
+* Linkage (by seting `method` inside `hclust()`): complete, single, average
+* Dissimilarity: Euclidean, correlation, ...
+* Scaling: scaled data (mean 0 variance 1) or unscaled, original data
+
+There is no definite guide on which combination works the best, hence you can try them out and see what could make most sense. Again, in unsupervised learning data do not have outcome labels, so the interpretation is left for the domain experts to make.
+
+
+```{r}
+#| label: hc-food-linkage
+#| warning: false
+#| echo: true
+
+# single linkage
+hc.single <- hclust(food_dist, method="single")
+plot(hc.single, labels=rownames(food), main="Single Linkage", xlab="", sub="")
+
+# average linkage
+hc.average <- hclust(food_dist, method="average")
+plot(hc.average, labels=rownames(food), main="Average Linkage", xlab="", sub="")
+```
+
+
+
+```{r}
+#| label: hc-food-unscaled
+#| warning: false
+#| echo: true
+
+# unscaled data, complete linkage
+hc.unscaled <- hclust(dist(food), method="complete")
+plot(hc.unscaled, labels=rownames(food), main="Complete linkage with unscaled features", xlab="", sub="")
+```
+
+
+```{r}
+#| label: hc-food-cor
+#| warning: false
+#| echo: true
+
+# correlation as dissimiarity, rather than euclidean distance
+dd <- as.dist(1-cor(t(food_s))) # compute the metric
+hc.corr <- hclust(dd, method="complete") # cluster
+plot(hc.corr, labels=rownames(food), main="Complete linkage with correlation-based distance", xlab="", sub="")
+```
+
+
+### Heatmap
+
+Heatmap is a visualization tool to plot data of similar values in similar colors, so that you can identify visualy if there is any pattern. It can also be combined with hierarchical clustering - this is actually the default outcome: dendrograms are displayed for both rows and column.
+
+```{r}
+#| label: hc-food-heatmap1
+#| warning: false
+#| echo: true
+
+# make heatmap on the scaled data
+heatmap(food_s)
+```
+
+To preserve the original ordering of the columns and rows, you can specify `Rowv = NA, Colv = NA`.
+
+```{r}
+#| label: hc-food-heatmap2
+#| warning: false
+#| echo: true
+
+# no clustering for row or col, this preserves the original ordering
+heatmap(food_s, Rowv = NA, Colv = NA)
+```
+
+Can also only do clustering for row only (or column only). 
+
+```{r}
+#| label: hc-food-heatmap3
+#| warning: false
+#| echo: true
+
+# only clustering for row
+heatmap(food_s, Colv = NA)
+```
+
+
+
+## Exercise 2: NCI60
 
 We look at the NCI60 data again. First load the dataset.
 
@@ -190,8 +370,7 @@ Set a different random seed, say 3 (as long as it's different from the one you u
 # different starting centroids improve the clustering:
 set.seed(3)
 km.out <- kmeans(sd.data, centers = 4, nstart=1)
-km.out$cluster
-km.out$tot.withinss
+km.out$cluster # cluster label
 ```
 
 ### Compare with hierarchical clustering
@@ -295,7 +474,7 @@ heatmap(pr.out$x[,1:40], Rowv = as.dendrogram(hc.corr), Colv = NA,
 
 
 
-## Exercise 2: Gene expression data
+## Exercise 3: Gene expression data
 
 (CH12Ex13 from statistical learning)
 
diff --git a/lab/overview.qmd b/lab/overview.qmd
index 56bab37..ef7f7f5 100644
--- a/lab/overview.qmd
+++ b/lab/overview.qmd
@@ -37,7 +37,7 @@ day 4 (clustering)
 | Day 2  | [Day 2: Multiple testing](lab_day2_testing.qmd)  |   [Code](https://github.com/ocbe-uio/course_med3007/blob/main/lab/code/MED3007_Lab1.R), [Code (solution)](https://github.com/ocbe-uio/course_med3007/blob/main/lab/code/MED3007_Lab1_exercise_solution.R) | |
 | Day 3  | [Day 3: Principal Component Analysis](lab_day3_pca.qmd) |[Code](https://github.com/ocbe-uio/course_med3007/blob/main/lab/code/MED3007_Lab2_pca.R) | [Slides](presentation/Lab_pca.pdf)
  |
-| Day 4  | [Day 4: Clustering](lab_day4_clustering.qmd) |[Code](https://github.com/ocbe-uio/course_med3007/blob/main/lab/code/MED3007_Lab3_clustering.R) | |
+| Day 4  | [Day 4: Clustering](lab_day4_clustering.qmd) |[Code](https://github.com/ocbe-uio/course_med3007/blob/main/lab/code/MED3007_Lab3_clustering.R) | [Slides](presentation/Lab_clustering.pdf)|
 
 
 
diff --git a/lab/presentation/Lab_clustering.pdf b/lab/presentation/Lab_clustering.pdf
new file mode 100644
index 0000000..ea019d2
Binary files /dev/null and b/lab/presentation/Lab_clustering.pdf differ