On the book website, www.StatLearning.com, there is a gene expression data set (Ch10Ex11.csv) that consists of 40 tissue samples with measurements on 1,000 genes. The first 20 samples are from healthy patients, while the second 20 are from a diseased group.
#library(ISLR)
#library(tidyverse)
gene = read.csv("Ch10Ex11.csv", header=FALSE)
dim(gene)
## [1] 1000   40
summary(gene)
##        V1                  V2                  V3                  V4          
##  Min.   :-3.056328   Min.   :-3.240490   Min.   :-3.527188   Min.   :-3.06553  
##  1st Qu.:-0.684539   1st Qu.:-0.703363   1st Qu.:-0.724498   1st Qu.:-0.70183  
##  Median : 0.032338   Median :-0.006061   Median : 0.000255   Median : 0.01735  
##  Mean   : 0.006397   Mean   :-0.020648   Mean   :-0.012895   Mean   :-0.00940  
##  3rd Qu.: 0.676673   3rd Qu.: 0.660186   3rd Qu.: 0.627725   3rd Qu.: 0.66906  
##  Max.   : 3.519299   Max.   : 3.084000   Max.   : 3.458551   Max.   : 3.22213  
##        V5                 V6                 V7                 V8          
##  Min.   :-2.99742   Min.   :-2.85389   Min.   :-2.74516   Min.   :-3.62753  
##  1st Qu.:-0.73211   1st Qu.:-0.68827   1st Qu.:-0.64695   1st Qu.:-0.66294  
##  Median :-0.04396   Median :-0.01104   Median : 0.03087   Median :-0.05012  
##  Mean   :-0.04554   Mean   :-0.04223   Mean   : 0.03408   Mean   :-0.01432  
##  3rd Qu.: 0.66127   3rd Qu.: 0.60505   3rd Qu.: 0.69415   3rd Qu.: 0.63980  
##  Max.   : 2.89747   Max.   : 3.06956   Max.   : 3.89075   Max.   : 3.35391  
##        V9                V10                V11                V12          
##  Min.   :-2.85224   Min.   :-2.57151   Min.   :-3.50807   Min.   :-2.46314  
##  1st Qu.:-0.66516   1st Qu.:-0.69711   1st Qu.:-0.66221   1st Qu.:-0.76172  
##  Median : 0.01229   Median :-0.02719   Median :-0.05554   Median :-0.06017  
##  Mean   : 0.02089   Mean   : 0.01119   Mean   :-0.03980   Mean   :-0.04690  
##  3rd Qu.: 0.70092   3rd Qu.: 0.69082   3rd Qu.: 0.63184   3rd Qu.: 0.62667  
##  Max.   : 2.76971   Max.   : 3.54802   Max.   : 3.24751   Max.   : 3.47430  
##       V13                V14                 V15                 V16           
##  Min.   :-3.31227   Min.   :-3.668497   Min.   :-3.143676   Min.   :-2.927187  
##  1st Qu.:-0.64411   1st Qu.:-0.701149   1st Qu.:-0.666224   1st Qu.:-0.692198  
##  Median : 0.01708   Median : 0.001063   Median :-0.002157   Median :-0.006049  
##  Mean   :-0.01055   Mean   :-0.008516   Mean   : 0.005718   Mean   : 0.001087  
##  3rd Qu.: 0.66782   3rd Qu.: 0.650463   3rd Qu.: 0.663221   3rd Qu.: 0.703088  
##  Max.   : 2.83636   Max.   : 3.507131   Max.   : 3.554124   Max.   : 4.088422  
##       V17                 V18                V19                 V20          
##  Min.   :-2.735982   Min.   :-2.89979   Min.   :-2.986620   Min.   :-3.64333  
##  1st Qu.:-0.802367   1st Qu.:-0.74117   1st Qu.:-0.689626   1st Qu.:-0.63972  
##  Median : 0.000495   Median :-0.04947   Median :-0.007034   Median :-0.01565  
##  Mean   : 0.003757   Mean   :-0.03978   Mean   :-0.040910   Mean   :-0.01158  
##  3rd Qu.: 0.701429   3rd Qu.: 0.62694   3rd Qu.: 0.669064   3rd Qu.: 0.62061  
##  Max.   : 3.183715   Max.   : 3.01382   Max.   : 3.517389   Max.   : 3.43399  
##       V21               V22                V23                V24         
##  Min.   :-2.9568   Min.   :-3.54902   Min.   :-3.10594   Min.   :-2.9732  
##  1st Qu.:-0.5812   1st Qu.:-0.63412   1st Qu.:-0.52197   1st Qu.:-0.5747  
##  Median : 0.1177   Median : 0.09129   Median : 0.09238   Median : 0.2228  
##  Mean   : 0.2203   Mean   : 0.18782   Mean   : 0.19267   Mean   : 0.2434  
##  3rd Qu.: 0.9504   3rd Qu.: 0.92581   3rd Qu.: 0.88226   3rd Qu.: 0.9720  
##  Max.   : 4.7516   Max.   : 4.93363   Max.   : 4.57264   Max.   : 4.2766  
##       V25               V26               V27               V28         
##  Min.   :-3.2182   Min.   :-2.9139   Min.   :-3.4260   Min.   :-2.9413  
##  1st Qu.:-0.6559   1st Qu.:-0.5064   1st Qu.:-0.5618   1st Qu.:-0.5467  
##  Median : 0.1708   Median : 0.1931   Median : 0.1609   Median : 0.1319  
##  Mean   : 0.2121   Mean   : 0.2674   Mean   : 0.2572   Mean   : 0.2316  
##  3rd Qu.: 1.0251   3rd Qu.: 0.9757   3rd Qu.: 0.9855   3rd Qu.: 0.9293  
##  Max.   : 4.9629   Max.   : 5.0694   Max.   : 4.5857   Max.   : 5.1242  
##       V29               V30               V31               V32         
##  Min.   :-3.1168   Min.   :-3.0350   Min.   :-3.4798   Min.   :-2.8842  
##  1st Qu.:-0.5876   1st Qu.:-0.6034   1st Qu.:-0.6183   1st Qu.:-0.5383  
##  Median : 0.1863   Median : 0.1731   Median : 0.1176   Median : 0.1881  
##  Mean   : 0.2289   Mean   : 0.2074   Mean   : 0.1894   Mean   : 0.2582  
##  3rd Qu.: 1.0060   3rd Qu.: 0.9279   3rd Qu.: 0.8815   3rd Qu.: 0.9578  
##  Max.   : 4.4779   Max.   : 4.0673   Max.   : 4.7855   Max.   : 5.6171  
##       V33               V34               V35               V36         
##  Min.   :-2.8979   Min.   :-3.3490   Min.   :-3.8558   Min.   :-2.9215  
##  1st Qu.:-0.6286   1st Qu.:-0.6064   1st Qu.:-0.6043   1st Qu.:-0.5791  
##  Median : 0.1394   Median : 0.1514   Median : 0.1079   Median : 0.1065  
##  Mean   : 0.2210   Mean   : 0.2101   Mean   : 0.1733   Mean   : 0.2284  
##  3rd Qu.: 0.9434   3rd Qu.: 0.9075   3rd Qu.: 0.8598   3rd Qu.: 0.8921  
##  Max.   : 5.3758   Max.   : 4.5026   Max.   : 4.2191   Max.   : 4.4853  
##       V37               V38               V39               V40         
##  Min.   :-2.8670   Min.   :-3.2340   Min.   :-2.7112   Min.   :-3.5816  
##  1st Qu.:-0.5063   1st Qu.:-0.5888   1st Qu.:-0.5610   1st Qu.:-0.5031  
##  Median : 0.2213   Median : 0.1619   Median : 0.1211   Median : 0.2064  
##  Mean   : 0.2823   Mean   : 0.2257   Mean   : 0.2215   Mean   : 0.2817  
##  3rd Qu.: 1.0003   3rd Qu.: 0.9847   3rd Qu.: 0.9576   3rd Qu.: 0.9878  
##  Max.   : 4.7904   Max.   : 4.6188   Max.   : 4.8149   Max.   : 5.2751
gene.corr.dist = as.dist(1-cor(gene))
# Fitting hierarchiacal clustering
h.clust = hclust(
  gene.corr.dist
)
plot(h.clust)
COMMENTS: Based on the dendrogram complete linkage seems like it can be classified into two groups.
h.clust = hclust(
  gene.corr.dist,
  method='average'
)
plot(h.clust)
COMMENTS: Average Linkage does not separate into two groups
h.clust = hclust(
  gene.corr.dist,
  method='single'
)
plot(h.clust)
COMMENTS: Single linkage looks like it can separate into two groups but looks wacky.
h.clust = hclust(
  gene.corr.dist,
  method='centroid'
)
plot(h.clust)
summary(h.clust)
##             Length Class  Mode     
## merge       78     -none- numeric  
## height      39     -none- numeric  
## order       40     -none- numeric  
## labels      40     -none- character
## method       1     -none- character
## call         3     -none- call     
## dist.method  0     -none- NULL
h.clust
## 
## Call:
## hclust(d = gene.corr.dist, method = "centroid")
## 
## Cluster method   : centroid 
## Number of objects: 40