7.3 Clustering

7.3.1 Supervised vs unsupervised

7.3.2 Clustering goal

7.3.3 k-means algorithm

7.3.4 k-Means clustering

set.seed(90210)
kmodel <- kmeans(scale(hprice2),centers=3,nstart=10)
kmodel$centers
##        price      crime         nox      rooms        dist     radial
## 1 -0.2792677 -0.3600423 -0.07840445 -0.4240612 -0.01224813 -0.5916003
## 2  1.0115090 -0.3971467 -0.77766399  0.9666947  0.75041393 -0.5764852
## 3 -0.6792515  1.0413321  1.00563169 -0.3899504 -0.82541542  1.6253203
##      proptax    stratio    lowstat
## 1 -0.4577160  0.0607907  0.1269073
## 2 -0.6908616 -0.8015948 -0.9236065
## 3  1.5333981  0.8030303  0.8314782
table(kmodel$cluster)
## 
##   1   2   3 
## 221 151 134
hprice2[,lapply(.SD,mean),by=kmodel$cluster] 
##    kmodel    price      crime      nox    rooms     dist    radial
## 1:      2 31826.35  0.1999470 4.648940 6.963245 5.376225  4.529801
## 2:      1 19939.77  0.5186833 5.458959 5.986109 3.769955  4.398190
## 3:      3 16256.38 12.5568358 6.714701 6.010075 2.057313 23.701493
##     proptax  stratio   lowstat
## 1: 29.18013 16.72318  6.016358
## 2: 33.10950 18.59095 13.620045
## 3: 66.66716 20.19851 18.719776
ggplot(hprice2,aes(x=nox,y=price,color=as.factor(kmodel$cluster))) + geom_point()

7.3.5 Within group sum of squares

kmodel$tot.withinss
## [1] 2183.669
kmeans.wss(scale(hprice2))
##  [1] 4545.000 2703.341 2183.948 1850.740 1621.996 1421.727 1293.463
##  [8] 1181.626 1099.631 1035.744
plot.wss(kmeans.wss(scale(hprice2)))

eratio <- function(wss) { # USE MINUS 1 FOR PCA
  # Creates the eigenvalue ratio estimator for the number of clusters
  n <- NROW(wss)
  dss <- -diff(wss) # Create differences in wss (eigenvalues)
  dss <- c(wss[1]/log(n),dss) # Assign a zero case
  erat <- dss[1:(n-1)]/dss[2:n] # Build the eigenvalue ratio statistic
  gss <- log(1+dss/wss) # Create growth rates
  grat <- gss[1:(n-1)]/gss[2:n] # Calucluate the growth rate statistic
  return(c(which.max(erat),which.max(grat))) # Find the maximum number for each estimator
}
eratio(kmeans.wss(scale(hprice2)))
## [1] 2 2

7.3.6 Eigenvalue ratio based estimators

7.3.7 Hierarchical clustering algorithm

# Model dendrogram
model <- hclust(dist(scale(hprice2)))
summary(cutree(model,k=2))
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   1.000   1.000   1.000   1.073   1.000   2.000
hprice2[,lapply(.SD,mean),by=cutree(model,k=3)]
##    cutree     price     crime      nox    rooms     dist    radial
## 1:      1 23474.439  1.929987 5.448635 6.349190 3.968465  8.409382
## 2:      2 10503.294 20.348294 6.833824 5.392647 1.611471 24.000000
## 3:      3  8066.667 76.810333 6.810000 6.203333 1.550000 24.000000
##     proptax  stratio  lowstat
## 1: 38.79019 18.32196 11.65881
## 2: 66.60000 20.20000 26.41647
## 3: 66.60000 20.20000 20.27000
plot.wss(hclust.wss(data.table(scale(hprice2))))

eratio(hclust.wss(data.table(scale(hprice2))))
## [1] 2 2

7.3.8 Algorithmic complexity

7.3.9 Dendrogram

plot(model)
rect.hclust(model,k=7,border="red")
rect.hclust(model,k=6,border="purple")
rect.hclust(model,k=5,border="blue")
rect.hclust(model,k=4,border="green")
rect.hclust(model,k=3,border="yellow")
rect.hclust(model,k=2,border="orange")

7.3.10 Applying clustering

set.seed(90210)
hprice2$kmean <- kmeans(scale(hprice2),centers=2,nstart=10)$cluster
hprice2$hier <- cutree(hclust(dist(data.table(scale(hprice2)))),k=4)
table(hprice2$hier,hprice2$kmean) # Two-way table to see relationship
##    
##       1   2
##   1 235   0
##   2 111 123
##   3   0  34
##   4   0   3
# k-means clustering models
# Pooled:
summary(lm(log(price)~log(nox)+rooms+stratio,data=hprice2))
## 
## Call:
## lm(formula = log(price) ~ log(nox) + rooms + stratio, data = hprice2)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.02357 -0.13576  0.02406  0.13258  1.40083 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 10.359458   0.219069  47.288   <2e-16 ***
## log(nox)    -0.645329   0.062580 -10.312   <2e-16 ***
## rooms        0.256727   0.018677  13.746   <2e-16 ***
## stratio     -0.050873   0.005926  -8.585   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.2673 on 502 degrees of freedom
## Multiple R-squared:  0.576,  Adjusted R-squared:  0.5734 
## F-statistic: 227.3 on 3 and 502 DF,  p-value: < 2.2e-16
# Within:
summary(lm(log(price)~as.factor(kmean)+log(nox)+rooms+stratio-1,data=hprice2))
## 
## Call:
## lm(formula = log(price) ~ as.factor(kmean) + log(nox) + rooms + 
##     stratio - 1, data = hprice2)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.98976 -0.12245  0.00579  0.11441  1.45933 
## 
## Coefficients:
##                    Estimate Std. Error t value Pr(>|t|)    
## as.factor(kmean)1  9.787719   0.254704  38.428  < 2e-16 ***
## as.factor(kmean)2  9.611210   0.279304  34.411  < 2e-16 ***
## log(nox)          -0.357760   0.091938  -3.891 0.000113 ***
## rooms              0.253544   0.018389  13.788  < 2e-16 ***
## stratio           -0.042169   0.006185  -6.818 2.67e-11 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.2629 on 501 degrees of freedom
## Multiple R-squared:  0.9993, Adjusted R-squared:  0.9993 
## F-statistic: 1.448e+05 on 5 and 501 DF,  p-value: < 2.2e-16
# hierarchical clustering models
# Pooled:
summary(lm(log(price)~log(nox)+rooms+stratio,data=hprice2))
## 
## Call:
## lm(formula = log(price) ~ log(nox) + rooms + stratio, data = hprice2)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.02357 -0.13576  0.02406  0.13258  1.40083 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 10.359458   0.219069  47.288   <2e-16 ***
## log(nox)    -0.645329   0.062580 -10.312   <2e-16 ***
## rooms        0.256727   0.018677  13.746   <2e-16 ***
## stratio     -0.050873   0.005926  -8.585   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.2673 on 502 degrees of freedom
## Multiple R-squared:  0.576,  Adjusted R-squared:  0.5734 
## F-statistic: 227.3 on 3 and 502 DF,  p-value: < 2.2e-16
# Within:
summary(lm(log(price)~as.factor(hier)+log(nox)+rooms+stratio,data=hprice2))
## 
## Call:
## lm(formula = log(price) ~ as.factor(hier) + log(nox) + rooms + 
##     stratio, data = hprice2)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.95937 -0.13982  0.01699  0.11803  1.30258 
## 
## Coefficients:
##                  Estimate Std. Error t value Pr(>|t|)    
## (Intercept)      10.10932    0.22968  44.015  < 2e-16 ***
## as.factor(hier)2 -0.06596    0.04174  -1.580    0.115    
## as.factor(hier)3 -0.42904    0.06856  -6.258 8.41e-10 ***
## as.factor(hier)4 -0.87087    0.15113  -5.762 1.45e-08 ***
## log(nox)         -0.42487    0.08843  -4.805 2.05e-06 ***
## rooms             0.21876    0.01895  11.542  < 2e-16 ***
## stratio          -0.04112    0.00634  -6.487 2.11e-10 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.2484 on 499 degrees of freedom
## Multiple R-squared:  0.6361, Adjusted R-squared:  0.6317 
## F-statistic: 145.4 on 6 and 499 DF,  p-value: < 2.2e-16