Search code examples
rfor-loopstat

R crashing when running a for loop


I am a r-beginner and have a code in which I am trying to do the average silhouette method. When I try to run a for loop, R stops with the error: R for Windows GUI front-end has stopped working. The code that I use is:

data_no_demographics<-read.csv(file.choose(),row.names=1)
data_no_demographics <- na.omit(data_no_demographics)
library(cluster)
k_max<-20
sil<-rep(0,k_max)
for(i in 2:k_max){
  km.res <- kmeans(scale(data_no_demographics), centers = i, nstart = 50)
  ss <- silhouette(km.res$cluster, scale(data_no_demographics))
  sil[i] <- mean(ss[, 3])
}
plot(1:k_max, sil, type = "b", pch = 20,col="black",frame = FALSE, xlab ="Number of clusters (k)", ylab="Average silhouette width (fit of data within its cluster)", main="Optimal number of clusters")
abline(v = which.max(sil), lty = 2, col="red")

Any help will be appreciated.

dput(data_no_demographics[1:4,])

Output: (data_structure(list(DRY.FOOD.MEAL.PREPARATION = c(0, 0, 0, 0),BASMATI.RICE =c(0.106274747, 0.086781763, 0.066892377, 0.039564525), JASMINE.RICE = c(0.037947215, 0.101672855, 0.025094901, 0.026354222), ITALIAN.RICE = c(0, 0.006335413, 0, 0), ORG.SIDE.DISHES = c(0, 0, 0, 0), COUSCOUS = c(0.014091056, 0.013493856, 0.009541397, 0.006704727), QUINOA = c(0.013921964, 0.020977683, 0.011593311, 0.006638343), FLAVOURED.RICE = c(0.322163823, 0.225682349, 0.378639581, 0.340580191), INSTANT.RICE = c(0.063184297, 0.044711557, 0.092459218, 0.152615507), PARBOILED.RICE = c(0.065396593, 0.089652796, 0.048548271, 0.032295539), RTH.SPECIALTY = c(0.010371018, 0.011771236, 0.009418283, 0.007833245), RTH.RICE = c(0.307790945, 0.286375991, 0.29621422, 0.336331652), OTHER.RICE = c(0.013020136, 0.019484745, 0.010382682, 0.006737918), REGULAR.DRY.RICE = c(0.045838206, 0.093059756, 0.051215759, 0.044344132), READY.TO.EAT = c(0, 0, 0, 0)), .Names = c("DRY.FOOD.MEAL.PREPARATION", "BASMATI.RICE", "JASMINE.RICE", "ITALIAN.RICE", "ORG.SIDE.DISHES", "COUSCOUS", "QUINOA", "FLAVOURED.RICE", "INSTANT.RICE", "PARBOILED.RICE", "RTH.SPECIALTY", "RTH.RICE", "OTHER.RICE", "REGULAR.DRY.RICE", "READY.TO.EAT"), row.names = c(1000L, 1004L, 1007L, 1008L), class = "data.frame")


Solution

  • I just had a test run with the data you provided and found some warnings because of the usage of silhouette. Changing silhouette(km.res$cluster, scale(data_no_demographics)) to silhouette(km.res$cluster, dist(data_no_demographics)) removed the warnings.

    data_no_demographics <- data.frame(
      DRY.FOOD.MEAL.PREPARATION = c(0, 0, 0, 0),
      BASMATI.RICE =c(0.106274747, 0.086781763, 0.066892377, 0.039564525), 
      JASMINE.RICE = c(0.037947215, 0.101672855, 0.025094901, 0.026354222), 
      ITALIAN.RICE = c(0, 0.006335413, 0, 0), 
      ORG.SIDE.DISHES = c(0, 0, 0, 0), 
      COUSCOUS = c(0.014091056, 0.013493856, 0.009541397, 0.006704727), 
      QUINOA = c(0.013921964, 0.020977683, 0.011593311, 0.006638343), 
      FLAVOURED.RICE = c(0.322163823, 0.225682349, 0.378639581, 0.340580191),
      INSTANT.RICE = c(0.063184297, 0.044711557, 0.092459218, 0.152615507),
      PARBOILED.RICE = c(0.065396593, 0.089652796, 0.048548271, 0.032295539),
      RTH.SPECIALTY = c(0.010371018, 0.011771236, 0.009418283, 0.007833245), 
      RTH.RICE = c(0.307790945, 0.286375991, 0.29621422, 0.336331652), 
      OTHER.RICE = c(0.013020136, 0.019484745, 0.010382682, 0.006737918), 
      REGULAR.DRY.RICE = c(0.045838206, 0.093059756, 0.051215759, 0.044344132), 
      READY.TO.EAT = c(0, 0, 0, 0), 
      row.names = c(1000L, 1004L, 1007L, 1008L)
    )
    # omit columns that are just zeros
    data_no_demographics <- data_no_demographics[-c(1,5,15)]
    
    library(cluster)
    k_max<-3
    sil<-rep(0,k_max)
    distance_matrix <- dist(data_no_demographics)
    for(i in 2:k_max){
      km.res <- kmeans(scale(data_no_demographics), centers = i, nstart = 1)
      ss <- silhouette(km.res$cluster, distance_matrix)
      sil[i] <- mean(ss[, 3])
    }
    plot(1:k_max, sil, type = "b", pch = 20,col="black",frame = FALSE, 
      xlab ="Number of clusters (k)", 
      ylab="Average silhouette width (fit of data within its cluster)", 
      main="Optimal number of clusters")
    abline(v = which.max(sil), lty = 2, col="red")
    

    enter image description here

    EDIT

    I just moved the computation of the distance matrix (dist) outside of the loop. This should speed up computations if more rows are used.