Search code examples
rmachine-learningdistancedata-miningr-daisy

R - Different results gower.dist and daisy(...,metric="gower")


I want to calculate the distances (dissimilarities) between the rows of two data frames in order to find the closest cluster for each observation. Because I have factors and numerical variables, I'm using Gower distance. As I want to compare two data frames (and not the dissimilarities between the rows of one matrix), gower.dist would be the function I need. However, when I implemented it, I realized that the results differ from the ones I get when I use daisy's gower, having the rows bound together and looking at the part of the dissimilarity matrix of interest.

I only provide a sample of my data here but when I calculated the dissimilarities with all data, gower.dist often resulted in dissimilarities of zero although the corresponding rows were not equal to each other. Why? And what could be the reason for the different results? In my opinion, daisys's gower is working correct and gower.dist isn't (in this example).

library(cluster)
library(StatMatch)

# Calculate distance using daisy's gower 
daisyDist <- daisy(rbind(df,cent),metric="gower")
daisyDist <- as.matrix(daisyDist)
daisyDist <- daisyDist[(nrow(df)+1):nrow(daisyDist),1:nrow(df)] #only look at part where rows from df are compared to (rows of) cent

# Calculate distance using dist.gower
gowerDist <- gower.dist(cent,df)

with the following data

df <- structure(list(searchType = structure(c(NA, 1L, 1L, 1L, 1L), .Label = c("1", "2"), class = "factor"), roomMin = structure(c(4L, 1L, 1L, 6L, 6L), .Label = c("10", "100", "150", "20", "255", "30", "40", "50", "60", "70", "Missing[NoInput]"), class = "factor"), roomMax = structure(c(8L, 8L, NA, 10L, 9L), .Label = c("10", "100", "120", "150", "160", "20", "255", "30", "40", "50", "60", "70", "80", "90", "Missing[NoInput]"), class = "factor"), priceMin = c(NA, 73, 60, 29, 11), priceMax = c(35, 11, 1, 62, 23), sizeMin = structure(c(5L, 5L, 5L, 6L, 6L), .Label = c("100", "125", "150", "250", "50", "75", "Missing[NoInput]"), class = "factor"), sizeMax = structure(c(1L, 6L, 5L, 3L, 1L), .Label = c("100", "125", "150", "250", "50", "75", "Missing[NoInput]"), class = "factor"), longitude = c(6.6306, 7.47195, 8.5562, NA, 8.569), latitude = c(46.52425, 46.9512, 47.37515, NA, 47.3929), specificSearch = structure(c(1L, 1L, 1L, 1L, 1L), .Label = c("0", "1"), class = "factor"), objectType = structure(c(NA, 2L, 2L, 2L, 2L), .Label = c("1", "2", "3", "Missing[]"), class = "factor")), .Names = c("searchType", "roomMin", "roomMax", "priceMin", "priceMax", "sizeMin", "sizeMax", "longitude", "latitude", "specificSearch", "objectType"), row.names = c(112457L,  94601L, 78273L, 59172L, 117425L), class = "data.frame")                                                                                                                                                                
cent <- structure(list(searchType = structure(c(1L, 1L, 1L), .Label = c("1", "2"), class = "factor"), roomMin = structure(c(1L, 4L, 4L), .Label = c("10", "100", "150", "20", "255", "30", "40", "50", "60", "70", "Missing[NoInput]"), class = "factor"), roomMax = structure(c(6L, 9L, 8L), .Label = c("10", "100", "120", "150", "160", "20", "255", "30", "40", "50", "60", "70", "80", "90", "Missing[NoInput]"), class = "factor"), priceMin = c(60, 33, 73), priceMax = c(103, 46, 23), sizeMin = structure(c(1L, 5L, 5L), .Label = c("100", "125", "150", "250", "50", "75", "Missing[NoInput]"), class = "factor"), sizeMax = structure(c(1L, 2L, 1L), .Label = c("100", "125", "150", "250", "50", "75", "Missing[NoInput]"), class = "factor"), longitude = c(8.3015, 7.42765, 7.6104), latitude = c(47.05485, 46.9469, 46.75125), specificSearch = structure(c(1L, 1L, 1L), .Label = c("0", "1"), class = "factor"), objectType = structure(c(2L, 2L, 2L), .Label = c("1", "2", "3", "Missing[]"), class = "factor")), .Names = c("searchType", "roomMin", "roomMax", "priceMin", "priceMax", "sizeMin", "sizeMax", "longitude", "latitude", "specificSearch", "objectType"), row.names = c(60656L, 66897L, 130650L), class = "data.frame")

Thank you!

EDIT: it seems that the error/difference occurs because there are NAs in the numeric columns and they seem to be treated differently. How can I adapt daisy's treatment of NAs to gower.dist?


Solution

  • It's due to the NA values in the numeric columns of your data frame. Consider the following code, to see how the two functions behave completely differently with a numeric column with NA values (daisy is more robust that gower.dist):

    df1 <- rbind(df,cent)
    head(df1)
           searchType roomMin roomMax priceMin priceMax sizeMin sizeMax longitude latitude specificSearch objectType
    112457       <NA>      20      30       NA       35      50     100   6.63060 46.52425              0       <NA>
    94601           1      10      30       73       11      50      75   7.47195 46.95120              0          2
    78273           1      10    <NA>       60        1      50      50   8.55620 47.37515              0          2
    59172           1      30      50       29       62      75     150        NA       NA              0          2
    117425          1      30      40       11       23      75     100   8.56900 47.39290              0          2
    60656           1      10      20       60      103     100     100   8.30150 47.05485              0          2
    
    # only use the numeric column priceMin (4th column) to compute the distance
    class(df1[,4])
    # [1] "numeric"
    df2 <- df1[4]
    
    # daisy output
    as.matrix(daisy(df2,metric="gower")) 
            112457     94601     78273      59172    117425     60656      66897    130650
    112457      0        NA        NA         NA        NA        NA         NA        NA
    94601      NA 0.0000000 0.2096774 0.70967742 1.0000000 0.2096774 0.64516129 0.0000000
    78273      NA 0.2096774 0.0000000 0.50000000 0.7903226 0.0000000 0.43548387 0.2096774
    59172      NA 0.7096774 0.5000000 0.00000000 0.2903226 0.5000000 0.06451613 0.7096774
    117425     NA 1.0000000 0.7903226 0.29032258 0.0000000 0.7903226 0.35483871 1.0000000
    60656      NA 0.2096774 0.0000000 0.50000000 0.7903226 0.0000000 0.43548387 0.2096774
    66897      NA 0.6451613 0.4354839 0.06451613 0.3548387 0.4354839 0.00000000 0.6451613
    130650     NA 0.0000000 0.2096774 0.70967742 1.0000000 0.2096774 0.64516129 0.0000000
    
    # gower.dist output
    gower.dist(df2)
         [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8]
    [1,]  NaN  NaN  NaN  NaN  NaN  NaN  NaN  NaN
    [2,]  NaN    0    0    0    0    0    0    0
    [3,]  NaN    0    0    0    0    0    0    0
    [4,]  NaN    0    0    0    0    0    0    0
    [5,]  NaN    0    0    0    0    0    0    0
    [6,]  NaN    0    0    0    0    0    0    0
    [7,]  NaN    0    0    0    0    0    0    0
    [8,]  NaN    0    0    0    0    0    0    0
    

    Fix this with the parameter rngs in gower.dist function:

    gower.dist(df2, rngs=max(df2, na.rm=TRUE) - min(df2, na.rm=TRUE))
         [,1]      [,2]      [,3]       [,4]      [,5]      [,6]       [,7]      [,8]
    [1,]  NaN       NaN       NaN        NaN       NaN       NaN        NaN       NaN
    [2,]  NaN 0.0000000 0.2096774 0.70967742 1.0000000 0.2096774 0.64516129 0.0000000
    [3,]  NaN 0.2096774 0.0000000 0.50000000 0.7903226 0.0000000 0.43548387 0.2096774
    [4,]  NaN 0.7096774 0.5000000 0.00000000 0.2903226 0.5000000 0.06451613 0.7096774
    [5,]  NaN 1.0000000 0.7903226 0.29032258 0.0000000 0.7903226 0.35483871 1.0000000
    [6,]  NaN 0.2096774 0.0000000 0.50000000 0.7903226 0.0000000 0.43548387 0.2096774
    [7,]  NaN 0.6451613 0.4354839 0.06451613 0.3548387 0.4354839 0.00000000 0.6451613
    [8,]  NaN 0.0000000 0.2096774 0.70967742 1.0000000 0.2096774 0.64516129 0.0000000
    

    Hence a way to make the function gower.dist work like daisy when NAs are present in numeric variables can be like the following one:

    df1 <- rbind(df,cent)
    
    # compute the ranges of the numeric variables correctly
    cols <- which(sapply(df1, is.numeric))
    rngs <- rep(1, ncol(df1))
    rngs[cols] <- sapply(df1[cols], function(x) max(x, na.rm=TRUE) - min(x, na.rm=TRUE)) 
    
    daisyDist <- as.matrix(daisy(df1,metric="gower"))
    gowerDist <- gower.dist(df1)
    
    daisyDist
              112457     94601     78273     59172    117425     60656     66897    130650
    112457 0.0000000 0.3951059 0.6151851 0.7107843 0.6397059 0.6424374 0.3756990 0.1105551
    94601  0.3951059 0.0000000 0.2355126 0.5788530 0.5629176 0.4235379 0.3651002 0.2199324
    78273  0.6151851 0.2355126 0.0000000 0.5122549 0.4033046 0.3500130 0.3951874 0.3631533
    59172  0.7107843 0.5788530 0.5122549 0.0000000 0.2969639 0.5446623 0.4690421 0.5657812
    117425 0.6397059 0.5629176 0.4033046 0.2969639 0.0000000 0.4638003 0.4256891 0.4757460
    60656  0.6424374 0.4235379 0.3500130 0.5446623 0.4638003 0.0000000 0.5063082 0.4272755
    66897  0.3756990 0.3651002 0.3951874 0.4690421 0.4256891 0.5063082 0.0000000 0.2900150
    130650 0.1105551 0.2199324 0.3631533 0.5657812 0.4757460 0.4272755 0.2900150 0.0000000
    
    gowerDist
              [,1]      [,2]      [,3]      [,4]      [,5]      [,6]      [,7]      [,8]
    [1,] 0.0000000 0.3951059 0.6151851 0.7107843 0.6397059 0.6424374 0.3756990 0.1105551
    [2,] 0.3951059 0.0000000 0.2355126 0.5788530 0.5629176 0.4235379 0.3651002 0.2199324
    [3,] 0.6151851 0.2355126 0.0000000 0.5122549 0.4033046 0.3500130 0.3951874 0.3631533
    [4,] 0.7107843 0.5788530 0.5122549 0.0000000 0.2969639 0.5446623 0.4690421 0.5657812
    [5,] 0.6397059 0.5629176 0.4033046 0.2969639 0.0000000 0.4638003 0.4256891 0.4757460
    [6,] 0.6424374 0.4235379 0.3500130 0.5446623 0.4638003 0.0000000 0.5063082 0.4272755
    [7,] 0.3756990 0.3651002 0.3951874 0.4690421 0.4256891 0.5063082 0.0000000 0.2900150
    [8,] 0.1105551 0.2199324 0.3631533 0.5657812 0.4757460 0.4272755 0.2900150 0.0000000