I want to calculate the distances (dissimilarities) between the rows of two data frames in order to find the closest cluster for each observation. Because I have factors and numerical variables, I'm using Gower distance. As I want to compare two data frames (and not the dissimilarities between the rows of one matrix), gower.dist would be the function I need. However, when I implemented it, I realized that the results differ from the ones I get when I use daisy's gower, having the rows bound together and looking at the part of the dissimilarity matrix of interest.
I only provide a sample of my data here but when I calculated the dissimilarities with all data, gower.dist often resulted in dissimilarities of zero although the corresponding rows were not equal to each other. Why? And what could be the reason for the different results? In my opinion, daisys's gower is working correct and gower.dist isn't (in this example).
library(cluster)
library(StatMatch)
# Calculate distance using daisy's gower
daisyDist <- daisy(rbind(df,cent),metric="gower")
daisyDist <- as.matrix(daisyDist)
daisyDist <- daisyDist[(nrow(df)+1):nrow(daisyDist),1:nrow(df)] #only look at part where rows from df are compared to (rows of) cent
# Calculate distance using dist.gower
gowerDist <- gower.dist(cent,df)
with the following data
df <- structure(list(searchType = structure(c(NA, 1L, 1L, 1L, 1L), .Label = c("1", "2"), class = "factor"), roomMin = structure(c(4L, 1L, 1L, 6L, 6L), .Label = c("10", "100", "150", "20", "255", "30", "40", "50", "60", "70", "Missing[NoInput]"), class = "factor"), roomMax = structure(c(8L, 8L, NA, 10L, 9L), .Label = c("10", "100", "120", "150", "160", "20", "255", "30", "40", "50", "60", "70", "80", "90", "Missing[NoInput]"), class = "factor"), priceMin = c(NA, 73, 60, 29, 11), priceMax = c(35, 11, 1, 62, 23), sizeMin = structure(c(5L, 5L, 5L, 6L, 6L), .Label = c("100", "125", "150", "250", "50", "75", "Missing[NoInput]"), class = "factor"), sizeMax = structure(c(1L, 6L, 5L, 3L, 1L), .Label = c("100", "125", "150", "250", "50", "75", "Missing[NoInput]"), class = "factor"), longitude = c(6.6306, 7.47195, 8.5562, NA, 8.569), latitude = c(46.52425, 46.9512, 47.37515, NA, 47.3929), specificSearch = structure(c(1L, 1L, 1L, 1L, 1L), .Label = c("0", "1"), class = "factor"), objectType = structure(c(NA, 2L, 2L, 2L, 2L), .Label = c("1", "2", "3", "Missing[]"), class = "factor")), .Names = c("searchType", "roomMin", "roomMax", "priceMin", "priceMax", "sizeMin", "sizeMax", "longitude", "latitude", "specificSearch", "objectType"), row.names = c(112457L, 94601L, 78273L, 59172L, 117425L), class = "data.frame")
cent <- structure(list(searchType = structure(c(1L, 1L, 1L), .Label = c("1", "2"), class = "factor"), roomMin = structure(c(1L, 4L, 4L), .Label = c("10", "100", "150", "20", "255", "30", "40", "50", "60", "70", "Missing[NoInput]"), class = "factor"), roomMax = structure(c(6L, 9L, 8L), .Label = c("10", "100", "120", "150", "160", "20", "255", "30", "40", "50", "60", "70", "80", "90", "Missing[NoInput]"), class = "factor"), priceMin = c(60, 33, 73), priceMax = c(103, 46, 23), sizeMin = structure(c(1L, 5L, 5L), .Label = c("100", "125", "150", "250", "50", "75", "Missing[NoInput]"), class = "factor"), sizeMax = structure(c(1L, 2L, 1L), .Label = c("100", "125", "150", "250", "50", "75", "Missing[NoInput]"), class = "factor"), longitude = c(8.3015, 7.42765, 7.6104), latitude = c(47.05485, 46.9469, 46.75125), specificSearch = structure(c(1L, 1L, 1L), .Label = c("0", "1"), class = "factor"), objectType = structure(c(2L, 2L, 2L), .Label = c("1", "2", "3", "Missing[]"), class = "factor")), .Names = c("searchType", "roomMin", "roomMax", "priceMin", "priceMax", "sizeMin", "sizeMax", "longitude", "latitude", "specificSearch", "objectType"), row.names = c(60656L, 66897L, 130650L), class = "data.frame")
Thank you!
EDIT: it seems that the error/difference occurs because there are NAs in the numeric columns and they seem to be treated differently. How can I adapt daisy's treatment of NAs to gower.dist?
It's due to the NA values in the numeric columns of your data frame. Consider the following code, to see how the two functions behave completely differently with a numeric column with NA values (daisy is more robust that gower.dist):
df1 <- rbind(df,cent)
head(df1)
searchType roomMin roomMax priceMin priceMax sizeMin sizeMax longitude latitude specificSearch objectType
112457 <NA> 20 30 NA 35 50 100 6.63060 46.52425 0 <NA>
94601 1 10 30 73 11 50 75 7.47195 46.95120 0 2
78273 1 10 <NA> 60 1 50 50 8.55620 47.37515 0 2
59172 1 30 50 29 62 75 150 NA NA 0 2
117425 1 30 40 11 23 75 100 8.56900 47.39290 0 2
60656 1 10 20 60 103 100 100 8.30150 47.05485 0 2
# only use the numeric column priceMin (4th column) to compute the distance
class(df1[,4])
# [1] "numeric"
df2 <- df1[4]
# daisy output
as.matrix(daisy(df2,metric="gower"))
112457 94601 78273 59172 117425 60656 66897 130650
112457 0 NA NA NA NA NA NA NA
94601 NA 0.0000000 0.2096774 0.70967742 1.0000000 0.2096774 0.64516129 0.0000000
78273 NA 0.2096774 0.0000000 0.50000000 0.7903226 0.0000000 0.43548387 0.2096774
59172 NA 0.7096774 0.5000000 0.00000000 0.2903226 0.5000000 0.06451613 0.7096774
117425 NA 1.0000000 0.7903226 0.29032258 0.0000000 0.7903226 0.35483871 1.0000000
60656 NA 0.2096774 0.0000000 0.50000000 0.7903226 0.0000000 0.43548387 0.2096774
66897 NA 0.6451613 0.4354839 0.06451613 0.3548387 0.4354839 0.00000000 0.6451613
130650 NA 0.0000000 0.2096774 0.70967742 1.0000000 0.2096774 0.64516129 0.0000000
# gower.dist output
gower.dist(df2)
[,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8]
[1,] NaN NaN NaN NaN NaN NaN NaN NaN
[2,] NaN 0 0 0 0 0 0 0
[3,] NaN 0 0 0 0 0 0 0
[4,] NaN 0 0 0 0 0 0 0
[5,] NaN 0 0 0 0 0 0 0
[6,] NaN 0 0 0 0 0 0 0
[7,] NaN 0 0 0 0 0 0 0
[8,] NaN 0 0 0 0 0 0 0
Fix this with the parameter rngs in gower.dist function:
gower.dist(df2, rngs=max(df2, na.rm=TRUE) - min(df2, na.rm=TRUE))
[,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8]
[1,] NaN NaN NaN NaN NaN NaN NaN NaN
[2,] NaN 0.0000000 0.2096774 0.70967742 1.0000000 0.2096774 0.64516129 0.0000000
[3,] NaN 0.2096774 0.0000000 0.50000000 0.7903226 0.0000000 0.43548387 0.2096774
[4,] NaN 0.7096774 0.5000000 0.00000000 0.2903226 0.5000000 0.06451613 0.7096774
[5,] NaN 1.0000000 0.7903226 0.29032258 0.0000000 0.7903226 0.35483871 1.0000000
[6,] NaN 0.2096774 0.0000000 0.50000000 0.7903226 0.0000000 0.43548387 0.2096774
[7,] NaN 0.6451613 0.4354839 0.06451613 0.3548387 0.4354839 0.00000000 0.6451613
[8,] NaN 0.0000000 0.2096774 0.70967742 1.0000000 0.2096774 0.64516129 0.0000000
Hence a way to make the function gower.dist work like daisy when NAs are present in numeric variables can be like the following one:
df1 <- rbind(df,cent)
# compute the ranges of the numeric variables correctly
cols <- which(sapply(df1, is.numeric))
rngs <- rep(1, ncol(df1))
rngs[cols] <- sapply(df1[cols], function(x) max(x, na.rm=TRUE) - min(x, na.rm=TRUE))
daisyDist <- as.matrix(daisy(df1,metric="gower"))
gowerDist <- gower.dist(df1)
daisyDist
112457 94601 78273 59172 117425 60656 66897 130650
112457 0.0000000 0.3951059 0.6151851 0.7107843 0.6397059 0.6424374 0.3756990 0.1105551
94601 0.3951059 0.0000000 0.2355126 0.5788530 0.5629176 0.4235379 0.3651002 0.2199324
78273 0.6151851 0.2355126 0.0000000 0.5122549 0.4033046 0.3500130 0.3951874 0.3631533
59172 0.7107843 0.5788530 0.5122549 0.0000000 0.2969639 0.5446623 0.4690421 0.5657812
117425 0.6397059 0.5629176 0.4033046 0.2969639 0.0000000 0.4638003 0.4256891 0.4757460
60656 0.6424374 0.4235379 0.3500130 0.5446623 0.4638003 0.0000000 0.5063082 0.4272755
66897 0.3756990 0.3651002 0.3951874 0.4690421 0.4256891 0.5063082 0.0000000 0.2900150
130650 0.1105551 0.2199324 0.3631533 0.5657812 0.4757460 0.4272755 0.2900150 0.0000000
gowerDist
[,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8]
[1,] 0.0000000 0.3951059 0.6151851 0.7107843 0.6397059 0.6424374 0.3756990 0.1105551
[2,] 0.3951059 0.0000000 0.2355126 0.5788530 0.5629176 0.4235379 0.3651002 0.2199324
[3,] 0.6151851 0.2355126 0.0000000 0.5122549 0.4033046 0.3500130 0.3951874 0.3631533
[4,] 0.7107843 0.5788530 0.5122549 0.0000000 0.2969639 0.5446623 0.4690421 0.5657812
[5,] 0.6397059 0.5629176 0.4033046 0.2969639 0.0000000 0.4638003 0.4256891 0.4757460
[6,] 0.6424374 0.4235379 0.3500130 0.5446623 0.4638003 0.0000000 0.5063082 0.4272755
[7,] 0.3756990 0.3651002 0.3951874 0.4690421 0.4256891 0.5063082 0.0000000 0.2900150
[8,] 0.1105551 0.2199324 0.3631533 0.5657812 0.4757460 0.4272755 0.2900150 0.0000000