Search code examples
rtime-seriesdplyrhierarchical-clustering

Distance & cluster with dynamic time warping


I am using dtw to calculate distances between several series and getting strange results. Notice that in the sample data below the first 9 customers are identical sets (A==B==C, D==E==F, and G==H==I). The remaining rows are only for noise to allow me to make 8 clusters.

I expect that the first sets would be clustered with their identical partners. This happens when I calculate distance on the original data, but when I scale the data before distance/clustering I get different results.

The distances between identical rows in original data is 0.0 (as expected), but with scaled data the distances is not 0.0 (not even close). Any ideas why they are not the same?

library(TSdist)
library(dplyr)
library(tidyr)



   mydata = as_data_frame(read.table(textConnection("
cust   P1   P2   P3   P4  P5   P6   P7   P8   P9  P10
1     A  1.1  1.0  1.0  1.0 1.0  1.0  1.0  1.0  1.0  1.0
2     B  1.1  1.0  1.0  1.0 1.0  1.0  1.0  1.0  1.0  1.0
3     C  1.1  1.0  1.0  1.0 1.0  1.0  1.0  1.0  1.0  1.0
4     D  0.0  1.0  2.0  1.0 0.0  1.0  2.0  1.0  0.0  1.0
5     E  0.0  1.0  2.0  1.0 0.0  1.0  2.0  1.0  0.0  1.0
6     F  0.0  1.0  2.0  1.0 0.0  1.0  2.0  1.0  0.0  1.0
7     G  2.0  1.5  1.0  0.5 0.0  0.5  1.0  1.5  2.0  1.5
8     H  2.0  1.5  1.0  0.5 0.0  0.5  1.0  1.5  2.0  1.5
9     I  2.0  1.5  1.0  0.5 0.0  0.5  1.0  1.5  2.0  1.5
10   D2  1.0  2.0  1.0  0.0 1.0  2.0  1.0  0.0  1.0  2.0
11   E2  5.0  6.0  5.0  4.0 5.0  6.0  5.0  4.0  5.0  6.0
12   F2  9.0 10.0  9.0  8.0 9.0 10.0  9.0  8.0  9.0 10.0
13   G2  1.5  1.0  0.5  0.0 0.5  1.0  1.5  2.0  1.5  1.0
14   H2  5.5  5.0  4.5  4.0 4.5  5.0  5.5  6.0  5.5  5.0
15   I2  9.5  9.0  8.5  8.0 8.5  9.0  9.5 10.0  9.5  9.0
16   A3  1.0  1.0  0.0  2.0 1.0  1.0  1.0  1.0  1.0  1.0
17   B3  5.0  5.0  5.0  5.0 5.0  3.0  8.0  5.0  5.0  5.0
18   C3  9.0  9.0  9.0  9.0 9.0  5.4 14.4  9.0  9.0  9.0
19   D3  0.0  1.0  2.0  1.0 0.0  1.0  1.0  2.0  0.0  1.0
20   E3  4.0  5.0  5.0  6.0 4.0  5.0  6.0  5.0  4.0  5.0
21   F3  8.0  9.0 10.0  9.0 9.0  9.0  9.0  9.0  8.0  9.0
22   G3  2.0  1.5  1.0  0.5 0.0  0.5  1.0  2.0  1.5  1.5
23   H3  6.0  5.5  5.0  4.5 4.0  5.0  4.5  5.5  6.0  5.5
24   I3 10.0  9.5  9.0  9.0 8.0  8.5  9.0  9.5 10.0  9.5
25   D4  0.0  3.0  6.0  3.0 0.0  3.0  6.0  3.0  0.0  5.0
26   E4  3.0  6.0  9.0  6.0 3.0  6.0  9.0  6.0  3.0  6.0
27   F4  4.0  6.0 10.0  7.0 5.0  6.0 11.0  8.0  5.0  7.0
28   D5  5.0  0.0  3.0  6.0 3.0  0.0  3.0  6.0  3.0  0.0
29   D6  9.0  6.0  3.0  6.0 9.0  6.0  3.0  6.0  9.0  6.0
30   D7  9.0 11.0  5.0  4.0 6.0 10.0  7.0  5.0  6.0 11.0
31   Dw  0.0  0.8  1.4  2.0 1.0  0.0  2.0  0.0  1.0  2.0
32   Ew  4.0  4.8  5.4  6.0 5.0  4.0  6.0  4.0  5.0  6.0
33   Fw  8.0  8.8  9.4 10.0 9.0  8.0 10.0  8.0  9.0 10.0
34   Gw  2.0  1.5  1.0  0.5 0.0  1.0  2.0  1.5  1.3  1.1
35   Hw  6.0  5.5  5.0  4.5 4.0  5.0  6.0  5.5  5.3  5.1
36   Iw 10.0  9.5  9.0  8.5 8.0  9.0 10.0  9.5  9.3  9.1"),
                           header = TRUE, stringsAsFactors = FALSE))

k=8
# create a scale version of mydata (raw data - mean) / std dev
      mydata_long = mydata %>%
            mutate (mean = apply(mydata[,2:ncol(mydata)],1,mean,na.rm = T)) %>%
            mutate (sd = apply(mydata[,2:(ncol(mydata))],1,sd,na.rm = T))%>%
            gather (period,value,-cust,-mean,-sd) %>%
            mutate (sc = (value-mean)/sd)
      mydata_sc = mydata_long[,-c(2,3,5)] %>%
        spread(period,sc)
  # dtw
        dtw_dist = TSDatabaseDistances(mydata[2:ncol(mydata)], distance = "dtw",lag.max= 2) #distance
        dtw_clus = hclust(dtw_dist, method="ward.D2") # Cluster 
        dtw_res = data.frame(cutree(dtw_clus, k)) # cut dendrogram into 9 clusters
  # dtw (w scaled data)
        dtw_sc_dist = TSDatabaseDistances(mydata_sc[2:ncol(mydata_sc)], distance = "dtw",lag.max= 2) #distance
        dtw_sc_clus = hclust(dtw_sc_dist, method="ward.D2") # Cluster      
        dtw_sc_res = data.frame(cutree(dtw_sc_clus, k)) # cut dendrogram into 9 clusters

results = cbind (dtw_res,dtw_sc_res)
  names(results) = c("dtw", "dtw_scaled")

  print(results)

   dtw dtw_scaled
1    1          1
2    1          2
3    1          1
4    1          2
5    1          1
6    1          2
7    1          3
8    1          4
9    1          3
10   1          3
11   2          3
12   3          4
13   1          5
14   2          6
15   3          3
16   1          4
17   2          3
18   4          3
19   1          6
20   2          3
21   3          4
22   1          3
23   2          3
24   3          6
25   5          7
26   6          8
27   7          7
28   5          7
29   6          7
30   8          8
31   1          7
32   2          7
33   3          7
34   1          8
35   2          7
36   3          7

Solution

  • A couple issues

    1. You are scaling rowwise, not columnwise (take a look at the intermediate results of your dplyr chain -- do they make sense?)

    2. The data manipulations you used to produce the scaled data changed the rows ordering of your data frame to alphabetical:

      > mydata_sc %>% head
      
           cust          P1          P2          P3          P4          P5         P6         P7          P8          P9         P10
        (chr)       (dbl)       (dbl)       (dbl)       (dbl)       (dbl)      (dbl)      (dbl)       (dbl)       (dbl)       (dbl)
        1     A  2.84604989 -0.31622777 -0.31622777 -0.31622777 -0.31622777 -0.3162278 -0.3162278 -0.31622777 -0.31622777 -0.31622777
        2    A3  0.00000000  0.00000000 -2.12132034  2.12132034  0.00000000  0.0000000  0.0000000  0.00000000  0.00000000  0.00000000
        3     B  2.84604989 -0.31622777 -0.31622777 -0.31622777 -0.31622777 -0.3162278 -0.3162278 -0.31622777 -0.31622777 -0.31622777
      

      vs.

        > mydata %>% head
        Source: local data frame [6 x 11]
      
               cust    P1    P2    P3    P4    P5    P6    P7    P8    P9   P10
        (chr) (dbl) (dbl) (dbl) (dbl) (dbl) (dbl) (dbl) (dbl) (dbl) (dbl)
        1     A   1.1     1     1     1     1     1     1     1     1     1
        2     B   1.1     1     1     1     1     1     1     1     1     1
      

    (check the cust variable ordering!)

    Here's my approach, and how I think you can avoid similar mistakes in the future:

    1. scale with built-in scale function

      mydata_sc <- mydata %>% select(-cust) %>% scale %>% as.data.frame %>% cbind(cust =mydata$cust,.) %>% as.tbl
      
    2. assert that your scaled dataframe is equivalent to a scaled version of your original dataframe:

      > (scale(mydata_sc %>% select(-cust)) - scale(mydata %>% select(-cust))) 
          %>% colSums %>% sum
      
      [1] 0.000000000000005353357 
      
    3. Create one single function to perform your desired manipulations:

      return_dtw <- function(df) {
        res_2 = TSDatabaseDistances(df[2:ncol(df)],distance="dtw",lag.max=2) %>%
           hclust(.,method="ward.D2")
          return(data.frame(cutree(res_2,k)))
      }
      
    4. execute function:

        > mydata %>% return_dtw %>% cbind(mydata_sc %>% return_dtw)
        cutree.res_2..k. cutree.res_2..k.
        1                 1                1
        2                 1                1
        3                 1                1
        4                 1                1
        5                 1                1
        6                 1                1
        7                 1                1
        8                 1                1
        9                 1                1
        10                1                1
        11                2                2
        12                3                3
        13                1                1
        14                2                2
        15                3                3
        16                1                1
        17                2                2
        18                4                3
        19                1                1
        20                2                2
        21                3                3
        22                1                1
        23                2                2
        24                3                3
        25                5                4
        26                6                5
        27                7                5
        28                5                6
        29                6                7
        30                8                8
        31                1                1
        32                2                2
        33                3                3
        34                1                1
        35                2                2
        36                3                3
      

    Some of the later customers are not grouped similarly, but that's for another question!