Search code examples
rdplyrapply

how to do the cluster wise summary statistics dynamically


Suppose i have this data with cluster mapped to each row as below:

head(movies.imp)
##   num_voted_users num_user_for_reviews num_critic_for_reviews
## 1          886204                 3054                    723
## 2          471220                 1238                    302
## 3          275868                  994                    602
## 4         1144337                 2701                    813
## 5               8                  127                     37
## 6          212204                  738                    462
##   movie_facebook_likes cast_total_facebook_likes cluster
## 1                33000                      4834       1
## 2                    0                     48350       1
## 3                85000                     11700       1
## 4               164000                    106759       1
## 5                    0                       143       2
## 6                24000                      1873       1

To get mean cluster wise i do:

a<-as.data.frame(aggregate( .~ cluster, FUN=mean, data=movies.imp))
a
##   cluster num_voted_users num_user_for_reviews num_critic_for_reviews
## 1       1       316561.46            831.60526              326.61773
## 2       2        44934.26            180.05922              109.69265
## 3       3        29020.10             80.20408               28.57143
##   movie_facebook_likes cast_total_facebook_likes
## 1            33647.263                 28282.450
## 2             3119.099                  6641.746
## 3             6843.327                  2426.755

Then i convert to long:

library(tidyr)
long_df.a <- gather(data=a, value = mean, key=variablenames,  num_voted_users,num_user_for_reviews,num_critic_for_reviews,movie_facebook_likes,cast_total_facebook_likes)
long_df.a
long_df.a[,1]<-as.factor(long_df.a[,1]) # converting into a factor
long_df.a[,2]<-as.factor(long_df.a[,2])# converting into a factor

Then i do the same process to get median,min,max & std clusterwise

creating one dataframe of all the descriptives stats calc above

  dflong<-cbind(long_df.a,long_df.b[,3],long_df.c[,3],long_df.d[,3],long_df.e[,3])

 dflong<-dflong%>% set_names(c("cluster","variablenames","mean","median","min","max","sd")) # renaming the columns
 head(dflong)  

 ##   cluster        variablenames         mean   median      min     max
 ## 1       1         num_voted_users   316561.45706 263332.5 246 1689764
 ## 2       2      num_voted_users      44934.26451  25256.5   5  469561
 ## 3       3      num_voted_users      29020.10204  9277.0    15  213483
 ## 4       1     num_user_for_reviews  831.60526    642.0     1    5060
 ## 5       2 num_user_for_reviews    180.05922      129.0     1    1690
 ## 6       3 num_user_for_reviews     80.20408      42.0      1     394
 ##            sd
 ## 1 231350.9509
 ## 2  53613.7994
 ## 3  48491.2638
## 4    659.0410
## 5    181.3630
## 6    105.1669   

subsetting the data clusterwise

 dflong.1<-dflong %>% filter(cluster==1)
 dflong.2<-dflong %>% filter(cluster==2)
 dflong.3<-dflong %>% filter(cluster==3)

 dflong.combined<-rbind(dflong.1,dflong.2,dflong.3)
 head(dflong.combined)  # final required output

##    cluster             variablenames         mean   median   min     max
## 1        1           num_voted_users   316561.45706 263332.5 246 1689764
## 2        1      num_user_for_reviews    831.60526    642.0    1    5060
## 3        1      num_critic_for_reviews    326.61773    307.5  2     813
## 4        1      movie_facebook_likes  33647.26316  23000.0    0  349000
## 5        1 cast_total_facebook_likes  28282.45014  21095.0    44  656730
## 6        2           num_voted_users  44934.26451  25256.5     5  469561

##              sd
## 1  231350.95086
## 2     659.04103
## 3     142.47953
## 4   37698.06583
## 5   37395.59205
## 6   53613.79942

So i am doing things in a non optimal way to get clusterwise summary stats....need help on how to sort of dynamically use loops or apply functions to get the final output in lesser lines of codes.....


Solution

  • I use the mtcars dataset as an example. Suppose the cyl variable is our equivalent of cluster. You can get all your summary statistics in one line of code:

    d <- mtcars
    s <- d %>% group_by(cyl) %>% 
      summarise_all(c("mean", "median", "min", "max", "sd"))
    #     cyl mpg_mean disp_mean   hp_mean drat_mean  wt_mean qsec_mean   vs_mean
    #   <dbl>    <dbl>     <dbl>     <dbl>     <dbl>    <dbl>     <dbl>     <dbl>
    # 1     4 26.66364  105.1364  82.63636  4.070909 2.285727  19.13727 0.9090909
    # 2     6 19.74286  183.3143 122.28571  3.585714 3.117143  17.97714 0.5714286
    # 3     8 15.10000  353.1000 209.21429  3.229286 3.999214  16.77214 0.0000000
    # # ... with 43 more variables: ...
    

    It just remains for us to reshape the dataframe to get it into our desired form:

    s <- gather(s, key, value, -cyl)
    s <- separate(s, key, c("variable", "stat"))
    d.combined <- spread(s, key = stat, value = value)
    # # A tibble: 30 × 7
    #      cyl variable    max        mean median    min         sd
    # *  <dbl>    <chr>  <dbl>       <dbl>  <dbl>  <dbl>      <dbl>
    # 1      4       am   1.00   0.7272727   1.00  0.000  0.4670994
    # 2      4     carb   2.00   1.5454545   2.00  1.000  0.5222330
    # 3      4     disp 146.70 105.1363636 108.00 71.100 26.8715937
    # 4      4     drat   4.93   4.0709091   4.08  3.690  0.3654711
    # 5      4     gear   5.00   4.0909091   4.00  3.000  0.5393599
    # 6      4       hp 113.00  82.6363636  91.00 52.000 20.9345300
    # 7      4      mpg  33.90  26.6636364  26.00 21.400  4.5098277
    # 8      4     qsec  22.90  19.1372727  18.90 16.700  1.6824452
    # 9      4       vs   1.00   0.9090909   1.00  0.000  0.3015113
    # 10     4       wt   3.19   2.2857273   2.20  1.513  0.5695637
    # # ... with 20 more rows