I am working with a longitudinal user event generated data set and I am trying to cluster the user ID's in the data at a Month-Year level using k-means. The idea is that I want to see how users disappear from or move into different cluster archetypes over the different timepoints.
Here is code I have so far, which contains a mock dataframe and the clustering process.
library(Pareto)
library(uuid)
library(ggplot2)
library(data.table)
library(zoo)
#generating the user ID variable
set.seed(1)
n_users <- 1300
n_rows <- 365000
relative_probs <- rPareto(n = n_users, t = 1, alpha = 0.3, truncation = 500)
unique_ids <- UUIDgenerate(n = n_users)
id_sample <- sample(unique_ids, size = n_rows, prob = relative_probs, replace = TRUE)
id_sample
df<-data.frame(id_sample)
#creating the date variable
df$Date<-sample(seq(as.Date("2015-01-01"), as.Date("2017-12-31"), by = "1 day"),
size = n_rows,replace = T)
#creating a numeric value called Duration; this will be a feature in the clustering
df$Duration<-sample(0:3000, nrow(df), replace = T)
df<-df%>%arrange(Date)
#getting Month-Year
df$MonthYear<-as.Date(as.yearmon(df$Date, "%m/%Y"))
MonthYear<-unique(df$MonthYear)
#empty results df
resultsdf<-data.frame()
for (i in MonthYear) {
#getting variables for clustering. I need to cluster based on the number of times the User appears in the data i.e. "Count"
#the second variable is the mean duration for each User ID i.e. "MeanDur"
#the third and final variable is the standard deviation of duration i.e. "SDDur"
df_filtered<-df%>%
filter(MonthYear<=i)
callerData<-df_filtered%>%
group_by(id_sample)%>%
summarise(Count=n(),MeanDur=mean(Duration),SDDur=sd(Duration))
#convert NA to zero's
callerData$SDDur[is.na(callerData$SDDur)]<-0.0
#scale data
scaledData<-scale(callerData[,2:4])
set.seed(20)
clust<-kmeans(scaledData, centers= 5,nstart = 15)
#pinning cluster number back onto callerData
callerData$Cluster<-clust$cluster
#getting cluster means and creating a rank order based on "Count"
callerData_centers<-callerData%>%
group_by(Cluster)%>%
summarise(Count=mean(Count),MeanDur=mean(MeanDur),SDDur=mean(SDDur))%>%
arrange(Count)
callerDate_centers$Rank<-c(1:5)
#Once the new ranking variable is created, I then use the code below to consistently name the clusters based on their rank
setDT(callerData_centers)[Rank==1,ClusName:="Cluster 1"]
callerData_centers[Rank==2,ClusName:="Cluster 2"]
callerData_centers[Rank==3,ClusName:="Cluster 3"]
callerData_centers[Rank==4,ClusName:="Cluster 4"]
callerData_centers[Rank==5,ClusName:="Cluster 5"]
#get the ClusName variable and the Cluster; this is then used to merge the new name back onto callerData
callerData_vars<-callerData_centers%>%select(Cluster,ClusName)
callerData<-merge(callerData,callerData_vars, by="Cluster")
newVars<-callerData%>%
select(CallerId,ClusterName)%>%
mutate(MonthYear=i)
resultsdf<-rbind(resultsdf,newVars)
}
head(resultsdf)
So the code filters backward from MonthYear<=i
and clusters it. But what I would like to know is there a way for me to detect if certain users move from one cluster to another or disappear? Say for instance, id_sample=abcdef
is in cluster 1 for 7 months but then moves to cluster 4 after the 7 months. How could I detect this?
I had to change your code a bit to make it run. CallerId
and ClusterName
are not part of callerData
. So first run this:
library(Pareto)
library(uuid)
library(ggplot2)
library(data.table)
library(zoo)
library(dplyr)
#generating the user ID variable
#generating the user ID variable
set.seed(1)
n_users <- 1300
n_rows <- 365000
relative_probs <-
rPareto(
n = n_users,
t = 1,
alpha = 0.3,
truncation = 500
)
unique_ids <- UUIDgenerate(n = n_users)
id_sample <-
sample(unique_ids,
size = n_rows,
prob = relative_probs,
replace = TRUE)
id_sample
df <- data.frame(id_sample)
#creating the date variable
df$Date <-
sample(seq(as.Date("2015-01-01"), as.Date("2017-12-31"), by = "1 day"),
size = n_rows,
replace = T)
#creating a numeric value called Duration; this will be a feature in the clustering
df$Duration <- sample(0:3000, nrow(df), replace = T)
df <- df %>% arrange(Date)
#getting Month-Year
df$MonthYear <- as.Date(as.yearmon(df$Date, "%m/%Y"))
MonthYear <- unique(df$MonthYear)
#empty results df
resultsdf <- data.frame()
for (i in MonthYear) {
#getting variables for clustering. I need to cluster based on the number of times the User appears in the data i.e. "Count"
#the second variable is the mean duration for each User ID i.e. "MeanDur"
#the third and final variable is the standard deviation of duration i.e. "SDDur"
df_filtered <- df %>%
filter(MonthYear <= i)
callerData <- df_filtered %>%
group_by(id_sample) %>%
summarise(
Count = n(),
MeanDur = mean(Duration),
SDDur = sd(Duration)
)
#convert NA to zero's
callerData$SDDur[is.na(callerData$SDDur)] <- 0.0
#scale data
scaledData <- scale(callerData[, 2:4])
set.seed(20)
clust <- kmeans(scaledData, centers = 5, nstart = 15)
#pinning cluster number back onto callerData
callerData$Cluster <- clust$cluster
#getting cluster means and creating a rank order based on "Count"
callerData_centers <- callerData %>%
group_by(Cluster) %>%
summarise(
Count = mean(Count),
MeanDur = mean(MeanDur),
SDDur = mean(SDDur)
) %>%
arrange(Count)
callerData_centers$Rank <- c(1:5)
#Once the new ranking variable is created, I then use the code below to consistently name the clusters based on their rank
setDT(callerData_centers)[Rank == 1, ClusName := "Cluster 1"]
callerData_centers[Rank == 2, ClusName := "Cluster 2"]
callerData_centers[Rank == 3, ClusName := "Cluster 3"]
callerData_centers[Rank == 4, ClusName := "Cluster 4"]
callerData_centers[Rank == 5, ClusName := "Cluster 5"]
#get the ClusName variable and the Cluster; this is then used to merge the new name back onto callerData
callerData_vars <-
callerData_centers %>% select(Cluster, ClusName)
callerData <- merge(callerData, callerData_vars, by = "Cluster")
newVars <- callerData %>%
select(id_sample, ClusName) %>%
mutate(MonthYear = i)
resultsdf <- rbind(resultsdf, newVars)
}
Then, you can reshape your dataframe to wide format and see that some of the ids will jump from cluster to cluster (e.g. in row 5 from column 4 to 5)
# long to wide
dfwide <- data.table::dcast(resultsdf, formula = id_sample ~ MonthYear, value.var = 'ClusName')
colnames(dfwide)[2:37] <- paste0('Date_', MonthYear)
> dfwide[1:5, 1:5]
id_sample Date_2015-01-01 Date_2015-02-01 Date_2015-03-01 Date_2015-04-01
1 0025a4ba-d620-4ffc-a82d-660354c2b21d <NA> Cluster 3 Cluster 3 Cluster 3
2 00403759-46d8-4c60-b298-a57e6299b2ca Cluster 3 Cluster 3 Cluster 3 Cluster 3
3 005e6e19-8e02-4326-993d-1fffa0b70c67 Cluster 4 Cluster 4 Cluster 4 Cluster 4
4 007c99ef-7e37-42c0-8883-90c275c03eab Cluster 3 Cluster 3 Cluster 3 Cluster 3
5 007e70f9-d960-4679-8088-a1065ea9835c Cluster 3 Cluster 3 Cluster 3 Cluster 2
You can then check which IDs did not stay within one cluster:
# check if they stay in their cluster:
dfwide_subset <- na.omit(dfwide) # drop rows with NAs
res <- dfwide_subset[, c(2:5)] == dfwide_subset[, 2]
res <- data.frame('id_sample' = dfwide_subset$id_sample,
'switches_cluster' = rowSums(!res))
> head(res)
id_sample switches_cluster
2 00403759-46d8-4c60-b298-a57e6299b2ca 0
3 005e6e19-8e02-4326-993d-1fffa0b70c67 0
4 007c99ef-7e37-42c0-8883-90c275c03eab 0
5 007e70f9-d960-4679-8088-a1065ea9835c 1
6 00b90528-5ee1-40f2-a2ba-5b6fc4b3707f 0
8 00d5cf0c-e2b4-4ed6-a69c-776b83ff8697 0