r loops conditional-statements rules cycle

create a rule in R to count number of consultations per patient per day

I have created the following dataset with key scenarios that I have in my actual dataset:

df <- data.frame (organisation_id  = c("1","1","2","2","2","2","2","2","3","3","3","3","3","4","4","4","4","4","4","4","4","4","4","4","4","4","4","4","4","4","4","4","4","4"),
                  patient_id = c("1230","1230","1222","1222","1244","1244","987","987","2223","2223","2247","2247","2247","1234","1234","1234","1234","1234","1234","1234","1234","1239","1239","1239","3322","3322","3322","5434","5434","4488","4488","4488","1250","1250"),
                  date = c("08-02-2018","08-02-2018","12-01-2018","12-01-2018","12-01-2018","22-02-2018","12-01-2018","22-02-2018","01-03-2019","01-03-2019","01-03-2019","01-03-2019","01-03-2019","12-07-2020","12-07-2020","12-07-2020","12-07-2020","12-07-2020","12-07-2020","12-07-2020","12-07-2020","13-07-2020","13-07-2020","13-07-2020","16-06-2021","16-06-2021","16-06-2021","14-05-2019","14-05-2019","17-03-2020","17-03-2020","17-03-2020","03-02-2019","03-02-2019"),
                  consultation_mode = c("Telephone","Face-to-Face","Telephone","Telephone","Face-to-Face","Face-to-Face","Telephone","Telephone","Home visit","Home visit","Face-to-Face","Face-to-Face","Face-to-Face","Telephone","Telephone","Telephone","Telephone","Face-to-Face","Face-to-Face","Face-to-Face","Face-to-Face","Home visit","Home visit","Home visit","Face-to-Face","Telephone","Face-to-Face","Telephone","Face-to-Face","Face-to-Face","Telephone","Telephone","Face-to-Face","Face-to-Face"),
                  professional_id = c("24","11","123","110","123","110","123","333","444","444","444","444","444","1133","12","25","26","12","34","35","38","44","44","5556","443","443","445","29","29","555","5556","12","1133","113663"),
                  professional_role = c("Doctor","Support","Doctor","Support","Doctor","Support","Doctor","Nurse","Doctor","Doctor","Doctor","Doctor","Doctor","Support","Support","Nurse","Nurse","Support","Doctor","Doctor","Nurse","Nurse","Nurse","Doctor","Doctor","Doctor","Doctor","Doctor","Doctor","Doctor","Doctor","Support","Support","Support"),
                  professional_name = c("Dr John Taylor","Mary Wright","Dr Patricia Jones","James Davies","Dr Patricia Jones","James Davies","Dr Patricia Jones","Peter Hall","Dr Mary Wilson","Dr Mary Wilson","Dr Mary Wilson","Dr Mary Wilson","Dr Mary Wilson","Mary Wright","Anthony Patel","Jennifer Walker","Jennifer Walker","Anthony Patel","Dr Carol Bell","Dr Carol Bell","Deborah Dixon","Kevin R Collins","Kevin Collins","Dr Robert Brown","Dr Mary Wilson","Dr Mary Wilson","Dr John Snow","Dr John Taylor","Dr John Taylor","Dr James Smith","Dr Robert Brown","Anthony Patel","Mary Wright","Mary TEST Wright")
)

df$organisation_id <- as.factor(df$organisation_id)
df$patient_id <- as.factor(df$patient_id)
df$date <- as.Date(df$date, "%d-%m-%Y")
df$consultation_mode <- as.factor(df$consultation_mode)
df$professional_id <- as.factor(df$professional_id)
df$professional_role <- as.factor(df$professional_role)

I want to create two extra columns (include? and Nr_consultations_per_Pt_day) as per the below:

For each organisation_id, patient_id, date and consultation_mode check:

1- If there is only 1 row, include? = 1 and Nr_consultations_per_Pt_day = 1 for that professional_role.

2- If there is more than 1 row, include? = 1 for each different professional_id and professional_name with consultation_role = 'Doctor' or 'Nurse'.

Note: if there are 2+ entries for ‘Doctor’ or ‘Nurse’ with different professional_id but same professional_name, the first row gets include? = 1 and the following rows include? = 0. E.g. 25 / 26 IDs for Jennifer Walker. Similarly, if there are 2+ entries for ‘Doctor’ or ‘Nurse’ with same professional_id but different professional_name, the first row gets include? = 1 and the following rows include? = 0. E.g. 44 ID for Kevin R Collins / Kevin Collins.

2.1- If there is 0 'Doctor' or 'Nurse' (all ‘Support’), then the first row gets include? = 1 and the following rows include? = 0, with Nr_consultations_per_Pt_day = 1 for that professional_role.

Intermediate dataset:

organisation_id	patient_id	date	consultation_mode	professional_id	professional_role	professional_name	include?
1	1230	08-02-2018	Telephone	24	Doctor	Dr John Taylor	1
1	1230	08-02-2018	Face-to-Face	11	Support	Mary Wright	1
2	1222	12-01-2018	Telephone	123	Doctor	Dr Patricia Jones	1
2	1222	12-01-2018	Telephone	110	Support	James Davies	0
2	1244	12-01-2018	Face-to-Face	123	Doctor	Dr Patricia Jones	1
2	1244	22-02-2018	Face-to-Face	110	Support	James Davies	1
2	987	12-01-2018	Telephone	123	Doctor	Dr Patricia Jones	1
2	987	22-02-2018	Telephone	333	Nurse	Peter Hall	1
3	2223	01-03-2019	Home visit	444	Doctor	Dr Mary Wilson	1
3	2223	01-03-2019	Home visit	444	Doctor	Dr Mary Wilson	0
3	2247	01-03-2019	Face-to-Face	444	Doctor	Dr Mary Wilson	1
3	2247	01-03-2019	Face-to-Face	444	Doctor	Dr Mary Wilson	0
3	2247	01-03-2019	Face-to-Face	444	Doctor	Dr Mary Wilson	0
4	1234	12-07-2020	Telephone	1133	Support	Mary Wright	0
4	1234	12-07-2020	Telephone	12	Support	Anthony Patel	0
4	1234	12-07-2020	Telephone	25	Nurse	Jennifer Walker	1
4	1234	12-07-2020	Telephone	26	Nurse	Jennifer Walker	0
4	1234	12-07-2020	Face-to-Face	12	Support	Anthony Patel	0
4	1234	12-07-2020	Face-to-Face	34	Doctor	Dr Carol Bell	1
4	1234	12-07-2020	Face-to-Face	35	Doctor	Dr Carol Bell	0
4	1234	12-07-2020	Face-to-Face	38	Nurse	Deborah Dixon	1
4	1239	13-07-2020	Home visit	44	Nurse	Kevin R Collins	1
4	1239	13-07-2020	Home visit	44	Nurse	Kevin Collins	0
4	1239	13-07-2020	Home visit	5556	Doctor	Dr Robert Brown	1
4	3322	16-06-2021	Face-to-Face	443	Doctor	Dr Mary Wilson	1
4	3322	16-06-2021	Telephone	443	Doctor	Dr Mary Wilson	1
4	3322	16-06-2021	Face-to-Face	445	Doctor	Dr John Snow	1
4	5434	14-05-2019	Telephone	29	Doctor	Dr John Taylor	1
4	5434	14-05-2019	Face-to-Face	29	Doctor	Dr John Taylor	1
4	4488	17-03-2020	Face-to-Face	555	Doctor	Dr James Smith	1
4	4488	17-03-2020	Telephone	5556	Doctor	Dr Robert Brown	1
4	4488	17-03-2020	Telephone	12	Support	Anthony Patel	0
4	1250	03-02-2019	Face-to-Face	1133	Support	Mary Wright	1
4	1250	03-02-2019	Face-to-Face	113663	Support	Mary TEST Wright	0

Final dataset: Example for one organisation_id,patient_id,date and for each category of consultation_mode and professional_role.

organisation_id	patient_id	date	consultation_mode	professional_role	Nr_consultations_per_Pt_day
1	1230	08-02-2018	Face-to-Face	Doctor	0
1	1230	08-02-2018	Face-to-Face	Nurse	0
1	1230	08-02-2018	Face-to-Face	Support	1
1	1230	08-02-2018	Telephone	Doctor	1
1	1230	08-02-2018	Telephone	Nurse	0
1	1230	08-02-2018	Telephone	Support	0
1	1230	08-02-2018	Home visit	Doctor	0
1	1230	08-02-2018	Home visit	Nurse	0
1	1230	08-02-2018	Home visit	Support	0

etc.

Any ideas on how to do this in R in an efficient way?

Solution

If I understand your description correctly, for each row we want to evaluate the following conditions to decide whether include? = 1:

The row's group size for organisation_id-patient_id-date-consultation_mode is 1
The row's group size for organisation_id-patient_id-date-consultation_mode is greater than 1 AND the row corresponds to a:
1. Doctor AND is the first among doctors with the same id/name
2. Nurse AND is the first among nurses with the same id/name
3. Support AND is the first among support AND is part of a organisation_id-patient_id-date-consultation_mode group that has no doctor or nurse

This logic will create the "intermediate" table. To create the "final" table, we go through each category of consultation_mode and professional_role and set Nr_consultations_per_Pt_day = 1 if there's a corresponding entry with include? = 1.

Based on the above expectation, here's how I'd do it:

library(tidyverse)

# For each row, add the size of its 
# organisation_id-patient_id-date-consultation_mode group
df2 <- df %>% group_by(organisation_id, patient_id, date, consultation_mode) %>% 
    mutate(group_size = n()) %>% ungroup()

# For each row, indicate whether it's the first entry of 
# organisation_id-patient_id-date-consultation_mode-professional_role group 
# of people with the SAME NAME but possiblly different ID
df3 <- df2 %>% group_by(organisation_id, patient_id, date, consultation_mode, 
        professional_role, professional_name) %>% 
    mutate(first_by_name = row_number()==1) %>% 
    ungroup()

# For each row, indicate whether it's the first entry of 
# organisation_id-patient_id-date-consultation_mode-professional_role group 
# of people with the SAME ID but possiblly different name
df4 <- df3 %>% group_by(organisation_id, patient_id, date, consultation_mode, 
        professional_role, professional_id) %>% 
    mutate(first_by_id = row_number()==1) %>% 
    ungroup()

# For each row, indicate whether there's no doctor/nurse in its 
# organisation_id-patient_id-date-consultation_mode
# and indicate the first entry in such support-only group
df5 <- df4 %>% group_by(organisation_id, patient_id, date, consultation_mode) %>% 
    mutate(support_only_group = length(intersect(professional_role, c("Doctor", "Nurse"))) == 0) %>% 
    mutate(first_in_support_only = row_number()==1 & support_only_group) %>% 
    ungroup()

# Apply rules to determine the inclusion status of each row
df6 <- df5 %>% mutate(`include?` = if_else(
        group_size == 1 | 
        (professional_role %in% c("Doctor","Nurse") & (first_by_name & first_by_id)) |
        first_in_support_only, 1, 0)) 
df6

Convert into the final table:

# Convert into the final table
df7 <- df6 %>% 
    select(-c(group_size, first_by_name, first_by_id, support_only_group, first_in_support_only)) %>% 
    group_by(organisation_id, patient_id, date) %>% 
    expand(consultation_mode, professional_role) %>%
    left_join(df6) %>%
    mutate(Nr_consultations_per_Pt_day = replace_na(`include?`,0)) %>%
    select(-c(professional_id, professional_name, `include?`)) %>%
    group_by(organisation_id, patient_id, date, consultation_mode, professional_role) %>%
    summarise(Nr_consultations_per_Pt_day = sum(Nr_consultations_per_Pt_day))

df7 %>% filter(patient_id %in% c(2223, 1250, 1230))