I have a very simple problem: I want to check which persons in DF1 are contained in DF2. I want to do so based on their
I want to keep only those rows of DF1 and DF2 that are correct matches.
DF1 looks like this
(Edit: "XXX" to "Joe")
DF1 <- data.frame(row_ID = 1:13,
first_name = c("Jovana", "Jovana", "Jovana", "Joe", "Jovana", "Jovana", "Zuhair", "Jackson", "James", "Alexandria", "Nicole", "Carl", "Matthew"),
middle_name = c("Cole", "", "Joe", "Cole", "Cole", "Cole", "Beate", "Milhouse", "", "Macy", "Riley", "", ""),
last_name = c("Tossie", "Tossie", "Tossie", "Tossie", "Tossie", "Joe", "Biddison", "Beck", "Baker", "Maya", "Grinstead", "Domenico", "Hosler"),
date_of_birth = as.Date(c("1930-07-05","1930-07-05", "1930-07-05", "1930-07-05", "2000-01-01", "1930-07-05", "1939-04-18", "1936-11-11", "1933-02-18"," 1942-10-18", "1945-03-24", "1948-01-25", "1951-02-03")),
var_difference = c("none", "no middle name", "diff middle name", "first name", "date of birth", "last name", "middle name not abbr", "middle name incl", "no title", "middle name column", "columns", "columns", "columns"),
var_should_be_found = c("yes", "yes", "no", "no", "no", "no", "yes", "yes", "yes", "yes", "yes", "yes", "yes"))
DF2 looks like this:
(edit: BD of Zuhair Biddison from 1933-02-18 to 1939-04-18)
DF2 <- data.frame(row_ID = 1:20,
first_name = c("Jovana","Zuhair","Jackson","Dr. James","Alexandria Macy","Nicole Riley Grinstead","","","Isaiah","Wyatt","Rayyana","Dhaahir","Lauren",
"Tony","Aziza","Cody","Paige","Jasmine","Kawkab","Pedro"),
middle_name = c("Cole","B.","", "","","","","","Kyrie","","Zachary", "","Tyler","", "Brian",
"","Amy", "","Robert",""),
last_name = c("Tossie","Biddison","Beck","Baker","Maya","", "Carl Domenico","Hosler, Matthew","Bishop","Ericson","Leptich","Franks","Pummer","Neves","Ferguson","Jennings",
"Phillips","Wyatt","Caisse","Laplante"),
date_of_birth = as.Date(c("1930-07-05", "1939-04-18", "1936-11-11", "1939-04-18",
"1942-10-18", "1945-03-24", "1948-01-25", "1951-02-03",
"1954-05-27", "1957-08-05", "1960-08-01", "1963-11-26",
"1966-05-25", "1969-11-19", "1972-01-28", "1975-06-17",
"1978-07-24", "1981-07-11", "1984-10-28", "1987-09-14")),
var_other = sample(colours(), 20)
)
There are a number of flaws in DF2
Sometimes:
As said, in the end, I would like to keep just the rows of persons that appear in DF1 and DF2, throw away the rest of the rows, and merge the columns of DF1 and DF2.
Firstly, is there any convenient and quick function for this? (the problem seems simple enough, but I did not find any)
If not, here is what I have done. It works but it is too slow for my purposes. For DF1 (approx. 74000 obs) and one of several DF2 (beyond 100000 obs), it takes hours
I would be very thankful for any help!
My approach:
DF1$all_names <- paste(DF1$first_name,
DF1$middle_name,
DF1$last_name,
sep = " ")
DF2$all_names <- paste(DF2$first_name,
DF2$middle_name,
DF2$last_name,
sep = " ")
##########################
# FUNCTION: BD MATCH log #
##########################
BD_MATCH <- function(the_data, birthday){
not_precise_date <- T
not_found <- T
bd_found <- F
while(not_precise_date & !bd_found & nrow(the_data)> 1){
# check half
half_of_df <- ceiling(nrow(the_data)/2)
# is bd at half?
bd_found <- the_data[half_of_df, "date_of_birth"] == birthday
if(bd_found){bd_row_id <- the_data[half_of_df, "row_ID"]; break} # else{bd_row_id <- NULL}
# is the bd above or below
in_upper_half <- the_data[half_of_df, "date_of_birth"] >= birthday
# subset accordingly
if(in_upper_half){the_data <- the_data[1:half_of_df, ]
} else{the_data <- the_data[(half_of_df+1):nrow(the_data), ]}
}
if(bd_found){return(bd_row_id)} else{return(NA)}
}
###########################
# FUNCTION: BD MATCH tree #
###########################
# search above and below for duplicate bds
TREE_FUN <- function(the_bd_vec, the_row){
birthday <- the_bd_vec[the_row]
# search above
i <- the_row
bd_criterion <- T
while(bd_criterion & i>1){
i <- i-1
bd_criterion <- the_bd_vec[i] == birthday
}
begin <- ifelse(bd_criterion, 1, i+1)
# search below
i <- the_row
bd_criterion <- T
while(bd_criterion & i <= length(the_bd_vec)){
i <- i+1
bd_criterion <- the_bd_vec[i] == birthday
}
if(is.na(bd_criterion)|bd_criterion == F){
end <- i-1
} else{
end <- i
}
return(begin:end)
}
(this matches, i.a. persons who, for example, differ in their last names, but have matching first names, middle names and birthdays. This is incorrect, but very rare.)
##########
# SEARCH #
##########
res_list <- list()
for(j in 1:nrow(DF1)){
birthday <- DF1$date_of_birth[j]
DF1_name <- strsplit(DF1$all_names[j], split = " ")
# SEARCH BIRTHDAY
bd_row_id <- BD_MATCH(DF2, birthday)
# SEARCH NAME
if(is.na(bd_row_id)){
res_list[[j]] <- NA
} else{
the_row <- which(DF2$row_ID == bd_row_id)
the_bd_vec <- DF2$date_of_birth
begin_end <- TREE_FUN(the_bd_vec, the_row)
BD_subset <- DF2[begin_end, ]
##############
# NAME CHECK #
##############
DF2_name <- strsplit(BD_subset$all_names, split = " ")
the_vec <- NULL
nest <- list()
for(k in seq(DF2_name)){
if(sum(DF2_name[[k]] %in% DF1_name[[1]]) >= 2) {
the_vec <- c(the_vec, k)
nest[[k]] <- BD_subset[the_vec, ]
} else {
nest[[k]] <- NA
}
}
if(sum(is.na(nest)) == length(nest)){
res_list[[j]] <- NA
}
else{
res_list[[j]] <- bind_rows(nest[!is.na(nest)])
}
}
print(j)
}
found_DF1 <- DF1[which(!is.na(res_list)), ]
found_DF2 <- res_list[!is.na(res_list)]
for(i in seq(found_DF2)){
found_DF2[[i]] <- cbind(found_DF2[[i]], found_DF1[i , ])
}
found_DF2 <- bind_rows(found_DF2)
Avoid cleaning and calculation loops. Instead, consider cleaning up both data frames by vectorized operations to properly normalize names with each name column holding a single identifier. Then, run an rbind
of two merge
s first by all three names and second by first and last. Then run unique()
to de-duplicate rows.
within
(clean up with strsplit
and ifelse
)
Note: Below solution accommodates posted data and may need to be expanded for other data issues.
DF1_clean <- within(
DF1, {
first_name <- gsub("XXX", "", first_name)
middle_name <- gsub("XXX", "", middle_name)
last_name <- gsub("XXX", "", last_name)
}
)
DF2_clean <- within(
DF2, {
# FIRST NAME CLEANUP
first_temp <- trimws(gsub("Dr.|Mr.|Ms.|Mrs.", "", first_name))
first_name_ <- trimws(sapply(strsplit(first_temp, " "), `[`, 1))
middle_name_ <- trimws(sapply(strsplit(first_temp, " "), `[`, 2))
last_name_ <- trimws(sapply(strsplit(first_temp, " "), `[`, 3))
first_name <- ifelse(is.na(first_name_), first_name, first_name_)
middle_name <- ifelse(is.na(middle_name_), middle_name, middle_name_)
last_name <- ifelse(is.na(last_name_), last_name, last_name_)
# LAST NAME CLEANUP
last_temp <- trimws(gsub("Jr|Sr|III", "", last_name))
first_name_ <- ifelse(
grepl(",", last_temp),
sapply(strsplit(last_temp, ","), `[`, 2),
sapply(strsplit(last_temp, " "), `[`, 1)
)
last_name_ <- ifelse(
grepl(",", last_temp),
sapply(strsplit(last_temp, ","), `[`, 1),
sapply(strsplit(last_temp, " "), `[`, 2)
)
first_temp <- trimws(first_name)
first_name <- trimws(ifelse(first_temp=="", first_name_, first_name))
last_name <- trimws(ifelse(first_temp=="", last_name_, last_name))
# REMOVE HELPER TEMP COLUMNS
rm(first_temp, last_temp, first_name_, middle_name_, last_name_)
}
)
merge
+ rbind
final_df <- rbind.data.frame(
merge(
DF1_clean, DF2_clean,
by=c("first_name", "middle_name", "last_name", "date_of_birth"),
suffixes=c("_DF1", "_DF2")
),
merge(
DF1_clean, transform(DF2_clean, middle_name=NULL),
by=c("first_name", "last_name", "date_of_birth"),
suffixes=c("_DF1", "_DF2")
)
) |> unique()
Output
Note: Inverted birth dates of Zuhair Biddison and James Baker were fixed in OP's input data to match both data frames.
final_df
first_name middle_name last_name date_of_birth row_ID_DF1 var_difference var_should_be_found row_ID_DF2 var_other
1 Alexandria Macy Maya 1942-10-18 10 middle name column yes 5 gray58
2 Carl Domenico 1948-01-25 12 columns yes 7 mediumvioletred
3 James Baker 1933-02-18 9 no title yes 4 grey94
4 Jovana Cole Tossie 1930-07-05 1 none yes 1 bisque1
5 Matthew Hosler 1951-02-03 13 columns yes 8 wheat
6 Nicole Riley Grinstead 1945-03-24 11 columns yes 6 yellow1
9 Jackson Milhouse Beck 1936-11-11 8 middle name incl yes 3 steelblue1
11 Jovana Tossie 1930-07-05 3 diff middle name no 1 bisque1
13 Jovana Tossie 1930-07-05 2 no middle name yes 1 bisque1
16 Zuhair Beate Biddison 1939-04-18 7 middle name not abbr yes 2 orange2