I have a string with only three words like this:
first_string <- c("self", "funny", "nymph")
As you can see the words of this vector can all be put together to one word because there is some overlap in letters, i.e. we get selfunnymph. Let`s call this a word train.
Besides, I have another vector with many words. Let the second vector be:
second_string <- c("house", "garden", "duck", "evil", "fluff")
I want to know what words of the second string can be added to the word train. In this case this is house
and fluff
(house
can be added in the end of selfunnymph and fluff
can be put between self
and funny
). So the expected output here would be:
expected <- data.frame(word= c("house", "fluff"), word_train= c("selfunnymphouse", "selfluffunnymph"))
The overlap can be of any length, i.e. self and funny overlap only with one character but funny and nymph overlap in two characters.
EDIT
The new word can change the word order of the first word train. For example, if the second vector contains the word hugs
we can make the word train nymphugselfunny, which puts nymph
before self
and funny
.
I'm wondering why you asked this, but it was a fun exercise regardless. Here's my implementation:
library('dplyr')
# define cars -------------------------------------------------------------
original_cars <- c("self", "funny", "nymph")
new_cars <- c("house", "garden", "duck", "evil", "fluff")
cars <- c(original_cars, new_cars)
# get all possible connections ('parts') per car --------------------------
car_parts <- lapply(seq_along(cars), \(car_id) {
car = cars[car_id]
n = nchar(car)
ids <- rep(car_id, n)
names <- rep(car, n)
left <- vapply(seq_len(n), \(i) substr(car, 1, i), "")
right <- vapply(seq_len(n), \(i) substr(car, n-i+1, n), "")
overlap <- nchar(left)
data.frame(car.id = ids, car.name = names, left = left, right = right, overlap = overlap)
}) |> do.call(rbind, args=_)
# > car_parts
# car.id car.name left right overlap
# 1 1 self s f 1
# 2 1 self se lf 2
# 3 1 self sel elf 3
# 4 1 self self self 4
# 5 2 funny f y 1
# 6 2 funny fu ny 2
# 7 2 funny fun nny 3
# 8 2 funny funn unny 4
# 9 2 funny funny funny 5
# 10 3 nymph n h 1
# [...]
# get all possible connections between two cars ---------------------------
connections <- inner_join(car_parts |> select(-left),
car_parts |> select(-right),
by = c('overlap', 'right' = 'left'),
suffix = c('.left', '.right')) |>
filter(car.id.left != car.id.right) |>
mutate(connection.id = row_number()) |>
select(connection.id, car.id.left, car.id.right, car.name.left, car.name.right, coupling = right)
rm(car_parts)
# > connections
# connection.id car.id.left car.id.right car.name.left car.name.right coupling
# 1 1 1 2 self funny f
# 2 2 1 8 self fluff f
# 3 3 2 3 funny nymph ny
# 4 4 3 4 nymph house h
# 5 5 4 7 house evil e
# 6 6 4 1 house self se
# 7 7 5 3 garden nymph n
# 8 8 8 2 fluff funny f
# function to store valid trains ------------------------------------------
# example:
# valid_trains <- list()
# valid_trains <- add_valid_train( valid_trains, c(1, 8), c(2) )
add_valid_train <- function(valid_trains, train_cars, train_connections) {
names = c(cars[train_cars[1]],
vapply(train_connections, \(x) connections$car.name.right[x], "") )
couplings = vapply(train_connections, \(x) connections$coupling[x], "")
append(valid_trains, list(list(cars = train_cars, names = names, couplings = couplings)))
}
# function to recursively find next cars to add to train ------------------
# example:
# add_car(9, 5, c(1,2,3), c(1,3,5))
add_car <- function(valid_trains, new_car, new_connection = NULL, train_cars = c(), train_connections = c(), depth = 0) {
cat(strrep(' ',depth), cars[new_car],'\n', sep='')
# store current train as valid
train_cars <- c(train_cars, new_car)
train_connections <- c(train_connections, new_connection)
# find next possible cars to add; save train if no more options, otherwise add all options
options <- connections |> filter(car.id.left == new_car, ! car.id.right %in% train_cars)
if(nrow(options) == 0) valid_trains <- add_valid_train(valid_trains, train_cars, train_connections) # save only the longest options
for(i in seq_len(nrow(options))) valid_trains <- add_car(valid_trains, options$car.id.right[i], options$connection.id[i], train_cars, train_connections, depth+1)
return(valid_trains)
}
# get all valid trains ----------------------------------------------------
valid_trains <- list()
for(i in seq_along(cars)) add_car(valid_trains, i) -> valid_trains
# filter valid trains that have all cars from `original_cars` -------------
mask <- vapply(valid_trains, \(x) all(seq_along(original_cars) %in% x$cars), T)
new_trains <- lapply(valid_trains[mask], \(x) {
x$newcars <- setdiff(x$cars, seq_along(original_cars))
x$newnames <- cars[x$newcars]
x
})
# print names of all trains that contain all 'original' cars:
#
# > sapply(new_trains, \(x) x$names)
# [[1]] "self" "funny" "nymph" "house" "evil"
# [[2]] "self" "fluff" "funny" "nymph" "house" "evil"
# [[3]] "funny" "nymph" "house" "self" "fluff"
# [[4]] "nymph" "house" "self" "funny"
# [[5]] "nymph" "house" "self" "fluff" "funny"
# [[6]] "house" "self" "funny" "nymph"
# [[7]] "house" "self" "fluff" "funny" "nymph"
# [[8]] "garden" "nymph" "house" "self" "funny"
# [[9]] "garden" "nymph" "house" "self" "fluff" "funny"
# [[10]] "fluff" "funny" "nymph" "house" "self"
## All possible trains are in `valid_trains`, all of those where *all* the original cars are used are in `new_trains`.
##
## It is possible that some trains are subsets of others.
edit: When I looked at your own implementation, I thought you were interested in the longest possible trains. Now you explained the purpose, I adapted the algorithm to take the original cars, and see which of the new cars could be added individually to the original set. With the previous code, a long list of potential new names would have created some huge trains that would be very unfeasible for naming a family.
library('dplyr')
# define cars -------------------------------------------------------------
original_cars <- c("self", "funny", "nymph")
new_cars <- c("house", "garden", "duck", "evil", "fluff")
# function to get all possible connections between a set of cars ----------
# example:
# cars <- c("self", "funny", "nymph", "house")
# get_connections(cars)
#
# > get_connections(c("self", "funny", "nymph", "house"))
# connection.id car.id.left car.id.right car.name.left car.name.right coupling
# 1 1 1 2 self funny f
# 2 2 2 3 funny nymph ny
# 3 3 3 4 nymph house h
# 4 4 4 1 house self se
get_connections <- function(cars) {
# get all connections the cars can make
car_parts <- lapply(seq_along(cars), \(car_id) {
car = cars[car_id]
n = nchar(car)
ids <- rep(car_id, n)
names <- rep(car, n)
left <- vapply(seq_len(n), \(i) substr(car, 1, i), "")
right <- vapply(seq_len(n), \(i) substr(car, n-i+1, n), "")
overlap <- nchar(left)
data.frame(car.id = ids, car.name = names, left = left, right = right, overlap = overlap)
}) |> do.call(rbind, args=_)
# > car_parts
# car.id car.name left right overlap
# 1 1 self s f 1
# 2 1 self se lf 2
# 3 1 self sel elf 3
# 4 1 self self self 4
# 5 2 funny f y 1
# 6 2 funny fu ny 2
# [...]
# return all possible connections between two cars
inner_join(car_parts |> select(-left),
car_parts |> select(-right),
by = c('overlap', 'right' = 'left'),
suffix = c('.left', '.right')) |>
filter(car.id.left != car.id.right) |>
mutate(connection.id = row_number()) |>
select(connection.id, car.id.left, car.id.right, car.name.left, car.name.right, coupling = right)
}
# function to store valid trains ------------------------------------------
# example:
# cars <- c("self", "funny", "nymph", "house")
# connections <- get_connections(cars)
# valid_trains <- list()
# valid_trains <- add_valid_train( cars, connections, valid_trains, c(2, 3), c(2) )
add_valid_train <- function(cars, connections, valid_trains, train_cars, train_connections) {
names = c(cars[train_cars[1]],
vapply(train_connections, \(x) connections$car.name.right[x], "") )
couplings = vapply(train_connections, \(x) connections$coupling[x], "")
append(valid_trains, list(list(cars = train_cars, names = names, couplings = couplings)))
}
# function to recursively find next cars to add to train ------------------
# example:
# cars <- c("self", "funny", "nymph", "house")
# connections <- get_connections(cars)
# valid_trains <- list()
# add_car(cars, connections, valid_trains, 2)
add_car <- function(cars, connections, valid_trains, new_car, new_connection = NULL, train_cars = c(), train_connections = c(), depth = 0) {
cat(strrep(' ',depth), cars[new_car], '\n', sep='')
# store current train as valid
train_cars <- c(train_cars, new_car)
train_connections <- c(train_connections, new_connection)
# find next possible cars to add
options <- connections |> filter(car.id.left == new_car, ! car.id.right %in% train_cars)
for(i in seq_len(nrow(options))) valid_trains <- add_car(cars, connections, valid_trains, options$car.id.right[i], options$connection.id[i], train_cars, train_connections, depth+1)
# save train if no more options
if(nrow(options) == 0) valid_trains <- add_valid_train(cars, connections, valid_trains, train_cars, train_connections)
return(valid_trains)
}
# find individual new cars that can be added to existing cars --------------
results <- lapply(new_cars, function(new_car) {
cat('adding "',new_car,'":\n', sep='')
cars <- c(original_cars, new_car)
connections <- get_connections(cars)
# get all possible trains
valid_trains <- list()
for(i in seq_along(cars)) add_car(cars, connections, valid_trains, i) -> valid_trains
cat('\n')
# return only trains where all cars are used
valid_trains <- valid_trains[ sapply(valid_trains, \(x) length(x$cars)) == length(cars) ]
return(list(new_car = new_car, options = length(valid_trains), trains = valid_trains))
})
for(result in results) {
cat('\n', result$new_car, ': ', result$options, ' options ', sep='')
for(train in result$trains) {
cat('[',train$names,'] ')
}
}
# detailed results are in `results`
house: 4 options [ self funny nymph house ] [ funny nymph house self ] [ nymph house self funny ] [ house self funny nymph ]
garden: 0 options
duck: 0 options
evil: 0 options
fluff: 1 options [ self fluff funny nymph ]