Search code examples
rstringcharactercombinationsoverlap

Find overlapping letters in words


I have a string with only three words like this:

first_string <- c("self", "funny", "nymph")

As you can see the words of this vector can all be put together to one word because there is some overlap in letters, i.e. we get selfunnymph. Let`s call this a word train.

Besides, I have another vector with many words. Let the second vector be:

second_string <- c("house", "garden", "duck", "evil", "fluff")

I want to know what words of the second string can be added to the word train. In this case this is house and fluff (house can be added in the end of selfunnymph and fluff can be put between self and funny). So the expected output here would be:

expected <- data.frame(word= c("house", "fluff"), word_train= c("selfunnymphouse", "selfluffunnymph"))

The overlap can be of any length, i.e. self and funny overlap only with one character but funny and nymph overlap in two characters.

EDIT

The new word can change the word order of the first word train. For example, if the second vector contains the word hugs we can make the word train nymphugselfunny, which puts nymph before self and funny.


Solution

  • I'm wondering why you asked this, but it was a fun exercise regardless. Here's my implementation:

    library('dplyr')
    
    
    # define cars -------------------------------------------------------------
    
    original_cars <- c("self", "funny", "nymph")
    new_cars <- c("house", "garden", "duck", "evil", "fluff")
    cars <- c(original_cars, new_cars)
    
    
    # get all possible connections ('parts') per car --------------------------
    
    car_parts <- lapply(seq_along(cars), \(car_id) {
      
      car = cars[car_id]
      n = nchar(car)
      
      ids <- rep(car_id, n)
      names <- rep(car, n)
      left <- vapply(seq_len(n), \(i) substr(car, 1, i), "")
      right <- vapply(seq_len(n), \(i) substr(car, n-i+1, n), "")
      overlap <- nchar(left)
      
      data.frame(car.id = ids, car.name = names, left = left, right = right, overlap = overlap)
      
    }) |> do.call(rbind, args=_)
    
    # > car_parts
    #    car.id car.name   left  right overlap
    # 1       1     self      s      f       1
    # 2       1     self     se     lf       2
    # 3       1     self    sel    elf       3
    # 4       1     self   self   self       4
    # 5       2    funny      f      y       1
    # 6       2    funny     fu     ny       2
    # 7       2    funny    fun    nny       3
    # 8       2    funny   funn   unny       4
    # 9       2    funny  funny  funny       5
    # 10      3    nymph      n      h       1
    # [...]
    
    
    # get all possible connections between two cars ---------------------------
    
    connections <- inner_join(car_parts |> select(-left),
               car_parts |> select(-right),
               by = c('overlap', 'right' = 'left'),
               suffix = c('.left', '.right')) |>
      filter(car.id.left != car.id.right) |>
      mutate(connection.id = row_number()) |>
      select(connection.id, car.id.left, car.id.right, car.name.left, car.name.right, coupling = right)
    
    rm(car_parts)
    
    # > connections
    #   connection.id car.id.left car.id.right car.name.left car.name.right coupling
    # 1             1           1            2          self          funny        f
    # 2             2           1            8          self          fluff        f
    # 3             3           2            3         funny          nymph       ny
    # 4             4           3            4         nymph          house        h
    # 5             5           4            7         house           evil        e
    # 6             6           4            1         house           self       se
    # 7             7           5            3        garden          nymph        n
    # 8             8           8            2         fluff          funny        f
    
    
    # function to store valid trains ------------------------------------------
    
    # example:
    # valid_trains <- list()
    # valid_trains <- add_valid_train( valid_trains, c(1, 8), c(2) )
    
    add_valid_train <- function(valid_trains, train_cars, train_connections) {
      
      names = c(cars[train_cars[1]],
                vapply(train_connections, \(x) connections$car.name.right[x], "") )
      
      couplings = vapply(train_connections, \(x) connections$coupling[x], "")
      
      append(valid_trains, list(list(cars = train_cars, names = names, couplings = couplings)))
      
    }
    
    
    # function to recursively find next cars to add to train ------------------
    
    # example:
    # add_car(9, 5, c(1,2,3), c(1,3,5))
    
    add_car <- function(valid_trains, new_car, new_connection = NULL, train_cars = c(), train_connections = c(), depth = 0) {
      
      cat(strrep('   ',depth), cars[new_car],'\n', sep='')
      
      # store current train as valid
      train_cars <- c(train_cars, new_car)
      train_connections <- c(train_connections, new_connection)
      
      # find next possible cars to add; save train if no more options, otherwise add all options
      options <- connections |> filter(car.id.left == new_car, ! car.id.right %in% train_cars)
      if(nrow(options) == 0) valid_trains <- add_valid_train(valid_trains, train_cars, train_connections) # save only the longest options
      for(i in seq_len(nrow(options))) valid_trains <- add_car(valid_trains, options$car.id.right[i], options$connection.id[i], train_cars, train_connections, depth+1)
      
      return(valid_trains)
      
    }
    
    
    # get all valid trains ----------------------------------------------------
    
    valid_trains <- list()
    for(i in seq_along(cars)) add_car(valid_trains, i) -> valid_trains
    
    # filter valid trains that have all cars from `original_cars` -------------
    
    mask <- vapply(valid_trains, \(x) all(seq_along(original_cars) %in% x$cars), T)
    
    new_trains <- lapply(valid_trains[mask], \(x) {
      x$newcars <- setdiff(x$cars, seq_along(original_cars))
      x$newnames <- cars[x$newcars]
      x
    })
    
    # print names of all trains that contain all 'original' cars:
    #
    # > sapply(new_trains, \(x) x$names)
    # [[1]] "self"  "funny" "nymph" "house" "evil" 
    # [[2]] "self"  "fluff" "funny" "nymph" "house" "evil" 
    # [[3]] "funny" "nymph" "house" "self"  "fluff"
    # [[4]] "nymph" "house" "self"  "funny"
    # [[5]] "nymph" "house" "self"  "fluff" "funny"
    # [[6]] "house" "self"  "funny" "nymph"
    # [[7]] "house" "self"  "fluff" "funny" "nymph"
    # [[8]] "garden" "nymph"  "house"  "self"   "funny" 
    # [[9]] "garden" "nymph"  "house"  "self"   "fluff"  "funny" 
    # [[10]] "fluff" "funny" "nymph" "house" "self" 
    
    ## All possible trains are in `valid_trains`, all of those where *all* the original cars are used are in `new_trains`.
    ## 
    ## It is possible that some trains are subsets of others.
    

    edit: When I looked at your own implementation, I thought you were interested in the longest possible trains. Now you explained the purpose, I adapted the algorithm to take the original cars, and see which of the new cars could be added individually to the original set. With the previous code, a long list of potential new names would have created some huge trains that would be very unfeasible for naming a family.

    library('dplyr')
    
    
    # define cars -------------------------------------------------------------
    
    original_cars <- c("self", "funny", "nymph")
    new_cars <- c("house", "garden", "duck", "evil", "fluff")
    
    
    # function to get all possible connections between a set of cars ----------
    
    # example:
    # cars <- c("self", "funny", "nymph", "house")
    # get_connections(cars)
    #
    # > get_connections(c("self", "funny", "nymph", "house"))
    #   connection.id car.id.left car.id.right car.name.left car.name.right coupling
    # 1             1           1            2          self          funny        f
    # 2             2           2            3         funny          nymph       ny
    # 3             3           3            4         nymph          house        h
    # 4             4           4            1         house           self       se
    
    get_connections <- function(cars) {
      
      # get all connections the cars can make
      car_parts <- lapply(seq_along(cars), \(car_id) {
        
        car = cars[car_id]
        n = nchar(car)
        
        ids <- rep(car_id, n)
        names <- rep(car, n)
        left <- vapply(seq_len(n), \(i) substr(car, 1, i), "")
        right <- vapply(seq_len(n), \(i) substr(car, n-i+1, n), "")
        overlap <- nchar(left)
        
        data.frame(car.id = ids, car.name = names, left = left, right = right, overlap = overlap)
        
      }) |> do.call(rbind, args=_)
      
      # > car_parts
      #    car.id car.name   left  right overlap
      # 1       1     self      s      f       1
      # 2       1     self     se     lf       2
      # 3       1     self    sel    elf       3
      # 4       1     self   self   self       4
      # 5       2    funny      f      y       1
      # 6       2    funny     fu     ny       2
      # [...]
      
      # return all possible connections between two cars
      
      inner_join(car_parts |> select(-left),
                                car_parts |> select(-right),
                                by = c('overlap', 'right' = 'left'),
                                suffix = c('.left', '.right')) |>
        filter(car.id.left != car.id.right) |>
        mutate(connection.id = row_number()) |>
        select(connection.id, car.id.left, car.id.right, car.name.left, car.name.right, coupling = right)
      
    }
    
    
    # function to store valid trains ------------------------------------------
    
    # example:
    # cars <- c("self", "funny", "nymph", "house")
    # connections <- get_connections(cars)
    # valid_trains <- list()
    # valid_trains <- add_valid_train( cars, connections, valid_trains, c(2, 3), c(2) )
    
    add_valid_train <- function(cars, connections, valid_trains, train_cars, train_connections) {
      
      names = c(cars[train_cars[1]],
                vapply(train_connections, \(x) connections$car.name.right[x], "") )
      
      couplings = vapply(train_connections, \(x) connections$coupling[x], "")
      
      append(valid_trains, list(list(cars = train_cars, names = names, couplings = couplings)))
      
    }
    
    
    # function to recursively find next cars to add to train ------------------
    
    # example:
    # cars <- c("self", "funny", "nymph", "house")
    # connections <- get_connections(cars)
    # valid_trains <- list()
    # add_car(cars, connections, valid_trains, 2)
    
    add_car <- function(cars, connections, valid_trains, new_car, new_connection = NULL, train_cars = c(), train_connections = c(), depth = 0) {
      
      cat(strrep('   ',depth), cars[new_car], '\n', sep='')
      
      # store current train as valid
      train_cars <- c(train_cars, new_car)
      train_connections <- c(train_connections, new_connection)
      
      # find next possible cars to add
      options <- connections |> filter(car.id.left == new_car, ! car.id.right %in% train_cars)
      for(i in seq_len(nrow(options))) valid_trains <- add_car(cars, connections, valid_trains, options$car.id.right[i], options$connection.id[i], train_cars, train_connections, depth+1)
      
      # save train if no more options
      if(nrow(options) == 0) valid_trains <- add_valid_train(cars, connections, valid_trains, train_cars, train_connections)
      
      return(valid_trains)
      
    }
    
    
    # find individual new cars that can be added to existing cars --------------
    
    results <- lapply(new_cars, function(new_car) {
      
      cat('adding "',new_car,'":\n', sep='')
      cars <- c(original_cars, new_car)
      connections <- get_connections(cars)
      
      # get all possible trains
      valid_trains <- list()
      for(i in seq_along(cars)) add_car(cars, connections, valid_trains, i) -> valid_trains
      
      cat('\n')
      
      # return only trains where all cars are used
      valid_trains <- valid_trains[ sapply(valid_trains, \(x) length(x$cars)) == length(cars) ]
      return(list(new_car = new_car, options = length(valid_trains), trains = valid_trains))
    })
    
    for(result in results) {
      cat('\n', result$new_car, ': ', result$options, ' options ', sep='')
      for(train in result$trains) {
        cat('[',train$names,'] ')
      }
    }
    # detailed results are in `results`
    
    house: 4 options [ self funny nymph house ] [ funny nymph house self ] [ nymph house self funny ] [ house self funny nymph ] 
    garden: 0 options 
    duck: 0 options 
    evil: 0 options 
    fluff: 1 options [ self fluff funny nymph ]