Search code examples
rrename

Assign number for a column depending on its columns names in R


I have columns from May 2021 to April 2022. They have different values.

   2021-05-01.y 2021-06-01.y 2021-07-01.y 2021-08-01.y 2021-09-01.y 2021-10-01.y 2021-11-01.y 2021-12-01.y 2022-01-01.y 2022-02-01.y 2022-03-01.y 2022-04-01.y 2022-05-01.y
            0            0         5000            0         3000            0            0            0            0            0            0            0            0
            0            0            0            0            0            0            0            0            0            0            0            0            0
         1000            0         4000        12000        10000            0            0            0            0            0            0            0            0
            0            0            0            0            0            0            0            0            0            0            0            0            0
            0            0            0            0            0            0            0            0            0            0            0            0            0
            0            0            0            0            0            0            0            0            0            0            0            0            0

With this, you can reproduce the dataset.

structure(
  list(
    `2021-05-01.y` = c(0, 0, 1000, 0, 0, 0, 0, 0,
                       0, 0),
    `2021-06-01.y` = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0),
    `2021-07-01.y` = c(5000,
                       0, 4000, 0, 0, 0, 0, 0, 0, 0),
    `2021-08-01.y` = c(0, 0, 12000,
                       0, 0, 0, 0, 0, 0, 0),
    `2021-09-01.y` = c(3000, 0, 10000, 0, 0,
                       0, 0, 0, 0, 0),
    `2021-10-01.y` = c(0, 0, 0, 0, 0, 0, 0, 0, 0,
                       0),
    `2021-11-01.y` = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0),
    `2021-12-01.y` = c(0,
                       0, 0, 0, 0, 0, 0, 0, 0, 0),
    `2022-01-01.y` = c(0, 0, 0, 0, 0,
                       0, 0, 0, 0, 0),
    `2022-02-01.y` = c(0, 0, 0, 0, 0, 0, 0, 0, 0,
                       0),
    `2022-03-01.y` = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0),
    `2022-04-01.y` = c(0,
                       0, 0, 0, 0, 0, 0, 0, 0, 0),
    `2022-05-01.y` = c(0, 0, 0, 0, 0,
                       0, 0, 0, 0, 0)
  ),
  row.names = c(NA, -10L),
  class = c("data.table",
            "data.frame"),
  .internal.selfref = < pointer:0x56460a868160 >
)

First I rename them:

df <- df %>%
  mutate(
    reaload_05 = `2021-05-01.y`,
    reaload_06 = `2021-06-01.y`,
    reaload_07 = `2021-07-01.y`,
    reaload_08 = `2021-08-01.y`,
    reaload_09 = `2021-09-01.y`,
    reaload_10 = `2021-10-01.y`,
    reaload_11 = `2021-11-01.y`,
    reaload_12 = `2021-12-01.y`,
    reaload_01 = `2022-01-01.y`,
    reaload_02 = `2022-02-01.y`,
    reaload_03 = `2022-03-01.y`,
    reaload_04 = `2022-04-01.y`
  ) %>%
  select(
    -c(
      `2021-05-01.y`,
      `2021-06-01.y`,
      `2021-07-01.y`,
      `2022-02-01.y`,
      `2022-03-01.y`,
      `2022-04-01.y`,
      `2021-08-01.y`,
      `2021-09-01.y`,
      `2021-10-01.y`,
      `2021-11-01.y`,
      `2021-12-01.y`,
      `2022-01-01.y`
    )
  ) 

Here I assign a number for each column.

I take the most current date and check if it is zero or not; if it's 0, I check the next most current month and so on until the start date:

df  <- df %>%
  mutate(months_before_reloading =
           ifelse(reaload_04 != 0, 0,
                  ifelse(
                    reaload_03 != 0, 1,
                    ifelse(reaload_02 != 0, 2,
                           ifelse(
                             reaload_01 != 0, 3,
                             ifelse(reaload_12 != 0, 4,
                                    ifelse(
                                      reaload_11 != 0, 5,
                                      ifelse(reaload_10 != 0, 6,
                                             ifelse(
                                               reaload_09 != 0, 7,
                                               ifelse(reaload_08 != 0, 8,
                                                      ifelse(
                                                        reaload_07 != 0, 9,
                                                        ifelse(reaload_06 != 0, 10,
                                                               ifelse(reaload_05 != 0, 11, "no reaload"))
                                                      ))
                                             ))
                                    ))
                           ))
                  )))

I was thinking about using rename. Something like this:

df %>%
  rename_at(vars(ends_with('.y')), funs(paste0('reaload_', .)))

Where I would have the entire date on its name. But i don't have any idea for the second part of this.

I have new months and I don't want to keep on doing it so manually


Solution

  • Here are a few suggestions:

    1. Try to use more parameters.

    I guess there is a hidden parameter somewhere that tells you your starting date for the analysis. Use that parameter (or create it) to define your column names. Mix that with another parameter that defines how many months you want to analyze (even if it's always the same amount; it might not be in the future).

    2. Use functions that do what you want to do as a whole instead of breaking up the process, if possible.

    Instead of using mutate() and then select(), you can just use rename().

    3. Try to post everything in English.

    That way it's easier for more people to understand. I do speak Spanish, so I do understand your context, but not everyone will.

    4. Avoid using ifelse(ifelse(ifelse(...))).

    Try to think: "What would happen if I needed to do this for 1000 months?". Always try to write your code in a way that could be easily scalable in the future. Also, avoid code duplication.

    It's very easy to make a mistake if you write a lot of ifelse()'s one by one. Prefer matrix calculations or parametric iterations.

    Context and changes for the proposed solution.

    With will use your dataset to define first_month, amount_of_month_for_analysis and last_month. After that, I will create the dataset I THINK you want to create. But if you do this, make sure to first define the parameters first_month and amount_of_month_for_analysis at the beginning of your code/project/pipeline.

    Also, I will change the names of the columns, because as Francesco Grossetti already said, those names can create problems if you want to use more months. Also, the order of your columns will change from month to month. I think it's better to use an indicator for relative order instead. Let's say our month 0 is the last month available, our month 1 is 1 month before month 0, and so on. That way you can analyze 24 months if you want (2 full years).

    I took the liberty to slightly change your sample dataset because it had too many zeroes and I wanted to test more cases for your last reload month.

    Proposed solution

    
    # Loading libraries -------------------------------------------------------
    
    library(tidyverse)
    library(lubridate)
    
    # Dample dataset definition -----------------------------------------------
    
    df = structure(
      list(
        `2021-05-01.y` = c(0, 0, 1000, 0, 0, 0, 0, 0,
                           0, 0),
        `2021-06-01.y` = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0),
        `2021-07-01.y` = c(5000,
                           0, 4000, 0, 0, 0, 0, 0, 0, 0),
        `2021-08-01.y` = c(0, 0, 12000,
                           0, 0, 0, 0, 0, 0, 0),
        `2021-09-01.y` = c(3000, 0, 10000, 0, 0,
                           0, 0, 0, 0, 0),
        `2021-10-01.y` = c(0, 0, 0, 0, 0, 0, 0, 0, 0,
                           5000),
        `2021-11-01.y` = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0),
        `2021-12-01.y` = c(0,
                           0, 0, 0, 0, 0, 0, 0, 5000, 0),
        `2022-01-01.y` = c(0, 0, 0, 0, 0,
                           0, 0, 0, 0, 0),
        `2022-02-01.y` = c(0, 0, 0, 0, 0, 0, 0, 0, 0,
                           0),
        `2022-03-01.y` = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0),
        `2022-04-01.y` = c(0,
                           0, 0, 0, 0, 0, 0, 0, 0, 0),
        `2022-05-01.y` = c(0, 0, 0, 5000, 0,
                           0, 0, 0, 0, 0)
      ),
      row.names = c(NA, -10L),
      class = c("data.table",
                "data.frame")
    )
    
    # Primary and secondary parameter definition ------------------------------
    
    df_names =
      df |>
      names()
    
    df_names
    
    first_month =
      df_names |>
      first() |>
      str_remove(".y") |>
      ymd()
    
    amount_of_months_for_analysis =
      length(df_names)
    
    last_month = first_month + months(amount_of_months_for_analysis - 1)
    
    months_for_analysis =
      first_month +
      (0:(amount_of_months_for_analysis - 1) |>
         months())
    
    # Columns renaming --------------------------------------------------------
    
    new_names = paste0("reload_on_previous_month_",
                       (amount_of_months_for_analysis - 1):0)
    names(df) = new_names
    
    # Determining month before last reload ------------------------------------
    
    months_before_last_reload =
      df |>
      mutate_all(
        .funs = function(column) {
          is_reload_0 = if_else(condition = column == 0,
                                true = T,
                                false = F)
          return(is_reload_0) # I know this return is not necessary, but I like to explicitly write it to avoid future problems if the function evolves and to very clearly state what's my output
        }
      ) |>
      rev() |> # I reversed and transposed to more easily run column calculations
      t() |>
      as.data.frame() |>
      mutate_all(
        .funs = function(column) {
          is_reload_0_up_until_N_months_before = cumall(column)
          return(is_reload_0_up_until_N_months_before)
        }
      ) |>
      summarise_all(
        .funs = function(column) {
          months_before_last_reload = sum(column)
          return(months_before_last_reload)
        }
      ) |>
      t() |>
      as.data.frame() |>
      rename(months_before_last_reload = V1) |>
      mutate(
        months_before_last_reload =
          ifelse(
            test = months_before_last_reload == amount_of_months_for_analysis,
            yes = paste0("No reload in ", months_before_last_reload, " months"),
            no = months_before_last_reload
          )
      )
    
    # Adding new column to original dataset -----------------------------------
    
    df_output =
      df |>
      bind_cols(months_before_last_reload)
    
    # Print output ------------------------------------------------------------
    
    print(df_output)
    
    

    Extra tip

    Note that you can also use the "bang bang" operator (!!) in conjunction with rlang::sym() (or with quo() and enquo()) to easily parametrize a dplyr pipeline.

    More on that here:

    Advanced R: Quasiquotation - Hadley Wickham

    Bang Bang – How to program with dplyr