Search code examples
rdataframematrixrows

Adding rows with changing variable values in R


I have the following extract of my dataset about the occupancy of a football match:

example <- data.frame(Date <- c("2019-03-21", "2019-03-30", "2019-04-07", 
                                "2019-03-21", "2019-03-30", "2019-04-07",
                                "2019-03-21",  "2019-04-07",
                                "2019-03-21", "2019-03-30", "2019-04-07",
                                "2019-03-21", "2019-03-30", "2019-04-07",
                                "2019-03-21", "2019-03-30", "2019-04-07",
                                "2019-03-21", "2019-03-30", "2019-04-07",
                                "2019-03-21", "2019-03-30", "2019-04-07",
                                "2019-03-21", "2019-03-30",
                                "2019-03-21", "2019-03-30",
                                "2019-03-21", "2019-03-30",
                                "2019-03-21"),
                         Block <- c("43L","43L", "43L", "15B", "15B", "15B", "43L", "43L",
                                    "15B", "15B", "15B",
                                    "15B", "15B", "15B",
                                    "15B", "15B", "15B",
                                    "15B", "15B", "15B",
                                    "15B", "15B", "15B",
                                    "15B", "15B",
                                    "15B", "15B",
                                    "15B", "15B",
                                    "15B"),
                         Preis <- as.numeric(c("24", "35", "30", "35", "45", 
                                    "40", "26", "30",
                                    "35", "45", "40",
                                    "34", "43", "42",
                                    "35", "42", "45",
                                    "36", "45", "43",
                                    "36", "43", "40",
                                    "35", "41",
                                    "32", "42",
                                    "30", "42",
                                    "35")),
                         Max  <- c("3", "3", "3", "10", "10","10","3", "3",
                                   "10", "10","10",
                                   "10", "10","10",
                                   "10", "10","10",
                                   "10", "10","10",
                                   "10", "10","10",
                                   "10", "10",
                                   "10", "10",
                                   "10", "10",
                                   "10"),
                         Actual <- c("2", "1", "2", "10", "9", "6","2", "2",
                                     "10", "9", "6",
                                     "10", "9", "6",
                                     "10", "9", "6",
                                     "10", "9", "6",
                                     "10", "9", "6",
                                     "10", "9",
                                     "10", "9", 
                                     "10", "9",
                                     "10"),
                         Temperatur <- c("15", "20", "18","15", "20", "18", "15",  "18",
                                         "15", "20", "18",
                                         "15", "20", "18",
                                         "15", "20", "18",
                                         "15", "20", "18",
                                         "15", "20", "18",
                                         "15", "20", 
                                         "15", "20", 
                                         "15", "20", 
                                         "15"),
                      Placesold <- c("1", "1", "1", "1", "1","1", "1", "1",
                                     "1", "1", "1",
                                     "1", "1", "1",
                                     "1", "1", "1",
                                     "1", "1", "1",
                                     "1", "1", "1",
                                     "1", "1", 
                                     "1", "1",
                                     "1", "1",
                                     "1") )
colnames(example) <- c("Date", "Block", "Price", "Max", "Actual", "Temprature", "Placesold")

In reality, the dataset contains over 100 blocks and 46 different dates. If you take a closer look at the data, you can see that different numbers of seats are sold out in block 15B and 43L on different days.

table(example$Date, example$Block)
table(example$Placesold)

           15B 43L
  2019-03-21  10   2
  2019-03-30   9   1
  2019-4-07    6   2
> table(example$Placesold)

 1 
30 

My goal is to add the seats that were not sold to the data set. The variable Placesold should be 0 instead of 1. In addition, the average price of the sold tickets should be used instead of the price (without 0).

To clarify my goal, I have added the missing rows for the reduced data set.

result <- data.frame(Date <- c("2019-03-21", "2019-03-30", "2019-4-07",
                               "2019-03-21", "2019-03-30", "2019-4-07",
                                "2019-03-21", "2019-03-30", "2019-4-07",
                               "2019-03-21", "2019-03-30", "2019-4-07",
                                "2019-03-21", "2019-03-30", "2019-4-07",
                                "2019-03-21", "2019-03-30", "2019-4-07",
                                "2019-03-21", "2019-03-30", "2019-4-07",
                                "2019-03-21", "2019-03-30", "2019-4-07",
                                "2019-03-21", "2019-03-30", "2019-4-07",
                               "2019-03-21", "2019-03-30", "2019-4-07",
                               "2019-03-21", "2019-03-30", "2019-4-07",
                               "2019-03-21", "2019-03-30", "2019-4-07",
                               "2019-03-21", "2019-03-30", "2019-4-07"),
                      Block <- c("43L","43L", "43L", 
                                 "15B", "15B", "15B", 
                                 "43L", "43L","43L",
                                 "15B", "15B", "15B",
                                 "43L", "43L","43L",
                                 "15B", "15B", "15B",
                                 "15B", "15B", "15B",
                                 "15B", "15B", "15B",
                                 "15B", "15B", "15B",
                                 "15B", "15B", "15B",
                                 "15B", "15B", "15B",
                                 "15B", "15B", "15B",
                                 "15B", "15B", "15B"),
                      Preis <- c("24", "35", "30", 
                                 "35", "45", "40", 
                                 "26", "35","30",
                                 "35", "45", "40",
                                 "25", "35","30",
                                 "34", "43", "42",
                                 "35", "42", "45",
                                 "36", "45", "43",
                                 "36", "43", "40",
                                 "35", "41", "41.67",
                                 "32", "42", "41.67",
                                 "30", "42", "41.67",
                                 "35","43.11","41.67"), 
                      Max  <- c("3", "3", "3", "10", "10","10",
                                "3", "3", "3",
                                "10", "10","10",
                                "3", "3", "3",
                                "10", "10","10",
                                "10", "10","10",
                                "10", "10","10",
                                "10", "10","10",
                                "10", "10","10",
                                "10", "10","10",
                                "10", "10","10",
                                "10", "10","10"), 
                      Actual <- c("2", "1", "2",
                                  "10", "9", "6",
                                   "2", "1","2",
                                  "10", "9", "6",
                                   "2", "1","2",
                                  "10", "9", "6",
                                  "10", "9", "6",
                                  "10", "9", "6",
                                  "10", "9", "6",
                                  "10", "9", "6",
                                  "10", "9", "6",
                                  "10", "9", "6",
                                  "10", "9", "6"),
                      Temperatur <- c("15", "20", "18",
                                      "15", "20", "18", 
                                      "15", "20", "18",
                                      "15", "20", "18",
                                      "15", "20", "18",
                                      "15", "20", "18",
                                      "15", "20", "18",
                                      "15", "20", "18",
                                      "15", "20", "18",
                                      "15", "20", "18",
                                      "15", "20", "18",
                                      "15", "20", "18",
                                      "15", "20", "18"),
                      Placesold <- c("1", "1", "1", "1", "1","1", 
                                     "1", "0", "1",
                                     "1", "1", "1",
                                     "0", "0", "0",
                                     "1", "1", "1",
                                     "1", "1", "1",
                                     "1", "1", "1",
                                     "1", "1", "1",
                                     "1", "1", "0",
                                     "1", "1", "0",
                                     "1", "1", "0",
                                     "1", "0", "0") )

colnames(result) <- c("Date", "Block", "Price", "Max", "Actual", "Temprature", "Placesold")

The results of the blocks and the data as well as the occurrence of the variable "Placesold" look like this:

table(result$Date, result$Block)
table(result$Placesold)

           15B 43L
  2019-03-21  10   3
  2019-03-30  10   3
  2019-4-07   10   3
> table(result$Placesold)

 0  1 
 9 30 

My first thought was to create a matrix with more rows, but to be honest I don't really know how. I hope you can help me.

Thank you very much.


Solution

  • I use dplyr functions and base::merge. merge can perform cross join between data frames, vectors and other types. Construction of each date and block pair - it includes unsold blocks of a date:

    # ordered, unique vector of dates
    dates <- example$Date %>% unique() %>% sort()
    # ordered, unique vector of blocks
    blocks <- example$Block %>% unique() %>% sort()
    # insert dummy block to demonstrate effects of missing blocks
    blocks <- c("11B", blocks)
    # cross join of dates and blocks: each date paired with each block
    # (it results a data.frame)
    eachDateBlock <- merge(dates, blocks, by = NULL)
    # merge generate x and y as names for the resulted data.frame
    # I rename them as a preparation for left_join
    eachDateBlock <- eachDateBlock %>% rename(Date = x, Block = y)
    
    # rows from 'eachDateBlock' with matchig row in 'example' get values of variables,
    # otherwise they filled by NAs
    extendedData <- eachDateBlock %>%
      left_join(example, by = c("Date" = "Date", "Block" = "Block"))
    
    # NOTE: before avgPrice you need something similar conversion - I ignore
    # other numeric columns here
    #example$Price <- as.double(example$Price)
    #example$Placesold <- as.double(example$Placesold)
    
    
    # Overwrite NAs in rows of supplied unsold blocks
    avgPrice <- mean(example$Price)
    result <- extendedData %>% mutate(
      Price = if_else(is.na(Price), avgPrice, Price),
      Placesold = if_else(is.na(Placesold), 0, Placesold)
    ) %>% arrange(Date)
    
    > table(result$Date, result$Block)
    
                 11B 15B 43L
      2019-03-21   1  10   2
      2019-03-30   1   9   1
      2019-04-07   1   6   2
    
    > table(result$Placesold)  
    
     0  1 
     3 30 
    
    > result
             Date Block    Price  Max Actual Temprature Placesold
    1  2019-03-21   11B 37.53333 <NA>   <NA>       <NA>         0
    .
    .
    .
    12 2019-03-21   43L 24.00000    3      2         15         1
    13 2019-03-21   43L 26.00000    3      2         15         1
    14 2019-03-30   11B 37.53333 <NA>   <NA>       <NA>         0
    15 2019-03-30   15B 45.00000   10      9         20         1
    .
    .
    .
    24 2019-03-30   43L 35.00000    3      1         20         1
    25 2019-04-07   11B 37.53333 <NA>   <NA>       <NA>         0
    .
    .
    .
    31 2019-04-07   15B 40.00000   10      6         18         1
    32 2019-04-07   43L 30.00000    3      2         18         1
    33 2019-04-07   43L 30.00000    3      2         18         1