Each row is wrongly allocated in his respective column. Is there a way to sort correctly and if there is not value, NA is included. Thanks in advance
df <- data.frame(
price = c("1€", "1€", "2022-03-04", "4hr", "Iberia"),
duration = c("2€", "1hr", "2022-01-02", "4hr", "Eurostart"),
airline = c("3€", "1hr", "2022-01-03", "5min", "ITA"),
dates = c("4€", "1hr 5min", "2022-01-03", "Air france", "Lufthansa")
)
desired_output <- data.frame(
price = c("1€", "1€", "2€", "3€", "4€", "NA"),
duration = c("1hr", "1hr", "1hr 5min", "4hr", "4hr", "5min"),
airline = c("Air france", "Iberia", "Eurostart", "ITA", "Lufthansa", "NA"),
dates = c("2022-03-04", "2022-01-02", "2022-01-03", "2022-01-03", "NA", "NA")
)
Personally I would go back to start and try to figure out why the data is read in a messy format. Be it as it is here is one option to clean your data:
df <- data.frame(
price = c("1€", "1€", "2022-03-04", "4hr", "Iberia"),
duration = c("2€", "1hr", "2022-01-02", "4hr", "Eurostart"),
airline = c("3€", "1hr", "2022-01-03", "5min", "ITA"),
dates = c("4€", "1hr 5min", "2022-01-03", "Air france", "Lufthansa")
)
df_mat <- as.matrix(df)
is_pattern <- list(
price = grepl("\\€$", df_mat),
duration = grepl("(hr|min)", df_mat),
date = grepl("^\\d{4}-\\d{2}-\\d{2}$", df_mat)
)
is_pattern$airline <- !is_pattern$price & !is_pattern$duration & !is_pattern$date
vec_list <- lapply(is_pattern, function(x) df_mat[x])
max_len <- max(lengths(vec_list))
vec_list <- lapply(vec_list, function(x) c(x, rep(NA, max_len - length(x))))
as.data.frame(vec_list)
#> price duration date airline
#> 1 1€ 4hr 2022-03-04 Iberia
#> 2 1€ 1hr 2022-01-02 Eurostart
#> 3 2€ 4hr 2022-01-03 ITA
#> 4 3€ 1hr 2022-01-03 Air france
#> 5 4€ 5min <NA> Lufthansa
#> 6 <NA> 1hr 5min <NA> <NA>