Search code examples
rdplyrtidyversedata-cleaningtidy

Split and recombine data: Is there an efficient way to do this


I have a very large dataset that i measured on an instrument with repeated, unequal headers. i'm trying to first subset every two rows i.e header and data then recombine using bind_row() from the tidyverse package. I've looked at the questions like use-first-row-data-as-column-names-in-r and its totally different from what i want to do.

here is a dput of my sample data

structure(list(X1 = c("File #", "76", "File #", "77", "File #", 
"78", "File #", "79", "File #", "80"), X2 = c("DateTime", "3/8/2020 18:08", 
"DateTime", "3/8/20 18:08", "DateTime", "3/8/2020 18:08", "DateTime", 
"3/8/2020 18:08", "DateTime", "3/8/2020 18:08"), X3 = c("Operator", 
"Supervisor", "Operator", "Supervisor", "Operator", "Supervisor", 
"Operator", "Supervisor", "Operator", "Supervisor"), X4 = c("Name", 
NA, "Name", NA, "Name", NA, "Name", NA, "Name", NA), X5 = c("ID", 
"0", "ID", "0", "ID", "0", "ID", "0", "ID", "0"), X6 = c("Sample", 
"Local 2", "Sample", "Local 2", "Sample", "Local 2", "Sample", 
"Local 2", "Sample", "Local 2"), X7 = c("Project", "XRF DATA", 
"Project", "XRF DATA", "Project", "XRF DATA", "Project", "XRF DATA", 
"Project", "XRF DATA"), X8 = c("Application", "GeoExploration", 
"Application", "GeoExploration", "Application", "GeoExploration", 
"Application", "GeoExploration", "Application", "GeoExploration"
), X9 = c("Method", "Oxide3phase", "Method", "Oxide3phase", "Method", 
"Oxide3phase", "Method", "Oxide3phase", "Method", "Oxide3phase"
), X10 = c("ElapsedTime", "0", "ElapsedTime", "0", "ElapsedTime", 
"0", "ElapsedTime", "0", "ElapsedTime", "0"), X11 = c("Alloy 1", 
NA, "Alloy 1", NA, "Alloy 1", NA, "Alloy 1", NA, "Alloy 1", NA
), X12 = c("Match Qual 1", "0", "Match Qual 1", "0", "Match Qual 1", 
"0", "Match Qual 1", "0", "Match Qual 1", "0"), X13 = c("Alloy 2", 
NA, "Alloy 2", NA, "Alloy 2", NA, "Alloy 2", NA, "Alloy 2", NA
), X14 = c("Match Qual 2", "0", "Match Qual 2", "0", "Match Qual 2", 
"0", "Match Qual 2", "0", "Match Qual 2", "0"), X15 = c("Alloy 3", 
NA, "Alloy 3", NA, "Alloy 3", NA, "Alloy 3", NA, "Alloy 3", NA
), X16 = c("Match Qual 3", "0", "Match Qual 3", "0", "Match Qual 3", 
"0", "Match Qual 3", "0", "Match Qual 3", "0"), X17 = c("MgO", 
"0.6579", "MgO", "0.6579", "MgO", "0.6579", "MgO", "0.6579", 
"MgO", "0.6579"), X18 = c("MgO Err", "0.7778", "MgO Err", "0.7778", 
"MgO Err", "0.7778", "MgO Err", "0.7778", "MgO Err", "0.7778"
), X19 = c("Al2O3", "11.1503", "Al2O3", "11.1503", "Al2O3", "11.1503", 
"Al2O3", "11.1503", "Al2O3", "11.1503"), X20 = c("Al2O3 Err", 
"0.5363", "Al2O3 Err", "0.5363", "Al2O3 Err", "0.5363", "Al2O3 Err", 
"0.5363", "Al2O3 Err", "0.5363"), X21 = c("SiO2", "67.523", "SiO2", 
"67.523", "SiO2", "67.523", "SiO2", "67.523", "SiO2", "67.523"
), X22 = c("SiO2 Err", "0.8529", "SiO2 Err", "0.8529", "SiO2 Err", 
"0.8529", "SiO2 Err", "0.8529", "SiO2 Err", "0.8529"), X23 = c("P", 
"0.0035", "P", "0.0035", "P", "0.0035", "P", "0.0035", "P", "0.0035"
), X24 = c("P Err", "0.0177", "P Err", "0.0177", "P Err", "0.0177", 
"P Err", "0.0177", "P Err", "0.0177"), X25 = c("S", "0.11", "S", 
"0.11", "S", "0.11", "S", "0.11", "S", "0.11"), X26 = c("S Err", 
"0.0195", "S Err", "0.0195", "S Err", "0.0195", "S Err", "0.0195", 
"S Err", "0.0195"), X27 = c("Cl", "0", "Cl", "0", "Cl", "0", 
"Cl", "0", "Cl", "0"), X28 = c("Cl Err", "0.0213", "Cl Err", 
"0.0213", "Cl Err", "0.0213", "Cl Err", "0.0213", "Cl Err", "0.0213"
), X29 = c("K2O", "4.6562", "K2O", "4.6562", "K2O", "4.6562", 
"K2O", "4.6562", "K2O", "4.6562"), X30 = c("K2O Err", "0.0516", 
"K2O Err", "0.0516", "K2O Err", "0.0516", "K2O Err", "0.0516", 
"K2O Err", "0.0516"), X31 = c("Ca", "0.6011", "Ca", "0.6011", 
"Ca", "0.6011", "Ca", "0.6011", "Ca", "0.6011"), X32 = c("Ca Err", 
"0.0136", "Ca Err", "0.0136", "Ca Err", "0.0136", "Ca Err", "0.0136", 
"Ca Err", "0.0136"), X33 = c("Ti", "0.1598", "Ti", "0.1598", 
"Ti", "0.1598", "Ti", "0.1598", "Ti", "0.1598"), X34 = c("Ti Err", 
"0.0177", "Ti Err", "0.0177", "Ti Err", "0.0177", "Ti Err", "0.0177", 
"Ti Err", "0.0177"), X35 = c("V", "0.0064", "V", "0.0064", "V", 
"0.0064", "V", "0.0064", "V", "0.0064"), X36 = c("V Err", "0.0088", 
"V Err", "0.0088", "V Err", "0.0088", "V Err", "0.0088", "V Err", 
"0.0088"), X37 = c("Cr", "0.0042", "Cr", "0.0042", "Cr", "0.0042", 
"Cr", "0.0042", "Cr", "0.0042"), X38 = c("Cr Err", "0.0033", 
"Cr Err", "0.0033", "Cr Err", "0.0033", "Cr Err", "0.0033", "Cr Err", 
"0.0033"), X39 = c("Mn", "0.0976", "Mn", "0.0976", "Mn", "0.0976", 
"Mn", "0.0976", "Mn", "0.0976"), X40 = c("Mn Err", "0.0063", 
"Mn Err", "0.0063", "Mn Err", "0.0063", "Mn Err", "0.0063", "Mn Err", 
"0.0063"), X41 = c("Fe", "1.5828", "Fe", "1.5828", "Fe", "1.5828", 
"Fe", "1.5828", "Fe", "1.5828"), X42 = c("Fe Err", "0.0186", 
"Fe Err", "0.0186", "Fe Err", "0.0186", "Fe Err", "0.0186", "Fe Err", 
"0.0186"), X43 = c("Co", "0.0042", "Co", "0.0042", "Co", "0.0042", 
"Co", "0.0042", "Co", "0.0042"), X44 = c("Co Err", "0.0032", 
"Co Err", "0.0032", "Co Err", "0.0032", "Co Err", "0.0032", "Co Err", 
"0.0032"), X45 = c("Ni", "0.0052", "Ni", "0.0052", "Ni", "0.0052", 
"Ni", "0.0052", "Ni", "0.0052"), X46 = c("Ni Err", "0.0012", 
"Ni Err", "0.0012", "Ni Err", "0.0012", "Ni Err", "0.0012", "Ni Err", 
"0.0012"), X47 = c("Cu", "0.0218", "Cu", "0.0218", "Cu", "0.0218", 
"Cu", "0.0218", "Cu", "0.0218"), X48 = c("Cu Err", "0.0015", 
"Cu Err", "0.0015", "Cu Err", "0.0015", "Cu Err", "0.0015", "Cu Err", 
"0.0015"), X49 = c("Zn", "0.0709", "Zn", "0.0709", "Zn", "0.0709", 
"Zn", "0.0709", "Zn", "0.0709"), X50 = c("Zn Err", "0.0024", 
"Zn Err", "0.0024", "Zn Err", "0.0024", "Zn Err", "0.0024", "Zn Err", 
"0.0024"), X51 = c("Ga", "0.0006", "Ga", "0.0006", "Ga", "0.0006", 
"Ga", "0.0006", "Ga", "0.0006"), X52 = c("Ga Err", "0.0009", 
"Ga Err", "0.0009", "Ga Err", "0.0009", "Ga Err", "0.0009", "Ga Err", 
"0.0009"), X53 = c("As", "0.0041", "As", "0.0041", "As", "0.0041", 
"As", "0.0041", "As", "0.0041"), X54 = c("As Err", "0.0019", 
"As Err", "0.0019", "As Err", "0.0019", "As Err", "0.0019", "As Err", 
"0.0019"), X55 = c("Se", "0.0001", "Se", "0.0001", "Se", "0.0001", 
"Se", "0.0001", "Se", "0.0001"), X56 = c("Se Err", "0.0002", 
"Se Err", "0.0002", "Se Err", "0.0002", "Se Err", "0.0002", "Se Err", 
"0.0002"), X57 = c("Rb", "0.0147", "Rb", "0.0147", "Rb", "0.0147", 
"Rb", "0.0147", "Rb", "0.0147"), X58 = c("Rb Err", "0.0008", 
"Rb Err", "0.0008", "Rb Err", "0.0008", "Rb Err", "0.0008", "Rb Err", 
"0.0008"), X59 = c("Sr", "0.0138", "Sr", "0.0138", "Sr", "0.0138", 
"Sr", "0.0138", "Sr", "0.0138"), X60 = c("Sr Err", "0.0007", 
"Sr Err", "0.0007", "Sr Err", "0.0007", "Sr Err", "0.0007", "Sr Err", 
"0.0007"), X61 = c("Y", "0.0032", "Y", "0.0032", "Y", "0.0032", 
"Y", "0.0032", "Y", "0.0032"), X62 = c("Y Err", "0.0005", "Y Err", 
"0.0005", "Y Err", "0.0005", "Y Err", "0.0005", "Y Err", "0.0005"
), X63 = c("Zr", "0.0277", "Zr", "0.0277", "Zr", "0.0277", "Zr", 
"0.0277", "Zr", "0.0277"), X64 = c("Zr Err", "0.0011", "Zr Err", 
"0.0011", "Zr Err", "0.0011", "Zr Err", "0.0011", "Zr Err", "0.0011"
), X65 = c("Nb", "0.002", "Nb", "0.002", "Nb", "0.002", "Nb", 
"0.002", "Nb", "0.002"), X66 = c("Nb Err", "0.0005", "Nb Err", 
"0.0005", "Nb Err", "0.0005", "Nb Err", "0.0005", "Nb Err", "0.0005"
), X67 = c("Mo", "0", "Mo", "0", "Mo", "0", "Mo", "0", "Mo", 
"0"), X68 = c("Mo Err", "0.0009", "Mo Err", "0.0009", "Mo Err", 
"0.0009", "Mo Err", "0.0009", "Mo Err", "0.0009"), X69 = c("Cd", 
"0", "Cd", "0", "Cd", "0", "Cd", "0", "Cd", "0"), X70 = c("Cd Err", 
"0.0009", "Cd Err", "0.0009", "Cd Err", "0.0009", "Cd Err", "0.0009", 
"Cd Err", "0.0009"), X71 = c("Sn", "0", "Sn", "0", "Sn", "0", 
"Sn", "0", "Sn", "0"), X72 = c("Sn Err", "0.0054", "Sn Err", 
"0.0054", "Sn Err", "0.0054", "Sn Err", "0.0054", "Sn Err", "0.0054"
), X73 = c("Sb", "0.0144", "Sb", "0.0144", "Sb", "0.0144", "Sb", 
"0.0144", "Sb", "0.0144"), X74 = c("Sb Err", "0.0044", "Sb Err", 
"0.0044", "Sb Err", "0.0044", "Sb Err", "0.0044", "Sb Err", "0.0044"
), X75 = c("Te", "0.0002", "Te", "0.0002", "Te", "0.0002", "Te", 
"0.0002", "Te", "0.0002"), X76 = c("Te Err", "0.0005", "Te Err", 
"0.0005", "Te Err", "0.0005", "Te Err", "0.0005", "Te Err", "0.0005"
), X77 = c("Ba", "0.1185", "Ba", "0.1185", "Ba", "0.1185", "Ba", 
"0.1185", "Ba", "0.1185"), X78 = c("Ba Err", "0.0142", "Ba Err", 
"0.0142", "Ba Err", "0.0142", "Ba Err", "0.0142", "Ba Err", "0.0142"
), X79 = c("La", "0", "La", "0", "La", "0", "La", "0", "La", 
"0"), X80 = c("La Err", "0.0156", "La Err", "0.0156", "La Err", 
"0.0156", "La Err", "0.0156", "La Err", "0.0156"), X81 = c("Ce", 
"0.002", "Ce", "0.002", "Ce", "0.002", "Ce", "0.002", "Ce", "0.002"
), X82 = c("Ce Err", "0.022", "Ce Err", "0.022", "Ce Err", "0.022", 
"Ce Err", "0.022", "Ce Err", "0.022"), X83 = c("Hf", "0", "Hf", 
"0", "Hf", "0", "Hf", "0", "Hf", "0"), X84 = c("Hf Err", "0.0026", 
"Hf Err", "0.0026", "Hf Err", "0.0026", "Hf Err", "0.0026", "Hf Err", 
"0.0026"), X85 = c("Ta", "0", "Ta", "0", "Ta", "0", "Ta", "0", 
"Ta", "0"), X86 = c("Ta Err", "0.0021", "Ta Err", "0.0021", "Ta Err", 
"0.0021", "Ta Err", "0.0021", "Ta Err", "0.0021"), X87 = c("W", 
"0", "W", "0", "W", "0", "W", "0", "W", "0"), X88 = c("W Err", 
"0.0073", "W Err", "0.0073", "W Err", "0.0073", "W Err", "0.0073", 
"W Err", "0.0073"), X89 = c("Pt", "0", "Pt", "0", "Pt", "0", 
"Pt", "0", "Pt", "0"), X90 = c("Pt Err", "0.0014", "Pt Err", 
"0.0014", "Pt Err", "0.0014", "Pt Err", "0.0014", "Pt Err", "0.0014"
), X91 = c("Hg", "0.0002", "Hg", "0.0002", "Bi", "0.0001", "Hg", 
"0.0002", "Hg", "0.0002"), X92 = c("Hg Err", "0.0008", "Hg Err", 
"0.0008", "Bi Err", "0.0012", "Hg Err", "0.0008", "Hg Err", "0.0008"
), X93 = c("Tl", "0", "Tl", "0", "Th", "0.0005", "Tl", "0", "Tl", 
"0"), X94 = c("Tl Err", "0.0023", "Tl Err", "0.0023", "Th Err", 
"0.0012", "Tl Err", "0.0023", "Tl Err", "0.0023"), X95 = c("Pb", 
"0.0834", "Pb", "0.0834", "U", "0", "Pb", "0.0834", "Pb", "0.0834"
), X96 = c("Pb Err", "0.0034", "Pb Err", "0.0034", "U Err", "0.0035", 
"Pb Err", "0.0034", "Pb Err", "0.0034"), X97 = c("Bi", "0.0001", 
"Bi", "0.0001", "Cal Check", "Passed", "Th", "0.0005", "Bi", 
"0.0001"), X98 = c("Bi Err", "0.0012", "Bi Err", "0.0012", NA, 
NA, "Th Err", "0.0012", "Bi Err", "0"), X99 = c("Th", "0.0005", 
"Cal Check", "Passed", NA, NA, "U", "0", "Th", "0.0005"), X100 = c("Th Err", 
"0.0012", NA, NA, NA, NA, "U Err", "0.0035", "Th Err", "0.0012"
), X101 = c("U", "0", NA, NA, NA, NA, "Cal Check", "Passed", 
"U", "0"), X102 = c("U Err", "0.0035", NA, NA, NA, NA, NA, NA, 
"U Err", "0.0035"), X103 = c("Cal Check", "Passed", NA, NA, NA, 
NA, NA, NA, "Cal Check", "Passed")), class = c("tbl_df", "tbl", 
"data.frame"), row.names = c(NA, -10L))

You'll notice each measurement has different number of columns which adds to the complexity... Also, i will like to get rid of any column that ends with "Err" e.g. "Br Err". Kindly note that i've intentionally duplicated values just to be able to have sufficient data to explain my problem. i was planning to create a function so i tried sample_data <- read_csv("sample_data.csv", col_names = FALSE) then

df_1<- sample_data%>%slice(1:2)%>%
janitor::row_to_names(1)%>% 
select(-ends_with("Err"))

for the first two rows, the preceding rows i.e gave error

Error: Columns 100, 101, 102, 103 cannot have NA as name

when i tried

df_2<- sample_data%>%
slice(3:4)%>%
janitor::row_to_names(1)%>%
 select(-ends_with("Err"))

probably due to the way the data was read-in.

What i want at the end is to use `bind_rows() to combine everything

packages used

library(janitor)
library(tidyverse)
library(readxl)
library(lubridate)

Solution

  • I think this fits what you're trying to do. Count off your rows by twos, so each pair represents an observation where row 1 is measures and row 2 is values. Then split.

    library(dplyr)
    library(tidyr)
    library(purrr)
    
    by_pair <- dat %>%
      mutate(obvs = ceiling(seq_along(X1) / 2)) %>%
      split(.$obvs)
    
    by_pair[[2]]
    #> # A tibble: 2 x 104
    #>   X1     X2    X3    X4    X5    X6    X7    X8    X9    X10   X11   X12   X13  
    #>   <chr>  <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr>
    #> 1 File # Date… Oper… Name  ID    Samp… Proj… Appl… Meth… Elap… Allo… Matc… Allo…
    #> 2 77     3/8/… Supe… <NA>  0     Loca… XRF … GeoE… Oxid… 0     <NA>  0     <NA> 
    # ... truncated
    

    I converted the split tibbles back to base data.frame objects so that subsetting a single row would get a vector—makes it easier to then subset values based on corresponding names being NA. Drop rows whose measures end in "Err," then make sure every observation has the full set of measures with complete, filling in NA where that measurement wasn't made. Then reshape back to wide.

    by_pair %>%
      map(select, -obvs) %>%
      map(as.data.frame) %>%
      map_dfr(function(df) {
        df_names <- df[1, ]
        df_vals <- df[2, ][!is.na(df_names)]
        tibble(measure = df_names[!is.na(df_names)], value = df_vals)
      }, .id = "obvs") %>%
      filter(!grepl("\\bErr$", measure)) %>%
      mutate(measure = forcats::as_factor(measure)) %>%
      complete(obvs, measure) %>%
      pivot_wider(names_from = measure)
    #> # A tibble: 5 x 61
    #>   obvs  `File #` DateTime Operator Name  ID    Sample Project Application Method
    #>   <chr> <chr>    <chr>    <chr>    <chr> <chr> <chr>  <chr>   <chr>       <chr> 
    #> 1 1     76       3/8/202… Supervi… <NA>  0     Local… XRF DA… GeoExplora… Oxide…
    #> 2 2     77       3/8/20 … Supervi… <NA>  0     Local… XRF DA… GeoExplora… Oxide…
    #> 3 3     78       3/8/202… Supervi… <NA>  0     Local… XRF DA… GeoExplora… Oxide…
    #> 4 4     79       3/8/202… Supervi… <NA>  0     Local… XRF DA… GeoExplora… Oxide…
    #> 5 5     80       3/8/202… Supervi… <NA>  0     Local… XRF DA… GeoExplora… Oxide…
    #> # … with 51 more variables: ElapsedTime <chr>, `Alloy 1` <chr>, `Match Qual
    #> #   1` <chr>, `Alloy 2` <chr>, `Match Qual 2` <chr>, `Alloy 3` <chr>, `Match
    #> #   Qual 3` <chr>, MgO <chr>, Al2O3 <chr>, SiO2 <chr>, P <chr>, S <chr>,
    #> #   Cl <chr>, K2O <chr>, Ca <chr>, Ti <chr>, V <chr>, Cr <chr>, Mn <chr>,
    #> #   Fe <chr>, Co <chr>, Ni <chr>, Cu <chr>, Zn <chr>, Ga <chr>, As <chr>,
    #> #   Se <chr>, Rb <chr>, Sr <chr>, Y <chr>, Zr <chr>, Nb <chr>, Mo <chr>,
    #> #   Cd <chr>, Sn <chr>, Sb <chr>, Te <chr>, Ba <chr>, La <chr>, Ce <chr>,
    #> #   Hf <chr>, Ta <chr>, W <chr>, Pt <chr>, Hg <chr>, Tl <chr>, Pb <chr>,
    #> #   Bi <chr>, Th <chr>, U <chr>, `Cal Check` <chr>