I have a very large dataset that i measured on an instrument with repeated, unequal headers. i'm trying to first subset every two rows i.e header and data then recombine using bind_row()
from the tidyverse package. I've looked at the questions like use-first-row-data-as-column-names-in-r and its totally different from what i want to do.
here is a dput of my sample data
structure(list(X1 = c("File #", "76", "File #", "77", "File #",
"78", "File #", "79", "File #", "80"), X2 = c("DateTime", "3/8/2020 18:08",
"DateTime", "3/8/20 18:08", "DateTime", "3/8/2020 18:08", "DateTime",
"3/8/2020 18:08", "DateTime", "3/8/2020 18:08"), X3 = c("Operator",
"Supervisor", "Operator", "Supervisor", "Operator", "Supervisor",
"Operator", "Supervisor", "Operator", "Supervisor"), X4 = c("Name",
NA, "Name", NA, "Name", NA, "Name", NA, "Name", NA), X5 = c("ID",
"0", "ID", "0", "ID", "0", "ID", "0", "ID", "0"), X6 = c("Sample",
"Local 2", "Sample", "Local 2", "Sample", "Local 2", "Sample",
"Local 2", "Sample", "Local 2"), X7 = c("Project", "XRF DATA",
"Project", "XRF DATA", "Project", "XRF DATA", "Project", "XRF DATA",
"Project", "XRF DATA"), X8 = c("Application", "GeoExploration",
"Application", "GeoExploration", "Application", "GeoExploration",
"Application", "GeoExploration", "Application", "GeoExploration"
), X9 = c("Method", "Oxide3phase", "Method", "Oxide3phase", "Method",
"Oxide3phase", "Method", "Oxide3phase", "Method", "Oxide3phase"
), X10 = c("ElapsedTime", "0", "ElapsedTime", "0", "ElapsedTime",
"0", "ElapsedTime", "0", "ElapsedTime", "0"), X11 = c("Alloy 1",
NA, "Alloy 1", NA, "Alloy 1", NA, "Alloy 1", NA, "Alloy 1", NA
), X12 = c("Match Qual 1", "0", "Match Qual 1", "0", "Match Qual 1",
"0", "Match Qual 1", "0", "Match Qual 1", "0"), X13 = c("Alloy 2",
NA, "Alloy 2", NA, "Alloy 2", NA, "Alloy 2", NA, "Alloy 2", NA
), X14 = c("Match Qual 2", "0", "Match Qual 2", "0", "Match Qual 2",
"0", "Match Qual 2", "0", "Match Qual 2", "0"), X15 = c("Alloy 3",
NA, "Alloy 3", NA, "Alloy 3", NA, "Alloy 3", NA, "Alloy 3", NA
), X16 = c("Match Qual 3", "0", "Match Qual 3", "0", "Match Qual 3",
"0", "Match Qual 3", "0", "Match Qual 3", "0"), X17 = c("MgO",
"0.6579", "MgO", "0.6579", "MgO", "0.6579", "MgO", "0.6579",
"MgO", "0.6579"), X18 = c("MgO Err", "0.7778", "MgO Err", "0.7778",
"MgO Err", "0.7778", "MgO Err", "0.7778", "MgO Err", "0.7778"
), X19 = c("Al2O3", "11.1503", "Al2O3", "11.1503", "Al2O3", "11.1503",
"Al2O3", "11.1503", "Al2O3", "11.1503"), X20 = c("Al2O3 Err",
"0.5363", "Al2O3 Err", "0.5363", "Al2O3 Err", "0.5363", "Al2O3 Err",
"0.5363", "Al2O3 Err", "0.5363"), X21 = c("SiO2", "67.523", "SiO2",
"67.523", "SiO2", "67.523", "SiO2", "67.523", "SiO2", "67.523"
), X22 = c("SiO2 Err", "0.8529", "SiO2 Err", "0.8529", "SiO2 Err",
"0.8529", "SiO2 Err", "0.8529", "SiO2 Err", "0.8529"), X23 = c("P",
"0.0035", "P", "0.0035", "P", "0.0035", "P", "0.0035", "P", "0.0035"
), X24 = c("P Err", "0.0177", "P Err", "0.0177", "P Err", "0.0177",
"P Err", "0.0177", "P Err", "0.0177"), X25 = c("S", "0.11", "S",
"0.11", "S", "0.11", "S", "0.11", "S", "0.11"), X26 = c("S Err",
"0.0195", "S Err", "0.0195", "S Err", "0.0195", "S Err", "0.0195",
"S Err", "0.0195"), X27 = c("Cl", "0", "Cl", "0", "Cl", "0",
"Cl", "0", "Cl", "0"), X28 = c("Cl Err", "0.0213", "Cl Err",
"0.0213", "Cl Err", "0.0213", "Cl Err", "0.0213", "Cl Err", "0.0213"
), X29 = c("K2O", "4.6562", "K2O", "4.6562", "K2O", "4.6562",
"K2O", "4.6562", "K2O", "4.6562"), X30 = c("K2O Err", "0.0516",
"K2O Err", "0.0516", "K2O Err", "0.0516", "K2O Err", "0.0516",
"K2O Err", "0.0516"), X31 = c("Ca", "0.6011", "Ca", "0.6011",
"Ca", "0.6011", "Ca", "0.6011", "Ca", "0.6011"), X32 = c("Ca Err",
"0.0136", "Ca Err", "0.0136", "Ca Err", "0.0136", "Ca Err", "0.0136",
"Ca Err", "0.0136"), X33 = c("Ti", "0.1598", "Ti", "0.1598",
"Ti", "0.1598", "Ti", "0.1598", "Ti", "0.1598"), X34 = c("Ti Err",
"0.0177", "Ti Err", "0.0177", "Ti Err", "0.0177", "Ti Err", "0.0177",
"Ti Err", "0.0177"), X35 = c("V", "0.0064", "V", "0.0064", "V",
"0.0064", "V", "0.0064", "V", "0.0064"), X36 = c("V Err", "0.0088",
"V Err", "0.0088", "V Err", "0.0088", "V Err", "0.0088", "V Err",
"0.0088"), X37 = c("Cr", "0.0042", "Cr", "0.0042", "Cr", "0.0042",
"Cr", "0.0042", "Cr", "0.0042"), X38 = c("Cr Err", "0.0033",
"Cr Err", "0.0033", "Cr Err", "0.0033", "Cr Err", "0.0033", "Cr Err",
"0.0033"), X39 = c("Mn", "0.0976", "Mn", "0.0976", "Mn", "0.0976",
"Mn", "0.0976", "Mn", "0.0976"), X40 = c("Mn Err", "0.0063",
"Mn Err", "0.0063", "Mn Err", "0.0063", "Mn Err", "0.0063", "Mn Err",
"0.0063"), X41 = c("Fe", "1.5828", "Fe", "1.5828", "Fe", "1.5828",
"Fe", "1.5828", "Fe", "1.5828"), X42 = c("Fe Err", "0.0186",
"Fe Err", "0.0186", "Fe Err", "0.0186", "Fe Err", "0.0186", "Fe Err",
"0.0186"), X43 = c("Co", "0.0042", "Co", "0.0042", "Co", "0.0042",
"Co", "0.0042", "Co", "0.0042"), X44 = c("Co Err", "0.0032",
"Co Err", "0.0032", "Co Err", "0.0032", "Co Err", "0.0032", "Co Err",
"0.0032"), X45 = c("Ni", "0.0052", "Ni", "0.0052", "Ni", "0.0052",
"Ni", "0.0052", "Ni", "0.0052"), X46 = c("Ni Err", "0.0012",
"Ni Err", "0.0012", "Ni Err", "0.0012", "Ni Err", "0.0012", "Ni Err",
"0.0012"), X47 = c("Cu", "0.0218", "Cu", "0.0218", "Cu", "0.0218",
"Cu", "0.0218", "Cu", "0.0218"), X48 = c("Cu Err", "0.0015",
"Cu Err", "0.0015", "Cu Err", "0.0015", "Cu Err", "0.0015", "Cu Err",
"0.0015"), X49 = c("Zn", "0.0709", "Zn", "0.0709", "Zn", "0.0709",
"Zn", "0.0709", "Zn", "0.0709"), X50 = c("Zn Err", "0.0024",
"Zn Err", "0.0024", "Zn Err", "0.0024", "Zn Err", "0.0024", "Zn Err",
"0.0024"), X51 = c("Ga", "0.0006", "Ga", "0.0006", "Ga", "0.0006",
"Ga", "0.0006", "Ga", "0.0006"), X52 = c("Ga Err", "0.0009",
"Ga Err", "0.0009", "Ga Err", "0.0009", "Ga Err", "0.0009", "Ga Err",
"0.0009"), X53 = c("As", "0.0041", "As", "0.0041", "As", "0.0041",
"As", "0.0041", "As", "0.0041"), X54 = c("As Err", "0.0019",
"As Err", "0.0019", "As Err", "0.0019", "As Err", "0.0019", "As Err",
"0.0019"), X55 = c("Se", "0.0001", "Se", "0.0001", "Se", "0.0001",
"Se", "0.0001", "Se", "0.0001"), X56 = c("Se Err", "0.0002",
"Se Err", "0.0002", "Se Err", "0.0002", "Se Err", "0.0002", "Se Err",
"0.0002"), X57 = c("Rb", "0.0147", "Rb", "0.0147", "Rb", "0.0147",
"Rb", "0.0147", "Rb", "0.0147"), X58 = c("Rb Err", "0.0008",
"Rb Err", "0.0008", "Rb Err", "0.0008", "Rb Err", "0.0008", "Rb Err",
"0.0008"), X59 = c("Sr", "0.0138", "Sr", "0.0138", "Sr", "0.0138",
"Sr", "0.0138", "Sr", "0.0138"), X60 = c("Sr Err", "0.0007",
"Sr Err", "0.0007", "Sr Err", "0.0007", "Sr Err", "0.0007", "Sr Err",
"0.0007"), X61 = c("Y", "0.0032", "Y", "0.0032", "Y", "0.0032",
"Y", "0.0032", "Y", "0.0032"), X62 = c("Y Err", "0.0005", "Y Err",
"0.0005", "Y Err", "0.0005", "Y Err", "0.0005", "Y Err", "0.0005"
), X63 = c("Zr", "0.0277", "Zr", "0.0277", "Zr", "0.0277", "Zr",
"0.0277", "Zr", "0.0277"), X64 = c("Zr Err", "0.0011", "Zr Err",
"0.0011", "Zr Err", "0.0011", "Zr Err", "0.0011", "Zr Err", "0.0011"
), X65 = c("Nb", "0.002", "Nb", "0.002", "Nb", "0.002", "Nb",
"0.002", "Nb", "0.002"), X66 = c("Nb Err", "0.0005", "Nb Err",
"0.0005", "Nb Err", "0.0005", "Nb Err", "0.0005", "Nb Err", "0.0005"
), X67 = c("Mo", "0", "Mo", "0", "Mo", "0", "Mo", "0", "Mo",
"0"), X68 = c("Mo Err", "0.0009", "Mo Err", "0.0009", "Mo Err",
"0.0009", "Mo Err", "0.0009", "Mo Err", "0.0009"), X69 = c("Cd",
"0", "Cd", "0", "Cd", "0", "Cd", "0", "Cd", "0"), X70 = c("Cd Err",
"0.0009", "Cd Err", "0.0009", "Cd Err", "0.0009", "Cd Err", "0.0009",
"Cd Err", "0.0009"), X71 = c("Sn", "0", "Sn", "0", "Sn", "0",
"Sn", "0", "Sn", "0"), X72 = c("Sn Err", "0.0054", "Sn Err",
"0.0054", "Sn Err", "0.0054", "Sn Err", "0.0054", "Sn Err", "0.0054"
), X73 = c("Sb", "0.0144", "Sb", "0.0144", "Sb", "0.0144", "Sb",
"0.0144", "Sb", "0.0144"), X74 = c("Sb Err", "0.0044", "Sb Err",
"0.0044", "Sb Err", "0.0044", "Sb Err", "0.0044", "Sb Err", "0.0044"
), X75 = c("Te", "0.0002", "Te", "0.0002", "Te", "0.0002", "Te",
"0.0002", "Te", "0.0002"), X76 = c("Te Err", "0.0005", "Te Err",
"0.0005", "Te Err", "0.0005", "Te Err", "0.0005", "Te Err", "0.0005"
), X77 = c("Ba", "0.1185", "Ba", "0.1185", "Ba", "0.1185", "Ba",
"0.1185", "Ba", "0.1185"), X78 = c("Ba Err", "0.0142", "Ba Err",
"0.0142", "Ba Err", "0.0142", "Ba Err", "0.0142", "Ba Err", "0.0142"
), X79 = c("La", "0", "La", "0", "La", "0", "La", "0", "La",
"0"), X80 = c("La Err", "0.0156", "La Err", "0.0156", "La Err",
"0.0156", "La Err", "0.0156", "La Err", "0.0156"), X81 = c("Ce",
"0.002", "Ce", "0.002", "Ce", "0.002", "Ce", "0.002", "Ce", "0.002"
), X82 = c("Ce Err", "0.022", "Ce Err", "0.022", "Ce Err", "0.022",
"Ce Err", "0.022", "Ce Err", "0.022"), X83 = c("Hf", "0", "Hf",
"0", "Hf", "0", "Hf", "0", "Hf", "0"), X84 = c("Hf Err", "0.0026",
"Hf Err", "0.0026", "Hf Err", "0.0026", "Hf Err", "0.0026", "Hf Err",
"0.0026"), X85 = c("Ta", "0", "Ta", "0", "Ta", "0", "Ta", "0",
"Ta", "0"), X86 = c("Ta Err", "0.0021", "Ta Err", "0.0021", "Ta Err",
"0.0021", "Ta Err", "0.0021", "Ta Err", "0.0021"), X87 = c("W",
"0", "W", "0", "W", "0", "W", "0", "W", "0"), X88 = c("W Err",
"0.0073", "W Err", "0.0073", "W Err", "0.0073", "W Err", "0.0073",
"W Err", "0.0073"), X89 = c("Pt", "0", "Pt", "0", "Pt", "0",
"Pt", "0", "Pt", "0"), X90 = c("Pt Err", "0.0014", "Pt Err",
"0.0014", "Pt Err", "0.0014", "Pt Err", "0.0014", "Pt Err", "0.0014"
), X91 = c("Hg", "0.0002", "Hg", "0.0002", "Bi", "0.0001", "Hg",
"0.0002", "Hg", "0.0002"), X92 = c("Hg Err", "0.0008", "Hg Err",
"0.0008", "Bi Err", "0.0012", "Hg Err", "0.0008", "Hg Err", "0.0008"
), X93 = c("Tl", "0", "Tl", "0", "Th", "0.0005", "Tl", "0", "Tl",
"0"), X94 = c("Tl Err", "0.0023", "Tl Err", "0.0023", "Th Err",
"0.0012", "Tl Err", "0.0023", "Tl Err", "0.0023"), X95 = c("Pb",
"0.0834", "Pb", "0.0834", "U", "0", "Pb", "0.0834", "Pb", "0.0834"
), X96 = c("Pb Err", "0.0034", "Pb Err", "0.0034", "U Err", "0.0035",
"Pb Err", "0.0034", "Pb Err", "0.0034"), X97 = c("Bi", "0.0001",
"Bi", "0.0001", "Cal Check", "Passed", "Th", "0.0005", "Bi",
"0.0001"), X98 = c("Bi Err", "0.0012", "Bi Err", "0.0012", NA,
NA, "Th Err", "0.0012", "Bi Err", "0"), X99 = c("Th", "0.0005",
"Cal Check", "Passed", NA, NA, "U", "0", "Th", "0.0005"), X100 = c("Th Err",
"0.0012", NA, NA, NA, NA, "U Err", "0.0035", "Th Err", "0.0012"
), X101 = c("U", "0", NA, NA, NA, NA, "Cal Check", "Passed",
"U", "0"), X102 = c("U Err", "0.0035", NA, NA, NA, NA, NA, NA,
"U Err", "0.0035"), X103 = c("Cal Check", "Passed", NA, NA, NA,
NA, NA, NA, "Cal Check", "Passed")), class = c("tbl_df", "tbl",
"data.frame"), row.names = c(NA, -10L))
You'll notice each measurement has different number of columns which adds to the complexity... Also, i will like to get rid of any column that ends with "Err" e.g. "Br Err". Kindly note that i've intentionally duplicated values just to be able to have sufficient data to explain my problem. i was planning to create a function so i tried
sample_data <- read_csv("sample_data.csv", col_names = FALSE)
then
df_1<- sample_data%>%slice(1:2)%>%
janitor::row_to_names(1)%>%
select(-ends_with("Err"))
for the first two rows, the preceding rows i.e gave error
Error: Columns
100
,101
,102
,103
cannot have NA as name
when i tried
df_2<- sample_data%>%
slice(3:4)%>%
janitor::row_to_names(1)%>%
select(-ends_with("Err"))
probably due to the way the data was read-in.
What i want at the end is to use `bind_rows() to combine everything
packages used
library(janitor)
library(tidyverse)
library(readxl)
library(lubridate)
I think this fits what you're trying to do. Count off your rows by twos, so each pair represents an observation where row 1 is measures and row 2 is values. Then split.
library(dplyr)
library(tidyr)
library(purrr)
by_pair <- dat %>%
mutate(obvs = ceiling(seq_along(X1) / 2)) %>%
split(.$obvs)
by_pair[[2]]
#> # A tibble: 2 x 104
#> X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13
#> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr>
#> 1 File # Date… Oper… Name ID Samp… Proj… Appl… Meth… Elap… Allo… Matc… Allo…
#> 2 77 3/8/… Supe… <NA> 0 Loca… XRF … GeoE… Oxid… 0 <NA> 0 <NA>
# ... truncated
I converted the split tibbles back to base data.frame
objects so that subsetting a single row would get a vector—makes it easier to then subset values based on corresponding names being NA. Drop rows whose measures end in "Err," then make sure every observation has the full set of measures with complete
, filling in NA
where that measurement wasn't made. Then reshape back to wide.
by_pair %>%
map(select, -obvs) %>%
map(as.data.frame) %>%
map_dfr(function(df) {
df_names <- df[1, ]
df_vals <- df[2, ][!is.na(df_names)]
tibble(measure = df_names[!is.na(df_names)], value = df_vals)
}, .id = "obvs") %>%
filter(!grepl("\\bErr$", measure)) %>%
mutate(measure = forcats::as_factor(measure)) %>%
complete(obvs, measure) %>%
pivot_wider(names_from = measure)
#> # A tibble: 5 x 61
#> obvs `File #` DateTime Operator Name ID Sample Project Application Method
#> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr>
#> 1 1 76 3/8/202… Supervi… <NA> 0 Local… XRF DA… GeoExplora… Oxide…
#> 2 2 77 3/8/20 … Supervi… <NA> 0 Local… XRF DA… GeoExplora… Oxide…
#> 3 3 78 3/8/202… Supervi… <NA> 0 Local… XRF DA… GeoExplora… Oxide…
#> 4 4 79 3/8/202… Supervi… <NA> 0 Local… XRF DA… GeoExplora… Oxide…
#> 5 5 80 3/8/202… Supervi… <NA> 0 Local… XRF DA… GeoExplora… Oxide…
#> # … with 51 more variables: ElapsedTime <chr>, `Alloy 1` <chr>, `Match Qual
#> # 1` <chr>, `Alloy 2` <chr>, `Match Qual 2` <chr>, `Alloy 3` <chr>, `Match
#> # Qual 3` <chr>, MgO <chr>, Al2O3 <chr>, SiO2 <chr>, P <chr>, S <chr>,
#> # Cl <chr>, K2O <chr>, Ca <chr>, Ti <chr>, V <chr>, Cr <chr>, Mn <chr>,
#> # Fe <chr>, Co <chr>, Ni <chr>, Cu <chr>, Zn <chr>, Ga <chr>, As <chr>,
#> # Se <chr>, Rb <chr>, Sr <chr>, Y <chr>, Zr <chr>, Nb <chr>, Mo <chr>,
#> # Cd <chr>, Sn <chr>, Sb <chr>, Te <chr>, Ba <chr>, La <chr>, Ce <chr>,
#> # Hf <chr>, Ta <chr>, W <chr>, Pt <chr>, Hg <chr>, Tl <chr>, Pb <chr>,
#> # Bi <chr>, Th <chr>, U <chr>, `Cal Check` <chr>