Search code examples
rdelete-rowmissing-datasequences

R: Delete incomplete data sequences


I have logged measurements of three Soil CO2 sampling points coming to one measuring machine though three valves. There are three measurements at each valve (from each sampling point). The machine was sun powered so there was many power losses during measurement. When the power came back the measurements started again with Valve 1 and Measurement 1. Sometimes, but not always (e.g. row #27), there is a row with batt. batt. batt. sign (showing empty batteries). I would like to choose only data with perfect sequences from 1. to 2. valve.

I have data that look slike this:

# Measurement   Valve   Value
#1         1        1   0.123
#2         2        1   0.124
#3         3        1   0.125
#4         1        2   0.126
#5         2        2   0.127
#6         3        2   0.128
#7         1        3   0.129
#8         2        3   0.13
#9         3        3   0.131
#10    batt.    batt.   batt.
#11        1        1   0.132
#12        2        1   0.133
#13        3        1   0.134
#14        1        2   0.135
#15        2        2   0.136
#16        3        2   0.137
#17        1        3   0.138
#18        2        3   0.139
#19     bat.     bat.    bat.
#20        1        1   0.141
#21     bat.     bat.    bat.
#22        1        1   0.141
#23        2        1   0.142
#24        3        1   0.143
#25     bat.     bat.    bat.
#26        1        1   0.141
#27        1        1   0.141
#28        2        1   0.142
#29        3        1   0.143
#30        1        2   0.144
#31        2        2   0.145
#32        3        2   0.146
#33        1        3   0.147
#34        2        3   0.148
#35        3        3   0.149

I would like to get data like this:

#   Measurement Valve   Value
#1            1     1   0.123
#2            2     1   0.124
#3            3     1   0.125
#4            1     2   0.126
#5            2     2   0.127
#6            3     2   0.128
#7            1     1   0.132
#8            2     1   0.133
#9            3     1   0.134
#10           1     2   0.135
#11           2     2   0.136
#12           3     2   0.137
#13           1     1   0.141
#14           2     1   0.142
#15           3     1   0.143
#16           1     2   0.144
#17           2     2   0.145
#18           3     2   0.146

Input data:

DF = structure(list(Measurement = c("1", "2", "3", "1", "2", "3", 
"1", "2", "3", "batt.", "1", "2", "3", "1", "2", "3", "1", "2", 
"bat.", "1", "bat.", "1", "2", "3", "bat.", "1", "1", "2", "3", 
"1", "2", "3", "1", "2", "3"), Valve = c("1", "1", "1", "2", 
"2", "2", "3", "3", "3", "batt.", "1", "1", "1", "2", "2", "2", 
"3", "3", "bat.", "1", "bat.", "1", "1", "1", "bat.", "1", "1", 
"1", "1", "2", "2", "2", "3", "3", "3"), Value = c("0.123", "0.124", 
"0.125", "0.126", "0.127", "0.128", "0.129", "0.13", "0.131", 
"batt.", "0.132", "0.133", "0.134", "0.135", "0.136", "0.137", 
"0.138", "0.139", "bat.", "0.141", "bat.", "0.141", "0.142", 
"0.143", "bat.", "0.141", "0.141", "0.142", "0.143", "0.144", 
"0.145", "0.146", "0.147", "0.148", "0.149")), .Names = c("Measurement", 
"Valve", "Value"), row.names = c(NA, -35L), class = "data.frame")

Solution

  • Well, you could create a grouping column:

    library(data.table)
    setDT(DF)
    
    DF[, g := cumsum(grepl("bat",DF$Value))]
    
        Measurement Valve Value g
     1:           1     1 0.123 0
     2:           2     1 0.124 0
     3:           3     1 0.125 0
     4:           1     2 0.126 0
     5:           2     2 0.127 0
     6:           3     2 0.128 0
     7:           1     3 0.129 0
     8:           2     3  0.13 0
     9:           3     3 0.131 0
    10:       batt. batt. batt. 1
    11:           1     1 0.132 1
    12:           2     1 0.133 1
    13:           3     1 0.134 1
    14:           1     2 0.135 1
    15:           2     2 0.136 1
    16:           3     2 0.137 1
    17:           1     3 0.138 1
    18:           2     3 0.139 1
    19:        bat.  bat.  bat. 2
    20:           1     1 0.141 2
    21:        bat.  bat.  bat. 3
    22:           1     1 0.141 3
    23:           2     1 0.142 3
    24:           3     1 0.143 3
    25:        bat.  bat.  bat. 4
    26:           1     1 0.141 4
    27:           1     1 0.141 4
    28:           2     1 0.142 4
    29:           3     1 0.143 4
    30:           1     2 0.144 4
    31:           2     2 0.145 4
    32:           3     2 0.146 4
    33:           1     3 0.147 4
    34:           2     3 0.148 4
    35:           3     3 0.149 4
        Measurement Valve Value g
    

    and then select groups

    DF2 <- DF[, if (all(c("1","2") %in% Valve)) unique(.SD[Valve %in% c("1","2")]), by = g]
    
        g Measurement Valve Value
     1: 0           1     1 0.123
     2: 0           2     1 0.124
     3: 0           3     1 0.125
     4: 0           1     2 0.126
     5: 0           2     2 0.127
     6: 0           3     2 0.128
     7: 1           1     1 0.132
     8: 1           2     1 0.133
     9: 1           3     1 0.134
    10: 1           1     2 0.135
    11: 1           2     2 0.136
    12: 1           3     2 0.137
    13: 4           1     1 0.141
    14: 4           2     1 0.142
    15: 4           3     1 0.143
    16: 4           1     2 0.144
    17: 4           2     2 0.145
    18: 4           3     2 0.146
    

    If rows 26 & 27 could've had different Values, you can select the first with unique(.SD[Valve %in% c("1","2")], by=c("Measurement", "Valve")).