Search code examples
rinterpolation

Defining rules programatically from XML for dataverifyr


I'm trying to use the dataverifyr R package to derive rules programatically as I've XML Schema's defining what the data (which is coming as .csv :roll_eyes: ) types should be.

My aim is to parse the XML to get the parameters such as min_length and max_length and then use these values to define a set of rules for dataverifyr.

I can extract the different parameters to a series of lists...

+ names(info)
> [1] "name"               "min_length"         "max_length"         "length"             "min_inclusive"      "max_inclusive"     
[7] "enumeration_values" "patterns"           "pattern_values"    
> 
> lapply(info, head)
$name
[1] NA                               NA                               NA                               "OrgIden_EC_Type"               
[5] "LocalPatientIdentifier_EC_Type" "PersonStatedGenderCode_EC_Type"

$min_length
[1] NA NA NA  3  1 NA

$max_length
[1] NA NA NA  5 20 NA

$length
[1] NA NA  3 NA NA  1

$min_inclusive
[1]  -90 -180   NA   NA   NA   NA

$max_inclusive
[1]  90 180  NA  NA  NA  NA

$enumeration_values
$enumeration_values[[1]]
character(0)

$enumeration_values[[2]]
character(0)

$enumeration_values[[3]]
character(0)

$enumeration_values[[4]]
character(0)

$enumeration_values[[5]]
character(0)

$enumeration_values[[6]]
[1] "1" "2" "9" "X"


$patterns
NULL

$pattern_values
$pattern_values[[1]]
character(0)

$pattern_values[[2]]
character(0)

$pattern_values[[3]]
character(0)

$pattern_values[[4]]
character(0)

$pattern_values[[5]]
character(0)

$pattern_values[[6]]
character(0)

(NB Not every field has every parameter)

I then move onto trying to create a rule using the values with...

> rule(info$name[7] >= info$min_length[7] & info$name[7] <= info$max_length[7],
       name=eval(info$name[7]),
       allow_na=TRUE)
+ + <Verification Rule>
  expr: 'info$name[7] >= info$min_length[7] & info$name[7] <= info$max_length[7]'
  name: 'N_6_18_EC_Type'
  allow NA: TRUE
  negated:  FALSE

But I can't seem to get the info$name[7], info$min_length[7] etc. to access the values in the lists. This is a common problem it seems and I found this thread which prompted me to try eval() and get(). I've tried a few things...

eval() all instances

> rule(eval(info$name[7]) >= eval(info$min_length[7]) & eval(info$name[7]) <= eval(info$max_length[7]),
+        name=eval(info$name[7]),
+        allow_na=TRUE)
+ + Error in substr(expr, 1, 1) == "\"" && substr(expr, nchar(expr), nchar(expr)) ==  : 
  'length = 2' in coercion to 'logical(1)'

get() all instances

> rule(get(info$name[7]) >= get(info$min_length[7]) & get(info$name[7]) <= get(info$max_length[7]),
+        name=eval(info$name[7]),
+        allow_na=TRUE)
+ + Error in substr(expr, 1, 1) == "\"" && substr(expr, nchar(expr), nchar(expr)) ==  : 
  'length = 2' in coercion to 'logical(1)'

Check what I'm getting for get() and eval()

> eval(info$name[7])
[1] "N_6_18_EC_Type"
> get(info$name[7])
Error in get(info$name[7]) : object 'N_6_18_EC_Type' not found
Did you mean : Type1Font or latex_dependency ?
> get(info$min_length[7])
Error in get(info$min_length[7]) : invalid first argument
> eval(info$min_length[7])
[1] 6

Maybe its a combination of get() and eval()

> rule(get(info$name[7]) >= eval(info$min_length[7]) & get(info$name[7]) <= eval(info$max_length[7]),
+        name=eval(info$name[7]),
+        allow_na=TRUE)
+ + Error in substr(expr, 1, 1) == "\"" && substr(expr, nchar(expr), nchar(expr)) ==  : 
  'length = 2' in coercion to 'logical(1)'

Nope that hasn't worked either, ok lets simplify things.

> rule(get(info$name[7]) >= eval(info$min_length[7]),
+        name=eval(info$name[7]),
+        allow_na=TRUE)
+ + <Verification Rule>
  expr: 'get(info$name[7]) >= eval(info$min_length[7])'
  name: 'N_6_18_EC_Type'
  allow NA: TRUE
  negated:  FALSE

Still not there, lets try [[ ]] (double square brackets).

> rule(get(info$name[[7]]) >= eval(info$min_length[[7]]),
+        name=eval(info$name[7]),
+        allow_na=TRUE)
+ + <Verification Rule>
  expr: 'get(info$name[[7]]) >= eval(info$min_length[[7]])'
  name: 'N_6_18_EC_Type'
  allow NA: TRUE
  negated:  FALSE

No joy. I'm stumped as to how to get the values shown by the <Verification Rule> and it feels like I'm missing something basic here in my R knowledge.

This is important for what I'm trying to achieve as ultimately I would like to write the rules to YAML configuration files.

EDIT

As suggested/requested a reprex. I've omitted the reading and extraction of XML parameters to create the info...

library(dataverifyr)

dput(info)
list(name = c(NA, NA, NA, "OrgIden_EC_Type", "LocalPatientIdentifier_EC_Type", 
"PersonStatedGenderCode_EC_Type", "N_6_18_EC_Type", "N_5_EC_Type", 
"N_1_N_5_EC_Type", "Overseas_EC_Type", "MH_Status_EC_Type", "AN_4_6_EC_Type", 
"AN_max_3_EC_Type", "AN_max_2_EC_Type", "AN_5_9_EC_Type", "AN_2_EC_Type", 
"AN_max_5_EC_Type", "AN_max_10_EC_Type", "AN_max_12_EC_Type", 
"AN_max_255_EC_Type", "AN_min_1_max_20_EC_Type", "AN_max_20_EC_Type", 
"AN_max_32_EC_Type", "AN_min_1_max_32_EC_Type", "AN_3_5_EC_Type", 
"AttendCat_EC_Type", "ProfessionalRegistrationIssuerCode_EC_Type", 
"CareProfessionalTier_EC_Type", "Yes_No_EC_Type", "ConsultationMechanismUrgentAndEmergencyCare_Type", 
"FitNoteIssuer_Type"), min_length = c(NA, NA, NA, 3, 1, NA, 6, 
NA, 1, NA, NA, 4, NA, NA, 5, NA, NA, NA, NA, NA, 1, NA, NA, 1, 
3, NA, NA, NA, NA, NA, NA), max_length = c(NA, NA, NA, 5, 20, 
NA, 18, 5, 5, NA, NA, 6, 3, 2, 9, NA, 5, 10, 12, 255, 20, 20, 
32, 32, 5, NA, NA, NA, NA, NA, NA), length = c(NA, NA, 3, NA, 
NA, 1, NA, NA, NA, 1, 2, NA, NA, NA, NA, 2, NA, NA, NA, NA, NA, 
NA, NA, NA, NA, 1, 2, 2, 1, 2, 2), min_inclusive = c(-90, -180, 
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA), max_inclusive = c(90, 
180, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA), 
    enumeration_values = list(character(0), character(0), character(0), 
        character(0), character(0), c("1", "2", "9", "X"), character(0), 
        character(0), character(0), c("A", "B", "C", "D", "E", 
        "F", "P", "9"), character(0), character(0), character(0), 
        character(0), character(0), c("01", "02", "03", "05", 
        "06", "07"), character(0), character(0), character(0), 
        character(0), character(0), character(0), character(0), 
        character(0), character(0), c("1", "2", "3", "4", "X"
        ), character(0), c("01", "02", "03", "04", "05"), c("Y", 
        "N"), c("01", "02", "11", "13"), character(0)), patterns = NULL, 
    pattern_values = list(character(0), character(0), character(0), 
        character(0), character(0), character(0), "[0-9]*", "[0-9]*", 
        "[0-9]*", character(0), character(0), character(0), character(0), 
        character(0), character(0), character(0), character(0), 
        character(0), character(0), character(0), character(0), 
        character(0), character(0), character(0), character(0), 
        character(0), character(0), character(0), character(0), 
        character(0), character(0)))

# Test rule creation
> rule(get(info$name[[7]]) >= eval(info$min_length[[7]]),
+        name=eval(info$name[7]),
+        allow_na=TRUE)
+ + <Verification Rule>
  expr: 'get(info$name[[7]]) >= eval(info$min_length[[7]])'
  name: 'N_6_18_EC_Type'
  allow NA: TRUE
  negated:  FALSE

# Create a ruleset and write to YAML
rules <- ruleset(
    rule(get(info$name[[7]]) >= eval(info$min_length[[7]]),
         name=eval(info$name[7]),
         allow_na=TRUE))
write_rules(rules, "~/tmp/ruleset.yaml")

The resulting YAML looks like...

- name: N_6_18_EC_Type
  expr: get(info$name[[7]]) >= eval(info$min_length[[7]])
  allow_na: yes
  negate: no
  index: 1

What I would like to get as a <Verification Rule> is the values of info$name[7] and info$min_length[7]...

> info$name[7]
[1] "N_6_18_EC_Type"
> info$min_length[7]
[1] 6

...as this would then work with the dataset I intend to verify. The resulting ruleset.yaml (which would then live with the dataset and could be reused) would read...

- name: N_6_18_EC_Type
  expr: N_6_18_EC_Type >= 6
  allow_na: yes
  negate: no
  index: 1

Solution

  • It's a shame that the library requires the use of expressions. Here's one way to build the expression

    colname <- info$name[7]
    col <- as.symbol(colname)
    min_length <- info$min_length[7]
    new_rule <- eval(bquote(rule(.(col)<=.(min_length), name=.(colname), allow_na=TRUE)))
    new_rule
    # <Verification Rule>
    #   expr: 'N_6_18_EC_Type <= 6'
    #   name: 'N_6_18_EC_Type'
    #   allow NA: TRUE
    #   negated:  FALSE