I'm trying to use the dataverifyr R package to derive rules programatically as I've XML Schema's defining what the data (which is coming as .csv
:roll_eyes: ) types should be.
My aim is to parse the XML to get the parameters such as min_length
and max_length
and then use these values to define a set of rules for dataverifyr
.
I can extract the different parameters to a series of lists...
+ names(info)
> [1] "name" "min_length" "max_length" "length" "min_inclusive" "max_inclusive"
[7] "enumeration_values" "patterns" "pattern_values"
>
> lapply(info, head)
$name
[1] NA NA NA "OrgIden_EC_Type"
[5] "LocalPatientIdentifier_EC_Type" "PersonStatedGenderCode_EC_Type"
$min_length
[1] NA NA NA 3 1 NA
$max_length
[1] NA NA NA 5 20 NA
$length
[1] NA NA 3 NA NA 1
$min_inclusive
[1] -90 -180 NA NA NA NA
$max_inclusive
[1] 90 180 NA NA NA NA
$enumeration_values
$enumeration_values[[1]]
character(0)
$enumeration_values[[2]]
character(0)
$enumeration_values[[3]]
character(0)
$enumeration_values[[4]]
character(0)
$enumeration_values[[5]]
character(0)
$enumeration_values[[6]]
[1] "1" "2" "9" "X"
$patterns
NULL
$pattern_values
$pattern_values[[1]]
character(0)
$pattern_values[[2]]
character(0)
$pattern_values[[3]]
character(0)
$pattern_values[[4]]
character(0)
$pattern_values[[5]]
character(0)
$pattern_values[[6]]
character(0)
(NB Not every field has every parameter)
I then move onto trying to create a rule using the values with...
> rule(info$name[7] >= info$min_length[7] & info$name[7] <= info$max_length[7],
name=eval(info$name[7]),
allow_na=TRUE)
+ + <Verification Rule>
expr: 'info$name[7] >= info$min_length[7] & info$name[7] <= info$max_length[7]'
name: 'N_6_18_EC_Type'
allow NA: TRUE
negated: FALSE
But I can't seem to get the info$name[7]
, info$min_length[7]
etc. to access the values in the lists. This is a common problem it seems and I found this thread which prompted me to try eval()
and get()
. I've tried a few things...
eval()
all instances> rule(eval(info$name[7]) >= eval(info$min_length[7]) & eval(info$name[7]) <= eval(info$max_length[7]),
+ name=eval(info$name[7]),
+ allow_na=TRUE)
+ + Error in substr(expr, 1, 1) == "\"" && substr(expr, nchar(expr), nchar(expr)) == :
'length = 2' in coercion to 'logical(1)'
get()
all instances> rule(get(info$name[7]) >= get(info$min_length[7]) & get(info$name[7]) <= get(info$max_length[7]),
+ name=eval(info$name[7]),
+ allow_na=TRUE)
+ + Error in substr(expr, 1, 1) == "\"" && substr(expr, nchar(expr), nchar(expr)) == :
'length = 2' in coercion to 'logical(1)'
get()
and eval()
> eval(info$name[7])
[1] "N_6_18_EC_Type"
> get(info$name[7])
Error in get(info$name[7]) : object 'N_6_18_EC_Type' not found
Did you mean : Type1Font or latex_dependency ?
> get(info$min_length[7])
Error in get(info$min_length[7]) : invalid first argument
> eval(info$min_length[7])
[1] 6
get()
and eval()
> rule(get(info$name[7]) >= eval(info$min_length[7]) & get(info$name[7]) <= eval(info$max_length[7]),
+ name=eval(info$name[7]),
+ allow_na=TRUE)
+ + Error in substr(expr, 1, 1) == "\"" && substr(expr, nchar(expr), nchar(expr)) == :
'length = 2' in coercion to 'logical(1)'
Nope that hasn't worked either, ok lets simplify things.
> rule(get(info$name[7]) >= eval(info$min_length[7]),
+ name=eval(info$name[7]),
+ allow_na=TRUE)
+ + <Verification Rule>
expr: 'get(info$name[7]) >= eval(info$min_length[7])'
name: 'N_6_18_EC_Type'
allow NA: TRUE
negated: FALSE
Still not there, lets try [[ ]]
(double square brackets).
> rule(get(info$name[[7]]) >= eval(info$min_length[[7]]),
+ name=eval(info$name[7]),
+ allow_na=TRUE)
+ + <Verification Rule>
expr: 'get(info$name[[7]]) >= eval(info$min_length[[7]])'
name: 'N_6_18_EC_Type'
allow NA: TRUE
negated: FALSE
No joy. I'm stumped as to how to get the values shown by the <Verification Rule>
and it feels like I'm missing something basic here in my R knowledge.
This is important for what I'm trying to achieve as ultimately I would like to write the rules to YAML configuration files.
EDIT
As suggested/requested a reprex. I've omitted the reading and extraction of XML parameters to create the info
...
library(dataverifyr)
dput(info)
list(name = c(NA, NA, NA, "OrgIden_EC_Type", "LocalPatientIdentifier_EC_Type",
"PersonStatedGenderCode_EC_Type", "N_6_18_EC_Type", "N_5_EC_Type",
"N_1_N_5_EC_Type", "Overseas_EC_Type", "MH_Status_EC_Type", "AN_4_6_EC_Type",
"AN_max_3_EC_Type", "AN_max_2_EC_Type", "AN_5_9_EC_Type", "AN_2_EC_Type",
"AN_max_5_EC_Type", "AN_max_10_EC_Type", "AN_max_12_EC_Type",
"AN_max_255_EC_Type", "AN_min_1_max_20_EC_Type", "AN_max_20_EC_Type",
"AN_max_32_EC_Type", "AN_min_1_max_32_EC_Type", "AN_3_5_EC_Type",
"AttendCat_EC_Type", "ProfessionalRegistrationIssuerCode_EC_Type",
"CareProfessionalTier_EC_Type", "Yes_No_EC_Type", "ConsultationMechanismUrgentAndEmergencyCare_Type",
"FitNoteIssuer_Type"), min_length = c(NA, NA, NA, 3, 1, NA, 6,
NA, 1, NA, NA, 4, NA, NA, 5, NA, NA, NA, NA, NA, 1, NA, NA, 1,
3, NA, NA, NA, NA, NA, NA), max_length = c(NA, NA, NA, 5, 20,
NA, 18, 5, 5, NA, NA, 6, 3, 2, 9, NA, 5, 10, 12, 255, 20, 20,
32, 32, 5, NA, NA, NA, NA, NA, NA), length = c(NA, NA, 3, NA,
NA, 1, NA, NA, NA, 1, 2, NA, NA, NA, NA, 2, NA, NA, NA, NA, NA,
NA, NA, NA, NA, 1, 2, 2, 1, 2, 2), min_inclusive = c(-90, -180,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA), max_inclusive = c(90,
180, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA),
enumeration_values = list(character(0), character(0), character(0),
character(0), character(0), c("1", "2", "9", "X"), character(0),
character(0), character(0), c("A", "B", "C", "D", "E",
"F", "P", "9"), character(0), character(0), character(0),
character(0), character(0), c("01", "02", "03", "05",
"06", "07"), character(0), character(0), character(0),
character(0), character(0), character(0), character(0),
character(0), character(0), c("1", "2", "3", "4", "X"
), character(0), c("01", "02", "03", "04", "05"), c("Y",
"N"), c("01", "02", "11", "13"), character(0)), patterns = NULL,
pattern_values = list(character(0), character(0), character(0),
character(0), character(0), character(0), "[0-9]*", "[0-9]*",
"[0-9]*", character(0), character(0), character(0), character(0),
character(0), character(0), character(0), character(0),
character(0), character(0), character(0), character(0),
character(0), character(0), character(0), character(0),
character(0), character(0), character(0), character(0),
character(0), character(0)))
# Test rule creation
> rule(get(info$name[[7]]) >= eval(info$min_length[[7]]),
+ name=eval(info$name[7]),
+ allow_na=TRUE)
+ + <Verification Rule>
expr: 'get(info$name[[7]]) >= eval(info$min_length[[7]])'
name: 'N_6_18_EC_Type'
allow NA: TRUE
negated: FALSE
# Create a ruleset and write to YAML
rules <- ruleset(
rule(get(info$name[[7]]) >= eval(info$min_length[[7]]),
name=eval(info$name[7]),
allow_na=TRUE))
write_rules(rules, "~/tmp/ruleset.yaml")
The resulting YAML looks like...
- name: N_6_18_EC_Type
expr: get(info$name[[7]]) >= eval(info$min_length[[7]])
allow_na: yes
negate: no
index: 1
What I would like to get as a <Verification Rule>
is the values of info$name[7]
and info$min_length[7]
...
> info$name[7]
[1] "N_6_18_EC_Type"
> info$min_length[7]
[1] 6
...as this would then work with the dataset I intend to verify. The resulting ruleset.yaml
(which would then live with the dataset and could be reused) would read...
- name: N_6_18_EC_Type
expr: N_6_18_EC_Type >= 6
allow_na: yes
negate: no
index: 1
It's a shame that the library requires the use of expressions. Here's one way to build the expression
colname <- info$name[7]
col <- as.symbol(colname)
min_length <- info$min_length[7]
new_rule <- eval(bquote(rule(.(col)<=.(min_length), name=.(colname), allow_na=TRUE)))
new_rule
# <Verification Rule>
# expr: 'N_6_18_EC_Type <= 6'
# name: 'N_6_18_EC_Type'
# allow NA: TRUE
# negated: FALSE