I have data as follows:
library(data.table)
set.seed(1)
DT <- data.table(panelID = sample(50,50), # Creates a panel ID
Country = c(rep("Albania",30),rep("Belarus",50), rep("Chilipepper",20)),
some_NA = sample(0:5, 6),
some_NA_factor = sample(0:5, 6),
Group = c(rep(1,20),rep(2,20),rep(3,20),rep(4,20),rep(5,20)),
Time = rep(seq(as.Date("2010-01-03"), length=20, by="1 month") - 1,5),
wt = 15*round(runif(100)/10,2),
Income = round(rnorm(10,-5,5),2),
Happiness = sample(10,10),
Sex = round(rnorm(10,0.75,0.3),2),
Age = sample(100,100),
Educ = round(rnorm(10,0.75,0.3),2))
DT [, uniqueID := .I] # Creates a unique ID # https://stackoverflow.com/questions/11036989/replace-all-0-values-to-na
DT$some_NA_factor <- factor(DT$some_NA_factor)
DT$Group <- as.character(DT$Group)
The last line converts the colum Group
to a string. Now I would like to write a function that converts only strings to numeric, that are suitable to be numbers, such as Group
, but leaves Country
alone because it has no numerical meaning.
How can I go about this?
My first thought was to use type.convert
, but that either converts character
and Date
to factor
, or with as.is=TRUE
it converts factor
to character
.
str(DT[, lapply(.SD, type.convert)])
# Classes 'data.table' and 'data.frame': 100 obs. of 13 variables:
# $ panelID : int 4 39 1 34 23 43 14 18 33 21 ...
# $ Country : Factor w/ 3 levels "Albania","Belarus",..: 1 1 1 1 1 1 1 1 1 1 ...
# $ some_NA : int 0 2 4 1 5 3 0 2 4 1 ...
# $ some_NA_factor: int 3 2 0 5 1 4 3 2 0 5 ...
# $ Group : int 1 1 1 1 1 1 1 1 1 1 ...
# $ Time : Factor w/ 20 levels "2010-01-02","2010-02-02",..: 1 2 3 4 5 6 7 8 9 10 ...
# $ wt : num 0.15 0.3 0.15 0.9 1.35 1.2 1.2 0.75 0.6 1.2 ...
# $ Income : num -4.4 -6.41 2.28 -3.85 -0.02 ...
# $ Happiness : int 3 10 6 9 5 7 4 1 2 8 ...
# $ Sex : num 0.61 1.18 0.55 0.69 0.63 0.65 0.67 0.9 0.7 0.6 ...
# $ Age : int 15 2 65 67 73 17 84 5 41 91 ...
# $ Educ : num 0.54 1.04 1.29 0.43 0.76 0.63 0.6 0.44 0.48 1.13 ...
# $ uniqueID : int 1 2 3 4 5 6 7 8 9 10 ...
# - attr(*, ".internal.selfref")=<externalptr>
str(DT[, lapply(.SD, type.convert, as.is = TRUE)])
# Classes 'data.table' and 'data.frame': 100 obs. of 13 variables:
# $ panelID : int 4 39 1 34 23 43 14 18 33 21 ...
# $ Country : chr "Albania" "Albania" "Albania" "Albania" ...
# $ some_NA : int 0 2 4 1 5 3 0 2 4 1 ...
# $ some_NA_factor: int 3 2 0 5 1 4 3 2 0 5 ...
# $ Group : int 1 1 1 1 1 1 1 1 1 1 ...
# $ Time : chr "2010-01-02" "2010-02-02" "2010-03-02" "2010-04-02" ...
# $ wt : num 0.15 0.3 0.15 0.9 1.35 1.2 1.2 0.75 0.6 1.2 ...
# $ Income : num -4.4 -6.41 2.28 -3.85 -0.02 ...
# $ Happiness : int 3 10 6 9 5 7 4 1 2 8 ...
# $ Sex : num 0.61 1.18 0.55 0.69 0.63 0.65 0.67 0.9 0.7 0.6 ...
# $ Age : int 15 2 65 67 73 17 84 5 41 91 ...
# $ Educ : num 0.54 1.04 1.29 0.43 0.76 0.63 0.6 0.44 0.48 1.13 ...
# $ uniqueID : int 1 2 3 4 5 6 7 8 9 10 ...
# - attr(*, ".internal.selfref")=<externalptr>
So I think we need our own function with similar intentions.
mytype <- function(z) if (is.character(z) && all(grepl("^-?[\\d.]+(?:e-?\\d+)?$", z, perl = TRUE))) as.numeric(z) else z
str(DT[, lapply(.SD, mytype)])
# Classes 'data.table' and 'data.frame': 100 obs. of 13 variables:
# $ panelID : int 4 39 1 34 23 43 14 18 33 21 ...
# $ Country : chr "Albania" "Albania" "Albania" "Albania" ...
# $ some_NA : int 0 2 4 1 5 3 0 2 4 1 ...
# $ some_NA_factor: Factor w/ 6 levels "0","1","2","3",..: 4 3 1 6 2 5 4 3 1 6 ...
# $ Group : num 1 1 1 1 1 1 1 1 1 1 ...
# $ Time : Date, format: "2010-01-02" "2010-02-02" "2010-03-02" ...
# $ wt : num 0.15 0.3 0.15 0.9 1.35 1.2 1.2 0.75 0.6 1.2 ...
# $ Income : num -4.4 -6.41 2.28 -3.85 -0.02 ...
# $ Happiness : int 3 10 6 9 5 7 4 1 2 8 ...
# $ Sex : num 0.61 1.18 0.55 0.69 0.63 0.65 0.67 0.9 0.7 0.6 ...
# $ Age : int 15 2 65 67 73 17 84 5 41 91 ...
# $ Educ : num 0.54 1.04 1.29 0.43 0.76 0.63 0.6 0.44 0.48 1.13 ...
# $ uniqueID : int 1 2 3 4 5 6 7 8 9 10 ...
# - attr(*, ".internal.selfref")=<externalptr>
With larger data, you may prefer to break the grepl
condition out so that you define which columns to work on:
mytypetest <- function(z) is.character(z) && all(grepl("^-?[\\d.]+(?:e-?\\d+)?$", z, perl = TRUE))
cols <- which(sapply(DT, mytypetest))
cols
# Group
# 5
DT[, (cols) := lapply(.SD, as.numeric), .SDcols = cols]
str(DT)
# Classes 'data.table' and 'data.frame': 100 obs. of 13 variables:
# $ panelID : int 4 39 1 34 23 43 14 18 33 21 ...
# $ Country : chr "Albania" "Albania" "Albania" "Albania" ...
# $ some_NA : int 0 2 4 1 5 3 0 2 4 1 ...
# $ some_NA_factor: Factor w/ 6 levels "0","1","2","3",..: 4 3 1 6 2 5 4 3 1 6 ...
# $ Group : num 1 1 1 1 1 1 1 1 1 1 ...
# $ Time : Date, format: "2010-01-02" "2010-02-02" "2010-03-02" ...
# $ wt : num 0.15 0.3 0.15 0.9 1.35 1.2 1.2 0.75 0.6 1.2 ...
# $ Income : num -4.4 -6.41 2.28 -3.85 -0.02 ...
# $ Happiness : int 3 10 6 9 5 7 4 1 2 8 ...
# $ Sex : num 0.61 1.18 0.55 0.69 0.63 0.65 0.67 0.9 0.7 0.6 ...
# $ Age : int 15 2 65 67 73 17 84 5 41 91 ...
# $ Educ : num 0.54 1.04 1.29 0.43 0.76 0.63 0.6 0.44 0.48 1.13 ...
# $ uniqueID : int 1 2 3 4 5 6 7 8 9 10 ...
# - attr(*, ".internal.selfref")=<externalptr>
This last one will be technically faster with any sized data, but it might be noticeable for larger (columns and/or rows) data.