I'm new to R, looking to implement what's seen in this post: Create dummy variables from string with multiple values for every column in my df.That is, I would like to encode each string element to a dummy variable and then store each back to a list as it's encoded value. I've done this in Python but am not sure of the best way to go about this in R. Please find Expected Output Below. CURRENTLY
| Databases | Cryptocurrency | Visual
|:------------------------------------------------:|:------------------------------------------------------:|:----------------------------------------------------------:|
|c("PostgreSQL","MySQL","SQL") |c("Ethereum","Cryptocurrency","Bitcoin","Blockchain") |c("BrandDesign","GraphicDesign","LogoDesign","PackageDesign")
| MSSQLManagement |character(0) |c("BrandDesign", "GraphicDesign", "LogoDesign")
|c("MongoDB","Redis","MySQL") |c("Bitcoin", "Blockchain") |character(0)
|c("RedisManagement") | Cryptocurrency |c("BrandDesign", "GraphicDesign")
|c("MySQL","MemcachedManagement","MongoDB","Redis")|c("Cryptocurrency", "Ethereum", "Blockchain") | GraphicDesign
EXPECTED OUTPUT:
| Databases | Cryptocurrency | Visual |
|:---------:|:----------------:|:------------:|
|c(1,2,3) |c(1,2,3,4) |c(1,2,3,4) |
| 4 | 0 |c(1, 2, 3) |
|c(5,6,2) |c(3, 4) | 0 |
|c(6) | 2 |c(1, 2) |
|c(2,7,5,6) |c(2, 1, 4) | 2 |
The end goal is to train the data on a KNN Model.
EDIT: The following output is output of dput(df[1:25, c(11, 17)]):
structure(list(Technical = list(c("Architecture", "TechnicalDesign",
"Documentation", "RequirementsGathering"), character(0), character(0),
c("Documentation", "TechnicalDesign", "RequirementsGathering"
), c("Architecture", "TechnicalDesign", "Documentation",
"RequirementsGathering"), c("Architecture", "TechnicalDesign",
"Documentation", "RequirementsGathering"), c("Architecture",
"TechnicalDesign", "Documentation", "RequirementsGathering"
), character(0), c("Architecture", "Documentation", "RequirementsGathering"
), c("Architecture", "TechnicalDesign", "Documentation",
"RequirementsGathering"), c("Architecture", "TechnicalDesign",
"Documentation", "RequirementsGathering"), c("Architecture",
"TechnicalDesign", "Documentation", "RequirementsGathering"
), c("Architecture", "TechnicalDesign", "Documentation",
"RequirementsGathering"), c("Architecture", "TechnicalDesign",
"Documentation", "RequirementsGathering"), c("Architecture",
"TechnicalDesign", "Documentation", "RequirementsGathering"
), c("Documentation", "TechnicalDesign", "RequirementsGathering"
), character(0), character(0), c("Architecture", "TechnicalDesign",
"Documentation", "RequirementsGathering"), c("Architecture",
"TechnicalDesign", "Documentation", "RequirementsGathering"
)), Tools = list(c("Photoshop", "Sketch", "InVision", "Illustrator",
"Zeplin"), c("GoogleAnalytics", "GoogleAdsense", "MailChimp",
"GoogleAdwords"), character(0), c("FacebookAds", "GoogleAnalytics",
"MailChimp"), c("FacebookAds", "GoogleAnalytics", "MailChimp"
), c("Kubernetes", "Vagrant"), c("Gulp", "Vagrant"), character(0),
c("GoogleAnalytics", "GoogleAdwords"), c("Hubspot", "Kissmetrics",
"InstagramAds", "FacebookAds", "LinkedInAds", "Optimizely",
"GoogleAnalytics"), "GoogleAnalytics", c("FacebookAds", "GoogleAnalytics",
"GoogleAdsense", "InstagramAds", "GoogleAdwords"), c("GoogleAnalytics",
"TwitterAds", "MailChimp"), c("GoogleAnalytics", "MailChimp",
"Mixpanel"), c("GoogleAnalytics", "MailChimp", "GoogleAdwords"
), c("Photoshop", "Sketch", "InVision", "Illustrator"), character(0),
character(0), character(0), c("FacebookAds", "GoogleAnalytics",
"Optimizely", "GoogleAdwords", "Mixpanel")), ProjectManagement = list(
c("AgileMethodology", "ClientManagement", "Scrum"), c("AgileMethodology",
"ClientManagement", "Scrum"), "ClientManagement", c("AgileMethodology",
"ClientManagement", "Scrum"), c("AgileMethodology", "ClientManagement",
"Scrum"), c("AgileMethodology", "Scrum"), c("AgileMethodology",
"ClientManagement"), character(0), c("AgileMethodology",
"ClientManagement", "Scrum"), c("AgileMethodology", "ClientManagement",
"Scrum"), c("AgileMethodology", "Scrum"), c("AgileMethodology",
"ClientManagement", "Scrum"), character(0), c("ClientManagement",
"Scrum"), c("AgileMethodology", "Scrum"), c("AgileMethodology",
"ClientManagement", "Scrum"), c("AgileMethodology", "Scrum"
), "AgileMethodology", c("AgileMethodology", "Scrum"), c("AgileMethodology",
"Scrum")), Visual = list(c("BrandDesign", "GraphicDesign",
"LogoDesign", "PackageDesign"), c("BrandDesign", "GraphicDesign",
"LogoDesign"), character(0), character(0), c("BrandDesign", "GraphicDesign",
"LogoDesign", "PackageDesign"), character(0), character(0), character(0),
character(0), c("BrandDesign", "GraphicDesign", "LogoDesign"
), character(0), character(0), c("BrandDesign", "GraphicDesign",
"LogoDesign", "PackageDesign"), c("BrandDesign", "GraphicDesign",
"LogoDesign", "PackageDesign"), character(0), character(0),
character(0), character(0), character(0), c("BrandDesign",
"GraphicDesign", "LogoDesign")), ConfigurationManagement = list(
character(0), "Chef", character(0), character(0), character(0),
c("Ansible", "Terraform", "Puppet"), character(0), character(0),
character(0), c("Puppet", "Chef"), c("Terraform", "Chef"),
character(0), character(0), character(0), character(0), character(0),
character(0), character(0), "Chef", character(0)), Containers = list(
character(0), "Docker", character(0), character(0), character(0),
"Docker", "Docker", "Docker", "Docker", "Docker", "Docker",
"Docker", character(0), character(0), "Docker", "Docker",
character(0), character(0), "Docker", "Docker"), Cryptocurrency = list(
character(0), character(0), character(0), character(0), character(0),
c("Ethereum", "Cryptocurrency", "Bitcoin", "Blockchain"),
character(0), character(0), character(0), c("Ethereum", "Cryptocurrency",
"Bitcoin", "Blockchain"), c("Solidity", "Ethereum", "Cryptocurrency",
"Bitcoin", "Blockchain"), c("Solidity", "Ethereum", "Cryptocurrency",
"Bitcoin", "Blockchain"), character(0), character(0), character(0),
character(0), character(0), character(0), character(0), character(0))), row.names = c(NA,
20L), class = "data.frame")
Write a function that recodes the values.
recode_values <- function(x) {
x1 <- strsplit(x, ',\\s*')
relist(match(unlist(x1), unique(unlist(x1))), x1)
}
Clean the string and apply it to multiple columns.
library(dplyr)
df %>% mutate(across(c(AutomatedTesting, Cryptocurrency),
~recode_values(gsub("\\[|\\]|'", "", .))))
# AutomatedTesting Cryptocurrency
#1
#2 1, 2, 3, 4
#3 1
#4 2
#5
#6 2, 1 1, 2, 3, 4
#7 1, 2, 5
#8 2, 1
#9 2, 3, 1
#10 1, 2, 6, 3, 5 1, 2, 3, 4
#11 2, 6, 1 5, 1, 2, 3, 4
#12 2, 1 5, 1, 2, 3, 4
#13
#14
#15 1, 2, 6, 3, 4
#16 1, 2, 6, 5
#17 2, 3, 1
#18
#19 2
#20 4, 5, 6, 3, 1, 2
#21 2, 4
#22 4, 5, 6, 3, 1, 2
#23 2, 1
#24 2, 6, 1
#25 2, 6, 3, 1