Search code examples
rencodingdummy-variable

encode each string element in list of strings to list of numbers within R dataframe columns


I'm new to R, looking to implement what's seen in this post: Create dummy variables from string with multiple values for every column in my df.That is, I would like to encode each string element to a dummy variable and then store each back to a list as it's encoded value. I've done this in Python but am not sure of the best way to go about this in R. Please find Expected Output Below. CURRENTLY

|          Databases                               |            Cryptocurrency                              |          Visual

|:------------------------------------------------:|:------------------------------------------------------:|:----------------------------------------------------------:|

|c("PostgreSQL","MySQL","SQL")                     |c("Ethereum","Cryptocurrency","Bitcoin","Blockchain")   |c("BrandDesign","GraphicDesign","LogoDesign","PackageDesign")
| MSSQLManagement                                  |character(0)                                            |c("BrandDesign", "GraphicDesign", "LogoDesign")
|c("MongoDB","Redis","MySQL")                      |c("Bitcoin", "Blockchain")                              |character(0)
|c("RedisManagement")                              | Cryptocurrency                                         |c("BrandDesign", "GraphicDesign")
|c("MySQL","MemcachedManagement","MongoDB","Redis")|c("Cryptocurrency", "Ethereum", "Blockchain")           | GraphicDesign

EXPECTED OUTPUT:

| Databases |  Cryptocurrency  |   Visual     |

|:---------:|:----------------:|:------------:|

|c(1,2,3)   |c(1,2,3,4)        |c(1,2,3,4)    |
| 4         | 0                |c(1, 2, 3)    |
|c(5,6,2)   |c(3, 4)           | 0            |
|c(6)       | 2                |c(1, 2)       |
|c(2,7,5,6) |c(2, 1, 4)        | 2            |

The end goal is to train the data on a KNN Model.

EDIT: The following output is output of dput(df[1:25, c(11, 17)]):

structure(list(Technical = list(c("Architecture", "TechnicalDesign", 
"Documentation", "RequirementsGathering"), character(0), character(0), 
    c("Documentation", "TechnicalDesign", "RequirementsGathering"
    ), c("Architecture", "TechnicalDesign", "Documentation", 
    "RequirementsGathering"), c("Architecture", "TechnicalDesign", 
    "Documentation", "RequirementsGathering"), c("Architecture", 
    "TechnicalDesign", "Documentation", "RequirementsGathering"
    ), character(0), c("Architecture", "Documentation", "RequirementsGathering"
    ), c("Architecture", "TechnicalDesign", "Documentation", 
    "RequirementsGathering"), c("Architecture", "TechnicalDesign", 
    "Documentation", "RequirementsGathering"), c("Architecture", 
    "TechnicalDesign", "Documentation", "RequirementsGathering"
    ), c("Architecture", "TechnicalDesign", "Documentation", 
    "RequirementsGathering"), c("Architecture", "TechnicalDesign", 
    "Documentation", "RequirementsGathering"), c("Architecture", 
    "TechnicalDesign", "Documentation", "RequirementsGathering"
    ), c("Documentation", "TechnicalDesign", "RequirementsGathering"
    ), character(0), character(0), c("Architecture", "TechnicalDesign", 
    "Documentation", "RequirementsGathering"), c("Architecture", 
    "TechnicalDesign", "Documentation", "RequirementsGathering"
    )), Tools = list(c("Photoshop", "Sketch", "InVision", "Illustrator", 
"Zeplin"), c("GoogleAnalytics", "GoogleAdsense", "MailChimp", 
"GoogleAdwords"), character(0), c("FacebookAds", "GoogleAnalytics", 
"MailChimp"), c("FacebookAds", "GoogleAnalytics", "MailChimp"
), c("Kubernetes", "Vagrant"), c("Gulp", "Vagrant"), character(0), 
    c("GoogleAnalytics", "GoogleAdwords"), c("Hubspot", "Kissmetrics", 
    "InstagramAds", "FacebookAds", "LinkedInAds", "Optimizely", 
    "GoogleAnalytics"), "GoogleAnalytics", c("FacebookAds", "GoogleAnalytics", 
    "GoogleAdsense", "InstagramAds", "GoogleAdwords"), c("GoogleAnalytics", 
    "TwitterAds", "MailChimp"), c("GoogleAnalytics", "MailChimp", 
    "Mixpanel"), c("GoogleAnalytics", "MailChimp", "GoogleAdwords"
    ), c("Photoshop", "Sketch", "InVision", "Illustrator"), character(0), 
    character(0), character(0), c("FacebookAds", "GoogleAnalytics", 
    "Optimizely", "GoogleAdwords", "Mixpanel")), ProjectManagement = list(
    c("AgileMethodology", "ClientManagement", "Scrum"), c("AgileMethodology", 
    "ClientManagement", "Scrum"), "ClientManagement", c("AgileMethodology", 
    "ClientManagement", "Scrum"), c("AgileMethodology", "ClientManagement", 
    "Scrum"), c("AgileMethodology", "Scrum"), c("AgileMethodology", 
    "ClientManagement"), character(0), c("AgileMethodology", 
    "ClientManagement", "Scrum"), c("AgileMethodology", "ClientManagement", 
    "Scrum"), c("AgileMethodology", "Scrum"), c("AgileMethodology", 
    "ClientManagement", "Scrum"), character(0), c("ClientManagement", 
    "Scrum"), c("AgileMethodology", "Scrum"), c("AgileMethodology", 
    "ClientManagement", "Scrum"), c("AgileMethodology", "Scrum"
    ), "AgileMethodology", c("AgileMethodology", "Scrum"), c("AgileMethodology", 
    "Scrum")), Visual = list(c("BrandDesign", "GraphicDesign", 
"LogoDesign", "PackageDesign"), c("BrandDesign", "GraphicDesign", 
"LogoDesign"), character(0), character(0), c("BrandDesign", "GraphicDesign", 
"LogoDesign", "PackageDesign"), character(0), character(0), character(0), 
    character(0), c("BrandDesign", "GraphicDesign", "LogoDesign"
    ), character(0), character(0), c("BrandDesign", "GraphicDesign", 
    "LogoDesign", "PackageDesign"), c("BrandDesign", "GraphicDesign", 
    "LogoDesign", "PackageDesign"), character(0), character(0), 
    character(0), character(0), character(0), c("BrandDesign", 
    "GraphicDesign", "LogoDesign")), ConfigurationManagement = list(
    character(0), "Chef", character(0), character(0), character(0), 
    c("Ansible", "Terraform", "Puppet"), character(0), character(0), 
    character(0), c("Puppet", "Chef"), c("Terraform", "Chef"), 
    character(0), character(0), character(0), character(0), character(0), 
    character(0), character(0), "Chef", character(0)), Containers = list(
    character(0), "Docker", character(0), character(0), character(0), 
    "Docker", "Docker", "Docker", "Docker", "Docker", "Docker", 
    "Docker", character(0), character(0), "Docker", "Docker", 
    character(0), character(0), "Docker", "Docker"), Cryptocurrency = list(
    character(0), character(0), character(0), character(0), character(0), 
    c("Ethereum", "Cryptocurrency", "Bitcoin", "Blockchain"), 
    character(0), character(0), character(0), c("Ethereum", "Cryptocurrency", 
    "Bitcoin", "Blockchain"), c("Solidity", "Ethereum", "Cryptocurrency", 
    "Bitcoin", "Blockchain"), c("Solidity", "Ethereum", "Cryptocurrency", 
    "Bitcoin", "Blockchain"), character(0), character(0), character(0), 
    character(0), character(0), character(0), character(0), character(0))), row.names = c(NA, 
20L), class = "data.frame")

Solution

  • Write a function that recodes the values.

    recode_values <- function(x) {
      x1 <- strsplit(x, ',\\s*')
      relist(match(unlist(x1), unique(unlist(x1))), x1)
    }
    

    Clean the string and apply it to multiple columns.

    library(dplyr)
    
    df %>% mutate(across(c(AutomatedTesting, Cryptocurrency), 
                 ~recode_values(gsub("\\[|\\]|'", "", .))))
    
    #   AutomatedTesting Cryptocurrency
    #1                                 
    #2        1, 2, 3, 4               
    #3                 1               
    #4                 2               
    #5                                 
    #6              2, 1     1, 2, 3, 4
    #7           1, 2, 5               
    #8              2, 1               
    #9           2, 3, 1               
    #10    1, 2, 6, 3, 5     1, 2, 3, 4
    #11          2, 6, 1  5, 1, 2, 3, 4
    #12             2, 1  5, 1, 2, 3, 4
    #13                                
    #14                                
    #15    1, 2, 6, 3, 4               
    #16       1, 2, 6, 5               
    #17          2, 3, 1               
    #18                                
    #19                2               
    #20 4, 5, 6, 3, 1, 2               
    #21             2, 4               
    #22 4, 5, 6, 3, 1, 2               
    #23             2, 1               
    #24          2, 6, 1               
    #25       2, 6, 3, 1