Search code examples
rlistdataframesplitsplit-apply-combine

Create logical list with strsplit on combined words to subset data frame


I have tried to subset my data frame according a condition on specific column. For this purpose I need to create TRUE or FALSE info for each line on this column. But some line on this column has combine words and my code can not detect them.

p <- sapply(strsplit(test$hashtags, split=","), function(x)any(x%in%"evet"))

When you check the sample data you can easily see that line 5,7,8 have specific word but they are showed as a FALSE.

I have tried to add "unlist" command in my code but it haven't worked for me.

p <- sapply(unlist(strsplit(test$hashtags, split=",")), function(x)any(x%in%"evet"))

I need to create one FALSE or TRUE condition according specific word for combined line even though there are more than one words. Thanks for in advance.

Sample Data:

test <- structure(list(created_at = structure(c(1489636860, 1489636860, 
1489636860, 1489636860, 1489636860, 1489636860, 1489636860, 1489636860, 
1489636860, 1489636860), class = c("POSIXct", "POSIXt"), tzone = "GMT"), 
    user.screen_name = c("bilge_bilir", "memetozturk93", "Byomeraslan", 
    "tmremolar", "orhanyilmaz_77", "tamdere", "EriVatan", "BaySancaktar", 
    "zeynepmekik", "EriVatan"), entities.hashtags = list(structure(list(
        indices = list(c(84L, 90L)), text = "Hayır"), .Names = c("indices", 
    "text"), class = "data.frame", row.names = 1L), structure(list(
        indices = list(c(65L, 70L)), text = "evet"), .Names = c("indices", 
    "text"), class = "data.frame", row.names = 1L), structure(list(
        indices = list(c(98L, 103L)), text = "Evet"), .Names = c("indices", 
    "text"), class = "data.frame", row.names = 1L), structure(list(
        indices = list(c(98L, 104L)), text = "Hayır"), .Names = c("indices", 
    "text"), class = "data.frame", row.names = 1L), structure(list(
        indices = list(c(28L, 33L), c(45L, 50L), c(89L, 94L)), 
        text = c("EVET", "EVET", "EVET")), .Names = c("indices", 
    "text"), class = "data.frame", row.names = c(NA, 3L)), structure(list(
        indices = list(c(38L, 43L)), text = "EVET"), .Names = c("indices", 
    "text"), class = "data.frame", row.names = 1L), structure(list(
        indices = list(c(20L, 29L), c(36L, 46L), c(89L, 94L)), 
        text = c("Dirilişe", "Yükselişe", "Evet")), .Names = c("indices", 
    "text"), class = "data.frame", row.names = c(NA, 3L)), structure(list(
        indices = list(c(10L, 15L), c(16L, 20L), c(21L, 26L), 
            c(27L, 31L)), text = c("Evet", "Eri", "Beli", "Yes"
        )), .Names = c("indices", "text"), class = "data.frame", row.names = c(NA, 
    4L)), structure(list(indices = list(c(125L, 130L)), text = "Evet"), .Names = c("indices", 
    "text"), class = "data.frame", row.names = 1L), structure(list(
        indices = list(c(102L, 107L)), text = "EVET"), .Names = c("indices", 
    "text"), class = "data.frame", row.names = 1L)), retweeted_status.created_at = c("Thu Mar 16 03:49:15 +0000 2017", 
    "Wed Mar 15 23:57:44 +0000 2017", "Wed Mar 15 21:07:54 +0000 2017", 
    "Wed Mar 15 20:54:43 +0000 2017", "Wed Mar 15 14:41:15 +0000 2017", 
    "Wed Mar 15 23:07:43 +0000 2017", "Wed Mar 15 15:41:06 +0000 2017", 
    NA, "Wed Mar 15 11:13:15 +0000 2017", "Wed Mar 15 16:37:13 +0000 2017"
    ), entities.user_mentions = list(structure(list(indices = list(
        c(3L, 16L), c(18L, 30L), c(44L, 55L), c(56L, 71L), c(72L, 
        83L)), screen_name = c("seremgiz8289", "bilge_bilir", 
    "OduncuTimi", "yalcinvelioglu", "OPTlMlst_Z"), id = c(301944248, 
    2189106581, 2756465282, 2668851081, 2734161237), id_str = c("301944248", 
    "2189106581", "2756465282", "2668851081", "2734161237"), 
        name = c("ATA KIZI HAYIR DİYOR", "Bilge Eryuz", "OduncuTimi ®", 
        "Yalçın Velioğlu", "OPTlMlst_Z")), .Names = c("indices", 
    "screen_name", "id", "id_str", "name"), class = "data.frame", row.names = c(NA, 
    5L)), structure(list(indices = list(c(3L, 16L)), screen_name = "kendimce_ben", 
        id = 2322523731, id_str = "2322523731", name = "İzzet#EVET/\U0001f1f9\U0001f1f7"), .Names = c("indices", 
    "screen_name", "id", "id_str", "name"), class = "data.frame", row.names = 1L), 
        structure(list(indices = list(c(3L, 12L)), screen_name = "omrolcay", 
            id = 360420809L, id_str = "360420809", name = "Ömer Olcay"), .Names = c("indices", 
        "screen_name", "id", "id_str", "name"), class = "data.frame", row.names = 1L), 
        structure(list(indices = list(c(3L, 18L)), screen_name = "mehmet_asassoy", 
            id = 3151503430, id_str = "3151503430", name = "Mehmet Asassoy"), .Names = c("indices", 
        "screen_name", "id", "id_str", "name"), class = "data.frame", row.names = 1L), 
        structure(list(indices = list(c(3L, 17L), c(120L, 132L
        )), screen_name = c("sevincbeykent", "yigitbulutt"), 
            id = c(538364458L, 256065299L), id_str = c("538364458", 
            "256065299"), name = c("Sevinç", "YİĞİT BULUT"
            )), .Names = c("indices", "screen_name", "id", "id_str", 
        "name"), class = "data.frame", row.names = 1:2), structure(list(
            indices = list(c(3L, 13L)), screen_name = "AKsamet54", 
            id = 313205928L, id_str = "313205928", name = "Samet ÇELİK"), .Names = c("indices", 
        "screen_name", "id", "id_str", "name"), class = "data.frame", row.names = 1L), 
        structure(list(indices = list(c(3L, 18L)), screen_name = "HayataTebessum", 
            id = 2911157237, id_str = "2911157237", name = "Meryem"), .Names = c("indices", 
        "screen_name", "id", "id_str", "name"), class = "data.frame", row.names = 1L), 
        structure(list(indices = list(c(0L, 9L)), screen_name = "4qet1dil", 
            id = 536676261L, id_str = "536676261", name = "KerenGo"), .Names = c("indices", 
        "screen_name", "id", "id_str", "name"), class = "data.frame", row.names = 1L), 
        structure(list(indices = list(c(3L, 18L)), screen_name = "akkadinantalya", 
            id = 1898504755L, id_str = "1898504755", name = "AK Kadın Antalya"), .Names = c("indices", 
        "screen_name", "id", "id_str", "name"), class = "data.frame", row.names = 1L), 
        structure(list(indices = list(c(3L, 15L)), screen_name = "menes__2010", 
            id = 186968367L, id_str = "186968367", name = "#EVET☪ ياسين ☝"), .Names = c("indices", 
        "screen_name", "id", "id_str", "name"), class = "data.frame", row.names = 1L)), 
    hashtags = c("hayir", "evet", "evet", "hayir", "c(\"evet\", \"evet\", \"evet\")", 
    "evet", "c(\"dirilise\", \"yukselise\", \"evet\")", "c(\"evet\", \"eri\", \"beli\", \"yes\")", 
    "evet", "evet"), mentions = list(c("seremgiz8289", "bilge_bilir", 
    "OduncuTimi", "yalcinvelioglu", "OPTlMlst_Z"), "kendimce_ben", 
        "omrolcay", "mehmet_asassoy", c("sevincbeykent", "yigitbulutt"
        ), "AKsamet54", "HayataTebessum", "4qet1dil", "akkadinantalya", 
        "menes__2010")), .Names = c("created_at", "user.screen_name", 
"entities.hashtags", "retweeted_status.created_at", "entities.user_mentions", 
"hashtags", "mentions"), row.names = c(NA, 10L), class = "data.frame")

Solution

  • That is mostly because the way hashtags column was generated. It was stored as a list of character vector and when coerced to character it gave this structure.

    See for example,

    list(c("A", "B", "C"))
    #[[1]]
    #[1] "A" "B" "C" 
    
    as.character(list(c("A", "B", "C"))) 
    #[1] "c(\"A\", \"B\", \"C\")"
    

    Checking an individual element on your dataframe gives the same structure.

    test$hashtags[5]
    #[1] "c(\"evet\", \"evet\", \"evet\")"
    

    So if there is no way you could go back and change the way hashtags columns was generated you can use grepl instead and it would save you from strsplit and sapply call as well.

    grepl("evet", test$hashtags)
    #[1] FALSE  TRUE  TRUE FALSE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE