Search code examples
rmachine-learninganomaly-detection

How can I identify the anomalous records from the Isolation Forest results?


I am trying to use the Isolation Forest algorithm in the Solitude package to identify anomalous rows in my data.

I'm using the examples in the documentation to learn about the algorithm, this example uses the Pima Indians Diabetes dataset.

At the end of the example it provides a dataframe of ids, average_depth and anomaly_score sorted from highest score to lowest.

How can I tie back the results of the model to the original dataset to see the rows with the highest anomaly score?

Here's the example from the package documentation

library("solitude")
library("tidyverse")
library("mlbench")

data(PimaIndiansDiabetes)
PimaIndiansDiabetes = as_tibble(PimaIndiansDiabetes)
PimaIndiansDiabetes

splitter   = PimaIndiansDiabetes %>%
  select(-diabetes) %>%
  rsample::initial_split(prop = 0.5)
pima_train = rsample::training(splitter)
pima_test  = rsample::testing(splitter)

iso = isolationForest$new()
iso$fit(pima_train)

scores_train = pima_train %>%
  iso$predict() %>%
  arrange(desc(anomaly_score))

scores_train

umap_train = pima_train %>%
  scale() %>%
  uwot::umap() %>%
  setNames(c("V1", "V2")) %>%
  as_tibble() %>%
  rowid_to_column() %>%
  left_join(scores_train, by = c("rowid" = "id"))

umap_train

umap_train %>%
  ggplot(aes(V1, V2)) +
  geom_point(aes(size = anomaly_score))

scores_test = pima_test %>%
  iso$predict() %>%
  arrange(desc(anomaly_score))

scores_test

Solution

  • Well this was a bit hard.

    Let me know if this code helps you:

    
    library("solitude")
    library("tidyverse")
    library("mlbench")
    
    data(PimaIndiansDiabetes)
    PimaIndiansDiabetes = as_tibble(PimaIndiansDiabetes)
    PimaIndiansDiabetes
    
    splitter   = PimaIndiansDiabetes %>%
      select(-diabetes) %>%
      rsample::initial_split(prop = 0.5)
    pima_train = rsample::training(splitter)
    pima_test  = rsample::testing(splitter)
    
    iso = isolationForest$new()
    iso$fit(pima_train)
    
    scores_train = pima_train %>%
      iso$predict() %>%
      arrange(desc(anomaly_score))
    
    scores_train
    
    umap_train = pima_train %>%
      scale() %>%
      uwot::umap() %>%
      setNames(c("V1", "V2")) %>%
      as_tibble() %>%
      rowid_to_column() %>%
      left_join(scores_train, by = c("rowid" = "id"))
    
    umap_train
    
    umap_train %>%
      ggplot(aes(V1, V2)) +
      geom_point(aes(size = anomaly_score))
    
    scores_test = pima_test %>%
      iso$predict() %>%
      arrange(desc(anomaly_score))
    
    scores_test
    
    umap_train %>% left_join(scores_test, by = c("rowid" = "id"))
    
    PimaIndiansDiabetes$id <- 1:nrow(PimaIndiansDiabetes)
    
    scores_train$id <- splitter$in_id
    
    scores_test$id <- PimaIndiansDiabetes$id[which(!PimaIndiansDiabetes$id %in% splitter$in_id)]
    
    p1 <- PimaIndiansDiabetes %>% inner_join(scores_test, by = c("id"))
    
    summary(p1)
    
    p2 <- PimaIndiansDiabetes %>% inner_join(scores_train, by = c("id"))
    
    summary(p2)
    
    p3 <- rbind(p1,p2)
    
    as_tibble(p3)
    
    summary(p3)
    
    

    You should get this results:

    > p3 <- rbind(p1,p2)
    > 
    > as_tibble(p3)
    # A tibble: 768 × 12
       pregnant glucose pressure triceps insulin  mass pedigree   age diabetes    id average_depth anomaly_score
          <dbl>   <dbl>    <dbl>   <dbl>   <dbl> <dbl>    <dbl> <dbl> <fct>    <int>         <dbl>         <dbl>
     1        6     148       72      35       0  33.6    0.627    50 pos          1          4.72         0.727
     2        8     183       64       0       0  23.3    0.672    32 pos          3          5.21         0.703
     3        1      89       66      23      94  28.1    0.167    21 neg          4          6.25         0.655
     4        3      78       50      32      88  31      0.248    26 pos          7          6.3          0.653
     5        2     197       70      45     543  30.5    0.158    53 pos          9          6.46         0.646
     6        8     125       96       0       0   0      0.232    54 pos         10          6.6          0.640
     7        7     100        0       0       0  30      0.484    32 pos         16          6.75         0.633
     8        0     118       84      47     230  45.8    0.551    31 pos         17          6.77         0.633
     9        1     103       30      38      83  43.3    0.183    33 neg         19          6.78         0.632
    10        9     119       80      35       0  29      0.263    29 pos         24          6.85         0.629
    # … with 758 more rows
    > 
    > summary(p3)
        pregnant         glucose         pressure         triceps         insulin           mass          pedigree     
     Min.   : 0.000   Min.   :  0.0   Min.   :  0.00   Min.   : 0.00   Min.   :  0.0   Min.   : 0.00   Min.   :0.0780  
     1st Qu.: 1.000   1st Qu.: 99.0   1st Qu.: 62.00   1st Qu.: 0.00   1st Qu.:  0.0   1st Qu.:27.30   1st Qu.:0.2437  
     Median : 3.000   Median :117.0   Median : 72.00   Median :23.00   Median : 30.5   Median :32.00   Median :0.3725  
     Mean   : 3.845   Mean   :120.9   Mean   : 69.11   Mean   :20.54   Mean   : 79.8   Mean   :31.99   Mean   :0.4719  
     3rd Qu.: 6.000   3rd Qu.:140.2   3rd Qu.: 80.00   3rd Qu.:32.00   3rd Qu.:127.2   3rd Qu.:36.60   3rd Qu.:0.6262  
     Max.   :17.000   Max.   :199.0   Max.   :122.00   Max.   :99.00   Max.   :846.0   Max.   :67.10   Max.   :2.4200  
          age        diabetes        id        average_depth   anomaly_score   
     Min.   :21.00   neg:500   Min.   :  1.0   Min.   :4.720   Min.   :0.5820  
     1st Qu.:24.00   pos:268   1st Qu.:192.8   1st Qu.:7.680   1st Qu.:0.5832  
     Median :29.00             Median :384.5   Median :7.910   Median :0.5856  
     Mean   :33.24             Mean   :384.5   Mean   :7.749   Mean   :0.5922  
     3rd Qu.:41.00             3rd Qu.:576.2   3rd Qu.:7.970   3rd Qu.:0.5947  
     Max.   :81.00             Max.   :768.0   Max.   :8.000   Max.   :0.7266