I am trying to use the Isolation Forest algorithm in the Solitude package to identify anomalous rows in my data.
I'm using the examples in the documentation to learn about the algorithm, this example uses the Pima Indians Diabetes dataset.
At the end of the example it provides a dataframe of ids, average_depth and anomaly_score sorted from highest score to lowest.
How can I tie back the results of the model to the original dataset to see the rows with the highest anomaly score?
Here's the example from the package documentation
library("solitude")
library("tidyverse")
library("mlbench")
data(PimaIndiansDiabetes)
PimaIndiansDiabetes = as_tibble(PimaIndiansDiabetes)
PimaIndiansDiabetes
splitter = PimaIndiansDiabetes %>%
select(-diabetes) %>%
rsample::initial_split(prop = 0.5)
pima_train = rsample::training(splitter)
pima_test = rsample::testing(splitter)
iso = isolationForest$new()
iso$fit(pima_train)
scores_train = pima_train %>%
iso$predict() %>%
arrange(desc(anomaly_score))
scores_train
umap_train = pima_train %>%
scale() %>%
uwot::umap() %>%
setNames(c("V1", "V2")) %>%
as_tibble() %>%
rowid_to_column() %>%
left_join(scores_train, by = c("rowid" = "id"))
umap_train
umap_train %>%
ggplot(aes(V1, V2)) +
geom_point(aes(size = anomaly_score))
scores_test = pima_test %>%
iso$predict() %>%
arrange(desc(anomaly_score))
scores_test
Well this was a bit hard.
Let me know if this code helps you:
library("solitude")
library("tidyverse")
library("mlbench")
data(PimaIndiansDiabetes)
PimaIndiansDiabetes = as_tibble(PimaIndiansDiabetes)
PimaIndiansDiabetes
splitter = PimaIndiansDiabetes %>%
select(-diabetes) %>%
rsample::initial_split(prop = 0.5)
pima_train = rsample::training(splitter)
pima_test = rsample::testing(splitter)
iso = isolationForest$new()
iso$fit(pima_train)
scores_train = pima_train %>%
iso$predict() %>%
arrange(desc(anomaly_score))
scores_train
umap_train = pima_train %>%
scale() %>%
uwot::umap() %>%
setNames(c("V1", "V2")) %>%
as_tibble() %>%
rowid_to_column() %>%
left_join(scores_train, by = c("rowid" = "id"))
umap_train
umap_train %>%
ggplot(aes(V1, V2)) +
geom_point(aes(size = anomaly_score))
scores_test = pima_test %>%
iso$predict() %>%
arrange(desc(anomaly_score))
scores_test
umap_train %>% left_join(scores_test, by = c("rowid" = "id"))
PimaIndiansDiabetes$id <- 1:nrow(PimaIndiansDiabetes)
scores_train$id <- splitter$in_id
scores_test$id <- PimaIndiansDiabetes$id[which(!PimaIndiansDiabetes$id %in% splitter$in_id)]
p1 <- PimaIndiansDiabetes %>% inner_join(scores_test, by = c("id"))
summary(p1)
p2 <- PimaIndiansDiabetes %>% inner_join(scores_train, by = c("id"))
summary(p2)
p3 <- rbind(p1,p2)
as_tibble(p3)
summary(p3)
You should get this results:
> p3 <- rbind(p1,p2)
>
> as_tibble(p3)
# A tibble: 768 × 12
pregnant glucose pressure triceps insulin mass pedigree age diabetes id average_depth anomaly_score
<dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <fct> <int> <dbl> <dbl>
1 6 148 72 35 0 33.6 0.627 50 pos 1 4.72 0.727
2 8 183 64 0 0 23.3 0.672 32 pos 3 5.21 0.703
3 1 89 66 23 94 28.1 0.167 21 neg 4 6.25 0.655
4 3 78 50 32 88 31 0.248 26 pos 7 6.3 0.653
5 2 197 70 45 543 30.5 0.158 53 pos 9 6.46 0.646
6 8 125 96 0 0 0 0.232 54 pos 10 6.6 0.640
7 7 100 0 0 0 30 0.484 32 pos 16 6.75 0.633
8 0 118 84 47 230 45.8 0.551 31 pos 17 6.77 0.633
9 1 103 30 38 83 43.3 0.183 33 neg 19 6.78 0.632
10 9 119 80 35 0 29 0.263 29 pos 24 6.85 0.629
# … with 758 more rows
>
> summary(p3)
pregnant glucose pressure triceps insulin mass pedigree
Min. : 0.000 Min. : 0.0 Min. : 0.00 Min. : 0.00 Min. : 0.0 Min. : 0.00 Min. :0.0780
1st Qu.: 1.000 1st Qu.: 99.0 1st Qu.: 62.00 1st Qu.: 0.00 1st Qu.: 0.0 1st Qu.:27.30 1st Qu.:0.2437
Median : 3.000 Median :117.0 Median : 72.00 Median :23.00 Median : 30.5 Median :32.00 Median :0.3725
Mean : 3.845 Mean :120.9 Mean : 69.11 Mean :20.54 Mean : 79.8 Mean :31.99 Mean :0.4719
3rd Qu.: 6.000 3rd Qu.:140.2 3rd Qu.: 80.00 3rd Qu.:32.00 3rd Qu.:127.2 3rd Qu.:36.60 3rd Qu.:0.6262
Max. :17.000 Max. :199.0 Max. :122.00 Max. :99.00 Max. :846.0 Max. :67.10 Max. :2.4200
age diabetes id average_depth anomaly_score
Min. :21.00 neg:500 Min. : 1.0 Min. :4.720 Min. :0.5820
1st Qu.:24.00 pos:268 1st Qu.:192.8 1st Qu.:7.680 1st Qu.:0.5832
Median :29.00 Median :384.5 Median :7.910 Median :0.5856
Mean :33.24 Mean :384.5 Mean :7.749 Mean :0.5922
3rd Qu.:41.00 3rd Qu.:576.2 3rd Qu.:7.970 3rd Qu.:0.5947
Max. :81.00 Max. :768.0 Max. :8.000 Max. :0.7266