python machine-learning classification weka cross-validation

How can do crossvalidation for a AttributeSelectedClassifier model?

I did a model like that:

base = Classifier(classname="weka.classifiers.trees.ADTree", 
                  options=["-B", "10", "-E", "-3", "-S", "1"])

CostS_cls = SingleClassifierEnhancer(classname="weka.classifiers.meta.CostSensitiveClassifier", 
                                options =["-cost-matrix", "[0.0 1.0; 1.0 0.0]", "-S", "1"])
CostS_cls.classifier = base
smote = Filter(classname="weka.filters.supervised.instance.SMOTE", 
               options=["-C", "0", "-K", "3", "-P", "250.0", "-S", "1"])
fc = FilteredClassifier(options=["-S","1"])
fc.filter = smote
fc.classifier = CostS_cls
bagging_cls = SingleClassifierEnhancer(classname="weka.classifiers.meta.Bagging",
                         options=["-P", "100", "-S", "1", "-num-slots", "1", "-I", "100"])
bagging_cls.classifier = fc
multisearch_cls = MultiSearch(options = ["-S", "1"])
multisearch_cls.evaluation = "FM"
multisearch_cls.search = ["-sample-size", "100", "-initial-folds", "2", "-subsequent-folds", "10",
                          "-initial-test-set", ".", "-subsequent-test-set", ".", "-num-slots", "1"]                        
mparam = MathParameter()
mparam.prop = "numOfBoostingIterations"
mparam.minimum = 5.0
mparam.maximum = 50.0
mparam.step = 1.0
mparam.base = 10.0
mparam.expression = "I"
multisearch_cls.parameters = [mparam]
multisearch_cls.classifier = bagging_cls
AttS_cls = AttributeSelectedClassifier()
AttS_cls.search = from_commandline('weka.attributeSelection.GreedyStepwise -B -T -1.7976931348623157E308 -N -1 -num-slots 1', classname=get_classname(ASSearch))
AttS_cls.evaluation = from_commandline('weka.attributeSelection.CfsSubsetEval -P 1 -E 1', classname=get_classname(ASEvaluation))
AttS_cls.classifier = multisearch_cls
train, test = data_modelos_1_2.train_test_split(70.0, Random(1))
AttS_cls.build_classifier(train)

and I'm trying to validate it with cross-validation but when I do that:

train, test = data_modelos_1_2.train_test_split(70.0, Random(1))
AttS_cls.build_classifier(train)
evl = Evaluation(test)
evl.crossvalidate_model(AttS_cls, test, 10, Random(1))

Im getting this error:

---------------------------------------------------------------------------
JavaException                             Traceback (most recent call last)
/tmp/ipykernel_50548/1197040560.py in <module>
     47 print(AttS_cls.to_commandline())
     48 evl = Evaluation(test)
---> 49 evl.crossvalidate_model(AttS_cls, test, 10, Random(1))
     50 print(AttS_cls)
     51 print("----------------------------------------------------------------------------")

/usr/local/lib/python3.8/dist-packages/weka/classifiers.py in crossvalidate_model(self, classifier, data, num_folds, rnd, output)
   1289         else:
   1290             generator = [output.jobject]
-> 1291         javabridge.call(
   1292             self.jobject, "crossValidateModel",
   1293             "(Lweka/classifiers/Classifier;Lweka/core/Instances;ILjava/util/Random;[Ljava/lang/Object;)V",

~/.local/lib/python3.8/site-packages/javabridge/jutil.py in call(o, method_name, sig, *args)
    890     ret_sig = sig[sig.find(')')+1:]
    891     nice_args = get_nice_args(args, args_sig)
--> 892     result = fn(*nice_args)
    893     x = env.exception_occurred()
    894     if x is not None:

~/.local/lib/python3.8/site-packages/javabridge/jutil.py in fn(*args)
    857             x = env.exception_occurred()
    858             if x is not None:
--> 859                 raise JavaException(x)
    860             return result
    861     else:

JavaException: Thread-based execution of evaluation tasks failed!

So i don't know what I'm doing wrong, because i know that using weka u can crossvalidate this types of model but I'm trying on pyweka and have that problem.

Solution

I have turned your code snippet into one with imports and fixed the MultiSearch setup for Bagging (mparam.prop = "numIterations" instead of mparam.prop = "numOfBoostingIterations"), allowing it to be executed.

Since I do not have access to your data, I just used the UCI dataset vote.arff.

Your code was a bit odd, as it did a 70/30 train/test split, trained the classifier and then performed cross-validation on the test data. For cross-validation you do not train the classifier, as this happens within the internal cross-validation loop (each trained classifier inside that loop gets discarded, as cross-validation is only used for gathering statistics).

The code below has therefore three parts:

your original evaluation code, but commented out
performing proper cross-validation
performing train/test evaluation

I do not use Jupyter notebooks and tested the code successfully in a regular virtual environment on my Linux Mint:

Python: 3.8.10

Output of pip freeze:

numpy==1.22.3
packaging==21.3
pyparsing==3.0.7
python-javabridge==4.0.3
python-weka-wrapper3==0.2.7

The modified code itself:

import weka.core.jvm as jvm
from weka.core.converters import load_any_file
from weka.classifiers import Classifier, SingleClassifierEnhancer, FilteredClassifier, MultiSearch, AttributeSelectedClassifier, Evaluation
from weka.core.classes import MathParameter, from_commandline, Random, get_classname
from weka.filters import Filter
from weka.attribute_selection import ASEvaluation, ASSearch

jvm.start(packages=True)

# the dataset/path needs adjusting
data_modelos_1_2 = load_any_file("/some/where/vote.arff")
data_modelos_1_2.class_is_last()

base = Classifier(classname="weka.classifiers.trees.ADTree",
                  options=["-B", "10", "-E", "-3", "-S", "1"])

CostS_cls = SingleClassifierEnhancer(classname="weka.classifiers.meta.CostSensitiveClassifier",
                                     options=["-cost-matrix", "[0.0 1.0; 1.0 0.0]", "-S", "1"])
CostS_cls.classifier = base
smote = Filter(classname="weka.filters.supervised.instance.SMOTE",
               options=["-C", "0", "-K", "3", "-P", "250.0", "-S", "1"])
fc = FilteredClassifier(options=["-S", "1"])
fc.filter = smote
fc.classifier = CostS_cls
bagging_cls = SingleClassifierEnhancer(classname="weka.classifiers.meta.Bagging",
                                       options=["-P", "100", "-S", "1", "-num-slots", "1", "-I", "100"])
bagging_cls.classifier = fc
multisearch_cls = MultiSearch(options=["-S", "1"])
multisearch_cls.evaluation = "FM"
multisearch_cls.search = ["-sample-size", "100", "-initial-folds", "2", "-subsequent-folds", "10",
                          "-initial-test-set", ".", "-subsequent-test-set", ".", "-num-slots", "1"]
mparam = MathParameter()
mparam.prop = "numIterations"
mparam.minimum = 5.0
mparam.maximum = 50.0
mparam.step = 1.0
mparam.base = 10.0
mparam.expression = "I"
multisearch_cls.parameters = [mparam]
multisearch_cls.classifier = bagging_cls

AttS_cls = AttributeSelectedClassifier()
AttS_cls.search = from_commandline('weka.attributeSelection.GreedyStepwise -B -T -1.7976931348623157E308 -N -1 -num-slots 1', classname=get_classname(ASSearch))
AttS_cls.evaluation = from_commandline('weka.attributeSelection.CfsSubsetEval -P 1 -E 1', classname=get_classname(ASEvaluation))
AttS_cls.classifier = multisearch_cls

# original
# train, test = data_modelos_1_2.train_test_split(70.0, Random(1))
# AttS_cls.build_classifier(train)
# evl = Evaluation(test)
# evl.crossvalidate_model(AttS_cls, test, 10, Random(1))
# print(evl.summary())

# cross-validation
print("\ncross-validation\n")
evl = Evaluation(data_modelos_1_2)
evl.crossvalidate_model(AttS_cls, data_modelos_1_2, 10, Random(1))
print(evl.summary())

# train/test split
print("\ntrain/test split\n")
train, test = data_modelos_1_2.train_test_split(70.0, Random(1))
AttS_cls.build_classifier(train)
evl = Evaluation(test)
evl.test_model(AttS_cls, test)
print(evl.summary())

jvm.stop()

This generated the following output:

cross-validation


Correctly Classified Instances         416               95.6322 %
Incorrectly Classified Instances        19                4.3678 %
Kappa statistic                          0.9094
Mean absolute error                      0.0737
Root mean squared error                  0.1778
Relative absolute error                 15.5353 %
Root relative squared error             36.5084 %
Total Number of Instances              435     


train/test split


Correctly Classified Instances         126               96.1832 %
Incorrectly Classified Instances         5                3.8168 %
Kappa statistic                          0.9216
Mean absolute error                      0.0735
Root mean squared error                  0.1649
Relative absolute error                 15.3354 %
Root relative squared error             33.6949 %
Total Number of Instances              131