What is the preferred way to compute a confusion matrix with OpenCV and C++?
Starting from:
int TP = 0,FP = 0,FN = 0,TN = 0;
cv::Mat truth(60,60, CV_8UC1);
cv::Mat detections(60,60, CV_8UC1);
this->loadResults(truth, detections); // loadResults(cv::Mat& t, cv::Mat& d);
I tried a few different options like:
Direct Calls:
for(int r = 0; r < detections.rows; ++r)
for(int c = 0; c < detections.cols; ++c)
{
int d,t;
d = detection.at<unsigned char>(r,c);
t = truth.at<unsigned char>(r,c);
if(d&&t) ++TP;
if(d&&!t) ++FP;
if(!d&&t) ++FN;
if(!d&&!t) ++TN;
}
RAM heavy matrix logic:
{
cv::Mat truePos = detection.mul(truth);
TP = cv::countNonZero(truePos)
}
{
cv::Mat falsePos = detection.mul(~truth);
FP = cv::countNonZero(falsePos )
}
{
cv::Mat falseNeg = truth.mul(~detection);
FN = cv::countNonZero(falseNeg )
}
{
cv::Mat trueNeg = (~truth).mul(~detection);
TN = cv::countNonZero(trueNeg )
}
forEach:
auto lambda = [&, truth,TP,FP,FN,TN](unsigned char d, const int pos[]){
cv::Point2i pt(pos[1], pos[0]);
char t = truth.at<unsigned char>(pt);
if(d&&t) ++TP;
if(d&&!t) ++FP;
if(!d&&t) ++FN;
if(!d&&!t) ++TN;
};
detection.forEach(lambda);
But is there a standard way of doing it? I might have missed a simple function in OpenCV docs.
p.s. I used VS2010 x64;
In short, none of the three.
Before we begin, let's define a simple struct to hold our results:
struct result_t
{
int TP;
int FP;
int FN;
int TN;
};
This will let us wrap each implementation in a function with following signature, in order to simplify testing and performance evaluation. (Note that I use cv::Mat1b
to make it explicit we want only mats of type CV_8UC1
:
result_t conf_mat_x(cv::Mat1b truth, cv::Mat1b detections);
I will measure performance with randomly generated data of size 4096 x 4096.
I'm using MSVS2013 64bit with OpenCV 3.1 here. Sorry, don't have a MSVS2010 set up with OpenCV ready for testing this, and the timing code using c++11, so you may need to modify that in order to compile.
Updated version of your code looks like this:
result_t conf_mat_1a(cv::Mat1b truth, cv::Mat1b detections)
{
CV_Assert(truth.size == detections.size);
result_t result = { 0 };
for (int r(0); r < detections.rows; ++r) {
for (int c(0); c < detections.cols; ++c) {
int d(detections.at<uchar>(r, c));
int t(truth.at<uchar>(r, c));
if (d&&t) { ++result.TP; }
if (d&&!t) { ++result.FP; }
if (!d&&t) { ++result.FN; }
if (!d&&!t) { ++result.TN; }
}
}
return result;
}
Performance and results:
#0: min=120.017 mean=123.258 TP=4192029 FP=4195489 TN=4195118 FN=4194580 Total=16777216
The main problem here is that this (especially with VS2010) is unlikely to get auto-vectorized, so will be rather slow. Taking advantage of SIMD could potentially allow up to an order of magnitude speedup. Additionally repeated calls to cv::Mat::at
can add some overhead as well.
There really isn't much to gain here, we should be able to do better.
Code:
result_t conf_mat_2a(cv::Mat1b truth, cv::Mat1b detections)
{
CV_Assert(truth.size == detections.size);
result_t result = { 0 };
{
cv::Mat truePos = detections.mul(truth);
result.TP = cv::countNonZero(truePos);
}
{
cv::Mat falsePos = detections.mul(~truth);
result.FP = cv::countNonZero(falsePos);
}
{
cv::Mat falseNeg = truth.mul(~detections);
result.FN = cv::countNonZero(falseNeg);
}
{
cv::Mat trueNeg = (~truth).mul(~detections);
result.TN = cv::countNonZero(trueNeg);
}
return result;
}
Performance and results:
#1: min=63.993 mean=68.674 TP=4192029 FP=4195489 TN=4195118 FN=4194580 Total=16777216
This is already about twice as fast, even though there's a lot of unnecessary work being done.
Multiplication (with saturation) seems an overkill -- a bitwise_and
will do the job as well and can potentially shave off a little bit of time.
A huge overhead is imposed by a number of redundant matrix allocations. Instead of allocating a new matrix for each of truePos
, falsePos
, falseNeg
and trueNeg
, we can reuse the same cv::Mat
for all 4 cases. Since the shape and data type will always be the same, this means only 1 allocation will happen instead of 4.
Code:
result_t conf_mat_2b(cv::Mat1b truth, cv::Mat1b detections)
{
CV_Assert(truth.size == detections.size);
result_t result = { 0 };
cv::Mat temp;
cv::bitwise_and(detections, truth, temp);
result.TP = cv::countNonZero(temp);
cv::bitwise_and(detections, ~truth, temp);
result.FP = cv::countNonZero(temp);
cv::bitwise_and(~detections, truth, temp);
result.FN = cv::countNonZero(temp);
cv::bitwise_and(~detections, ~truth, temp);
result.TN = cv::countNonZero(temp);
return result;
}
Performance and results:
#2: min=50.995 mean=52.440 TP=4192029 FP=4195489 TN=4195118 FN=4194580 Total=16777216
Time needed has been reduced by ~20% compared to conf_mat_2a
.
Next, notice that you're calculating ~truth
and ~detections
twice. Hence, we can eliminate the operations along with 2 additional allocations by reusing them as well.
NB: The memory usage won't change -- we needed 3 temporary arrays before, and that's still the case.
Code:
result_t conf_mat_2c(cv::Mat1b truth, cv::Mat1b detections)
{
CV_Assert(truth.size == detections.size);
result_t result = { 0 };
cv::Mat inv_truth(~truth);
cv::Mat inv_detections(~detections);
cv::Mat temp;
cv::bitwise_and(detections, truth, temp);
result.TP = cv::countNonZero(temp);
cv::bitwise_and(detections, inv_truth, temp);
result.FP = cv::countNonZero(temp);
cv::bitwise_and(inv_detections, truth, temp);
result.FN = cv::countNonZero(temp);
cv::bitwise_and(inv_detections, inv_truth, temp);
result.TN = cv::countNonZero(temp);
return result;
}
Performance and results:
#3: min=37.997 mean=38.569 TP=4192029 FP=4195489 TN=4195118 FN=4194580 Total=16777216
Time needed has been reduced by ~40% compared to conf_mat_2a
.
There's still potential for improvement. Let's make some observations.
element_count == rows * cols
where rows
and cols
represent the height and width of the cv::Mat
(we can use cv::Mat::total()
).TP + FP + FN + TN == element_count
since each element belongs to exactly 1 of the 4 sets.positive_count
is the number of non-zero elements in detections
.negative_count
is the number of zero elements in detections
.positive_count + negative_count == element_count
since each element belongs to exactly 1 of the 2 setsTP + FP == positive_count
TN + FN == negative_count
Using this information we can calculate TN
using simple arithmetics, thus eliminating one bitwise_and
and one countNonZero
. We can similarly calculate FP
, eliminating another bitwise_and
, and using the second countNonZero
to calculate positive_count
instead.
Since we eliminated both uses of inv_truth
, we can drop it as well.
NB: The memory usage has been reduced -- we now only have 2 temporary arrays.
Code:
result_t conf_mat_2d(cv::Mat1b truth, cv::Mat1b detections)
{
CV_Assert(truth.size == detections.size);
result_t result = { 0 };
cv::Mat1b inv_detections(~detections);
int positive_count(cv::countNonZero(detections));
int negative_count(static_cast<int>(truth.total()) - positive_count);
cv::Mat1b temp;
cv::bitwise_and(truth, detections, temp);
result.TP = cv::countNonZero(temp);
result.FP = positive_count - result.TP;
cv::bitwise_and(truth, inv_detections, temp);
result.FN = cv::countNonZero(temp);
result.TN = negative_count - result.FN;
return result;
}
Performance and results:
#4: min=22.494 mean=22.831 TP=4192029 FP=4195489 TN=4195118 FN=4194580 Total=16777216
Time needed has been reduced by ~65% compared to conf_mat_2a
.
Finally, since we only need inv_detections
once, we can reuse temp
to store it, getting rid of one more allocation, and further reducing memory footprint.
NB: The memory usage has been reduced -- we now only have 1 temporary array.
Code:
result_t conf_mat_2e(cv::Mat1b truth, cv::Mat1b detections)
{
CV_Assert(truth.size == detections.size);
result_t result = { 0 };
int positive_count(cv::countNonZero(detections));
int negative_count(static_cast<int>(truth.total()) - positive_count);
cv::Mat1b temp;
cv::bitwise_and(truth, detections, temp);
result.TP = cv::countNonZero(temp);
result.FP = positive_count - result.TP;
cv::bitwise_not(detections, temp);
cv::bitwise_and(truth, temp, temp);
result.FN = cv::countNonZero(temp);
result.TN = negative_count - result.FN;
return result;
}
Performance and results:
#5: min=16.999 mean=17.391 TP=4192029 FP=4195489 TN=4195118 FN=4194580 Total=16777216
Time needed has been reduced by ~72% compared to conf_mat_2a
.
This again suffers from the same issue as variant 1, namely it's unlikely to get vectorized, so it will be relatively slow.
The major issue with your implementation is the fact that forEach
runs the function in parallel on multiple slices of the input, and there's lack of any synchronization. The current implementation returns incorrect results.
However, the idea of parallelizing could be applied with some effort to the best of Variant 2.
Let's improve on conf_mat_2e
by taking advantage of cv::parallel_for_
. The simplest way to distribute the load between worker threads is to do it on a row-by-row basis.
We can avoid the need for synchronization by sharing an intermediate cv::Mat3i
which will hold TP
, FP
, and FN
for each row (recall that TN
can be calculated from the other 3 at the end). Since each row is only processed by a single worker thread, we don't need to synchronize. Once all the rows have been processed, a simple cv::sum
will give us a total TP
, FP
, and FN
. The TN
is then calculated.
NB: We can again reduce memory requirements -- we need a one buffer spanning a single row for each worker. Additionally, we need 3 * rows
integers to store the intermediate results.
Code:
class ParallelConfMat : public cv::ParallelLoopBody
{
public:
enum
{
TP = 0
, FP = 1
, FN = 2
};
ParallelConfMat(cv::Mat1b& truth, cv::Mat1b& detections, cv::Mat3i& result)
: truth_(truth)
, detections_(detections)
, result_(result)
{
}
ParallelConfMat& operator=(ParallelConfMat const&)
{
return *this;
};
virtual void operator()(cv::Range const& range) const
{
cv::Mat1b temp;
for (int r(range.start); r < range.end; r++) {
cv::Mat1b detections(detections_.row(r));
cv::Mat1b truth(truth_.row(r));
cv::Vec3i& result(result_.at<cv::Vec3i>(r));
int positive_count(cv::countNonZero(detections));
int negative_count(static_cast<int>(truth.total()) - positive_count);
cv::bitwise_and(truth, detections, temp);
result[TP] = cv::countNonZero(temp);
result[FP] = positive_count - result[TP];
cv::bitwise_not(detections, temp);
cv::bitwise_and(truth, temp, temp);
result[FN] = cv::countNonZero(temp);
}
}
private:
cv::Mat1b& truth_;
cv::Mat1b& detections_;
cv::Mat3i& result_; // TP, FP, FN per row
};
result_t conf_mat_4(cv::Mat1b truth, cv::Mat1b detections)
{
CV_Assert(truth.size == detections.size);
result_t result = { 0 };
cv::Mat3i partial_results(truth.rows, 1);
cv::parallel_for_(cv::Range(0, truth.rows)
, ParallelConfMat(truth, detections, partial_results));
cv::Scalar reduced_results = cv::sum(partial_results);
result.TP = static_cast<int>(reduced_results[ParallelConfMat::TP]);
result.FP = static_cast<int>(reduced_results[ParallelConfMat::FP]);
result.FN = static_cast<int>(reduced_results[ParallelConfMat::FN]);
result.TN = static_cast<int>(truth.total()) - result.TP - result.FP - result.FN;
return result;
}
Performance and results:
#6: min=1.496 mean=1.966 TP=4192029 FP=4195489 TN=4195118 FN=4194580 Total=16777216
This is running on 6 core CPU with HT enabled (i.e. 12 threads).
The runtime has been reduce by ~97.5% compared to conf_mat_2a
.
It may be the case that for very small inputs, this may be sub-optimal. The ideal implementation may be a combination of some of those approaches, delegating based on the input size.
Test code:
#include <opencv2/opencv.hpp>
#include <chrono>
#include <iomanip>
using std::chrono::high_resolution_clock;
using std::chrono::duration_cast;
using std::chrono::microseconds;
struct result_t
{
int TP;
int FP;
int FN;
int TN;
};
/******** PASTE all the conf_mat_xx functions here *********/
int main()
{
int ROWS(4 * 1024), COLS(4 * 1024), ITERS(32);
cv::Mat1b truth(ROWS, COLS);
cv::randu(truth, 0, 2);
truth *= 255;
cv::Mat1b detections(ROWS, COLS);
cv::randu(detections, 0, 2);
detections *= 255;
typedef result_t(*conf_mat_fn)(cv::Mat1b, cv::Mat1b);
struct test_info
{
conf_mat_fn fn;
std::vector<double> d;
result_t r;
};
std::vector<test_info> info;
info.push_back({ conf_mat_1a });
info.push_back({ conf_mat_2a });
info.push_back({ conf_mat_2b });
info.push_back({ conf_mat_2c });
info.push_back({ conf_mat_2d });
info.push_back({ conf_mat_2e });
info.push_back({ conf_mat_4 });
// Warm-up
for (int n(0); n < info.size(); ++n) {
info[n].fn(truth, detections);
}
for (int i(0); i < ITERS; ++i) {
for (int n(0); n < info.size(); ++n) {
high_resolution_clock::time_point t1 = high_resolution_clock::now();
info[n].r = info[n].fn(truth, detections);
high_resolution_clock::time_point t2 = high_resolution_clock::now();
info[n].d.push_back(static_cast<double>(duration_cast<microseconds>(t2 - t1).count()) / 1000.0);
}
}
for (int n(0); n < info.size(); ++n) {
std::cout << "#" << n << ":"
<< std::fixed << std::setprecision(3)
<< "\tmin=" << *std::min_element(info[n].d.begin(), info[n].d.end())
<< "\tmean=" << cv::mean(info[n].d)[0]
<< "\tTP=" << info[n].r.TP
<< "\tFP=" << info[n].r.FP
<< "\tTN=" << info[n].r.TN
<< "\tFN=" << info[n].r.FN
<< "\tTotal=" << (info[n].r.TP + info[n].r.FP + info[n].r.TN + info[n].r.FN)
<< "\n";
}
}
Performance and results from MSVS2015, Win64, OpenCV 3.4.3:
#0: min=119.797 mean=121.769 TP=4192029 FP=4195489 TN=4195118 FN=4194580 Total=16777216
#1: min=64.130 mean=65.086 TP=4192029 FP=4195489 TN=4195118 FN=4194580 Total=16777216
#2: min=51.152 mean=51.758 TP=4192029 FP=4195489 TN=4195118 FN=4194580 Total=16777216
#3: min=37.781 mean=38.357 TP=4192029 FP=4195489 TN=4195118 FN=4194580 Total=16777216
#4: min=22.329 mean=22.637 TP=4192029 FP=4195489 TN=4195118 FN=4194580 Total=16777216
#5: min=17.029 mean=17.297 TP=4192029 FP=4195489 TN=4195118 FN=4194580 Total=16777216
#6: min=1.827 mean=2.017 TP=4192029 FP=4195489 TN=4195118 FN=4194580 Total=16777216