I have a libsvm vector of the form:
{i_1:v_1; i_2:v_2;...; i_n:v_n}
Where i_j:v_j represent respectively the index and the value. If the value is null then it wont be given any index.
My objective is to compute the euclidean distance between two libsvm vectors. For that I have to convert them to
vector<float>
of the same size. In the following example i'll be showing the function that I used in order to convert the libsvm vector intovector<float>
.
The first column has an index = 2648 and a value = 0.408734 meaning that all the values before it are zeros.
LIBSVM VECTOR = 2648:0.408734;4157:0.609588;6087:0.593104;26747:0.331008
#include <vector>
#include <string>
#include <chrono>
#include <boost/algorithm/string.hpp>
using namespace std;
using namespace chrono;
//convert libsvm vector to float vector in order to compute the similarity
vector<float> splitVector(const vector<string> &);
int main()
{
vector<string> libsvm {"2648:0.408734","4157:0.609588","6087:0.593104","26747:0.331008" };
high_resolution_clock::time_point t1 = high_resolution_clock::now();
vector<float> newVec = splitVector(libsvm);
high_resolution_clock::time_point t2 = high_resolution_clock::now();
auto duration = chrono::duration_cast<chrono::microseconds>( t2 - t1 ).count();
cout <<"construction time: " << duration << endl;
return 0;
}
vector<float> splitVector(const vector<string> & v)
{
int numberofterms = 266373;
vector<float> values;
vector<int> previous_idx;
for(int i = 0; i < v.size(); i++)
{
vector<string> tmpv;
boost::split(tmpv, v[i] , boost::is_any_of(":"));
//idx:value
int idx = atoi(tmpv[0].c_str());
float val = atof(tmpv[1].c_str());
//summation of previous indices
int sum = accumulate(previous_idx.begin(), previous_idx.end(), 0);
int n = idx - (sum + i + 1);
//fill vector with 0s
for(int k = 0; k < n; k++)
values.push_back(0.0);
//add value
values.push_back(val);
previous_idx.push_back(n);
}//end for
int paddingsize = numberofterms - values.size();
for(int i = 0; i < paddingsize;i++)
{
values.push_back(0.0);
}
return values;
}//end function
The timing of the conversion is around 0,00866 seconds and when I have around 1000 vectors it becomes slow. Is there a faster way to convert the libsvm vector into vector<float>
?
values.resize(266373,0.0);
void splitVector(const vector<string> & v, vector<float> & values)
{
vector<string> tmpv;
for(int i = 0; i < v.size(); i++)
{
boost::split(tmpv, v[i] , boost::is_any_of(":"));
//idx:value
int idx = atoi(tmpv[0].c_str());
float val = atof(tmpv[1].c_str());
tmpv.clear();
values[idx] = val;
}//end for
}//end function
You could reduce time cost on memory allocation by reusing vector
s.
To be more specific,
tmpv
by declaring it before the for
loop and call tmpv.clear()
in the beginning of each loopvalues
by values.reserve()
; and pad it by values.resize(266373, 0.0)
instead of repeated push_back()
.previous_idx
if possible. This may has negative impact on the code structure and thus maintainability.