I was trying to benchmark binary search vs linear search. I'm using C++
.
Here is how I implemented my benchmark:
#include <chrono>
#include <iostream>
#include <vector>
#include <algorithm>
#include <numeric>
using namespace std;
class Solution {
public:
int search(vector<int>& nums, int target) {
int low = 0;
int high = nums.size() - 1;
while (low <= high) {
int mid = (low + high) / 2;
if (target > nums[mid]) {
low = mid+1;
} else if (target < nums[mid]) {
high = mid-1;
} else {
return mid;
}
}
return -1;
}
int naiveSearch(vector<int>& nums, int target) {
for(int i = 0; i < nums.size(); i++) {
if(nums[i] == target) {
return i;
}
}
return -1;
}
};
int main() {
Solution sol;
for(int n = 1000; n <= 10000000; n += 10000) {
// Random target, always gonna be in the array
int target = rand()%n+1;
vector<int> nums(n);
iota(nums.begin(), nums.end(), 1);
auto start = std::chrono::high_resolution_clock::now();
sol.search(nums, target);
auto stop = std::chrono::high_resolution_clock::now();
auto duration = std::chrono::duration_cast<std::chrono::microseconds>(stop - start);
cout << n << "," << duration.count() << ",Binary" << endl;
start = std::chrono::high_resolution_clock::now();
sol.naiveSearch(nums, target);
stop = std::chrono::high_resolution_clock::now();
duration = std::chrono::duration_cast<std::chrono::microseconds>(stop - start);
cout << n << "," << duration.count() << ",Naive" << endl;
}
return 0;
}
I then copied everything in a data.csv
and plotted it in python
with this:
import matplotlib.pyplot as plt
import pandas as pd
data = pd.read_csv('data.csv', names=['Size', 'Time', 'Method'])
binary_data = data[data['Method'] == 'Binary']
naive_data = data[data['Method'] == 'Naive']
plt.plot(binary_data['Size'], binary_data['Time'], label='Binary')
plt.plot(naive_data['Size'], naive_data['Time'], label='Naive')
plt.xlabel('Input size')
plt.ylabel('Time (microseconds)')
plt.legend()
plt.show()
I expected a linear increase in time for the Naive
method but I had this graph:
I ran without optimizer g++ -O0 -o output_file.exe hello_world.cpp
in case my optimizer was doing something shady but it didn't change anything.
This is a very interesting question. The answer is that when you generate the target
, you only generate small numbers.
One of the common reasons for getting only small numbers with rand() is that it generates random numbers within a limited range, between 0
and RAND_MAX
(inclusive). The value of this weird RAND_MAX
is an implementation-defined constant, and it must be at least 32767
. This means that the maximum value that can be generated by rand() is RAND_MAX
, and the range of values is from 0
to RAND_MAX
. For some systems, RAND_MAX
might be relatively small (on mine it's 32767
).
Therefore, all of your searches finish really quickly (after at most 32767
executions).
A proper benchmark with a proper random generation would yield this:
Here is how you can know what your RAND_MAX
is:
#include <cstdlib>
#include <iostream>
int main() {
std::cout << RAND_MAX << '\n';
}
And here is proof that on some configurations, RAND_MAX
is big: