I coded this for an assignment which has passed its deadline.
This implementation works completely fine with various smaller test cases and displays the sizes of the 5 largest Strongly Connected Components in the graph as it should.
But seems to execute forever when i run it on the assignment data set of about 875714 vertices. (Doesn't even come out of the first DFS pass after 60mins)
I've used the non recursive stack implementation of the DFS routine as i heard that the large number of vertices was causing recursion stack overflow problems.
It would be really helpful if anyone could point out, what in this code is making it behave this way with the large dataset.
The input file consists of list of edges in the graph. one edge/line.
(eg):
1 2
2 3
3 1
3 4
5 4
Download link for the Large graph test case zip file
//Macro definitions and Global variables
#define N 875714
#define all(a) (a).begin(), (a).end()
#define tr(c,i) for(typeof((c).begin()) i = (c).begin(); i != (c).end(); i++)
vi v(N), ft, size;
//Non recursive DFS algorithm
void DFS(vvi g, int s, int flag)
{
stack<int> stk;
stk.push(s);
v[s] = 1;
int jumpOut, count;
vi::iterator i;
if(flag == 2)
count = 1;
while(!stk.empty())
{
i = g[stk.top()].begin();
jumpOut = 0;
for(; i != g[stk.top()].end(); i++)
{
if(v[*i] != 1)
{
stk.push(*i);
v[*i] = 1;
if(flag == 2) //Count the SCC size
count++;
jumpOut = 1; //Jump to the while loop's beginning
break;
}
}
if(flag == 1 && jumpOut == 0) //Record the finishing time order of vertices
ft.push_back(stk.top());
if(jumpOut == 0)
stk.pop();
}
if(flag == 2)
size.push_back(count); //Store the SCC size
}
// The 2 pass Kosaraju algorithm
void kosaraju(vvi g, vvi gr)
{
cout<<"\nInside pass 1\n";
for(int i = N - 1; i >= 0; i--)
if(v[i] != 1)
DFS(gr, i, 1);
cout<<"\nPass 1 completed\n";
fill(all(v), 0);
cout<<"\nInside pass 2\n";
for(int i = N - 1; i >= 0; i--)
if(v[ ft[i] ] != 1)
DFS(g, ft[i], 2);
cout<<"\nPass 2 completed\n";
}
.
int main()
{
vvi g(N), gr(N);
ifstream file("/home/tauseef/Desktop/DAA/SCC.txt");
int first, second;
string line;
while(getline(file,line,'\n')) //Reading from file
{
stringstream ss(line);
ss >> first;
ss >> second;
if(first == second) //Eliminating self loops
continue;
g[first-1].push_back(second-1); //Creating G & Grev
gr[second-1].push_back(first-1);
}
cout<<"\nfile read successfully\n";
kosaraju(g, gr);
cout<<"\nFinishing order is: ";
tr(ft, j)
cout<<*j+1<<" ";
cout<<"\n";
sort(size.rbegin(), size.rend()); //Sorting the SCC sizes in descending order
cout<<"\nThe largest 5 SCCs are: ";
tr(size, j)
cout<<*j<<" ";
cout<<"\n";
file.close();
}
There are several improvements that you can apply:
1- cin
is not as fast scanf
for large inputs: Because your input file is huge you better use scanf
to read your data.
2- It is not a good idea to pass large data to functions by value: You have two huge graphs in your code that you pass them to functions by value. It takes a lot of time because every time you are making a copy of the data.
3- There is no need to use iterator
for traversing a vector
: Because you are using a vector
and you have random access to it via []
operator there is no need to use iterator
to access data.
4- Your DFS is not efficient: This is the most important one. Every time the program go to the beginning of the while
and check the adjacency list of the element on top of the stack
you start from the beginning and check elements. This make the algorithm very inefficient because you are checking some things over and over again. You can simply store how many of the children you have checked and when you go back to this element you start from the next element instead of starting from start.
#include<iostream>
#include<vector>
#include<stack>
#include<algorithm>
#include<fstream>
#include<string>
#include<sstream>
using namespace std;
typedef vector<int> vi;
typedef vector<vi> vvi;
#define N 875714
#define sz(a) int((a).size())
#define all(a) (a).begin(), (a).end()
#define tr(c,i) for(typeof((c).begin()) i = (c).begin(); i != (c).end(); i++)
vi v(N), ft, size;
vi childsVisited(N);
void DFS(vvi &g, int s, int flag)
{
stack<int> stk;
stk.push(s);
v[s] = 1;
int jumpOut, count;
if(flag == 2)
count = 1;
int counter = 0;
while(!stk.empty())
{
jumpOut = 0;
int cur = stk.top();
for ( ;childsVisited[cur] < g[cur].size(); ++childsVisited[cur] )
//for ( int i=0; i< g[cur].size(); ++i )
//for(; i != g[stk.top()].end(); i++)
{
int i = childsVisited[cur];
int next = g[cur][i];
if(v[next] != 1)
{
stk.push(next);
v[next] = 1;
if(flag == 2) //Count the SCC size
count++;
jumpOut = 1; //Jump to the while loop's beginning
break;
}
}
if(flag == 1 && jumpOut == 0) //Record the finishing time order of vertices
ft.push_back(stk.top());
if(jumpOut == 0)
stk.pop();
}
if(flag == 2)
size.push_back(count); //Store the SCC size
}
void kosaraju(vvi &g, vvi &gr)
{
cout<<"\nInside pass 1\n";
for(int i = N - 1; i >= 0; i--)
if(v[i] != 1)
DFS(gr, i, 1);
cout<<"\nPass 1 completed\n";
fill(all(v), 0);
fill(all(childsVisited), 0);
cout<<"\nInside pass 2\n";
for(int i = N - 1; i >= 0; i--)
if(v[ ft[i] ] != 1)
DFS(g, ft[i], 2);
cout<<"\nPass 2 completed\n";
}
int main()
{
freopen("input.txt","r",stdin);
vvi g(N), gr(N);
//ifstream file("/home/tauseef/Desktop/DAA/SCC.txt");
int first, second;
//string line;
unsigned long int cnt = 0;
//while(getline(file,line,'\n')) //Reading from file
//{
//stringstream ss(line);
//ss >> first;
//ss >> second;
//if(first == second) //Eliminating self loops
//continue;
for ( int i = 0; i < 5105043; ++i ){
int first, second;
scanf("%d %d",&first,&second);
g[first-1].push_back(second-1); //Creating G & Grev
gr[second-1].push_back(first-1);
}
//cnt++;
//}
cout<<"\nfile read successfully\n";
kosaraju(g, gr);
cout<<"\nFinishing order is: ";
sort(size.rbegin(), size.rend()); //Sorting the SCC sizes in descending order
cout<<"\nThe largest 5 SCCs are: ";
}