I have a research about clustering NSL KDD data using DBSCAN in python. When I try to run the program with limit 10.000 data, it says MemoryError while when running all data (NSL KDD has 125.973 rows, 41 columns) it says something like Maximum Dimension reached. It is just because the matter of computer specification (I am using 8GB RAM) or the matter of the code? How to solve this? Last, how to update cluster result of each rows to be saved in mySQL? I am a python newbie i am sorry if you thought i ask a stupid question
def set2List(NumpyArray):
list = []
for item in NumpyArray:
list.append(item.tolist())
return list
def GenerateData():
mydb = pymysql.connect(
host="localhost",user="root", password="", database="ta")
mycursor = mydb.cursor()
mycursor.execute("SELECT * FROM data_trans LIMIT 10000")
myresult = mycursor.fetchall()
final_result= numpy.array(myresult)
return final_result
def DBSCAN(Dataset, Epsilon,MinumumPoints,DistanceMethod = 'euclidean'):
m,n=Dataset.shape
Visited=numpy.zeros(m,'int')
Type=numpy.zeros(m)
ClustersList=[]
Cluster=[]
PointClusterNumber=numpy.zeros(m)
PointClusterNumberIndex=1
PointNeighbors=[]
DistanceMatrix = scipy.spatial.distance.squareform(scipy.spatial.distance.pdist(Dataset, DistanceMethod))
for i in xrange(m):
if Visited[i]==0:
Visited[i]=1
PointNeighbors=numpy.where(DistanceMatrix[i]<Epsilon)[0]
if len(PointNeighbors)<MinumumPoints:
Type[i]=-1
else:
for k in xrange(len(Cluster)):
Cluster.pop()
Cluster.append(i)
PointClusterNumber[i]=PointClusterNumberIndex
PointNeighbors=set2List(PointNeighbors)
ExpandClsuter(Dataset[i], PointNeighbors,Cluster,MinumumPoints,Epsilon,Visited,DistanceMatrix,PointClusterNumber,PointClusterNumberIndex )
Cluster.append(PointNeighbors[:])
ClustersList.append(Cluster[:])
PointClusterNumberIndex=PointClusterNumberIndex+1
return PointClusterNumber
def ExpandClsuter(PointToExapnd, PointNeighbors, Cluster, MinumumPoints, Epsilon, Visited, DistanceMatrix, PointClusterNumber, PointClusterNumberIndex ):
Neighbors=[]
for i in PointNeighbors:
if Visited[i]==0:
Visited[i]=1
Neighbors=numpy.where(DistanceMatrix[i]<Epsilon)[0]
if len(Neighbors)>=MinumumPoints:
for j in Neighbors:
try:
PointNeighbors.index(j)
except ValueError:
PointNeighbors.append(j)
if PointClusterNumber[i]==0:
Cluster.append(i)
PointClusterNumber[i]=PointClusterNumberIndex
return
Data=GenerateData()
fig = plt.figure()
ax1=fig.add_subplot(2,1,1) #row, column, figure number
ax1.scatter(Data[:,0],Data[:,1], alpha = 0.5 )
Epsilon=300
MinumumPoints=50
result =DBSCAN(Data,Epsilon,MinumumPoints)
print result
plt.show()
Error message:
Traceback (most recent call last):
File "<ipython-input-8-20458e6efb7c>", line 1, in <module>
runfile('C:/Users/Ji Min/Downloads/oprek.py', wdir='C:/Users/Ji Min/Downloads')
File "C:\Users\Ji Min\Anaconda2\lib\site-packages\spyder\utils\site\sitecustomize.py", line 705, in runfile
execfile(filename, namespace)
File "C:\Users\Ji Min\Anaconda2\lib\site-packages\spyder\utils\site\sitecustomize.py", line 87, in execfile
exec(compile(scripttext, filename, 'exec'), glob, loc)
File "C:/Users/Ji Min/Downloads/oprek.py", line 95, in <module>
result =DBSCAN(Data,Epsilon,MinumumPoints)
File "C:/Users/Ji Min/Downloads/oprek.py", line 44, in DBSCAN
DistanceMatrix = scipy.spatial.distance.squareform(scipy.spatial.distance.pdist(Dataset, DistanceMethod))
File "C:\Users\Ji Min\Anaconda2\lib\site-packages\scipy\spatial\distance.py", line 1652, in pdist
dm = np.empty((m * (m - 1)) // 2, dtype=np.double)
MemoryError
The key is to not compute a distance matrix.
Distance matrixes need too much memory.
But that data set is useless anyway. The distances you compute are meaningless, so don't expect the clustering to be much better than that...