Search code examples
pythondbscan

MemoryError, is it just because ram or nested for


I have a research about clustering NSL KDD data using DBSCAN in python. When I try to run the program with limit 10.000 data, it says MemoryError while when running all data (NSL KDD has 125.973 rows, 41 columns) it says something like Maximum Dimension reached. It is just because the matter of computer specification (I am using 8GB RAM) or the matter of the code? How to solve this? Last, how to update cluster result of each rows to be saved in mySQL? I am a python newbie i am sorry if you thought i ask a stupid question

def set2List(NumpyArray):
    list = []
    for item in NumpyArray:
        list.append(item.tolist())
    return list 

def GenerateData():
    mydb = pymysql.connect(
    host="localhost",user="root", password="", database="ta")
    mycursor = mydb.cursor()
    mycursor.execute("SELECT * FROM data_trans LIMIT 10000")
    myresult = mycursor.fetchall() 
    final_result= numpy.array(myresult)
    return final_result

def DBSCAN(Dataset, Epsilon,MinumumPoints,DistanceMethod = 'euclidean'):

    m,n=Dataset.shape
    Visited=numpy.zeros(m,'int')
    Type=numpy.zeros(m)

    ClustersList=[]
    Cluster=[]
    PointClusterNumber=numpy.zeros(m)
    PointClusterNumberIndex=1
    PointNeighbors=[]
    DistanceMatrix = scipy.spatial.distance.squareform(scipy.spatial.distance.pdist(Dataset, DistanceMethod))
    for i in xrange(m):
       if Visited[i]==0:
          Visited[i]=1
          PointNeighbors=numpy.where(DistanceMatrix[i]<Epsilon)[0]
          if len(PointNeighbors)<MinumumPoints:
            Type[i]=-1
          else:
            for k in xrange(len(Cluster)):
                Cluster.pop()
            Cluster.append(i)
            PointClusterNumber[i]=PointClusterNumberIndex               

            PointNeighbors=set2List(PointNeighbors)    
            ExpandClsuter(Dataset[i], PointNeighbors,Cluster,MinumumPoints,Epsilon,Visited,DistanceMatrix,PointClusterNumber,PointClusterNumberIndex  )
            Cluster.append(PointNeighbors[:])
            ClustersList.append(Cluster[:])
            PointClusterNumberIndex=PointClusterNumberIndex+1

     return PointClusterNumber 

 def ExpandClsuter(PointToExapnd, PointNeighbors, Cluster, MinumumPoints, Epsilon, Visited, DistanceMatrix, PointClusterNumber, PointClusterNumberIndex  ):
     Neighbors=[] 
     for i in PointNeighbors:
       if Visited[i]==0:
         Visited[i]=1
         Neighbors=numpy.where(DistanceMatrix[i]<Epsilon)[0]
         if len(Neighbors)>=MinumumPoints: 
            for j in Neighbors:
                try:
                    PointNeighbors.index(j)
                except ValueError:
                    PointNeighbors.append(j)

         if PointClusterNumber[i]==0:
            Cluster.append(i)
            PointClusterNumber[i]=PointClusterNumberIndex
return

Data=GenerateData()

fig = plt.figure()
ax1=fig.add_subplot(2,1,1) #row, column, figure number

ax1.scatter(Data[:,0],Data[:,1], alpha =  0.5 ) 

Epsilon=300
MinumumPoints=50
result =DBSCAN(Data,Epsilon,MinumumPoints) 
print result     
plt.show()

Error message:

Traceback (most recent call last):

File "<ipython-input-8-20458e6efb7c>", line 1, in <module>
runfile('C:/Users/Ji Min/Downloads/oprek.py', wdir='C:/Users/Ji Min/Downloads')

File "C:\Users\Ji Min\Anaconda2\lib\site-packages\spyder\utils\site\sitecustomize.py", line 705, in runfile
execfile(filename, namespace)

File "C:\Users\Ji Min\Anaconda2\lib\site-packages\spyder\utils\site\sitecustomize.py", line 87, in execfile
exec(compile(scripttext, filename, 'exec'), glob, loc)

File "C:/Users/Ji Min/Downloads/oprek.py", line 95, in <module>
result =DBSCAN(Data,Epsilon,MinumumPoints)

File "C:/Users/Ji Min/Downloads/oprek.py", line 44, in DBSCAN
DistanceMatrix = scipy.spatial.distance.squareform(scipy.spatial.distance.pdist(Dataset, DistanceMethod))

File "C:\Users\Ji Min\Anaconda2\lib\site-packages\scipy\spatial\distance.py", line 1652, in pdist
dm = np.empty((m * (m - 1)) // 2, dtype=np.double)

MemoryError

Solution

  • The key is to not compute a distance matrix.

    Distance matrixes need too much memory.

    But that data set is useless anyway. The distances you compute are meaningless, so don't expect the clustering to be much better than that...