Search code examples
pythonnumpyscipysparse-matrix

Create dense matrix from sparse matrix efficently (numpy/scipy but NO sklearn)


I have a sparse.txt that looks like this:

# first column is label 0 or 1
# rest of the data is sparse data
# maximum value in the data is 4, so the future dense matrix will
# have 1+4 = 5 elements in a row
# file: sparse.txt
1 1:1 2:1 3:1
0 1:1 4:1
1 2:1 3:1 4:1

The required dense.txt is this:

# required file: dense.txt
1 1 1 1 0
0 1 0 0 1
1 0 1 1 1

Without using scipy coo_matrix it did it in a simple way like this:

def create_dense(fsparse, fdense,fvocab):
    # number of lines in vocab
    lvocab = sum(1 for line in open(fvocab))

    # create dense file
    with open(fsparse) as fi, open(fdense,'w') as fo:
        for i, line in enumerate(fi):
            words = line.strip('\n').split(':')
            words = " ".join(words).split()

            label = int(words[0])
            indices = [int(w) for (i,w) in enumerate(words) if int(i)%2]

            row = [0]* (lvocab+1)
            row[0] = label

            # use listcomps
            row = [ 1 if i in indices else row[i] for i in range(len(row))]

            l = " ".join(map(str,row)) + "\n"
            fo.write(l)

            print('Writing dense matrix line: ', i+1)

Question How can we directly get label and data from sparse data without first creating dense matrix and using NUMPY /Scipy preferably??

Question: How can we read the sparse data using numpy.fromregex ?

My attempt is:

def read_file(fsparse):
    regex = r'([0-1]\s)([0-9]):(1\s)*([0-9]:1)' + r'\s*\n'
    data = np.fromregex(fsparse,regex,dtype=str)

    print(data,file=open('dense.txt','w'))

It did not work!

Related links:

Parsing colon separated sparse data with pandas and numpy


Solution

  • Tweaking your code to create the dense array directly, rather via file:

    fsparse = 'stack47266965.txt'
    
    def create_dense(fsparse, fdense, lvocab):    
        alist = []
        with open(fsparse) as fi:
            for i, line in enumerate(fi):
                words = line.strip('\n').split(':')
                words = " ".join(words).split()
    
                label = int(words[0])
                indices = [int(w) for (i,w) in enumerate(words) if int(i)%2]
    
                row = [0]* (lvocab+1)
                row[0] = label
    
                # use listcomps
                row = [ 1 if i in indices else row[i] for i in range(len(row))]
                alist.append(row)
        return alist
    
    alist = create_dense(fsparse, fdense, 4)
    print(alist)
    import numpy as np
    arr = np.array(alist)
    from scipy import sparse
    M = sparse.coo_matrix(arr)
    print(M)
    print(M.A)
    

    produces

    0926:~/mypy$ python3 stack47266965.py 
    [[1, 1, 1, 1, 0], [0, 1, 0, 0, 1], [1, 0, 1, 1, 1]]
      (0, 0)    1
      (0, 1)    1
      (0, 2)    1
      (0, 3)    1
      (1, 1)    1
      (1, 4)    1
      (2, 0)    1
      (2, 2)    1
      (2, 3)    1
      (2, 4)    1
    [[1 1 1 1 0]
     [0 1 0 0 1]
     [1 0 1 1 1]]
    

    If you want to skip the dense arr, you need to generate the equivalent of the M.row,M.col, and M.data attributes (order doesn't matter)

    [0 0 0 0 1 1 2 2 2 2] 
    [0 1 2 3 1 4 0 2 3 4] 
    [1 1 1 1 1 1 1 1 1 1]
    

    I don't use regex much so I won't try to fix that. I assume you want to convert

     '1 1:1 2:1 3:1'
    

    into

     ['1' '1' '2' '2' '1' '3' '1']
    

    But that just gets you to the words/label stage.


    A direct to sparse:

    def create_sparse(fsparse, lvocab):
    
        row, col, data = [],[],[]
        with open(fsparse) as fi:
            for i, line in enumerate(fi):
                words = line.strip('\n').split(':')
                words = " ".join(words).split()
    
                label = int(words[0])
                row.append(i); col.append(0); data.append(label)
    
                indices = [int(w) for (i,w) in enumerate(words) if int(i)%2]
                for j in indices:   # quick-n-dirty version
                    row.append(i); col.append(j); data.append(1)
        return row, col, data
    
    r,c,d = create_sparse(fsparse, 4)
    print(r,c,d)
    M = sparse.coo_matrix((d,(r,c)))
    print(M)
    print(M.A)
    

    producing

    [0, 0, 0, 0, 1, 1, 1, 2, 2, 2, 2] [0, 1, 2, 3, 0, 1, 4, 0, 2, 3, 4] [1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1]
    ....
    

    The only thing that's different is the one data item with value 0. sparse will take care of that.