Search code examples
pythonscipysparse-matrix

Saving a scipy.sparse matrix directly as a regular txt file


I have a scipy.sparse matrix (csr_matrix()). But I need to save it to a file not in the .npz format but as a regular .txt or .csv file. My problem is that I don't have enough memory to convert the sparse matrix into a regular np.array() and then save it to a file. Is there a way to have the data as a sparse matrix in memory but save it directly as a regular matrix in the form:

0 0 0
0 1 0
1 0 1

to the disk? Or is there a way to "unzip" a .npz file without loading it into memory inside Python? (like for example gunzip or unzip in Bash).


Solution

  • Answer to new question:

    import numpy as np
    from scipy import sparse, io
    A = sparse.eye(5, format='csr') * np.pi
    np.set_printoptions(precision=16, linewidth=1000)
    with open('matrix.txt', 'a') as f:
        for row in A:
            f.write(str(row.toarray()[0]))
            f.write('\n')
    
    # [3.141592653589793 0.                0.                0.                0.               ]
    # [0.                3.141592653589793 0.                0.                0.               ]
    # [0.                0.                3.141592653589793 0.                0.               ]
    # [0.                0.                0.                3.141592653589793 0.               ]
    # [0.                0.                0.                0.                3.141592653589793]
    

    And with begin/end brackets:

    import numpy as np
    from scipy import sparse, io
    A = sparse.eye(5, format='csr') * np.pi
    np.set_printoptions(precision=16, linewidth=1000)
    with open('matrix.txt', 'a') as f:
        for i, row in enumerate(A):
            f.write('[' if (i == 0) else ' ')
            f.write(str(row.toarray()[0]))
            f.write(']' if (i == A.shape[0] - 1) else '\n')
    
    # [[3.141592653589793 0.                0.                0.                0.               ]
    #  [0.                3.141592653589793 0.                0.                0.               ]
    #  [0.                0.                3.141592653589793 0.                0.               ]
    #  [0.                0.                0.                3.141592653589793 0.               ]
    #  [0.                0.                0.                0.                3.141592653589793]]
    

    You may have to fiddle with set_printoptions depending on your data.


    Answer to original question, which did not require that the matrix be written as dense.

    Harwell-Boeing format is plain text:

    import numpy as np
    from scipy import sparse, io
    A = sparse.eye(3, format='csr') * np.pi
    
    # Default title                                                           0       
    #              3             1             1             1
    # RUA                        3             3             3             0
    # (40I2)          (40I2)          (3E25.16)           
    #  1 2 3 4
    #  1 2 3
    #   3.1415926535897931E+00  3.1415926535897931E+00  3.1415926535897931E+00
    
    io.hb_write('matrix.txt', A)  # saves as matrix.txt
    A2 = io.hb_read('matrix.txt')
    assert not (A2 != A).nnz  # efficient check for equality
    

    So is Matrix Market:

    io.mmwrite('matrix', A)  # saves as matrix.mtx
    
    # %%MatrixMarket matrix coordinate real symmetric
    # %
    # 3 3 3
    # 1 1 3.141592653589793e+00
    # 2 2 3.141592653589793e+00
    # 3 3 3.141592653589793e+00
    
    A2 = io.mmread('matrix')
    assert not (A2 != A).nnz
    

    If you want an even simpler format, although it involves more code:

    import numpy as np
    from scipy import sparse
    A = sparse.eye(10, format='csr')*np.pi
    
    np.savetxt('data.txt', A.data)
    np.savetxt('indices.txt', A.indices, fmt='%i')
    np.savetxt('indptr.txt', A.indptr, fmt='%i')
    

    To load:

    data = np.loadtxt('data.txt')
    indices = np.loadtxt('indices.txt', dtype=np.int32)
    indptr = np.loadtxt('indptr.txt', dtype=np.int32)
    
    A2 = sparse.csr_matrix((data, indices, indptr))
    assert not (A2 != A).nnz
    

    But the important idea is that all you need to save are the data, indices, and indptr attributes of the csr_matrix.