Search code examples
pythoncorrelationmemory-efficient

get correlation p-value with deep graph


I am using deepgraph in python to compute correlation coefficients for large matrices. The output gives a multi-index data frame:

s    t
0    1    -0.006066
     2     0.094063
     3    -0.025529
     4     0.074080
     5     0.035490
     6     0.005221
     7     0.032064

I want to add a column with corresponding p-values. The original code with input example is obtained from https://deepgraph.readthedocs.io/en/latest/tutorials/pairwise_correlations.html The code surrounded by hashtags is my approach to get p-values. I want to merge the separate edge lists later on.

#!/bin/python

import os
from multiprocessing import Pool
import numpy as np
import pandas as pd
import deepgraph as dg
from numpy.random import RandomState
from scipy.stats import pearsonr, spearmanr

prng = RandomState(0)
n_features = int(5e3)
n_samples = int(1e2)
X = prng.randint(100, size=(n_features, n_samples)).astype(np.float64)

# Spearman's correlation coefficients
X = X.argsort(axis=1).argsort(axis=1)
# whiten variables for fast parallel computation later on
X = (X - X.mean(axis=1, keepdims=True)) / X.std(axis=1, keepdims=True)
# save in binary format
np.save('samples', X)

# parameters (change these to control RAM usage)
step_size = 1e5
n_processes = 100

# load samples as memory-map
X = np.load('samples.npy', mmap_mode='r')

# create node table that stores references to the mem-mapped samples
v = pd.DataFrame({'index': range(X.shape[0])})


# connector function to compute pairwise pearson correlations
def corr(index_s, index_t):
    features_s = X[index_s]
    features_t = X[index_t]
    corr = np.einsum('ij,ij->i', features_s, features_t) / n_samples
    return corr


#################################
def p_Val(index_s, index_t):
    features_s = X[index_s]
    features_t = X[index_t]
    p = spearmanr(features_s, features_t)[1]
    return p
#################################

# index array for parallelization
pos_array = np.array(np.linspace(0, n_features*(n_features-1)//2, n_processes), dtype=int)

# parallel computation
def create_ei(i):
    from_pos = pos_array[i]
    to_pos = pos_array[i+1]
    # initiate DeepGraph
    g = dg.DeepGraph(v)
    # create edges
    g.create_edges(connectors=corr, step_size=step_size, from_pos=from_pos, to_pos=to_pos)
    # store edge table
    g.e.to_pickle('tmp/correlations/{}_corr.pickle'.format(str(i).zfill(3)))
    #################################
    gp = dg.DeepGraph(v)
    # create edges
    gp.create_edges(connectors=p_Val, step_size=step_size, from_pos=from_pos, to_pos=to_pos)
    # store edge table
    gp.e.to_pickle('tmp/correlations/{}_pval.pickle'.format(str(i).zfill(3)))
    #################################

# computation
if __name__ == '__main__':
    os.makedirs("tmp/correlations", exist_ok=True)
    indices = np.arange(0, n_processes - 1)
    p = Pool()
    for _ in p.imap_unordered(create_ei, indices):
        pass


# store correlation values
files = os.listdir('tmp/correlations/')
files.sort()
for f in files:
    et = pd.read_pickle('tmp/correlations/{}'.format(f))
    print(et)
store.close()

I get the following error:

Traceback (most recent call last):
  File "/lib/python3.9/multiprocessing/pool.py", line 125, in worker
    result = (True, func(*args, **kwds))
  File "pairwise_corr.py", line 64, in create_ei
    gp.create_edges(connectors=p_Val, step_size=step_size, from_pos=from_pos, to_pos=to_pos)
  File "/lib/python3.9/site-packages/deepgraph/deepgraph.py", line 616, in create_edges
    self.e = _matrix_iterator(
  File "/lib/python3.9/site-packages/deepgraph/deepgraph.py", line 4875, in _matrix_iterator
    ei = _select_and_return(vi, sources_k, targets_k, ft_feature,
  File "/lib/python3.9/site-packages/deepgraph/deepgraph.py", line 5339, in _select_and_return
    ei = pd.DataFrame({col: data[col] for col in coldtypedic})
  File "/lib/python3.9/site-packages/pandas/core/frame.py", line 614, in __init__
    mgr = dict_to_mgr(data, index, columns, dtype=dtype, copy=copy, typ=manager)
  File "/lib/python3.9/site-packages/pandas/core/internals/construction.py", line 464, in dict_to_mgr
    return arrays_to_mgr(
  File "/lib/python3.9/site-packages/pandas/core/internals/construction.py", line 124, in arrays_to_mgr
    arrays = _homogenize(arrays, index, dtype)
  File "/lib/python3.9/site-packages/pandas/core/internals/construction.py", line 589, in _homogenize
    val = sanitize_array(
  File "/lib/python3.9/site-packages/pandas/core/construction.py", line 576, in sanitize_array
    subarr = _sanitize_ndim(subarr, data, dtype, index, allow_2d=allow_2d)
  File "/lib/python3.9/site-packages/pandas/core/construction.py", line 627, in _sanitize_ndim
    raise ValueError("Data must be 1-dimensional")
ValueError: Data must be 1-dimensional

Any suggestions? Thanks!


Solution

  • I was able to solve it with

    def p_Val(index_s, index_t):
        features_s = X[index_s]
        features_t = X[index_t]
        p = [pearsonr(features_s[i, :], features_t[i, :])[1] for i in range(len(features_s))]
        p_val = np.asarray(p)
        return p_val