Search code examples
pythonpandasscatter-plot

How do I scatterplot between specific columns sequentially from dataframes


I have the following dataframe merged_dft to scatterplot the two column eg. snv vs snv-dra

samples snv het-hom ti-tv   snv-drg het-hom-drg ti-tv-drg   insertion-drg   deletion-drg    insertion   deletion    ins-del-ratio-drg   ins-del-ratio   Sample_name Sex Superpopulation_code
0   NA20126 4592368 2.14    1.97    4770140 2.26    1.96    523917  536443  472931  494200  0.98    0.96    NA20126 male    AFR
1   NA20127 4699751 2.04    1.97    4918959 2.18    1.97    562430  572733  485645  505302  0.98    0.96    NA20127 female  AFR
2   NA20128 4636463 2.09    1.97    4854107 2.22    1.97    552634  566283  478801  500632  0.98    0.96    NA20128 female  AFR
3   NA20129 4638940 2.11    1.97    4863336 2.23    1.97    552984  565534  478078  499867  0.98    0.96    NA20129 female  AFR
4   NA20274 4339811 2.10    1.96    4554995 2.23    1.96    524046  530728  456420  471116  0.99    0.97    NA20274 female  AFR
.... 
....

--

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.lines as mlines
import scipy.stats as stats

x = merged_dft['snv']
y = merged_dft['snv-drg']

x_min = merged_dft['snv'].min()
x_max = merged_dft['snv'].max()

y_min = merged_dft['snv-drg'].min()
y_max = merged_dft['snv-drg'].max()

lineStart = min(x_min,y_min)
lineEnd = max(x_max,y_max)

# Create a scatter plot
# plt.scatter(x, y, c='tab:blue')
sns.scatterplot(data=merged_dft, x='snv', y='snv-drg', hue='Superpopulation_code' )

plt.xlabel('NPM')
plt.ylabel('Drgen')
plt.title('Count_SNVs')
plt.rcParams.update({'figure.figsize':(10,8), 'figure.dpi':100})

plt.plot([lineStart, lineEnd], [lineStart, lineEnd], color = 'r', linestyle = 'dashed')
plt.xlim(lineStart, lineEnd)
plt.ylim(lineStart, lineEnd)

r, p = stats.pearsonr(x, y)
plt.annotate('r = {:.2f}'.format(r), xy=(0.1, 0.95), xycoords='axes fraction')

# plt.legend(bbox_to_anchor=(1.025,1), loc='upper left', borderaxespad=0.)

enter image description here

I want to scatterplot/pearson correlation the pair of columns in npm_col vs drg_col sequentially. I couldn't make it from the below code.

Example

snv vs snv-drg
het-hom vs het-hom-drg
ti-tv vs ti-tv-drg

Code:

# set 1 columns 
npm_col = merged_dft[['snv', 'het-hom', 'ti-tv']]
npm_col

# set 2 columns 
drg_col = merged_dft[['snv-drg', 'het-hom-drg', 'ti-tv-drg']]
drg_col

for i in range(len(npm_col)):
    for j in range(len(drg_col)):
        plt.figure()
        plt.scatter(merged_dft[npm_col], merged_dft[drg_col])
        plt.xlabel(npm_col)
        plt.ylabel(drg_col)
        plt.title(f'Scatter plot between {npm_col} and {drg_col}')
        plt.rcParams.update({'figure.figsize':(10,8), 'figure.dpi':100})
        plt.plot([lineStart, lineEnd], [lineStart, lineEnd], color = 'r', linestyle = 'dashed')
        plt.xlim(lineStart, lineEnd)
        plt.ylim(lineStart, lineEnd)
        # r, p = stats.pearsonr(x, y)
        r, p = stats.pearsonr(merged_dft[npm_col], merged_dft[drg_col])
        plt.annotate('r = {:.2f}'.format(r), xy=(0.1, 0.95), xycoords='axes fraction')
        # plt.legend(bbox_to_anchor=(1.025,1), loc='upper left', borderaxespad=0.)
        plt.show()

Thanks for any help!

Answer:

xcols = npm_col.columns.tolist()
ycols = drg_col.columns.tolist()

# title=0

for i in range(len(xcols)):
    plt.scatter(npm_col[xcols[i]], drg_col[ycols[i]])
    x_min = npm_col[xcols[i]].min()
    x_max = npm_col[xcols[i]].max()
    y_min = drg_col[ycols[i]].min()
    y_max = drg_col[ycols[i]].max()

    # lineStart = min(x_min,y_min)
    # lineEnd = max(x_max,y_max)
    lineStart = min(x_min,y_min)
    lineEnd = max(x_max,y_max)


    plt.xlabel('NPM')
    plt.ylabel('DRG')
    # plt.title(xcols[title])
    # title=title+1
    plt.title(f'Scatter plot between NPM_{xcols[i]} and DRG_{xcols[i]}')
    # plt.title(xcols[i])

    plt.rcParams.update({'figure.figsize':(10,8), 'figure.dpi':100})
    r, p = stats.pearsonr(npm_col[xcols[i]], dragen_col[ycols[i]])
    plt.plot([lineStart, lineEnd], [lineStart, lineEnd], color = 'r', linestyle = 'dashed')
    plt.xlim(lineStart, lineEnd)
    plt.ylim(lineStart, lineEnd)
    plt.annotate('r = {:.2f}'.format(r), xy=(0.1, 0.95), xycoords='axes fraction')
    plt.show()

Solution

  • If you are just trying to get x and y and they are in the same relative position in the new dataframes, you should be able to use this. You can play around with rest of the items in the loop to get the right variables for labels, etc.

    xcols = npm_col.columns.tolist()
    ycols = drg_col.columns.tolist()
    
    for i in range(len(xcols)):
        plt.scatter(npm_col[xcols[i]], drg_col[ycols[i]])
        r, p = stats.pearsonr(npm_col[xcols[i]], drg_col[ycols[i]])