Search code examples
rlinear-regressionpearson-correlation

Relationship or correlation between two dataframes with several columns


I have two dataframes and I would like to show graphically (scatter plot) the correlation between the rows of these two dataframes (genes vs protein) to see each rows are related. Therefore, I can see two strategies to be used: 1. A linear regression between both dataframe (no idea how) 2. A Person correlation between both using the mean (and standard deviation) of the columns.

Some one can help me to design these graphs?

Here is an exemple of my data:

genes <- "gene  sample1 sample2 sample3 sample4
gene1   1863.4  1972.94 1603.96 1185.6
gene2   213.88  247.14  189.02  208.793
gene3   8.06    9.25    9.59    7.33
gene4   22.36   3.76    10.64   19.17"
genes<-read.table(text=genes,header=T)

protein <- "protein sample1 sample2 sample3 sample4
protein1    314.2871797 426.8856595 405.7971059 334.1369651
protein2    4747.866647 3070.916824 2780.352062 2990.085431
protein3    1621.566329 1290.470104 1554.27426  1601.357345
pretein4    8832.210499 7796.675008 8461.733171 9500.429355"
protein<-read.table(text=protein,header=T)

Thank you


Solution

  • I appreciate the answers that were scored positively by me, and also helped me to solve the trick as follows:

    #using the exemple data
    
    
    #getting the individuals average:
    mRNA_expression<- data.frame(genes=genes[,1], Means=rowMeans(genes[,-1]))
    Protein_abundance<- data.frame(protein=protein[,1], Means=rowMeans(protein[,-1]))
    
    #merging both to do the correlation graph
    mean_corr <- data.frame(mRNA_expression[,2],Protein_abundance[,2])
    names(mean_corr) <- c("mRNA_expression","Protein_abundance")
    
    #deleting NA lines
    mean_corr <- mean_corr[complete.cases(mean_corr),]
    
    #appling log10
    mean_corr <- log10 (mean_corr)
    
    library(ggplot2)
    
    #to check the distribution
    ggplot(mean_corr, aes(x=Protein_abundance, y=mRNA_expression)) + labs(x = "Protein abundance (log10)", y = "mRNA expression (log10)") +  theme(axis.title.y=element_text(margin=margin(0,10,0,0))) +  theme(axis.title.x=element_text(margin=margin(10,0,0,0))) +
      geom_point(shape=1)  # Use hollow circles
    #Different kind of plots::
    
    ggplot(mean_corr, aes(x=Protein_abundance, y=mRNA_expression)) + labs(x = "Protein abundance (log10)", y = "mRNA expression (log10)") +  theme(axis.title.y=element_text(margin=margin(0,10,0,0))) +  theme(axis.title.x=element_text(margin=margin(10,0,0,0))) + 
      geom_point(shape=1) +    # Use hollow circles
      geom_smooth(method=lm)   # Add linear regression line 
    #  (by default includes 95% confidence region)
    
    ggplot(mean_corr, aes(x=Protein_abundance, y=mRNA_expression))+ labs(x = "Protein abundance (log10)", y = "mRNA expression (log10)") +  theme(axis.title.y=element_text(margin=margin(0,10,0,0))) +  theme(axis.title.x=element_text(margin=margin(10,0,0,0))) +
      geom_point(shape=1) +    # Use hollow circles
      geom_smooth(method=lm,   # Add linear regression line
                  se=FALSE)    # Don't add shaded confidence region
    
    ggplot(mean_corr, aes(x=Protein_abundance, y=mRNA_expression)) + labs(x = "Protein abundance (log10)", y = "mRNA expression (log10)") +  theme(axis.title.y=element_text(margin=margin(0,10,0,0))) +  theme(axis.title.x=element_text(margin=margin(10,0,0,0))) +
      geom_point(shape=1) +    # Use hollow circles
      geom_smooth()            # Add a loess smoothed fit curve with confidence region
    
    #statistics
    #to check the correlation
    cor(mean_corr)
    
    #linear regression
    #lm(genes_mean ~ protein$mean, data=mean_corr)
    lm(Protein_abundance ~ mRNA_expression, data=mean_corr)