Search code examples
rdataframedplyrtidyverseformula

apply/create formula to manipulate two dataframes


I have two dataframes in R

df1
chr   start     end strand bam1 bam2 bam3 bam4 bam5 bam6 bam7 bam8
1 chr1 3531569 3531966      -    2    2    1    4    8   36   21    1
2 chr1 3670538 3672624      -  251   50  170  165  294  259  665   86
3 chr1 4491645 4493854      -  220   46  179  167  275  332  414   77
4 chr1 4496542 4497750      -  115   41  100   67  114   69   42   63
5 chr1 4571267 4572265      -   64   32   77   44   76  130  179   27
6 chr1 4688213 4688719      -   39   10   20   20   14   23   25   17
7 chr1 4688800 4688919      -   20   30   10   20   14   55   17   20
8 chr1 4688800 4688919      -    2    4    6    8   10   12   14   16
9 chr1 4688800 4688919      -    1    2    3    4    5    6    7    8

and

df2
bam_file r1 r2
1     bam1  2  1
2     bam2  9  3
3     bam3  1  4
4     bam4  1  5
5     bam5  1  1
6     bam6  8  6
7     bam7  3  7
8     bam8  3  2

I want to apply following formula (let say X), So that column of df2 X row of df1

((df2[1,2]-df1[1,5])ˆ2 + (df2[2,2]-df1[1,6])ˆ2 + (df2[3,2]-df1[1,7])ˆ2 + (df2[4,2]-df1[1,8])ˆ2 + (df2[5,2]-df1[1,9])ˆ2 + (df2[6,2]-df1[1,10])ˆ2 + (df2[7,2]-df1[1,11])ˆ2 + 
 (df2[8,2]-df1[1,12])ˆ2)/(ncol(df1)-4)

So the desired output will be

output
        r1        r2
1  152.375    144.75
2 89140.25 88467.875
3 57822.75 57413.125
4 6195.125      6148
5 8007.375   7858.75
6   395.75   372.625
7   508.75   543.125
8    60.75    47.125
9     15.5     6.875

I apologize if this appears to be a repetitive question, but I tried and was unable to resolve it (as I am beginner and learning). It would be great to find a solution. Thank you in advance and looking for a positive response.


Solution

  • We could create a sequence column ('rn'), reshape to 'long' format with pivot_longer on the first data, join with the second data ('df2') and do a group by calculation on the 'r1', 'r2' columns in reframe

    library(dplyr) # version >= 1.1.0
    library(tidyr)
    df1 %>%
      mutate(rn = row_number()) %>%  
      pivot_longer(cols= starts_with("bam"), names_to = "bam_file") %>% 
      left_join(df2) %>%  
      reframe(across(r1:r2, ~ sum((value - .x)^2)/n()), .by = "rn")
    

    -output

    # A tibble: 9 × 3
         rn      r1       r2
      <int>   <dbl>    <dbl>
    1     1   152.    145.  
    2     2 89140.  88468.  
    3     3 57823.  57413.  
    4     4  6195.   6148   
    5     5  8007.   7859.  
    6     6   396.    373.  
    7     7   509.    543.  
    8     8    60.8    47.1 
    9     9    15.5     6.88
    

    data

    df1 <- structure(list(chr = c("chr1", "chr1", "chr1", "chr1", "chr1", 
    "chr1", "chr1", "chr1", "chr1"), start = c(3531569L, 3670538L, 
    4491645L, 4496542L, 4571267L, 4688213L, 4688800L, 4688800L, 4688800L
    ), end = c(3531966L, 3672624L, 4493854L, 4497750L, 4572265L, 
    4688719L, 4688919L, 4688919L, 4688919L), strand = c("-", "-", 
    "-", "-", "-", "-", "-", "-", "-"), bam1 = c(2L, 251L, 220L, 
    115L, 64L, 39L, 20L, 2L, 1L), bam2 = c(2L, 50L, 46L, 41L, 32L, 
    10L, 30L, 4L, 2L), bam3 = c(1L, 170L, 179L, 100L, 77L, 20L, 10L, 
    6L, 3L), bam4 = c(4L, 165L, 167L, 67L, 44L, 20L, 20L, 8L, 4L), 
        bam5 = c(8L, 294L, 275L, 114L, 76L, 14L, 14L, 10L, 5L), bam6 = c(36L, 
        259L, 332L, 69L, 130L, 23L, 55L, 12L, 6L), bam7 = c(21L, 
        665L, 414L, 42L, 179L, 25L, 17L, 14L, 7L), bam8 = c(1L, 86L, 
        77L, 63L, 27L, 17L, 20L, 16L, 8L)), class = "data.frame", row.names = c("1", 
    "2", "3", "4", "5", "6", "7", "8", "9"))
    
    df2 <- structure(list(bam_file = c("bam1", "bam2", "bam3", "bam4", "bam5", 
    "bam6", "bam7", "bam8"), r1 = c(2L, 9L, 1L, 1L, 1L, 8L, 3L, 3L
    ), r2 = c(1L, 3L, 4L, 5L, 1L, 6L, 7L, 2L)), class = "data.frame", row.names = c("1", 
    "2", "3", "4", "5", "6", "7", "8"))