Search code examples
awkbioinformaticsbed

awk merging files where col value is in range


I have a tab delim dataset1

NC_044998.1     14582   80739   LOC100221041
NC_044998.1     31388   68748   DCBLD2
NC_044998.1     80874   299341  CMSS1
NC_044998.1     112495  297570  FILIP1L
NC_044998.1     287349  289742  LOC116808959
NC_044998.1     300404  343805  TBC1D23
NC_044998.1     333622  344667  NIT2
NC_044998.1     346168  368957  TOMM70
NC_044998.1     371654  380427  LNP1
NC_044998.1     387231  413422  TMEM45A

and another tab delim dataset2

1       NC_044998.1     15001   6.040368        2.038993e-04
1       NC_044998.1     25002   0.000000        3.333334e-01
1       NC_044998.1     35003   2.309260        4.638924e-03
1       NC_044998.1     45004   3.438428        5.053365e-03
1       NC_044998.1     55005   1.086369        9.663565e-02
1       NC_044998.1     65006   3.250019        8.298793e-04
1       NC_044998.1     75007   1.081163        8.039542e-03
1       NC_044998.1     85008   0.186722        8.158607e-02
1       NC_044998.1     95009   2.236803        3.256445e-03
1       NC_044998.1     105010  0.089978        2.846438e-01

I want to do for each line of dataset2 if string in col 2 of dataset2 matches string in col1 of dataset1 and value in col 3 of dataset2 is between values in col2 and col3 of dataset1, print line of dataset2 followed by line of dataset1, else print line of dataset2 followed by "." in each column.

Im using

NR==FNR {
    q[++n] = $0
    f1[n] = $2
    f2[n] = $3
    next
}
# process file2
{
    for (i = 1; i <= n; i++) {
        if ($1 == f1[i] && (f2[i] > $2 && f2[i] < $3))
            print q[i]"\t"$0
        else
            print q[i]"\t"".""\t"".""\t"".""\t""."
    }
} 

but it's not working, as it just prints whichever file is first followed by the dot columns

awk -f annot.sh dataset2 dataset1

1       NC_044998.1     15001   6.040368        2.038993e-04    NC_044998.1     14582   80739   LOC100221041
1       NC_044998.1     25002   0.000000        3.333334e-01    NC_044998.1     14582   80739   LOC100221041
1       NC_044998.1     35003   2.309260        4.638924e-03    NC_044998.1     14582   80739   LOC100221041
1       NC_044998.1     45004   3.438428        5.053365e-03    NC_044998.1     14582   80739   LOC100221041
1       NC_044998.1     55005   1.086369        9.663565e-02    NC_044998.1     14582   80739   LOC100221041
1       NC_044998.1     65006   3.250019        8.298793e-04    NC_044998.1     14582   80739   LOC100221041
1       NC_044998.1     75007   1.081163        8.039542e-03    NC_044998.1     14582   80739   LOC100221041
1       NC_044998.1     85008   0.186722        8.158607e-02    .       .       .       .       .
1       NC_044998.1     95009   2.236803        3.256445e-03    .       .       .       .       .
1       NC_044998.1     105010  0.089978        2.846438e-01    .       .       .       .       .

the expected output is

1       NC_044998.1     15001   6.040368        2.038993e-04    NC_044998.1     14582   80739   LOC100221041
1       NC_044998.1     25002   0.000000        3.333334e-01    NC_044998.1     14582   80739   LOC100221041
1       NC_044998.1     35003   2.309260        4.638924e-03    NC_044998.1     14582   80739   LOC100221041
1       NC_044998.1     35003   2.309260        4.638924e-03    NC_044998.1     31388   68748   DCBLD2
1       NC_044998.1     45004   3.438428        5.053365e-03    NC_044998.1     14582   80739   LOC100221041
1       NC_044998.1     45004   3.438428        5.053365e-03    NC_044998.1     31388   68748   DCBLD2
1       NC_044998.1     55005   1.086369        9.663565e-02    NC_044998.1     14582   80739   LOC100221041
1       NC_044998.1     55005   1.086369        9.663565e-02    NC_044998.1     31388   68748   DCBLD2
1       NC_044998.1     65006   3.250019        8.298793e-04    NC_044998.1     14582   80739   LOC100221041
1       NC_044998.1     65006   3.250019        8.298793e-04    NC_044998.1     31388   68748   DCBLD2
1       NC_044998.1     75007   1.081163        8.039542e-03    NC_044998.1     14582   80739   LOC100221041
1       NC_044998.1     85008   0.186722        8.158607e-02    NC_044998.1     80874   299341  CMSS1
1       NC_044998.1     95009   2.236803        3.256445e-03    NC_044998.1     80874   299341  CMSS1
1       NC_044998.1     105010  0.089978        2.846438e-01    NC_044998.1     80874   299341  CMSS1

Solution

  • Using any awk:

    $ cat tst.awk
    BEGIN { FS=OFS="\t" }
    NR == FNR {
        cnts[$1]++
        begs[$1,cnts[$1]] = $2
        ends[$1,cnts[$1]] = $3
        vals[$1,cnts[$1]] = $0
        gsub("[^"FS"]+",".")
        dflt = $0
        next
    }
    {
        val = dflt
        if ( $2 in cnts ) {
            for ( i=1; i<=cnts[$2]; i++ ) {
                if ( (begs[$2,i] <= $3) && ($3 <= ends[$2,i]) ) {
                    val = vals[$2,i]
                    break
                }
            }
        }
        print $0, val
    }
    

    $ awk -f tst.awk dataset1 dataset2
    1       NC_044998.1     15001   6.040368        2.038993e-04    NC_044998.1     14582   80739   LOC100221041
    1       NC_044998.1     25002   0.000000        3.333334e-01    NC_044998.1     14582   80739   LOC100221041
    1       NC_044998.1     35003   2.309260        4.638924e-03    NC_044998.1     14582   80739   LOC100221041
    1       NC_044998.1     45004   3.438428        5.053365e-03    NC_044998.1     14582   80739   LOC100221041
    1       NC_044998.1     55005   1.086369        9.663565e-02    NC_044998.1     14582   80739   LOC100221041
    1       NC_044998.1     65006   3.250019        8.298793e-04    NC_044998.1     14582   80739   LOC100221041
    1       NC_044998.1     75007   1.081163        8.039542e-03    NC_044998.1     14582   80739   LOC100221041
    1       NC_044998.1     85008   0.186722        8.158607e-02    NC_044998.1     80874   299341  CMSS1
    1       NC_044998.1     95009   2.236803        3.256445e-03    NC_044998.1     80874   299341  CMSS1
    1       NC_044998.1     105010  0.089978        2.846438e-01    NC_044998.1     80874   299341  CMSS1
    

    The above is assuming that the first match to a range is all that's required. If you do want all matches printed then change it to:

    $ cat tst.awk
    BEGIN { FS=OFS="\t" }
    NR == FNR {
        cnts[$1]++
        begs[$1,cnts[$1]] = $2
        ends[$1,cnts[$1]] = $3
        vals[$1,cnts[$1]] = $0
        gsub("[^"FS"]+",".")
        dflt = $0
        next
    }
    {
        found = 0
        if ( $2 in cnts ) {
            for ( i=1; i<=cnts[$2]; i++ ) {
                if ( (begs[$2,i] <= $3) && ($3 <= ends[$2,i]) ) {
                    print $0, vals[$2,i]
                    found = 1
                }
            }
        }
        if ( !found ) {
            print $0, dflt
        }
    }
    

    $ awk -f tst.awk dataset1 dataset2
    1       NC_044998.1     15001   6.040368        2.038993e-04    NC_044998.1     14582   80739   LOC100221041
    1       NC_044998.1     25002   0.000000        3.333334e-01    NC_044998.1     14582   80739   LOC100221041
    1       NC_044998.1     35003   2.309260        4.638924e-03    NC_044998.1     14582   80739   LOC100221041
    1       NC_044998.1     35003   2.309260        4.638924e-03    NC_044998.1     31388   68748   DCBLD2
    1       NC_044998.1     45004   3.438428        5.053365e-03    NC_044998.1     14582   80739   LOC100221041
    1       NC_044998.1     45004   3.438428        5.053365e-03    NC_044998.1     31388   68748   DCBLD2
    1       NC_044998.1     55005   1.086369        9.663565e-02    NC_044998.1     14582   80739   LOC100221041
    1       NC_044998.1     55005   1.086369        9.663565e-02    NC_044998.1     31388   68748   DCBLD2
    1       NC_044998.1     65006   3.250019        8.298793e-04    NC_044998.1     14582   80739   LOC100221041
    1       NC_044998.1     65006   3.250019        8.298793e-04    NC_044998.1     31388   68748   DCBLD2
    1       NC_044998.1     75007   1.081163        8.039542e-03    NC_044998.1     14582   80739   LOC100221041
    1       NC_044998.1     85008   0.186722        8.158607e-02    NC_044998.1     80874   299341  CMSS1
    1       NC_044998.1     95009   2.236803        3.256445e-03    NC_044998.1     80874   299341  CMSS1
    1       NC_044998.1     105010  0.089978        2.846438e-01    NC_044998.1     80874   299341  CMSS1