I have a tab delim dataset1
NC_044998.1 14582 80739 LOC100221041
NC_044998.1 31388 68748 DCBLD2
NC_044998.1 80874 299341 CMSS1
NC_044998.1 112495 297570 FILIP1L
NC_044998.1 287349 289742 LOC116808959
NC_044998.1 300404 343805 TBC1D23
NC_044998.1 333622 344667 NIT2
NC_044998.1 346168 368957 TOMM70
NC_044998.1 371654 380427 LNP1
NC_044998.1 387231 413422 TMEM45A
and another tab delim dataset2
1 NC_044998.1 15001 6.040368 2.038993e-04
1 NC_044998.1 25002 0.000000 3.333334e-01
1 NC_044998.1 35003 2.309260 4.638924e-03
1 NC_044998.1 45004 3.438428 5.053365e-03
1 NC_044998.1 55005 1.086369 9.663565e-02
1 NC_044998.1 65006 3.250019 8.298793e-04
1 NC_044998.1 75007 1.081163 8.039542e-03
1 NC_044998.1 85008 0.186722 8.158607e-02
1 NC_044998.1 95009 2.236803 3.256445e-03
1 NC_044998.1 105010 0.089978 2.846438e-01
I want to do for each line of dataset2 if string in col 2 of dataset2 matches string in col1 of dataset1 and value in col 3 of dataset2 is between values in col2 and col3 of dataset1, print line of dataset2 followed by line of dataset1, else print line of dataset2 followed by "." in each column.
Im using
NR==FNR {
q[++n] = $0
f1[n] = $2
f2[n] = $3
next
}
# process file2
{
for (i = 1; i <= n; i++) {
if ($1 == f1[i] && (f2[i] > $2 && f2[i] < $3))
print q[i]"\t"$0
else
print q[i]"\t"".""\t"".""\t"".""\t""."
}
}
but it's not working, as it just prints whichever file is first followed by the dot columns
awk -f annot.sh dataset2 dataset1
1 NC_044998.1 15001 6.040368 2.038993e-04 NC_044998.1 14582 80739 LOC100221041
1 NC_044998.1 25002 0.000000 3.333334e-01 NC_044998.1 14582 80739 LOC100221041
1 NC_044998.1 35003 2.309260 4.638924e-03 NC_044998.1 14582 80739 LOC100221041
1 NC_044998.1 45004 3.438428 5.053365e-03 NC_044998.1 14582 80739 LOC100221041
1 NC_044998.1 55005 1.086369 9.663565e-02 NC_044998.1 14582 80739 LOC100221041
1 NC_044998.1 65006 3.250019 8.298793e-04 NC_044998.1 14582 80739 LOC100221041
1 NC_044998.1 75007 1.081163 8.039542e-03 NC_044998.1 14582 80739 LOC100221041
1 NC_044998.1 85008 0.186722 8.158607e-02 . . . . .
1 NC_044998.1 95009 2.236803 3.256445e-03 . . . . .
1 NC_044998.1 105010 0.089978 2.846438e-01 . . . . .
the expected output is
1 NC_044998.1 15001 6.040368 2.038993e-04 NC_044998.1 14582 80739 LOC100221041
1 NC_044998.1 25002 0.000000 3.333334e-01 NC_044998.1 14582 80739 LOC100221041
1 NC_044998.1 35003 2.309260 4.638924e-03 NC_044998.1 14582 80739 LOC100221041
1 NC_044998.1 35003 2.309260 4.638924e-03 NC_044998.1 31388 68748 DCBLD2
1 NC_044998.1 45004 3.438428 5.053365e-03 NC_044998.1 14582 80739 LOC100221041
1 NC_044998.1 45004 3.438428 5.053365e-03 NC_044998.1 31388 68748 DCBLD2
1 NC_044998.1 55005 1.086369 9.663565e-02 NC_044998.1 14582 80739 LOC100221041
1 NC_044998.1 55005 1.086369 9.663565e-02 NC_044998.1 31388 68748 DCBLD2
1 NC_044998.1 65006 3.250019 8.298793e-04 NC_044998.1 14582 80739 LOC100221041
1 NC_044998.1 65006 3.250019 8.298793e-04 NC_044998.1 31388 68748 DCBLD2
1 NC_044998.1 75007 1.081163 8.039542e-03 NC_044998.1 14582 80739 LOC100221041
1 NC_044998.1 85008 0.186722 8.158607e-02 NC_044998.1 80874 299341 CMSS1
1 NC_044998.1 95009 2.236803 3.256445e-03 NC_044998.1 80874 299341 CMSS1
1 NC_044998.1 105010 0.089978 2.846438e-01 NC_044998.1 80874 299341 CMSS1
Using any awk:
$ cat tst.awk
BEGIN { FS=OFS="\t" }
NR == FNR {
cnts[$1]++
begs[$1,cnts[$1]] = $2
ends[$1,cnts[$1]] = $3
vals[$1,cnts[$1]] = $0
gsub("[^"FS"]+",".")
dflt = $0
next
}
{
val = dflt
if ( $2 in cnts ) {
for ( i=1; i<=cnts[$2]; i++ ) {
if ( (begs[$2,i] <= $3) && ($3 <= ends[$2,i]) ) {
val = vals[$2,i]
break
}
}
}
print $0, val
}
$ awk -f tst.awk dataset1 dataset2
1 NC_044998.1 15001 6.040368 2.038993e-04 NC_044998.1 14582 80739 LOC100221041
1 NC_044998.1 25002 0.000000 3.333334e-01 NC_044998.1 14582 80739 LOC100221041
1 NC_044998.1 35003 2.309260 4.638924e-03 NC_044998.1 14582 80739 LOC100221041
1 NC_044998.1 45004 3.438428 5.053365e-03 NC_044998.1 14582 80739 LOC100221041
1 NC_044998.1 55005 1.086369 9.663565e-02 NC_044998.1 14582 80739 LOC100221041
1 NC_044998.1 65006 3.250019 8.298793e-04 NC_044998.1 14582 80739 LOC100221041
1 NC_044998.1 75007 1.081163 8.039542e-03 NC_044998.1 14582 80739 LOC100221041
1 NC_044998.1 85008 0.186722 8.158607e-02 NC_044998.1 80874 299341 CMSS1
1 NC_044998.1 95009 2.236803 3.256445e-03 NC_044998.1 80874 299341 CMSS1
1 NC_044998.1 105010 0.089978 2.846438e-01 NC_044998.1 80874 299341 CMSS1
The above is assuming that the first match to a range is all that's required. If you do want all matches printed then change it to:
$ cat tst.awk
BEGIN { FS=OFS="\t" }
NR == FNR {
cnts[$1]++
begs[$1,cnts[$1]] = $2
ends[$1,cnts[$1]] = $3
vals[$1,cnts[$1]] = $0
gsub("[^"FS"]+",".")
dflt = $0
next
}
{
found = 0
if ( $2 in cnts ) {
for ( i=1; i<=cnts[$2]; i++ ) {
if ( (begs[$2,i] <= $3) && ($3 <= ends[$2,i]) ) {
print $0, vals[$2,i]
found = 1
}
}
}
if ( !found ) {
print $0, dflt
}
}
$ awk -f tst.awk dataset1 dataset2
1 NC_044998.1 15001 6.040368 2.038993e-04 NC_044998.1 14582 80739 LOC100221041
1 NC_044998.1 25002 0.000000 3.333334e-01 NC_044998.1 14582 80739 LOC100221041
1 NC_044998.1 35003 2.309260 4.638924e-03 NC_044998.1 14582 80739 LOC100221041
1 NC_044998.1 35003 2.309260 4.638924e-03 NC_044998.1 31388 68748 DCBLD2
1 NC_044998.1 45004 3.438428 5.053365e-03 NC_044998.1 14582 80739 LOC100221041
1 NC_044998.1 45004 3.438428 5.053365e-03 NC_044998.1 31388 68748 DCBLD2
1 NC_044998.1 55005 1.086369 9.663565e-02 NC_044998.1 14582 80739 LOC100221041
1 NC_044998.1 55005 1.086369 9.663565e-02 NC_044998.1 31388 68748 DCBLD2
1 NC_044998.1 65006 3.250019 8.298793e-04 NC_044998.1 14582 80739 LOC100221041
1 NC_044998.1 65006 3.250019 8.298793e-04 NC_044998.1 31388 68748 DCBLD2
1 NC_044998.1 75007 1.081163 8.039542e-03 NC_044998.1 14582 80739 LOC100221041
1 NC_044998.1 85008 0.186722 8.158607e-02 NC_044998.1 80874 299341 CMSS1
1 NC_044998.1 95009 2.236803 3.256445e-03 NC_044998.1 80874 299341 CMSS1
1 NC_044998.1 105010 0.089978 2.846438e-01 NC_044998.1 80874 299341 CMSS1