Search code examples
awkbioinformaticsbed

How to merge multiple files with two common columns, and name the added col as file name?


I'm trying to merge multiple .bed files by identifying the first two columns chr and start following this,

Merging multiple files with two common columns, and replace the blank to 0

However, I'm wondering how to make the file name a newly added column name.

$cat combineFWPS_02.sh

    BEGIN {
       for (k=1; k<ARGC; ++k)
          s = s " " 0
    }
    FNR == 1 {
       ++ARGIND
    }
    {
       key=$1 OFS $2
       if (!(key in map))
          map[key] = s
       split(map[key], a)
       a[ARGIND] = $3
       v = ""
       for (k=1; k<ARGC; ++k)
          v = v " " a[k]
       map[key]=v
    }
    END {
       for (k in map)
          print k map[k]
    }

$cat comRwps_02.sh

awkCOM="~/scripts/combineFWPS_02.sh"
## Run the jobs
time awk -f $awkCOM *.xyz.bed | sort -k1 >  13jLiC.xyz.txt 

The input files look like this:

FF85561.xyz.bed:

chr1 111001 234
chr2 22099  108
chr5 463100 219

FF85574.xyz.bed:

chr1 111001 42
chr1 430229 267
chr5 663800 319

FF85631.xyz.bed:

chr1 111001 92
chr3 22099  144
chr5 663800 311

FF85717.xyz.bed:

chr1 111001 129
chr1 157901 79
chr2 22099  442

The expected output file would be

$head 13jLiC.xyz.txt

chr    start    FF85561    FF85574    FF85631    FF85717
chr1   111001    234         42          92         129
chr1   157901      0          0           0          79
chr1   430229      0        267           0           0
chr2    22099    108          0           0         442
chr3    22099      0          0         144           0
chr5   463100    219          0           0           0
chr5   663800      0        319         311           0

Solution

  • Using any awk:

    $ cat tst.awk
    BEGIN {
        OFS = "\t"
        vals[++numRows,++numCols] = "chr"
        vals[numRows,++numCols] = "start"
    }
    FNR == 1 {
        val = FILENAME
        sub(/\..*/,"",val)
        vals[1,++numCols] = val
    }
    {
        key = $1 FS $2
        if ( !(key in key2rowNr) ) {
            key2rowNr[key] = ++numRows
            vals[numRows,1] = $1
            vals[numRows,2] = $2
        }
        rowNr = key2rowNr[key]
        vals[rowNr,numCols] = $3
    }
    END {
        for ( rowNr=1; rowNr<=numRows; rowNr++ ) {
            for ( colNr=1; colNr<=numCols; colNr++ ) {
                val = ( (rowNr,colNr) in vals ? vals[rowNr,colNr] : 0 )
                row = ( colNr>1 ? row OFS : "" ) val
            }
            print row
        }
    }
    

    $ awk -f tst.awk *.bed
    chr     start   FF85561 FF85574 FF85631 FF85717
    chr1    111001  234     42      92      129
    chr2    22099   108     0       0       442
    chr5    463100  219     0       0       0
    chr1    430229  0       267     0       0
    chr5    663800  0       319     311     0
    chr3    22099   0       0       144     0
    chr1    157901  0       0       0       79
    

    and if you want the rows sorted then you can apply a Decorate-Sort-Undecorate approach:

    $ awk -f tst.awk *.bed | awk -v OFS='\t' '{print (NR>1), $0}' | sort -k1,1n -k2,2 -k3,3n | cut -f2-
    chr     start   FF85561 FF85574 FF85631 FF85717
    chr1    111001  234     42      92      129
    chr1    157901  0       0       0       79
    chr1    430229  0       267     0       0
    chr2    22099   108     0       0       442
    chr3    22099   0       0       144     0
    chr5    463100  219     0       0       0
    chr5    663800  0       319     311     0
    

    If the chr<number> string can ever have more than 1 digit at the end and you want those sorted both alphabetically and numerically (e.g. so chr2 comes before chr10) then you'd have to change the DSU part to something like:

    $ awk -f tst.awk *.bed | awk -v OFS='\t' '{c=$1; sub(/[[:alpha:]]+/,"&" OFS,c); print (NR>1), c, $0}' | sort -k1,1n -k2,2 -k3,3 -k5,5n | cut -f4-
    chr     start   FF85561 FF85574 FF85631 FF85717
    chr1    111001  234     42      92      129
    chr1    157901  0       0       0       79
    chr1    430229  0       267     0       0
    chr2    22099   108     0       0       442
    chr3    22099   0       0       144     0
    chr5    463100  219     0       0       0
    chr5    663800  0       319     311     0