I'm trying to merge multiple .bed
files by identifying the first two columns chr
and start
following this,
Merging multiple files with two common columns, and replace the blank to 0
However, I'm wondering how to make the file name a newly added column name.
$cat combineFWPS_02.sh
BEGIN {
for (k=1; k<ARGC; ++k)
s = s " " 0
}
FNR == 1 {
++ARGIND
}
{
key=$1 OFS $2
if (!(key in map))
map[key] = s
split(map[key], a)
a[ARGIND] = $3
v = ""
for (k=1; k<ARGC; ++k)
v = v " " a[k]
map[key]=v
}
END {
for (k in map)
print k map[k]
}
$cat comRwps_02.sh
awkCOM="~/scripts/combineFWPS_02.sh"
## Run the jobs
time awk -f $awkCOM *.xyz.bed | sort -k1 > 13jLiC.xyz.txt
The input files look like this:
FF85561.xyz.bed:
chr1 111001 234
chr2 22099 108
chr5 463100 219
FF85574.xyz.bed:
chr1 111001 42
chr1 430229 267
chr5 663800 319
FF85631.xyz.bed:
chr1 111001 92
chr3 22099 144
chr5 663800 311
FF85717.xyz.bed:
chr1 111001 129
chr1 157901 79
chr2 22099 442
The expected output file would be
$head 13jLiC.xyz.txt
chr start FF85561 FF85574 FF85631 FF85717
chr1 111001 234 42 92 129
chr1 157901 0 0 0 79
chr1 430229 0 267 0 0
chr2 22099 108 0 0 442
chr3 22099 0 0 144 0
chr5 463100 219 0 0 0
chr5 663800 0 319 311 0
Using any awk:
$ cat tst.awk
BEGIN {
OFS = "\t"
vals[++numRows,++numCols] = "chr"
vals[numRows,++numCols] = "start"
}
FNR == 1 {
val = FILENAME
sub(/\..*/,"",val)
vals[1,++numCols] = val
}
{
key = $1 FS $2
if ( !(key in key2rowNr) ) {
key2rowNr[key] = ++numRows
vals[numRows,1] = $1
vals[numRows,2] = $2
}
rowNr = key2rowNr[key]
vals[rowNr,numCols] = $3
}
END {
for ( rowNr=1; rowNr<=numRows; rowNr++ ) {
for ( colNr=1; colNr<=numCols; colNr++ ) {
val = ( (rowNr,colNr) in vals ? vals[rowNr,colNr] : 0 )
row = ( colNr>1 ? row OFS : "" ) val
}
print row
}
}
$ awk -f tst.awk *.bed
chr start FF85561 FF85574 FF85631 FF85717
chr1 111001 234 42 92 129
chr2 22099 108 0 0 442
chr5 463100 219 0 0 0
chr1 430229 0 267 0 0
chr5 663800 0 319 311 0
chr3 22099 0 0 144 0
chr1 157901 0 0 0 79
and if you want the rows sorted then you can apply a Decorate-Sort-Undecorate approach:
$ awk -f tst.awk *.bed | awk -v OFS='\t' '{print (NR>1), $0}' | sort -k1,1n -k2,2 -k3,3n | cut -f2-
chr start FF85561 FF85574 FF85631 FF85717
chr1 111001 234 42 92 129
chr1 157901 0 0 0 79
chr1 430229 0 267 0 0
chr2 22099 108 0 0 442
chr3 22099 0 0 144 0
chr5 463100 219 0 0 0
chr5 663800 0 319 311 0
If the chr<number>
string can ever have more than 1 digit at the end and you want those sorted both alphabetically and numerically (e.g. so chr2
comes before chr10
) then you'd have to change the DSU part to something like:
$ awk -f tst.awk *.bed | awk -v OFS='\t' '{c=$1; sub(/[[:alpha:]]+/,"&" OFS,c); print (NR>1), c, $0}' | sort -k1,1n -k2,2 -k3,3 -k5,5n | cut -f4-
chr start FF85561 FF85574 FF85631 FF85717
chr1 111001 234 42 92 129
chr1 157901 0 0 0 79
chr1 430229 0 267 0 0
chr2 22099 108 0 0 442
chr3 22099 0 0 144 0
chr5 463100 219 0 0 0
chr5 663800 0 319 311 0