Search code examples
awkposix

implementing a `maxsplit` function in POSIX awk


I'm trying to implement an awk function for splitting a string into an array; the fundamental difference with the built-in split is that one is able to limit the number of "splits" that are performed, just like in Python's str.split

The prototype would be something like this:

maxsplit(s, a, n[, fs ])
Split the string s into array elements, performing at most n splits (thus, the array will have at most n+1 elements a[1], a[2], ..., a[n+1]), and return the actual number of elements in the array. All elements of the array shall be deleted before the split is performed. The separation shall be done with the ERE fs or with the field separator FS when fs is not provided.
At the moment, the effects of a null string as fs value, or a negative number as n value, are unspecified.

Here's the code I came-up with:
edit1: fixed the border cases pointed-out by @markp-fuso and @Daweo
edit2: added handling of non-provided fs argument (thanks @EdMorton)

function maxsplit(s,a,n,fs,    i,j) {
    delete a
    if (fs == 0 && fs == "")
        fs = FS
    if (fs == " ")
    {
        for (i = 1; (i <= n) && match(s,/[^[:space:]]+/); i++) {
            a[i] = substr(s,RSTART,RLENGTH)
            s = substr(s,RSTART+RLENGTH)
        }
        sub(/^[[:space:]]+/, "", s)
        if (s != "")
           a[i] = s
        else
           --i
    }
    else if (length(fs) == 1)
    {
        for (i = 1; (i <= n) && (j = index(s,fs)); i++) {
            a[i] = substr(s,1,j-1)
            s = substr(s,j+1)
        }
        a[i] = s
    }
    else
    {
        i = 1
        while ( (i <= n) && (s != "") && match(s,fs) ) {
            if (RLENGTH) {
                a[i] = a[i] substr(s,1,RSTART-1)
                s = substr(s,RSTART+RLENGTH)
                ++i
            } else {
                a[i] = a[i] substr(s,1,1)
                s = substr(s,2)
            }
        }
        a[i] = s
    }
    return i
}

As you can see, I wrote multiple code paths for handling the different types of fs. My question is: what other edge cases need to be handled?


Here's a set of test cases:
edit1: reordered the input/output fields and added a few more tests.
edit2: converted the input to a colon-delimited format (instead of xargs format), and added a few more tests.
note: I'm still looking for test-cases that could be tricky to handle.

#inputString:fieldSep:maxSplit
: :0
: :1
: :100
:,:0
:,:1
:,:100
:[ ]:0
:[ ]:1
:[ ]:100
,:,:0
,:,:1
,:,:2
,:,:100
  foo bar   baz    : :0
  foo bar   baz    : :1
  foo bar   baz    : :2
  foo bar   baz    : :3
  foo bar   baz    : :100
foo=bar=baz:=:0
foo=bar=baz:=:1
foo=bar=baz:=:2
foo=bar=baz:=:3
foo=bar=baz:=:100
foo|bar|baz|:|:0
foo|bar|baz|:|:1
foo|bar|baz|:|:2
foo|bar|baz|:|:3
foo|bar|baz|:|:4
foo|bar|baz|:|:100
 foo  bar :[ ]:0
 foo  bar :[ ]:1
 foo  bar :[ ]:2
 foo  bar :[ ]:3
 foo  bar :[ ]:4
 foo  bar :[ ]:5
 foo  bar :[ ]:6
 foo  bar :[ ]:100
_.{2}__.?_[0-9]*:_+:0
_.{2}__.?_[0-9]*:_+:1
_.{2}__.?_[0-9]*:_+:2
_.{2}__.?_[0-9]*:_+:3
_.{2}__.?_[0-9]*:_+:4
_.{2}__.?_[0-9]*:_+:100
foo:^[fo]:0
foo:^[fo]:1
foo:^[fo]:2
foo:^[fo]:100
foo_bar_:_?:0
foo_bar_:_?:1
foo_bar_:_?:2
foo_bar_:_?:3
foo_bar_:_?:100

And the code wrapper for processing the test-cases.txt:

awk -F: '
    function maxsplit(s,a,n,fs,    i,j) {
        # ...
    }
    function shell_quote(s) {
        gsub(/\047/,"\047\\\047\047")
        return "\047" s "\047"
    }
    function array_quote(a    ,s,i) {
        s=""
        for (i=1;i in a;i++)
            s = s " " shell_quote(a[i])
        return "(" s " )"
    }
    NR > 1 {
        s  = $1
        fs = $2
        n  = $3
        maxsplit(s,a,n,fs)
        print "s=" shell_quote(s),    \
             "fs=" shell_quote(fs),   \
              "n=" sprintf("%-3d",n), \
              "a=" array_quote(a)
    }
' test-cases.txt

My expected output is:

s='' fs=' ' n=0   a=( )
s='' fs=' ' n=1   a=( )
s='' fs=' ' n=100 a=( )
s='' fs=',' n=0   a=( )
s='' fs=',' n=1   a=( )
s='' fs=',' n=100 a=( )
s='' fs='[ ]' n=0   a=( )
s='' fs='[ ]' n=1   a=( )
s='' fs='[ ]' n=100 a=( )
s=',' fs=',' n=0   a=( ',' )
s=',' fs=',' n=1   a=( '' '' )
s=',' fs=',' n=2   a=( '' '' )
s=',' fs=',' n=100 a=( '' '' )
s='  foo bar   baz    ' fs=' ' n=0   a=( 'foo bar   baz    ' )
s='  foo bar   baz    ' fs=' ' n=1   a=( 'foo' 'bar   baz    ' )
s='  foo bar   baz    ' fs=' ' n=2   a=( 'foo' 'bar' 'baz    ' )
s='  foo bar   baz    ' fs=' ' n=3   a=( 'foo' 'bar' 'baz' )
s='  foo bar   baz    ' fs=' ' n=100 a=( 'foo' 'bar' 'baz' )
s='foo=bar=baz' fs='=' n=0   a=( 'foo=bar=baz' )
s='foo=bar=baz' fs='=' n=1   a=( 'foo' 'bar=baz' )
s='foo=bar=baz' fs='=' n=2   a=( 'foo' 'bar' 'baz' )
s='foo=bar=baz' fs='=' n=3   a=( 'foo' 'bar' 'baz' )
s='foo=bar=baz' fs='=' n=100 a=( 'foo' 'bar' 'baz' )
s='foo|bar|baz|' fs='|' n=0   a=( 'foo|bar|baz|' )
s='foo|bar|baz|' fs='|' n=1   a=( 'foo' 'bar|baz|' )
s='foo|bar|baz|' fs='|' n=2   a=( 'foo' 'bar' 'baz|' )
s='foo|bar|baz|' fs='|' n=3   a=( 'foo' 'bar' 'baz' '' )
s='foo|bar|baz|' fs='|' n=4   a=( 'foo' 'bar' 'baz' '' )
s='foo|bar|baz|' fs='|' n=100 a=( 'foo' 'bar' 'baz' '' )
s=' foo  bar ' fs='[ ]' n=0   a=( ' foo  bar ' )
s=' foo  bar ' fs='[ ]' n=1   a=( '' 'foo  bar ' )
s=' foo  bar ' fs='[ ]' n=2   a=( '' 'foo' ' bar ' )
s=' foo  bar ' fs='[ ]' n=3   a=( '' 'foo' '' 'bar ' )
s=' foo  bar ' fs='[ ]' n=4   a=( '' 'foo' '' 'bar' '' )
s=' foo  bar ' fs='[ ]' n=5   a=( '' 'foo' '' 'bar' '' )
s=' foo  bar ' fs='[ ]' n=6   a=( '' 'foo' '' 'bar' '' )
s=' foo  bar ' fs='[ ]' n=100 a=( '' 'foo' '' 'bar' '' )
s='_.{2}__.?_[0-9]*' fs='_+' n=0   a=( '_.{2}__.?_[0-9]*' )
s='_.{2}__.?_[0-9]*' fs='_+' n=1   a=( '' '.{2}__.?_[0-9]*' )
s='_.{2}__.?_[0-9]*' fs='_+' n=2   a=( '' '.{2}' '.?_[0-9]*' )
s='_.{2}__.?_[0-9]*' fs='_+' n=3   a=( '' '.{2}' '.?' '[0-9]*' )
s='_.{2}__.?_[0-9]*' fs='_+' n=4   a=( '' '.{2}' '.?' '[0-9]*' )
s='_.{2}__.?_[0-9]*' fs='_+' n=100 a=( '' '.{2}' '.?' '[0-9]*' )
s='foo' fs='^[fo]' n=0   a=( 'foo' )
s='foo' fs='^[fo]' n=1   a=( '' 'oo' )
s='foo' fs='^[fo]' n=2   a=( '' 'oo' )
s='foo' fs='^[fo]' n=100 a=( '' 'oo' )
s='foo_bar_' fs='_?' n=0   a=( 'foo_bar_' )
s='foo_bar_' fs='_?' n=1   a=( 'foo' 'bar_' )
s='foo_bar_' fs='_?' n=2   a=( 'foo' 'bar' '' )
s='foo_bar_' fs='_?' n=3   a=( 'foo' 'bar' '' )
s='foo_bar_' fs='_?' n=100 a=( 'foo' 'bar' '' )

Solution

  • A while match approach will fail with, for example, maxsplit("ab",_,2,/^[ab]/) because the loop will match a in the first iteration and b in the second one, which isn't what a user expects from a regex like ^[ab]. IMHO there's no simple way to fix this problem.

    Instead, here's a different approach, which is to build an ERE that matches the first N fields (after a call to awk's split):

    edit1: Duplicated the backslash in the regex of ere_escape, for compatibility with GNU awk (thanks @EdMorton)
    edit2: corrected a typo bug found by @EdMorton

    UPDATE: I'm using ere_parenthesize from here for enclosing fs in parentheses safely.

    function ere_parenthesize(...) {
        # see https://stackoverflow.com/a/78556520/3387716
    }
    function ere_escape(s) {
       gsub(/[.[\\(*^$+?{|]/, "\\\\&", s)
       return s
    }
    function maxsplit(str,arr,nf,fs    ,n,i,r) {
        if (fs == 0 && fs == "")
            fs = FS
        n = split(str, arr, fs)
        if (n > nf) {
            r = ""
            if (length(fs) == 1) {
                if (fs == " ") {
                    fs = "[[:space:]]+"
                    r = "^[[:space:]]*"
                } else
                    fs = ere_escape(fs)
            }
            fs = ere_parenthesize(fs)
            for (i = 1; i <= nf; i++) {
                r = r ere_escape(arr[i]) fs
            }
            sub(r,"",str)
            arr[nf+1] = str
            for (i = nf+2; i <= n; i++) {
                delete arr[i]
            }
            n = nf+1
        }
        return n
    }
    

    Here are the results:

    s='' fs=' ' n=0   a=( )
    s='' fs=' ' n=1   a=( )
    s='' fs=' ' n=100 a=( )
    s='' fs=',' n=0   a=( )
    s='' fs=',' n=1   a=( )
    s='' fs=',' n=100 a=( )
    s='' fs='[ ]' n=0   a=( )
    s='' fs='[ ]' n=1   a=( )
    s='' fs='[ ]' n=100 a=( )
    s=',' fs=',' n=0   a=( ',' )
    s=',' fs=',' n=1   a=( '' '' )
    s=',' fs=',' n=2   a=( '' '' )
    s=',' fs=',' n=100 a=( '' '' )
    s='  foo bar   baz    ' fs=' ' n=0   a=( 'foo bar   baz    ' )
    s='  foo bar   baz    ' fs=' ' n=1   a=( 'foo' 'bar   baz    ' )
    s='  foo bar   baz    ' fs=' ' n=2   a=( 'foo' 'bar' 'baz    ' )
    s='  foo bar   baz    ' fs=' ' n=3   a=( 'foo' 'bar' 'baz' )
    s='  foo bar   baz    ' fs=' ' n=100 a=( 'foo' 'bar' 'baz' )
    s='foo=bar=baz' fs='=' n=0   a=( 'foo=bar=baz' )
    s='foo=bar=baz' fs='=' n=1   a=( 'foo' 'bar=baz' )
    s='foo=bar=baz' fs='=' n=2   a=( 'foo' 'bar' 'baz' )
    s='foo=bar=baz' fs='=' n=3   a=( 'foo' 'bar' 'baz' )
    s='foo=bar=baz' fs='=' n=100 a=( 'foo' 'bar' 'baz' )
    s='foo|bar|baz|' fs='|' n=0   a=( 'foo|bar|baz|' )
    s='foo|bar|baz|' fs='|' n=1   a=( 'foo' 'bar|baz|' )
    s='foo|bar|baz|' fs='|' n=2   a=( 'foo' 'bar' 'baz|' )
    s='foo|bar|baz|' fs='|' n=3   a=( 'foo' 'bar' 'baz' '' )
    s='foo|bar|baz|' fs='|' n=4   a=( 'foo' 'bar' 'baz' '' )
    s='foo|bar|baz|' fs='|' n=100 a=( 'foo' 'bar' 'baz' '' )
    s=' foo  bar ' fs='[ ]' n=0   a=( ' foo  bar ' )
    s=' foo  bar ' fs='[ ]' n=1   a=( '' 'foo  bar ' )
    s=' foo  bar ' fs='[ ]' n=2   a=( '' 'foo' ' bar ' )
    s=' foo  bar ' fs='[ ]' n=3   a=( '' 'foo' '' 'bar ' )
    s=' foo  bar ' fs='[ ]' n=4   a=( '' 'foo' '' 'bar' '' )
    s=' foo  bar ' fs='[ ]' n=5   a=( '' 'foo' '' 'bar' '' )
    s=' foo  bar ' fs='[ ]' n=6   a=( '' 'foo' '' 'bar' '' )
    s=' foo  bar ' fs='[ ]' n=100 a=( '' 'foo' '' 'bar' '' )
    s='_.{2}__.?_[0-9]*' fs='_+' n=0   a=( '_.{2}__.?_[0-9]*' )
    s='_.{2}__.?_[0-9]*' fs='_+' n=1   a=( '' '.{2}__.?_[0-9]*' )
    s='_.{2}__.?_[0-9]*' fs='_+' n=2   a=( '' '.{2}' '.?_[0-9]*' )
    s='_.{2}__.?_[0-9]*' fs='_+' n=3   a=( '' '.{2}' '.?' '[0-9]*' )
    s='_.{2}__.?_[0-9]*' fs='_+' n=4   a=( '' '.{2}' '.?' '[0-9]*' )
    s='_.{2}__.?_[0-9]*' fs='_+' n=100 a=( '' '.{2}' '.?' '[0-9]*' )
    s='foo' fs='^[fo]' n=0   a=( 'foo' )
    s='foo' fs='^[fo]' n=1   a=( '' 'oo' )
    s='foo' fs='^[fo]' n=2   a=( '' 'oo' )
    s='foo' fs='^[fo]' n=100 a=( '' 'oo' )
    s='foo_bar_' fs='_?' n=0   a=( 'foo_bar_' )
    s='foo_bar_' fs='_?' n=1   a=( 'foo' 'bar_' )
    s='foo_bar_' fs='_?' n=2   a=( 'foo' 'bar' '' )
    s='foo_bar_' fs='_?' n=3   a=( 'foo' 'bar' '' )
    s='foo_bar_' fs='_?' n=100 a=( 'foo' 'bar' '' )