Search code examples
awkgawk

How do you convert an array to a string in awk?


The intent of this question is to provide a robust, flexible solution to a common problem.

A frequent situation when processing text is the need to split the input into fields, manipulate fields and then recombine for printing. For example given this input:

$ cat file
    A      7  C       3

if we want to ensure every single digit was in .2f format and we want to retain the spacing before/after/between fields then we might write (using GNU awk for the 4th arg to split()):

$ cat tst.awk
{
    split($0,flds,FS,seps)
    for (i in flds) {
        if (flds[i] ~ /[0-9]/) {
            flds[i] = sprintf("%.2f",flds[i])
        }
    }

    #### print the flds[] array, interleaving seps[] values:
    printf "%s", seps[0]
    for (i=1; i in flds; i++) {
        printf "%s%s", flds[i], seps[i]
    }
    print ""
    #####
}

$ awk -f tst.awk file
    A      7.00  C       3.00

That last loop where we flatten an array into a string for printing is common to many awk scripts. Sometimes the separators are stored in a different array as above, sometimes they're a specific character, sometimes they aren't needed. Also, the order we want flds[] printed in could be based on numerically ascending indices like the above, or it could be descending (e.g. to mimic the UNIX tool "rev") or it could be based on the flds[] values rather than their indices.

So - is there an awk utility function that converts an array to a string using provided separators(s) in a specified order?


Solution

  • We've been having discussions with GNU awk developers to provide a function as described in the question but until if/when that arrives, the below user-space gawk-specific (for sorted_in) function will do that job. It takes care not to add elements to flds, seps or PROCINFO arrays that did not exist before it was called. It can be used as follows:

    $ cat tst.awk
    {
        split($0,flds,FS,seps)
        for (i in flds) {
            if (flds[i] ~ /[0-9]/) {
                flds[i] = sprintf("%.2f",flds[i])
            }
        }
    
        print "arr2str() usage examples:"
        print "1)", arr2str(flds,OFS)
        print "2)", arr2str(flds,seps)
        print "3)", arr2str(flds,seps,"@ind_num_desc")
        print "4)", arr2str(flds,seps,"@val_str_asc")
        print "5)", arr2str(flds,",")
    }
    
    $ awk -f arr2str.awk -f tst.awk file
    arr2str() usage examples:
    1) A 7.00 C 3.00
    2)     A      7.00  C       3.00
    3) 3.00       C  7.00      A
    4)     3.007.00  A      C
    5) A,7.00,C,3.00
    

    .

    $ cat arr2str.awk
    # Usage:
    #    arr2str(flds[,seps,[sortOrder]])
    #
    # flds:
    #    This function converts the mandatory "flds" array argument into a string.
    #
    # seps:
    #    If "seps" is not present then the "flds" values will simply be concatenated
    #    in the returned string.
    #
    #    If "seps" is present and is a string then that "seps" value will be inserted
    #    between each "flds" value in the returned string.
    #
    #    If "seps" is present and is an array then each "seps" value with the same index
    #    as a "flds" index will be inserted in the returned string before or after
    #    (sort order dependent) the corresponding "flds" value with that same index.
    #    - All "seps" values that do not have an index in "flds" will be inserted in
    #      the returned string before or after all of the "flds" and other "seps" values.
    #      This ensures that a "seps" array that, for example, starts at zero as a result
    #      of a previous split(str,flds,re,seps) will have its zeroth entry included.
    #
    # sortOrder:
    #    If "sortOrder" is present then it will be used as the order the "flds" values
    #    are visited in, otherwise it uses PROCINFO["sorted_in"] if set, otherwise it
    #    uses ascending numeric indices.
    #    - If the sort order is descending (ends in "desc") and "seps" is an array then
    #      the "seps" values are inserted before each "flds" value, otherwise after them.
    #
    # Example:
    #    $ cat tst.awk
    #    BEGIN {
    #        orig = ",a+b:c-d="
    #        split(orig,flds,/[^[:alpha:]]/,seps)
    #
    #        printf "orig: <%s>\n", orig
    #        printf "asc:  <%s>\n", arr2str(flds,seps)
    #        printf "desc: <%s>\n", arr2str(flds,seps,"@ind_num_desc")
    #    }
    #    $ awk -f arr2str.awk -f tst.awk
    #    orig: <,a+b:c-d=>
    #    asc:  <,a+b:c-d=>
    #    desc: <=d-c:b+a,>
    
    function arr2str(flds, seps, sortOrder,      sortedInPresent, sortedInValue, currIdx, prevIdx, idxCnt, outStr) {
    
        if ( "sorted_in" in PROCINFO ) {
            sortedInPresent = 1
            sortedInValue = PROCINFO["sorted_in"]
        }
    
        if ( sortOrder == "" ) {
            sortOrder = (sortedInPresent ? sortedInValue : "@ind_num_asc")
        }
        PROCINFO["sorted_in"] = sortOrder
    
        if ( isarray(seps) ) {
            # An array of separators.
            if ( sortOrder ~ /desc$/ ) {
                for (currIdx in flds) {
                    outStr = outStr (currIdx in seps ? seps[currIdx] : "") flds[currIdx]
                }
            }
    
            for (currIdx in seps) {
                if ( !(currIdx in flds) ) {
                    outStr = outStr seps[currIdx]
                }
            }
    
            if ( sortOrder !~ /desc$/ ) {
                for (currIdx in flds) {
                    outStr = outStr flds[currIdx] (currIdx in seps ? seps[currIdx] : "")
                }
            }
        }
        else {
            # Fixed scalar separator.
            # We would use this if we could distinguish an unset variable arg from a missing arg:
            #    seps = (magic_argument_present_test == true ? seps : OFS)
            # but we cant so just use whatever value was passed in.
            for (currIdx in flds) {
                outStr = outStr (idxCnt++ ? seps : "") flds[currIdx]
            }
        }
    
        if ( sortedInPresent ) {
            PROCINFO["sorted_in"] = sortedInValue
        }
        else {
            delete PROCINFO["sorted_in"]
        }
    
        return outStr
    }