Search code examples
awksedtext-processing

Parse text file, change some strings to camel case, add other strings - follow up question


Note that this is the follow up question of Parse text file, change some strings to camel case, add other strings . The parsing rules are similar but different:

  • The input order in the output is important.
  • The input records are separated by empty lines.
  • Strings just before the character '=' are considered similar either from the start or end of the strings.
  1. Replace the string "public static final String" with the string "export const" if that string occurs only once.
  2. Replace the string "public static final String" with the string "export enum" if similar strings (which has understores) occur more than once. Change all similar strings to the camel case string Str1. Append Str1 to the string "export enum".
  • Keep only the string difference Str2.
  • If the new strings contain only number, prefix Str2 with the camel case string Str1.
  • Enclose the new strings with "{}" only once.
  • Change ';' to ','.
  1. Replace the string "public static final int" with the string "export const" if that string occurs only once.
  2. Replace the string "public static final int" with the string "export enum" if similar strings (which has understores) occur more than once.
  • Separate the parsed string by the first '_' character into two tokens. The first token is T1. The second token is changed to the camel case string Str1. Append Str1 to the string "export enum".
  • If the new strings contain only number, prefix T1 with the camel case string Str1.
  • Enclose the new strings with "{}" only once.
  • Change ';' to ','.

These are sample input and output.

input

    //Comment
    public static final String CUSTOMER_TYPE_CD_T_01 = "01";
    public static final String CUSTOMER_TYPE_CD_TB_02 = "02";
    public static final String CUSTOMER_TYPE_CD_TCC_03 = "03";
    public static final String CUSTOMER_TYPE_CD_TDDD_04 = "04";

    public static final String TEST_ING       = "TEST";

    //----------------------------------------
    //Comments
    //----------------------------------------
    public static final int    BEGIN_A_BB_C_D_EE_FFF_01      = 0;
    public static final int    END_A_BB_C_D_EE_FFF_01    = 2;

output

    //Comment
    export enum CustomerTypeCd {
        T_01 = "01",
        TB_02 = "02",
        TCC_03 = "03",
        TDDD_04 = "04",
    }

    export const TEST_ING = "TEST";

    //----------------------------------------
    //Comments
    //----------------------------------------
    export enum ABbCDEeFff01 {
        BEGIN = 0,
        END = 2,
    }

I modified the answer of: Parse text file, change some strings to camel case, add other strings as follows. It handles rules 1 and 2, fails to handle rules 3 and 4:

    function cap(s) { return substr(s, 1, 1) tolower(substr(s, 2)) } # capitalization

    function cc(s, a,    b, n, i) { # return a[1] = enum name, a[2] = key
      n = split(s, b, /_/); a[1] = ""
      for(i = 1; i < n; i++) a[1] = a[1] cap(b[i]) # camel-case
      a[2] = cap(b[n]) # key
    }

    function cc2(s, a,    b, n, i) { # return a[1] = enum name, a[2] = key
      n = split(s, b, /_/); a[1] = ""
      for(i = 1; i < n - 1; i++) a[1] = a[1] cap(b[i]) # camel-case
      a[2] = b[n - 1] "_" cap(b[n]) # key
    }

    function cc3(s, a,    b, n, i) { # return a[1] = enum name, a[2] = key
      n = split(s, b, /_/);
      enumkey = b[1]
      a[1] = ""
      for(i = 1; i < n - 1; i++) a[1] = a[1] cap(b[i]) # camel-case
      a[2] = b[n - 1] "_" cap(b[n]) # key
      a[1] = enumkey
    }

    /public static final String/ {
      # compute enum name (e), key (k), value without final ";" (v)
      cc2($5, ek); e = ek[1]; k = ek[2]; v = $NF; sub(/;[[:space:]]*$/, "", v)
      # if new enum name
      if(!(e in seen)) { seen[e] = 1; ne += 1; ename[ne] = e; cname[ne] = $5 }
      # add key and value
      nk[ne] += 1; key[ne,nk[ne]] = k; val[ne,nk[ne]] = v
      # key prefix if only-digits key
      if(k ~ /^[0-9]+$/) pfx[ne] = e
    }

    /public static final int/ {
      # compute enum name (e), key (k), value without final ";" (v)
      cc3($5, ek); e = ek[1]; k = ek[2]; v = $NF; sub(/;[[:space:]]*$/, "", v)
        
      # if new enum name
      if(!(e in seen)) { seen[e] = 1; ne += 1; ename[ne] = e; cname[ne] = $5 }
      # add key and value
      nk[ne] += 1; key[ne,nk[ne]] = k; val[ne,nk[ne]] = v
      # key prefix if only-digits key
      if(k ~ /^[0-9]+$/) pfx[ne] = e
    }

    END {
      for(i = 1; i <= ne; i++) { # for all enum/const
        # if only one key-value pair => const
        if(nk[i] == 1) print sep "export const " cname[i] " = " val[i,1] ";"
        else { # enum
          print sep "export enum " ename[i] " {"
          for(j = 1; j <= nk[i]; j++) print "\t" pfx[i] key[i,j] " = " val[i,j] ","
          print "}"
        }
        sep = "\n"
      }
    }

awk -V GNU Awk 5.0.1, API: 2.0 (GNU MPFR 4.0.2, GNU MP 6.2.0)


---------
EDIT: here is the above code formatted legibly by `gawk -o-`:

    /public static final String/ {
        # compute enum name (e), key (k), value without final ";" (v)
        cc2($5, ek)
        e = ek[1]
        k = ek[2]
        v = $NF
        sub(/;[[:space:]]*$/, "", v)
        # if new enum name
        if (! (e in seen)) {
            seen[e] = 1
            ne += 1
            ename[ne] = e
            cname[ne] = $5
        }
        # add key and value
        nk[ne] += 1
        key[ne, nk[ne]] = k
        val[ne, nk[ne]] = v
        # key prefix if only-digits key
        if (k ~ /^[0-9]+$/) {
            pfx[ne] = e
        }
    }
    
    /public static final int/ {
        # compute enum name (e), key (k), value without final ";" (v)
        cc3($5, ek)
        e = ek[1]
        k = ek[2]
        v = $NF
        sub(/;[[:space:]]*$/, "", v)
        # if new enum name
        if (! (e in seen)) {
            seen[e] = 1
            ne += 1
            ename[ne] = e
            cname[ne] = $5
        }
        # add key and value
        nk[ne] += 1
        key[ne, nk[ne]] = k
        val[ne, nk[ne]] = v
        # key prefix if only-digits key
        if (k ~ /^[0-9]+$/) {
            pfx[ne] = e
        }
    }
    
    END {
        for (i = 1; i <= ne; i++) {    # for all enum/const
            # if only one key-value pair => const
            if (nk[i] == 1) {
                print sep "export const " cname[i] " = " val[i, 1] ";"
            } else {    # enum
                print sep "export enum " ename[i] " {"
                for (j = 1; j <= nk[i]; j++) {
                    print "\t" pfx[i] key[i, j] " = " val[i, j] ","
                }
                print "}"
            }
            sep = "\n"
        }
    }
    
    
    function cap(s)
    {
        return (substr(s, 1, 1) tolower(substr(s, 2)))
    }
    
    # capitalization
    function cc(s, a, b, n, i)
    {
        # return a[1] = enum name, a[2] = key
        n = split(s, b, /_/)
        a[1] = ""
        for (i = 1; i < n; i++) {
            a[1] = a[1] cap(b[i])    # camel-case
        }
        a[2] = cap(b[n])    # key
    }
    
    function cc2(s, a, b, n, i)
    {
        # return a[1] = enum name, a[2] = key
        n = split(s, b, /_/)
        a[1] = ""
        for (i = 1; i < n - 1; i++) {
            a[1] = a[1] cap(b[i])    # camel-case
        }
        a[2] = b[n - 1] "_" cap(b[n])    # key
    }
    
    function cc3(s, a, b, n, i)
    {
        # return a[1] = enum name, a[2] = key
        n = split(s, b, /_/)
        enumkey = b[1]
        a[1] = ""
        for (i = 1; i < n - 1; i++) {
            a[1] = a[1] cap(b[i])    # camel-case
        }
        a[2] = b[n - 1] "_" cap(b[n])    # key
        a[1] = enumkey
    }


Solution

  • I'd blame tiredness for those problems, because while the carefully crafted cc2 works as intended, cc3() seems to be just three typos away from working:

    • although the spec, the example, and in cc3()'s return handling all agree that public static final int is the contrary to public static final String (first the unique key, then the common enum name),
      in the implementation you inverted with an a[1] = enumkey (a bit after the comment # return a[1] = enum name)
    • and, similarly, the loop concatenates (for the enum name) indices 1 to n - 1, while it should run from 2 to n
    • then there's this a[2] = b[n - 1] "_" cap(b[n]) # key which is a copy-paste from cc2(), with absolutely no use here

    So by reindexing the loop, removing the unwanted line, and putting the key to a[2], you'll have your awk rework.

    Diff:

    -      for(i = 1; i < n - 1; i++) a[1] = a[1] cap(b[i]) # camel-case
    -      a[2] = b[n - 1] "_" cap(b[n]) # key
    -      a[1] = enumkey
    +      for(i = 2; i <= n; i++) a[1] = a[1] cap(b[i]) # camel-case
    +      a[2] = enumkey