Search code examples
bashmacosawklocale

Unable to print period thousands separators


I'm trying to output the total number of words for all input files with a period thousands separator. For the life of me, I cannot get the period thousands separator to print.

export LC_ALL=nl_NL.UTF-8

gwc -w "${i[@]}" | gtail -n 1 | awk '{printf "%\x27i\n", $1}'

Output: 13389

I also cannot get it to work for other locales that use a period. I can get it to work for locales that use a comma or space separator. If I just change the first line to export LC_ALL=en_US.UTF-8, the output becomes: 13,389.

I'm using a Mac and it seems MacOS just refuses to print period separators.


Solution

  • It's unrelated to your problem but use octal \047, not hex \x27, to represent a ' - see http://awk.freeshell.org/PrintASingleQuote.

    For your problem - pick a different locale. On my Mac these are the locales that use a period for the separator:

    $ for LC_ALL in $(locale -a | sort); do
        LC_ALL="$LC_ALL" awk 'BEGIN{printf "%s -> %\047i\n", ENVIRON["LC_ALL"], 13389}'
    done | grep '13\.389'
    af_ZA -> 13.389
    af_ZA.ISO8859-1 -> 13.389
    af_ZA.ISO8859-15 -> 13.389
    af_ZA.UTF-8 -> 13.389
    da_DK -> 13.389
    da_DK.ISO8859-1 -> 13.389
    da_DK.ISO8859-15 -> 13.389
    da_DK.UTF-8 -> 13.389
    el_GR -> 13.389
    el_GR.ISO8859-7 -> 13.389
    el_GR.UTF-8 -> 13.389
    fi_FI -> 13.389
    fi_FI.ISO8859-1 -> 13.389
    fi_FI.ISO8859-15 -> 13.389
    fi_FI.UTF-8 -> 13.389
    no_NO -> 13.389
    no_NO.ISO8859-1 -> 13.389
    no_NO.ISO8859-15 -> 13.389
    no_NO.UTF-8 -> 13.389
    pt_BR -> 13.389
    pt_BR.ISO8859-1 -> 13.389
    pt_BR.UTF-8 -> 13.389
    

    Just for future reference, here are the locales on my Mac that use commas:

    $ for LC_ALL in $(locale -a | sort); do LC_ALL="$LC_ALL" awk 'BEGIN{printf "%s -> %\047i\n", ENVIRON["LC_ALL"], 13389}'; done | grep '13,389'
    am_ET -> 13,389
    am_ET.UTF-8 -> 13,389
    en_AU -> 13,389
    en_AU.ISO8859-1 -> 13,389
    en_AU.ISO8859-15 -> 13,389
    en_AU.US-ASCII -> 13,389
    en_AU.UTF-8 -> 13,389
    en_CA -> 13,389
    en_CA.ISO8859-1 -> 13,389
    en_CA.ISO8859-15 -> 13,389
    en_CA.US-ASCII -> 13,389
    en_CA.UTF-8 -> 13,389
    en_GB -> 13,389
    en_GB.ISO8859-1 -> 13,389
    en_GB.ISO8859-15 -> 13,389
    en_GB.US-ASCII -> 13,389
    en_GB.UTF-8 -> 13,389
    en_IE -> 13,389
    en_IE.UTF-8 -> 13,389
    en_NZ -> 13,389
    en_NZ.ISO8859-1 -> 13,389
    en_NZ.ISO8859-15 -> 13,389
    en_NZ.US-ASCII -> 13,389
    en_NZ.UTF-8 -> 13,389
    en_US -> 13,389
    en_US.ISO8859-1 -> 13,389
    en_US.ISO8859-15 -> 13,389
    en_US.US-ASCII -> 13,389
    en_US.UTF-8 -> 13,389
    he_IL -> 13,389
    he_IL.UTF-8 -> 13,389
    ja_JP -> 13,389
    ja_JP.SJIS -> 13,389
    ja_JP.UTF-8 -> 13,389
    ja_JP.eucJP -> 13,389
    ko_KR -> 13,389
    ko_KR.CP949 -> 13,389
    ko_KR.UTF-8 -> 13,389
    ko_KR.eucKR -> 13,389
    zh_CN -> 13,389
    zh_CN.GB18030 -> 13,389
    zh_CN.GB2312 -> 13,389
    zh_CN.GBK -> 13,389
    zh_CN.UTF-8 -> 13,389
    zh_CN.eucCN -> 13,389
    zh_HK -> 13,389
    zh_HK.Big5HKSCS -> 13,389
    zh_HK.UTF-8 -> 13,389
    zh_TW -> 13,389
    zh_TW.Big5 -> 13,389
    zh_TW.UTF-8 -> 13,389
    

    and blanks:

    $ for LC_ALL in $(locale -a | sort); do LC_ALL="$LC_ALL" awk 'BEGIN{printf "%s -> %\047i\n", ENVIRON["LC_ALL"], 13389}'; done | grep '13 389'
    be_BY -> 13 389
    be_BY.CP1131 -> 13 389
    be_BY.CP1251 -> 13 389
    be_BY.ISO8859-5 -> 13 389
    be_BY.UTF-8 -> 13 389
    bg_BG -> 13 389
    bg_BG.CP1251 -> 13 389
    bg_BG.UTF-8 -> 13 389
    cs_CZ -> 13 389
    cs_CZ.ISO8859-2 -> 13 389
    cs_CZ.UTF-8 -> 13 389
    et_EE -> 13 389
    et_EE.ISO8859-15 -> 13 389
    et_EE.UTF-8 -> 13 389
    hu_HU -> 13 389
    hu_HU.ISO8859-2 -> 13 389
    hu_HU.UTF-8 -> 13 389
    hy_AM -> 13 389
    hy_AM.ARMSCII-8 -> 13 389
    hy_AM.UTF-8 -> 13 389
    is_IS -> 13 389
    is_IS.ISO8859-1 -> 13 389
    is_IS.ISO8859-15 -> 13 389
    is_IS.UTF-8 -> 13 389
    kk_KZ -> 13 389
    kk_KZ.PT154 -> 13 389
    kk_KZ.UTF-8 -> 13 389
    lt_LT -> 13 389
    lt_LT.ISO8859-13 -> 13 389
    lt_LT.ISO8859-4 -> 13 389
    lt_LT.UTF-8 -> 13 389
    pl_PL -> 13 389
    pl_PL.ISO8859-2 -> 13 389
    pl_PL.UTF-8 -> 13 389
    ro_RO -> 13 389
    ro_RO.ISO8859-2 -> 13 389
    ro_RO.UTF-8 -> 13 389
    ru_RU -> 13 389
    ru_RU.CP1251 -> 13 389
    ru_RU.CP866 -> 13 389
    ru_RU.ISO8859-5 -> 13 389
    ru_RU.KOI8-R -> 13 389
    ru_RU.UTF-8 -> 13 389
    sk_SK -> 13 389
    sk_SK.ISO8859-2 -> 13 389
    sk_SK.UTF-8 -> 13 389
    sr_YU -> 13 389
    sr_YU.ISO8859-5 -> 13 389
    sr_YU.UTF-8 -> 13 389
    sv_SE -> 13 389
    sv_SE.ISO8859-1 -> 13 389
    sv_SE.ISO8859-15 -> 13 389
    sv_SE.UTF-8 -> 13 389
    uk_UA -> 13 389
    uk_UA.ISO8859-5 -> 13 389
    uk_UA.KOI8-U -> 13 389
    uk_UA.UTF-8 -> 13 389
    

    and nothing:

    $ for LC_ALL in $(locale -a | sort); do LC_ALL="$LC_ALL" awk 'BEGIN{printf "%s -> %\047i\n", ENVIRON["LC_ALL"], 13389}'; done | grep '13389'
    C -> 13389
    POSIX -> 13389
    ca_ES -> 13389
    ca_ES.ISO8859-1 -> 13389
    ca_ES.ISO8859-15 -> 13389
    ca_ES.UTF-8 -> 13389
    de_AT -> 13389
    de_AT.ISO8859-1 -> 13389
    de_AT.ISO8859-15 -> 13389
    de_AT.UTF-8 -> 13389
    de_CH -> 13389
    de_CH.ISO8859-1 -> 13389
    de_CH.ISO8859-15 -> 13389
    de_CH.UTF-8 -> 13389
    de_DE -> 13389
    de_DE-A.ISO8859-1 -> 13389
    de_DE.ISO8859-1 -> 13389
    de_DE.ISO8859-15 -> 13389
    de_DE.UTF-8 -> 13389
    es_ES -> 13389
    es_ES.ISO8859-1 -> 13389
    es_ES.ISO8859-15 -> 13389
    es_ES.UTF-8 -> 13389
    eu_ES -> 13389
    eu_ES.ISO8859-1 -> 13389
    eu_ES.ISO8859-15 -> 13389
    eu_ES.UTF-8 -> 13389
    fr_BE -> 13389
    fr_BE.ISO8859-1 -> 13389
    fr_BE.ISO8859-15 -> 13389
    fr_BE.UTF-8 -> 13389
    fr_CA -> 13389
    fr_CA.ISO8859-1 -> 13389
    fr_CA.ISO8859-15 -> 13389
    fr_CA.UTF-8 -> 13389
    fr_CH -> 13389
    fr_CH.ISO8859-1 -> 13389
    fr_CH.ISO8859-15 -> 13389
    fr_CH.UTF-8 -> 13389
    fr_FR -> 13389
    fr_FR.ISO8859-1 -> 13389
    fr_FR.ISO8859-15 -> 13389
    fr_FR.UTF-8 -> 13389
    hr_HR -> 13389
    hr_HR.ISO8859-2 -> 13389
    hr_HR.UTF-8 -> 13389
    it_CH -> 13389
    it_CH.ISO8859-1 -> 13389
    it_CH.ISO8859-15 -> 13389
    it_CH.UTF-8 -> 13389
    it_IT -> 13389
    it_IT.ISO8859-1 -> 13389
    it_IT.ISO8859-15 -> 13389
    it_IT.UTF-8 -> 13389
    nl_BE -> 13389
    nl_BE.ISO8859-1 -> 13389
    nl_BE.ISO8859-15 -> 13389
    nl_BE.UTF-8 -> 13389
    nl_NL -> 13389
    nl_NL.ISO8859-1 -> 13389
    nl_NL.ISO8859-15 -> 13389
    nl_NL.UTF-8 -> 13389
    pt_PT -> 13389
    pt_PT.ISO8859-1 -> 13389
    pt_PT.ISO8859-15 -> 13389
    pt_PT.UTF-8 -> 13389
    sl_SI -> 13389
    sl_SI.ISO8859-2 -> 13389
    sl_SI.UTF-8 -> 13389
    sr_YU.ISO8859-2 -> 13389
    tr_TR -> 13389
    tr_TR.ISO8859-9 -> 13389
    tr_TR.UTF-8 -> 13389
    

    and other:

    $ for LC_ALL in $(locale -a | sort); do LC_ALL="$LC_ALL" awk 'BEGIN{printf "%s -> %\047i\n", ENVIRON["LC_ALL"], 13389}'; done | grep -Ev '13[,. ]?389'
    hi_IN.ISCII-DEV -> 133,89