Search code examples
arrayslinuxbashperlawk

How can I improve the execution speed of my script


I have a file containing about 15 million records. Below is a sample of the data

99001597,555555555555,3211,Njro_Kaniani,test,NORTH,Yes,Yes,Yes,smart,Yes,Yes,Yes,Yes,Yes,Yes,IN2017,OnePlus,1,N/A,Yes,Yes,Yes,N/A
99001679,555555555555,1756,Bnju_HTT,Salam,,Y,RAD,Yes,Yes,Yes,smart,Yes,Yes,Yes,Yes,Yes,Yes,BE2012,OnePlus,1,N/A,Yes,Yes,Yes,N/A
99001680,555555555555,1108,Temoni_Kiara,Salam,,Y,RAD,Yes,Yes,Yes,smart,Yes,Yes,Yes,Yes,Yes,Yes,BE2028,OnePlus,1,Google,Yes,Yes,Yes,N/A
99001683,555555555555,1604,Blue_Bay,Nzindo,,Y,COAST,Yes,Yes,Yes,smart,Yes,Yes,Yes,Yes,Yes,Yes,BE2011,OnePlus,1,N/A,Yes,Yes,Yes,N/A
99001683,555555555555,1820,Sgerea_Makuka,Salaam,,N,RAD,Yes,Yes,Yes,smart,Yes,Yes,Yes,Yes,Yes,Yes,BE2011,OnePlus,1,N/A,Yes,Yes,Yes,N/A
99001683,555555555555,1184,Makka,Salaam,,Y,RAD,Yes,Yes,Yes,smart,Yes,Yes,Yes,Yes,Yes,Yes,BE2011,OnePlus,1,N/A,Yes,Yes,Yes,N/A
99001683,555555555555,1381,Leaders_Club,Salam,,Y,RAD,Yes,Yes,Yes,smart,Yes,Yes,Yes,Yes,Yes,Yes,BE2011,OnePlus,1,N/A,Yes,Yes,Yes,N/A
99001683,555555555555,1037,Mbez,Salam,,Y,RAD,Yes,Yes,Yes,smart,Yes,Yes,Yes,Yes,Yes,Yes,BE2011,OnePlus,1,N/A,Yes,Yes,Yes,N/A
99001683,555555555555,1041,Ngano,Salam,,Y,RAD,Yes,Yes,Yes,smart,Yes,Yes,Yes,Yes,Yes,Yes,BE2011,OnePlus,1,N/A,Yes,Yes,Yes,N/A
99001683,555555555555,1313,Kichangani,Salam,,Y,RAD,Yes,Yes,Yes,smart,Yes,Yes,Yes,Yes,Yes,Yes,BE2011,OnePlus,1,N/A,Yes,Yes,Yes,N/A
99001684,555555555555,4975,Nyugusu Campp2,Test,test,Yes,Yes,Yes,smart,Yes,Yes,Yes,Yes,Yes,Yes,BE2026,OnePlus,1,Google,Yes,Yes,Yes,N/A
99001684,555555555555,1041,Ngano,Salam,,Y,RAD,Yes,Yes,Yes,smart,Yes,Yes,Yes,Yes,Yes,Yes,BE2026,OnePlus,1,Google,Yes,Yes,Yes,N/A
99001684,555555555555,1420,Airport_Macro,Salaam,,Y,RAD,Yes,Yes,Yes,smart,Yes,Yes,Yes,Yes,Yes,Yes,BE2026,OnePlus,1,Google,Yes,Yes,Yes,N/A
99001684,555555555555,3147,Technical_Nzoti,test,ORTH,Yes,Yes,Yes,smart,Yes,Yes,Yes,Yes,Yes,Yes,BE2026,OnePlus,1,Google,Yes,Yes,Yes,N/A
99001684,555555555555,4488,Lumala,Mwnza,,Y,Nyeka,Yes,Yes,Yes,smart,Yes,Yes,Yes,Yes,Yes,Yes,BE2026,OnePlus,1,Google,Yes,Yes,Yes,N/A
99001684,555555555555,4975,Nyarugusu Campp2,Kigoma,,Y,Nyeka,Yes,Yes,Yes,smart,Yes,Yes,Yes,Yes,Yes,Yes,BE2026,OnePlus,1,Google

I am using the below script to count the occurrence of lines matching certain conditions. The problem is this script is very slow. I get about 200 output lines in a day. Currently, my program will read the 15 million record file 36,000 times. This is very inefficient (slow!!). How can I rework my script to just read the very large file just one time?

Desired output

1037,0,0,1,1,1,1,1,1,1,1,1,1
1041,0,0,2,2,2,2,2,2,2,2,2,2
1108,0,0,1,1,1,1,1,1,1,1,1,1
1184,0,0,1,1,1,1,1,1,1,1,1,1
1313,0,0,1,1,1,1,1,1,1,1,1,1
1381,0,0,1,1,1,1,1,1,1,1,1,1
1420,0,0,1,1,1,1,1,1,1,1,1,1
1604,0,0,1,1,1,1,1,1,1,1,1,1
1756,0,0,1,1,1,1,1,1,1,1,1,1
1820,0,0,1,1,1,1,1,1,1,1,1,1
3147,0,0,1,1,1,1,1,0,0,0,0,1
3211,0,0,1,1,1,1,1,0,0,0,0,1
4488,0,0,1,1,1,1,1,1,1,1,1,1
4975,0,0,2,2,2,2,2,1,1,0,0,1

The IDs_file file contains about 3000 records each a 4 digit number

while read i
do

twog=$(cat combined_marketing_sadm_report.csv|awk -v src=$i -F, '{if ((($10 == "Yes")||($10 == "No")) && ($3 == src) && ($9 == "No")&& ($11 == "No")) print $0;}'|wc -l)

threeg=$(cat combined_marketing_sadm_report.csv|awk -v src=$i  -F,'{if (($3 == src) &&($9 == "Yes")&& ($11 == "No")) print $0;}'|wc -l)

fourg=$(cat combined_marketing_sadm_report.csv|awk -v src=$i -F,'{if (($11 == "Yes") && ($3 == src)) print $0;}'|wc -l)
lte2100=$(cat  combined_marketing_sadm_report.csv|awk -v src=$i  -F, '{if (($13 == "Yes") &&  ($3 == src)) print $0;}'|wc -l)
lte800=$(cat  combined_marketing_sadm_report.csv|awk -v src=$i  -F, '{if (($14 == "Yes") &&  ($3 == src)) print $0;}'|wc -l)
lte700=$(cat  combined_marketing_sadm_report.csv|awk -v src=$i  -F, '{if (($15 == "Yes") &&  ($3 == src)) print $0;}'|wc -l)
lte1800=$(cat  combined_marketing_sadm_report.csv|awk -v src=$i  -F, '{if (($16 == "Yes") &&  ($3 == src)) print $0;}'|wc -l)
lte2600=$(cat  combined_marketing_sadm_report.csv|awk -v src=$i  -F, '{if (($17 == "Yes") &&  ($3 == src)) print $0;}'|wc -l)
lte900=$(cat  combined_marketing_sadm_report.csv|awk -v src=$i  -F, '{if (($18 == "Yes") &&  ($3 == src)) print $0;}'|wc -l)
threeg2100=$(cat  combined_marketing_sadm_report.csv|awk -v src=$i  -F, '{if (($24 == "Yes") &&  ($3 == src)) print $0;}'|wc -l)
threeg900=$(cat  combined_marketing_sadm_report.csv|awk -v src=$i  -F, '{if (($25 == "Yes") &&  ($3 == src)) print $0;}'|wc -l)
volte=$(cat  combined_marketing_sadm_report.csv|awk -v src=$i  -F, '{if (($23 == "Yes") &&  ($3 == src)) print $0;}'|wc -l)

echo $i,$twog,$threeg,$fourg,$lte2100,$lte800,$lte700,$lte1800,$lte2600,$lte900,$threeg2100,$threeg900,$volte>>Raw_data_for_report.csv
done < IDs_file


Solution

  • Solution: put all your loops in one awk program:

    #! /usr/bin/awk -f
    
    BEGIN {
        FS=OFS=","
        if (src=="") {
            exit
        }
        split(src,arr_src,",")
        for (i in arr_src) {
            src=arr_src[i]
            twog[src]=threeg[src]=fourg[src]=lte2100[src]=lte800[src]=lte700[src]=lte1800[src]=lte2600[src]=lte900[src]=threeg2100[src]=threeg900[src]=volte[src]=0
        }
    }
    {
        for (i in arr_src) {
            src=arr_src[i]
            if ($3 == src) {
                if (($10 == "Yes" || $10 == "No") && $9 == "No" && $11 == "No") twog[src]++
                if ($9  == "Yes" && $11 == "No") threeg[src]++
                if ($11 == "Yes") fourg[src]++
                if ($13 == "Yes") lte2100[src]++
                if ($14 == "Yes") lte800[src]++
                if ($15 == "Yes") lte700[src]++
                if ($16 == "Yes") lte1800[src]++
                if ($17 == "Yes") lte2600[src]++
                if ($18 == "Yes") lte900[src]++
                if ($24 == "Yes") threeg2100[src]++
                if ($25 == "Yes") threeg900[src]++
                if ($23 == "Yes") volte[src]++
            }
        }
    }
    END {
        for (i in arr_src) {
            src=arr_src[i]
            print src,twog[src],threeg[src],fourg[src],lte2100[src],lte800[src],lte700[src],lte1800[src],lte2600[src],lte900[src],threeg2100[src],threeg900[src],volte[src]
        }
    }
    

    Call:

    ./counter.awk -v src=1037,1041,4975 combined_marketing_sadm_report.csv
    

    UPDATE

    If your src values are in a file, the script (counter-v2.awk) became:

    #! /usr/bin/awk -f
    
    BEGIN {
        FS=OFS=","
    }
    FNR == NR {
        i++
        arr_src[i] = $0
        next
    }
    FNR == 1 {
        for (i in arr_src) {
            src=arr_src[i]
            twog[src]=threeg[src]=fourg[src]=lte2100[src]=lte800[src]=lte700[src]=lte1800[src]=lte2600[src]=lte900[src]=threeg2100[src]=threeg900[src]=volte[src]=0
        }
    }
    {
        for (i in arr_src) {
            src=arr_src[i]
            if ($3 == src) {
                if (($10 == "Yes" || $10 == "No") && $9 == "No" && $11 == "No") twog[src]++
                if ($9  == "Yes" && $11 == "No") threeg[src]++
                if ($11 == "Yes") fourg[src]++
                if ($13 == "Yes") lte2100[src]++
                if ($14 == "Yes") lte800[src]++
                if ($15 == "Yes") lte700[src]++
                if ($16 == "Yes") lte1800[src]++
                if ($17 == "Yes") lte2600[src]++
                if ($18 == "Yes") lte900[src]++
                if ($24 == "Yes") threeg2100[src]++
                if ($25 == "Yes") threeg900[src]++
                if ($23 == "Yes") volte[src]++
            }
        }
    }
    END {
        for (i in arr_src) {
            src=arr_src[i]
            print src,twog[src],threeg[src],fourg[src],lte2100[src],lte800[src],lte700[src],lte1800[src],lte2600[src],lte900[src],threeg2100[src],threeg900[src],volte[src]
        }
    }
    

    File IDSs_file:

    1037
    1041
    1108
    1184
    1313
    1381
    1420
    1604
    1756
    1820
    3147
    3211
    4488
    4975
    

    Execute like this (Warning: Files order is very important):

    ./counter-v2.awk IDSs_file combined_marketing_sadm_report.csv
    

    Output:

    1037,0,0,1,1,1,1,1,1,1,1,1,1
    1041,0,0,2,2,2,2,2,2,2,2,2,2
    1108,0,0,1,1,1,1,1,1,1,1,1,1
    1184,0,0,1,1,1,1,1,1,1,1,1,1
    1313,0,0,1,1,1,1,1,1,1,1,1,1
    1381,0,0,1,1,1,1,1,1,1,1,1,1
    1420,0,0,1,1,1,1,1,1,1,1,1,1
    1604,0,0,1,1,1,1,1,1,1,1,1,1
    1756,0,0,1,1,1,1,1,1,1,1,1,1
    1820,0,0,1,1,1,1,1,1,1,1,1,1
    3147,0,0,1,1,1,1,1,0,0,0,0,1
    3211,0,0,1,1,1,1,1,0,0,0,0,1
    4488,0,0,1,1,1,1,1,1,1,1,1,1
    4975,0,0,2,2,2,2,2,1,1,0,0,1