Search code examples
pythonperformanceloopstiming

How can I make my python code run faster


I am working on code that loops over multiple netcdf files (large ~28G). The netcdf files have multiple 4D variables[time, east-west, south-north, height] throughout a domain. The goal is to loop over these files and to loop over each location of all of these variables in the domain and pull certain variables to store into a large array. When there is missing or incomplete files I fill the values with 99.99. Right now I am just testing by looping over 2 daily netcdf files but for some reason it is taking forever (~14 hours). I am not sure if there is a way to optimize this code. I don't think that python should take this long for this task but maybe it is a problem with python or my code. Below is my code hopefully it is readable and any suggestions on how to make this faster is greatly appreciated:

#Domain to loop over
k_space = np.arange(0,37)
j_space = np.arange(80,170)
i_space = np.arange(200,307)

predictors_wrf=[]
names_wrf=[]

counter = 0
cdate = start_date
while cdate <= end_date:
    if cdate.month not in month_keep:
        cdate+=inc
        continue
    yy = cdate.strftime('%Y')        
    mm = cdate.strftime('%m')
    dd = cdate.strftime('%d')
    filename = wrf_path+'\wrfoutRED_d01_'+yy+'-'+mm+'-'+dd+'_'+hour_str+'_00_00'
    for i in i_space:
        for j in j_space:
            for k in k_space:
                    if os.path.isfile(filename):
                        f = nc.Dataset(filename,'r')
                        times = f.variables['Times'][1:]
                        num_lines = times.shape[0]
                        if num_lines == 144:
                            u = f.variables['U'][1:,k,j,i]
                            v = f.variables['V'][1:,k,j,i]
                            wspd = np.sqrt(u**2.+v**2.)
                            w = f.variables['W'][1:,k,j,i]
                            p = f.variables['P'][1:,k,j,i]
                            t = f.variables['T'][1:,k,j,i]
                        if num_lines < 144:
                            print "partial files for WRF: "+ filename
                            u = np.ones((144,))*99.99
                            v = np.ones((144,))*99.99
                            wspd = np.ones((144,))*99.99
                            w = np.ones((144,))*99.99
                            p = np.ones((144,))*99.99
                            t = np.ones((144,))*99.99
                    else:
                        u = np.ones((144,))*99.99
                        v = np.ones((144,))*99.99
                        wspd = np.ones((144,))*99.99
                        w = np.ones((144,))*99.99
                        p = np.ones((144,))*99.99
                        t = np.ones((144,))*99.99
                        counter=counter+1
                    predictors_wrf.append(u)
                    predictors_wrf.append(v)
                    predictors_wrf.append(wspd)
                    predictors_wrf.append(w)
                    predictors_wrf.append(p)
                    predictors_wrf.append(t)
                    u_names = 'u_'+str(k)+'_'+str(j)+'_'+str(i)
                    v_names = 'v_'+str(k)+'_'+str(j)+'_'+str(i)
                    wspd_names = 'wspd_'+str(k)+'_'+str(j)+'_'+str(i)
                    w_names = 'w_'+str(k)+'_'+str(j)+'_'+str(i)
                    p_names = 'p_'+str(k)+'_'+str(j)+'_'+str(i)
                    t_names = 't_'+str(k)+'_'+str(j)+'_'+str(i)
                    names_wrf.append(u_names)
                    names_wrf.append(v_names)
                    names_wrf.append(wspd_names)
                    names_wrf.append(w_names)
                    names_wrf.append(p_names)
                    names_wrf.append(t_names)
    cdate+=inc

Solution

  • This is a lame first pass to tighten up your forloops. Since you only use the file shape once per file, you can move the handling outside the loop which should reduce the amount of loading of data in interrupting processing. I still don't get what counter and inc do as they don't seem to be updated in the loop. You definitely want to look into repeated string concatenation performance, or how the performance of your appending to predictors_wrf and names_wrf looks as starting points

    k_space = np.arange(0,37)
    j_space = np.arange(80,170)
    i_space = np.arange(200,307)
    
    predictors_wrf=[]
    names_wrf=[]
    
    counter = 0
    cdate = start_date
    while cdate <= end_date:
        if cdate.month not in month_keep:
            cdate+=inc
            continue
        yy = cdate.strftime('%Y')        
        mm = cdate.strftime('%m')
        dd = cdate.strftime('%d')
        filename = wrf_path+'\wrfoutRED_d01_'+yy+'-'+mm+'-'+dd+'_'+hour_str+'_00_00'
        file_exists = os.path.isfile(filename)
        if file_exists:
            f = nc.Dataset(filename,'r')
            times = f.variables['Times'][1:]
            num_lines = times.shape[0]
        for i in i_space:
            for j in j_space:
                for k in k_space:
                        if file_exists:    
                            if num_lines == 144:
                                u = f.variables['U'][1:,k,j,i]
                                v = f.variables['V'][1:,k,j,i]
                                wspd = np.sqrt(u**2.+v**2.)
                                w = f.variables['W'][1:,k,j,i]
                                p = f.variables['P'][1:,k,j,i]
                                t = f.variables['T'][1:,k,j,i]
                            if num_lines < 144:
                                print "partial files for WRF: "+ filename
                                u = np.ones((144,))*99.99
                                v = np.ones((144,))*99.99
                                wspd = np.ones((144,))*99.99
                                w = np.ones((144,))*99.99
                                p = np.ones((144,))*99.99
                                t = np.ones((144,))*99.99
                        else:
                            u = np.ones((144,))*99.99
                            v = np.ones((144,))*99.99
                            wspd = np.ones((144,))*99.99
                            w = np.ones((144,))*99.99
                            p = np.ones((144,))*99.99
                            t = np.ones((144,))*99.99
                            counter=counter+1
                        predictors_wrf.append(u)
                        predictors_wrf.append(v)
                        predictors_wrf.append(wspd)
                        predictors_wrf.append(w)
                        predictors_wrf.append(p)
                        predictors_wrf.append(t)
                        u_names = 'u_'+str(k)+'_'+str(j)+'_'+str(i)
                        v_names = 'v_'+str(k)+'_'+str(j)+'_'+str(i)
                        wspd_names = 'wspd_'+str(k)+'_'+str(j)+'_'+str(i)
                        w_names = 'w_'+str(k)+'_'+str(j)+'_'+str(i)
                        p_names = 'p_'+str(k)+'_'+str(j)+'_'+str(i)
                        t_names = 't_'+str(k)+'_'+str(j)+'_'+str(i)
                        names_wrf.append(u_names)
                        names_wrf.append(v_names)
                        names_wrf.append(wspd_names)
                        names_wrf.append(w_names)
                        names_wrf.append(p_names)
                        names_wrf.append(t_names)
        cdate+=inc