Search code examples
pythonnumpynumpy-ndarraypython-iris

Why is reading from a NumPY array so slow for a CDF file


I have a simple netCDF file which has a cube of data i.e LAT, LONG ,TIME as 3 dimension and it stores Temperature. It's stored in the form of Masked Array in NumPy . Below code extracts it as CSV format. But its very slow for processing 20 MB file, i.e. for each iteration it takes 20 seconds, so totally I have 4 * 548 * 20 seconds = 43840 sec = 703 mins = 12 hours.

If you look at line which has comment TAKES_LONG_TIME , it takes more time. I believe for each cell there is a switch happening from Python to C code in NumPy. Not sure in the below scenario how I can resolve. Please advise. Thanks.

# conda install -y -c conda-forge iris

import iris
import cf_units as unit
import numpy as np
import datetime
import urllib.request
from os import path


def make_data_object_name(dataset_name, year, month, day, hour, realization, forecast_period):
    template_string = "prods_op_{}_{:02d}{:02d}{:02d}_{:02d}_{:02d}_{:03d}.nc"
    return template_string.format(dataset_name, year, month, day, hour, realization, forecast_period)


def download_data_object(dataset_name, data_object_name):
    url = "https://s3.eu-west-2.amazonaws.com/" + dataset_name + "/" + data_object_name
    urllib.request.urlretrieve(url, data_object_name)  # save in this directory with same name


def load_data():
    filename = 'prods_op_mogreps-uk_20140101_03_02_003.nc'
    if not path.exists(filename):
        # obj_name = make_data_object_name('mogreps-uk', 2014, 1, 1, 3, 2, 3)
        download_data_object('mogreps-uk', filename)

    listofcubes = iris.load(filename)
    air_temps = listofcubes.extract('air_temperature')
    surface_temp = air_temps[0]
    dim_time, dim_lat, dim_long = "time", "grid_latitude", "grid_longitude"

    time_cords = surface_temp.coord(dim_time).points
    time_since = str(surface_temp.coord(dim_time).units)
    lat_cords = surface_temp.coord(dim_lat).points
    long_cords = surface_temp.coord(dim_long).points

    time_records = [str(unit.num2date(time_cords[i], time_since, unit.CALENDAR_STANDARD)) for i in
                    range(len(time_cords))]
    lat_records = [lat_cords[lat_recorded] for lat_recorded in range(len(lat_cords))]
    long_records = [long_cords[long_recorded] for long_recorded in range(len(long_cords))]

    print(len(time_records), len(lat_records), len(long_records))
    print(surface_temp.shape)
    data_size = len(surface_temp.shape)
    print(" File write start -->  ", datetime.datetime.now())
    with open(filename + '.curated', 'w') as filehandle:
        for t, time_record in enumerate(time_records):  # Iterate TIME - 4
            t_a = surface_temp[t] if data_size == 3 else surface_temp[t][0]
            for lat, lat_record in enumerate(lat_records):  # Iterate LAT - 548
                lat_a = t_a[lat]
                iter_start_time = datetime.datetime.now()
                lat_lines = list()
                for lng, long_record in enumerate(long_records):  # Iterate Long 421
                    data = str(lat_a[lng].data.min()) # TAKES_LONG_TIME
                    lat_lines.append(time_record + ',' + str(lat_record) + ',' + str(long_record) + ',' + data + '\n')
                filehandle.writelines(lat_lines)
                print(t, time_record, lat, lat_record, " time taken in seconds -> ",
                      (datetime.datetime.now() - iter_start_time).seconds)



if __name__ == "__main__":
    load_data()



Solution

  • When you first read in your cubes using iris.load, the actual data arrays do not get loaded into memory (see Real and lazy data in the Iris User Guide). Because you are slicing up your cube before accessing subcube.data, the actual loading is happening individually for each slice. So you are going back and accessing the NetCDF file every time your "TAKES_LONG_TIME" line is executed.

    To load everything into memory before you start your loops, simply add a line that says

    surface_temp.data
    

    This should speed things up, but may not be desirable depending how much memory you have available. A compromise could be found by choosing a different level in your for loop to realise the data (i.e. ta.data or la.data).