Search code examples
python-3.xnetcdfnetcdf4measurementcdo-climate

NETCDF4 file doesn't grow beyond 2GB


I have a NETCDF4 file which doesn't grow beyond 2GB.

I am using the following sample data - I am converting over 200 txt files to netcdf4 file

STATIONS_ID;MESS_DATUM;  QN;FF_10;DD_10;eor
       3660;201912150000;    3;   4.6; 170;eor
       3660;201912150010;    3;   4.2; 180;eor
       3660;201912150020;    3;   4.3; 190;eor
       3660;201912150030;    3;   5.2; 190;eor
       3660;201912150040;    3;   5.1; 190;eor
       3660;201912150050;    3;   4.8; 190;eor

The code looks like:

files = [f for f in os.listdir('.') if os.path.isfile(f)]
count = 0 
for f in files:

    filecp = open(f, "r", encoding="ISO-8859-1")
    
    
# NC file setup
    mydata = netCDF4.Dataset('v5.nc', 'w', format='NETCDF4')
    
    mydata.description = 'Measurement Data'
    
    mydata.createDimension('STATION_ID',None)
    mydata.createDimension('MESS_DATUM',None)
    mydata.createDimension('QN',None)
    mydata.createDimension('FF_10',None)
    mydata.createDimension('DD_10',None)
    
    STATION_ID = mydata.createVariable('STATION_ID',np.short,('STATION_ID'))
    MESS_DATUM = mydata.createVariable('MESS_DATUM',np.long,('MESS_DATUM'))
    QN = mydata.createVariable('QN',np.byte,('QN'))
    FF_10 = mydata.createVariable('FF_10',np.float64,('FF_10'))
    DD_10 = mydata.createVariable('DD_10',np.short,('DD_10'))
    
    STATION_ID.units = ''
    MESS_DATUM.units = 'Central European Time yyyymmddhhmi'
    QN.units = ''
    FF_10.units = 'meters per second'
    DD_10.units = "degree"
    
    txtdata = pd.read_csv(filecp, delimiter=';').values
    
    #txtdata = np.genfromtxt(filecp, dtype=None, delimiter=';', names=True, encoding=None)
    if len(txtdata) > 0:
        
        df = pd.DataFrame(txtdata)

        sh = txtdata.shape
        print("txtdata shape is ", sh)
    
        mydata['STATION_ID'][:] = df[0]
        mydata['MESS_DATUM'][:] = df[1]
        mydata['QN'][:] = df[2]
        mydata['FF_10'][:] = df[3]
        mydata['DD_10'][:] = df[4]
    
        
    mydata.close()
    filecp.close()
    count +=1

Solution

  • Your problem is that you create the same file in the loop. So your file size is limited to the biggest initial data file.

    Open the file once, and add each new data to the end of netcdf data arrays.

    If you get 124 values in the first file, you put:

    mydata['STATION_ID'][0:124] = df[0]

    and you get 224 from the second file, you put

    mydata['STATION_ID'][124:124+224] = df[0]

    So, in case data files are downloaded from https://opendata.dwd.de/climate_environment/CDC/observations_germany/climate/10_minutes/wind/recent/ to <text file path>

    import netCDF4
    import codecs
    import pandas as pd
    import os
    import numpy as np
    
    
    mydata = netCDF4.Dataset('v5.nc', 'w', format='NETCDF4')
    mydata.description = 'Wind Measurement Data'
    mydata.createDimension('STATION_ID',None)
    mydata.createDimension('MESS_DATUM',None)
    mydata.createDimension('QN',None)
    mydata.createDimension('FF_10',None)
    mydata.createDimension('DD_10',None)
    
    STATION_ID = mydata.createVariable('STATION_ID',np.short,('STATION_ID'))
    MESS_DATUM = mydata.createVariable('MESS_DATUM',np.long,('MESS_DATUM'))
    QN = mydata.createVariable('QN',np.byte,('QN'))
    FF_10 = mydata.createVariable('FF_10',np.float64,('FF_10'))
    DD_10 = mydata.createVariable('DD_10',np.short,('DD_10'))
    
    STATION_ID.units = ''
    MESS_DATUM.units = 'Central European Time yyyymmddhhmi'
    QN.units = ''
    FF_10.units = 'meters per second'
    DD_10.units = "degree"    
    fpath = <text file path>
    files = [f for f in os.listdir(fpath)]
    count = 0 
    mydata_startindex=0
    for f in files:
        filecp = open(fpath+f, "r", encoding="ISO-8859-1")
        txtdata = pd.read_csv(filecp, delimiter=';')
        chunksize = len(txtdata)
        if len(txtdata) > 0:          
            mydata['STATION_ID'][mydata_startindex:mydata_startindex+chunksize] = txtdata['STATIONS_ID']
            mydata['MESS_DATUM'][mydata_startindex:mydata_startindex+chunksize] = txtdata['MESS_DATUM']
            mydata['QN'][mydata_startindex:mydata_startindex+chunksize] = txtdata['  QN']
            mydata['FF_10'][mydata_startindex:mydata_startindex+chunksize] = txtdata['FF_10']
            mydata['DD_10'][mydata_startindex:mydata_startindex+chunksize] = txtdata['DD_10']
            mydata_startindex += chunksize