I have a NETCDF4 file which doesn't grow beyond 2GB.
I am using the following sample data - I am converting over 200 txt files to netcdf4 file
STATIONS_ID;MESS_DATUM; QN;FF_10;DD_10;eor
3660;201912150000; 3; 4.6; 170;eor
3660;201912150010; 3; 4.2; 180;eor
3660;201912150020; 3; 4.3; 190;eor
3660;201912150030; 3; 5.2; 190;eor
3660;201912150040; 3; 5.1; 190;eor
3660;201912150050; 3; 4.8; 190;eor
The code looks like:
files = [f for f in os.listdir('.') if os.path.isfile(f)]
count = 0
for f in files:
filecp = open(f, "r", encoding="ISO-8859-1")
# NC file setup
mydata = netCDF4.Dataset('v5.nc', 'w', format='NETCDF4')
mydata.description = 'Measurement Data'
mydata.createDimension('STATION_ID',None)
mydata.createDimension('MESS_DATUM',None)
mydata.createDimension('QN',None)
mydata.createDimension('FF_10',None)
mydata.createDimension('DD_10',None)
STATION_ID = mydata.createVariable('STATION_ID',np.short,('STATION_ID'))
MESS_DATUM = mydata.createVariable('MESS_DATUM',np.long,('MESS_DATUM'))
QN = mydata.createVariable('QN',np.byte,('QN'))
FF_10 = mydata.createVariable('FF_10',np.float64,('FF_10'))
DD_10 = mydata.createVariable('DD_10',np.short,('DD_10'))
STATION_ID.units = ''
MESS_DATUM.units = 'Central European Time yyyymmddhhmi'
QN.units = ''
FF_10.units = 'meters per second'
DD_10.units = "degree"
txtdata = pd.read_csv(filecp, delimiter=';').values
#txtdata = np.genfromtxt(filecp, dtype=None, delimiter=';', names=True, encoding=None)
if len(txtdata) > 0:
df = pd.DataFrame(txtdata)
sh = txtdata.shape
print("txtdata shape is ", sh)
mydata['STATION_ID'][:] = df[0]
mydata['MESS_DATUM'][:] = df[1]
mydata['QN'][:] = df[2]
mydata['FF_10'][:] = df[3]
mydata['DD_10'][:] = df[4]
mydata.close()
filecp.close()
count +=1
Your problem is that you create the same file in the loop. So your file size is limited to the biggest initial data file.
Open the file once, and add each new data to the end of netcdf data arrays.
If you get 124 values in the first file, you put:
mydata['STATION_ID'][0:124] = df[0]
and you get 224 from the second file, you put
mydata['STATION_ID'][124:124+224] = df[0]
So, in case data files are downloaded from https://opendata.dwd.de/climate_environment/CDC/observations_germany/climate/10_minutes/wind/recent/
to <text file path>
import netCDF4
import codecs
import pandas as pd
import os
import numpy as np
mydata = netCDF4.Dataset('v5.nc', 'w', format='NETCDF4')
mydata.description = 'Wind Measurement Data'
mydata.createDimension('STATION_ID',None)
mydata.createDimension('MESS_DATUM',None)
mydata.createDimension('QN',None)
mydata.createDimension('FF_10',None)
mydata.createDimension('DD_10',None)
STATION_ID = mydata.createVariable('STATION_ID',np.short,('STATION_ID'))
MESS_DATUM = mydata.createVariable('MESS_DATUM',np.long,('MESS_DATUM'))
QN = mydata.createVariable('QN',np.byte,('QN'))
FF_10 = mydata.createVariable('FF_10',np.float64,('FF_10'))
DD_10 = mydata.createVariable('DD_10',np.short,('DD_10'))
STATION_ID.units = ''
MESS_DATUM.units = 'Central European Time yyyymmddhhmi'
QN.units = ''
FF_10.units = 'meters per second'
DD_10.units = "degree"
fpath = <text file path>
files = [f for f in os.listdir(fpath)]
count = 0
mydata_startindex=0
for f in files:
filecp = open(fpath+f, "r", encoding="ISO-8859-1")
txtdata = pd.read_csv(filecp, delimiter=';')
chunksize = len(txtdata)
if len(txtdata) > 0:
mydata['STATION_ID'][mydata_startindex:mydata_startindex+chunksize] = txtdata['STATIONS_ID']
mydata['MESS_DATUM'][mydata_startindex:mydata_startindex+chunksize] = txtdata['MESS_DATUM']
mydata['QN'][mydata_startindex:mydata_startindex+chunksize] = txtdata[' QN']
mydata['FF_10'][mydata_startindex:mydata_startindex+chunksize] = txtdata['FF_10']
mydata['DD_10'][mydata_startindex:mydata_startindex+chunksize] = txtdata['DD_10']
mydata_startindex += chunksize