Search code examples
pythonpandasnetcdfpython-xarraytxt

how to open several nc files, filter a desired period and write the result in a single txt file?


I have several nc files, I need to open them all. Filter the desired period and write the result of all in a single txt file.

The nc files correspond to a month (Jan, Feb, Mar...) and have four variables (temperature, dew point, u and v).

I need to assemble a table, with all the variables side by side for a specific period. For example, from January to October. The first column being temperature, the second dew point, third u and lastly v.

from netCDF4 import MFDataset
import pandas as pd
import xarray as xr
import csv
import tempfile




ds=xr.open_mfdataset('/home/milena/Documentos/dados_obs_haroldo/media_horaria/MEDIA_HORARIA_*.nc')


lat = ds.variables['lat'][:]
lon = ds.variables['lon'][:]
t2mj = ds.variables['t2mj'][:]
td2mj = ds.variables['td2mj'][:]
u10mj = ds.variables['u10mj'][:]
v10mj = ds.variables['v10mj'][:]



#Brasilia
t2mj_txt=ds.t2mj.isel(lat=153, lon=117).to_dataframe().to_csv('/home/milena/Documentos/dados_obs_haroldo/media_horaria/csv/t2mj.csv')
td2mj_txt=ds.td2mj.isel(lat=153, lon=117).to_dataframe().to_csv('/home/milena/Documentos/dados_obs_haroldo/media_horaria/csv/td2mj.csv')
u10mj_txt=ds.u10mj.isel(lat=153, lon=117).to_dataframe().to_csv('/home/milena/Documentos/dados_obs_haroldo/media_horaria/csv/u10mj.csv')
v10mj_txt=ds.v10mj.isel(lat=153, lon=117).to_dataframe().to_csv('/home/milena/Documentos/dados_obs_haroldo/media_horaria/csv/v10mj.csv')
#print(t2mj_txt)


#opem csv

t2mj_csv = pd.read_csv('/home/milena/Documentos/dados_obs_haroldo/media_horaria/csv/t2mj.csv', skipinitialspace=True)
td2mj_csv = pd.read_csv('/home/milena/Documentos/dados_obs_haroldo/media_horaria/csv/td2mj.csv', skipinitialspace=True)
u10mj_csv = pd.read_csv('/home/milena/Documentos/dados_obs_haroldo/media_horaria/csv/u10mj.csv', skipinitialspace=True)
v10mj_csv = pd.read_csv('/home/milena/Documentos/dados_obs_haroldo/media_horaria/csv/v10mj.csv', skipinitialspace=True)
#print(t2mj_csv)


#filter desired period
t2mj_date=t2mj_csv[(t2mj_csv['time'])<"2022-12-01"]
td2mj_date=td2mj_csv[(td2mj_csv['time'])<"2022-12-01"]
u10mj_date=u10mj_csv[(u10mj_csv['time'])<"2022-12-01"]
v10mj_date=v10mj_csv[(v10mj_csv['time'])<"2022-12-01"]


arquivo = open("/home/milena/Documentos/dados_obs_haroldo/media_horaria/csv/t2mj_filter.txt", "w")
arquivo.write(t2mj_date['t2mj'].to_string())
arquivo.close()

arquivo2 = open("/home/milena/Documentos/dados_obs_haroldo/media_horaria/csv/td2mj_filter.txt", "w")
arquivo2.write(td2mj_date['td2mj'].to_string())
arquivo2.close()

arquivo3 = open("/home/milena/Documentos/dados_obs_haroldo/media_horaria/csv/u10mj_filter.txt", "w")
arquivo3.write(u10mj_date['u10mj'].to_string())
arquivo3.close()

arquivo4 = open("/home/milena/Documentos/dados_obs_haroldo/media_horaria/csv/v10mj_filter.txt", "w")
arquivo4.write(v10mj_date['v10mj'].to_string())
arquivo4.close()


file_list=['/home/milena/Documentos/dados_obs_haroldo/media_horaria/csv/t2mj_filter.txt', '/home/milena/Documentos/dados_obs_haroldo/media_horaria/csv/td2mj_filter.txt', '/home/milena/Documentos/dados_obs_haroldo/media_horaria/csv/u10mj_filter.txt', '/home/milena/Documentos/dados_obs_haroldo/media_horaria/csv/v10mj_filter.txt']
dfe = pd.DataFrame()
for file in file_list:
    temp_dfe = pd.read_csv(file, header=None, names=[file[:-4]])
    dfe = pd.concat([dfe, temp_dfe], axis=1)
    

arquivo5 = open("/home/milena/Documentos/dados_obs_haroldo/media_horaria/teste.txt", "w")
arquivo5.write(dfe.to_string())
arquivo5.close()

my result looks like this:

enter image description here

I would like it to look like this:

enter image description here


Solution

  • The beauty of Xarray is that you don't need to do all this extra IO to make this problem work. Just use xarray's selection logic then convert the dataset to a dataframe before writing a multi-column text file.

    ds = xr.open_mfdataset(...)
    
    # select point, select time slice, drop lat/lon variables
    ds_point = ds.isel(lat=153, lon=117).sel(time=slice(None, '2022-12-01')).drop(['lat', 'lon'])
    
    # convert to dataframe
    df_point = ds_point.to_dataframe()
    
    # order columns then write to text file with tab separator
    variables = ['t2mj', 'td2mj', 'u10mj', 'v10mj']
    df_point[variables].to_csv(..., sep='\t')