Search code examples
pythonnetcdfpython-xarray

Dropping nan values from an xarray dataset not working, possibly due to smoothing with a rolling mean


A lot of text, but mostly it is self contained examples. Any help is appreciated.

I have an xarray dataset with Range and time coordinates, and for each time I want to find the Range where the backscatter gradient is the minimum. However, I am running into the ValueError: All-NaN slice encountered, I think this might be because I am smoothing my data first with a rolling mean, but I am not certain. My approach is as follows:

  1. create a new data variable for the gradient of backscatter using da.differentiate(coord = 'Range')
  2. Get the locations of the minimums in this gradient using min_height = ds['bs_grad'].argmin(dim='Range').values, and min_height = ds['Range'].isel(Range=min_height). This throws the error. To address this I tried da.dropna() along both time and Range coordinates individually with no success. Here are two examples, the first one works, the second one replicates the problem I am encountering in my actual data set.

The approach works fine on a simple xarray, as seen below.

import pandas as pd
import numpy as np
import xarray as xr
import matplotlib.pyplot as plt

# create a sample DataArray
ds = xr.DataArray([[7, 6, 8, np.nan, 9],
                [9, 6, 2, np.nan, 4],
                [3, 4, 1, np.nan, 1]],
    dims=['Range', 'time'],
    coords={
        'Range': [10, 20, 30],
        'time': ['2022-01-01', '2022-01-02', '2022-01-03', '2022-01-04', '2022-01-05']
    }
)


plt.pcolormesh(ds['time'], ds['Range'], ds, shading = 'auto')
plt.colorbar()
plt.gcf().autofmt_xdate()
plt.title('xarray')
plt.show()

ds['bs_grad'] = ds.differentiate(coord = 'Range')

plt.pcolormesh(ds['time'], ds['Range'], ds['bs_grad'], shading = 'auto')
plt.colorbar()
plt.gcf().autofmt_xdate()
plt.title('gradient')
plt.show()


ds = ds.dropna(dim = 'time', how = 'all')  
ds = ds.dropna(dim = 'Range', how = 'all')  

plt.pcolormesh(ds['time'], ds['Range'], ds['bs_grad'], shading = 'auto')
plt.colorbar()
plt.gcf().autofmt_xdate()
plt.title('grad, nan dropped')
plt.show()

## find the Range of the minimum backscatter for each time step
min_Range = ds['bs_grad'].argmin(dim='Range').values  # get the index of the minimum backscatter
min_Range = ds['Range'].isel(Range=min_Range)  # extract the corresponding Range


plt.scatter(ds['time'], min_Range),
plt.gcf().autofmt_xdate()
plt.show()

However, the same approach fails for the example below.

import pandas as pd
import numpy as np
import xarray as xr
import matplotlib.pyplot as plt

min_date = "2023-03-01"
max_date = "2023-04-20"
dates = pd.date_range(min_date, max_date)

# Define the dimensions
Range = np.arange(0, 1000, 50)
time = dates

# Create the data array with variables for backscatter, temperature, and humidity
data = xr.DataArray(
    np.random.rand(len(Range), len(time)),  # random data for demonstration purposes
    dims=("Range", "time"),
    coords={"Range": Range, "time": time},
    attrs={"long_name": "example data array"},
)

backscatter = xr.DataArray(
    np.random.rand(len(Range), len(time)),  # random data for demonstration purposes
    dims=("Range", "time"),
    coords={"Range": Range, "time": time},
    attrs={"long_name": "backscatter", "units": "dB"},
)

temperature = xr.DataArray(
    np.random.rand(len(Range), len(time))*20,  # random data for demonstration purposes
    dims=("Range", "time"),
    coords={"Range": Range, "time": time},
    attrs={"long_name": "temperature", "units": "K"},
)

humidity = xr.DataArray(
    np.random.rand(len(Range), len(time))*100,  # random data for demonstration purposes
    dims=("Range", "time"),
    coords={"Range": Range, "time": time},
    attrs={"long_name": "humidity", "units": "%"},
)

# Combine the data arrays into a single xarray dataset
ds = xr.Dataset(
    {"data": data, "backscatter": backscatter, "temperature": temperature, "humidity": humidity}
)

plt.pcolormesh(ds['time'], ds['Range'], ds['backscatter'], shading = 'auto', vmin = 0)
plt.colorbar()
plt.gcf().autofmt_xdate()
plt.xlabel('time')
plt.ylabel('Range')
plt.show()

## Smooth the data with a rolling mean 
ds['backscatter'] = ds['backscatter'].rolling(
    Range = 5, center=True).mean().rolling(
    time = 5, center=True).mean()

plt.pcolormesh(ds['time'], ds['Range'], ds['backscatter'], shading = 'auto', vmin = 0)
plt.colorbar()
plt.gcf().autofmt_xdate()
plt.xlabel('time')
plt.ylabel('Range')
plt.show()

## Artificially add some nan columns, rows, and min gradient
## Set a clear min gradient
ds = ds.where(ds['Range'] != 800, other= -20000)


## Set values equal to nan to siumulate problem I'm running into
ds = ds.where(ds['time'] != pd.to_datetime('2023-04-01'), np.nan)
ds = ds.where(ds['time'] != pd.to_datetime('2023-04-03'), np.nan)
ds = ds.where(ds['time'] != pd.to_datetime('2023-03-12'), np.nan)

ds = ds.where(ds['Range'] != 550, np.nan)
ds = ds.where(ds['Range'] != 300, np.nan)

plt.pcolormesh(ds['time'], ds['Range'], ds['backscatter'], shading = 'auto', vmin = 0)
plt.colorbar()
plt.gcf().autofmt_xdate()
plt.xlabel('time')
plt.ylabel('Range')
plt.show()

## Calculate the gradient
ds['bs_grad'] = ds['backscatter'].differentiate(coord = 'Range')

plt.pcolormesh(ds['time'], ds['Range'], ds['bs_grad'], shading = 'auto')
plt.colorbar()
plt.gcf().autofmt_xdate()
plt.xlabel('time')
plt.ylabel('Range')
plt.show()

## Drop nan values
ds = ds.dropna(dim = 'time', how = 'all') ## Works
ds = ds.dropna(dim = 'Range', how = 'all') ## Does nothing ? 

plt.pcolormesh(ds['time'], ds['Range'], ds['bs_grad'], shading = 'auto')
plt.colorbar()
plt.gcf().autofmt_xdate()
plt.xlabel('time')
plt.ylabel('Range')
plt.show()

## Try to find location of the minimum values as in example 1.
min_height = ds['bs_grad'].argmin(dim='Range').values  # get the index of the minimum backscatter
display(min_height)
min_height = ds['Range'].isel(Range=min_height)  # extract the corresponding height
display(min_height)

plt.scatter(ds['time'], min_height),
plt.gcf().autofmt_xdate()
plt.show()

Thanks, and have a nice day.


Solution

  • You can take the argmin of an array with nans by first filling nans with a dummy value (e.g. np.inf):

    da.fillna(np.inf).argmin(dim=dimname)
    

    The result will return the first value for any slices with all nans, so you may want to mask the result to exclude these values:

    mins = da.fillna(np.inf).argmin(dim=dimname)
    mins = mins.where(da.notnull().any(dim=dimname))