Is there a way to vectorize the following three-nested loop that calcuate the daily mean of hourly data? The function below loops first over the year, then months, and finally over days. It also check for the last month and day to ensure that the loop does not go beyond the last month or day of the data.
def hourly2daily(my_var,my_periods):
import pandas as pd
import numpy as np
import sys
print('######### Daily2monthly function ##################')
Frs_year =my_periods[0].year
Frs_month =my_periods[0].month
Frs_day =my_periods[0].day
Frs_hour =my_periods[0].hour
Last_year =my_periods[-1].year
Last_month =my_periods[-1].month
Last_day =my_periods[-1].day
Last_hour =my_periods[-1].hour
print('First year is '+str(Frs_year) +'\n'+\
'First months is '+str(Frs_month)+'\n'+\
'First day is '+str(Frs_day)+'\n'+\
'First hour is '+str(Frs_hour))
print(' ')
print('Last year is '+str(Last_year)+'\n'+\
'Last months is '+str(Last_month)+'\n'+\
'Last day is '+str(Last_day)+'\n'+\
'Last hour is '+str(Last_hour))
Frs = str(Frs_year)+'/'+str(Frs_month)+'/'+str(Frs_day)+' '+str(Frs_hour)+":00"
Lst = str(Last_year)+'/'+str(Last_month)+'/'+str(Last_day)+' '+str(Last_hour)+":00"
my_daily_time=pd.date_range(Frs,Lst,freq='D')
## END of the data_range tricks ###########
nt_days=len(my_daily_time)
nd=np.ndim(my_var)
if (nd == 1): # only time series
var_mean=np.full((nt_days),np.nan)
if (nd == 2): # e.g., time, lat or lon or lev
n1=np.shape(my_var)[1]
var_mean=np.full((nt_days,n1),np.nan)
if (nd == 3): # e.g., time, lat, lon
n1=np.shape(my_var)[1]
n2=np.shape(my_var)[2]
var_mean=np.full((nt_days,n1,n2),np.nan)
if (nd == 4): # e.g., time, lat , lon, lev
n1=np.shape(my_var)[1]
n2=np.shape(my_var)[2]
n3=np.shape(my_var)[3]
var_mean=np.full((nt_days,n1,n2,n3),np.nan)
end_mm=12
k=0
####### loop over years ################
for yy in np.arange(Frs_year,Last_year+1):
print('working on the '+str(yy))
# in case the last month is NOT 12
if (yy == Last_year):
end_mm=Last_month
print('The last month is '+str(end_mm))
## Loop over months ################
for mm in np.arange(1,end_mm+1):
end_day=pd.Period(str(yy)+'-'+str(mm)).days_in_month
# in case the last day is not at the end of the month.
if ((yy == Last_year) & (mm == Last_month)):
end_day=Last_day
#### loop over days ###############
for dd in np.arange(1,end_day+1):
print(str(yy)+'-'+str(mm)+'-'+str(dd))
#list all days of the month and year.
I=np.where((my_periods.year == yy) &\
(my_periods.month == mm) &\
(my_periods.day == dd ))[0]
print(I)
# if there is a discontinuity in time.
# I will be empty and then you have to quit.
# you have first to reindex the data.
if len(I) == 0 :
print('Warning time shift here >>')
print('Check the continuity of your time sequence')
sys.exit()
var_mean[k,...]=np.nanmean(my_var[I,...],0)
k=k+1
return var_mean,my_daily_time
Here is, perhaps, easy and quick way to call this function. Note that you may be asked to install Pooch
import numpy as np
import xarray as xr
x = xr.tutorial.load_dataset("air_temperature")
time = x['time'] # reading the time
period=time.to_index().to_period('h')
bb0,bb1=hourly2daily(x['air'],period)
I am aware that there is another way to implement this; for example, I can do the previous calculation in one single loop as shown below, but it won’t help for data with discontinues in time.
daily_tem2m = np.full((int(len_time/24),len_lat,len_lon),np.nan,float)
counter=0
timemm=[]
for i in np.arange(0,len_time,24):
print(period[i])
timemm.append(period[i])
daily_tem2m[counter,:,:]=np.nanmean(cleaned_tem2m_celsius.data[i:i+24,:,:],0)
counter=counter+1
Per this documentation, to group the data in daily buckets, you can use similar syntax to the pandas option:
import numpy as np
import xarray as xr
x = (
xr
.tutorial
.load_dataset("air_temperature")
.groupby("time.day")
.mean()
)
Since you want two things, one to see if there are any non-contiguous dates and then to do the analysis, here's how that can be accomplished:
import pandas as pd
import xarray as xr
x = (
xr
.tutorial
.load_dataset("air_temperature")
.resample(time='D')
)
idx = pd.Series(x.groups).index
# Check that all of the range is contiguous, raise an error otherwise
# dropna because the first element will always be NaT
if not (idx.diff().dropna() == '1 days').all():
raise ValueError("Got a non-contiguous date range!")
# do the calculation
x.mean()
And to show that the index check will fail on a non-contiguous date range:
import pandas as pd
a = pd.date_range('2023-01-01', periods=5)
b = pd.date_range('2024-01-01', periods=5)
if not (a.union(b).diff().dropna() == '1 days').all():
raise Exception("Broken date range!")
Exception Traceback (most recent call last)
Cell In[8], line 5
2 b = pd.date_range('2024-01-01', periods=5)
4 if not (a.union(b).diff().dropna() == '1 days').all():
----> 5 raise Exception("Broken date range!")
Exception: Broken date range!