I am working on a problem where I need to crop and mask a rasterstack to a series of shapefiles. Here is a reproducible example:
# Example data ------------------------------------------------------------
#create example raster stack
r1 = raster(nrows=1000,ncol=1000,xmn=60,xmx=90,ymn=0,ymx=25)
rr = lapply(1:10, function(i) setValues(r1,runif(ncell(r1))))
rrstack=stack()
for (i in 1:length(rr)){
stacknext=rr[[i]]
rrstack=stack(rrstack,stacknext)
}
#create example shapefile list
lats=runif(10,min=0,max=25)
lons=runif(10,min=60,max=90)
exnames=paste0("city_",letters[1:10])
coords=data.frame(names=exnames,lats=lats,lons=lons)
coords_sf = st_as_sf(coords,coords=c("lons","lats"),crs=4326,dim ="XY")
circle1=st_buffer(coords_sf, 1E3)
circle100=st_buffer(coords_sf,1E5)
circle500=st_buffer(coords_sf,5E5)
circlist=list(circle1=circle1,circle100=circle100,circle500=circle500)
circlist_reproj=lapply(circlist,function(x) st_transform(x,crs(rrstack[[1]])))
I developed a series of nested for loops to crop and mask the rasterstack to the shapefiles:
start <- proc.time()
citlist <- vector(mode='list',length=nrow(circlist_reproj[[1]]))
dellist <- vector(mode='list',length=length(circlist_reproj))
mystack <- stack()
for(k in 1:nrow(circlist_reproj[[1]])) {
for(j in 1:length(circlist_reproj)) {
for (i in 1:nlayers(rrstack)){
maskraster <- raster::mask(rrstack[[i]],circlist_reproj[[j]][k,])
maskraster <- raster::crop(maskraster,circlist_reproj[[j]][k,])
mystack <- stack(mystack,maskraster)
}
dellist[[j]] <- mystack
mystack <- stack()
}
citlist[[k]] <- dellist
dellist <- vector(mode='list',length=length(circlist_reproj))
}
basetime <- proc.time()-start
For my actual dataset, this takes days to complete. I developed a parallel process to speed up the above code
cores <- detectCores()
cl <- makeCluster(cores[1]-2) #not to overload your computer
registerDoParallel(cl)
parstart=proc.time()
citlist <- vector(mode='list',length=nrow(circlist_reproj[[1]]))
dellist <- vector(mode='list',length=length(circlist_reproj))
for(k in 1:nrow(circlist_reproj[[1]])) {
for(j in 1:length(circlist_reproj)) {
parrasterstack <- foreach(i=1:nlayers(rrstack),.packages=c('raster','sf')) %dopar% {
maskraster <- raster::mask(rrstack[[i]],circlist_reproj[[j]][k,])
raster::crop(maskraster,circlist_reproj[[j]][k,])
}
parrasterstack <- stack(parrasterstack)
dellist[[j]] <- parrasterstack
parrasterstack <- NULL
}
citlist[[k]] <- dellist
dellist <- vector(mode='list',length=length(circlist_reproj))
}
stopCluster(cl)
Running both methods of processing, I discovered that the sequential process is faster than the parallel process.
> basetime #sequential process
user system elapsed
19.858 4.124 24.283
> partime #parallel process
user system elapsed
41.415 10.153 132.262
I repeated the code on a different machine with more cores and got a wider difference. Why is the parallel process slower in all aspects, and is there a way to speed up the processing?
With your sequential approach I get
proc.time()-start
# user system elapsed
# 18.86 2.57 21.44
Cutting an unnecessary loop, and using crop before mask makes this go about 8x faster
start <- proc.time()
citlist <- vector(mode='list',length=nrow(circlist_reproj[[1]]))
for(k in 1:nrow(circlist_reproj[[1]])) {
for(j in 1:length(circlist_reproj)) {
r <- crop(rrstack, circlist_reproj[[j]][k,])
citlist[[k]][[j]] <- mask(r, circlist_reproj[[j]][k,])
}
}
proc.time()-start
# user system elapsed
# 2.41 0.30 2.70
And terra
is 3x faster than raster
library(terra)
rrstack <- rast(rrstack)
circlist_reproj <- lapply(circlist_reproj, vect)
start <- proc.time()
citlist <- vector(mode='list',length=nrow(circlist_reproj[[1]]))
for(k in 1:nrow(circlist_reproj[[1]])) {
for(j in 1:length(circlist_reproj)) {
r <- crop(rrstack, circlist_reproj[[j]][k,])
citlist[[k]][[j]] <- mask(r, circlist_reproj[[j]][k,])
}
}
proc.time()-start
# user system elapsed
# 0.70 0.19 0.89
So that is 24x faster.
Also, the loop should really be written like this to be safe:
start <- proc.time()
citlist <- vector(mode='list',length=length(circlist_reproj))
for(i in 1:length(circlist_reproj)) {
for(j in 1:nrow(circlist_reproj[[i]])) {
r <- crop(rrstack, circlist_reproj[[i]][j,])
citlist[[i]][[j]] <- mask(r, circlist_reproj[[i]][j,])
}
}
proc.time()-start
And with terra you can do crop and mask in one step, and using that may be as much as 35x faster.
start <- proc.time()
citlist <- vector(mode='list',length=length(circlist_reproj))
for(i in 1:length(circlist_reproj)) {
for(j in 1:nrow(circlist_reproj[[i]])) {
citlist[[i]][[j]] <- crop(rrstack, circlist_reproj[[i]][j,], mask=TRUE)
}
}
proc.time()-start
# user system elapsed
# 0.46 0.11 0.56
You can now again see if the parallelization can help if that is still necessary (it is rarely worth it for this type of problem). Perhaps do that for the outer-most loop, to keep the overhead limited. But note that while you can do that with raster
, but you cannot do that with terra
(that will eventually be available "under the hood").
Finally, you provide no context, but what you are doing looks odd. Do you want to use extract
instead? You could do, for example:
e <- lapply(circlist_reproj, function(v) extract(rrstack, v))
That is not the fastest method, but perhaps the clearest code.