Search code examples
rgeospatialspatialspdep

Map of bivariate spatial correlation in R (bivariate LISA)


I would like to create a map showing the bi-variate spatial correlation between two variables. This could be done either by doing a LISA map of bivariate Moran's I spatial correlation or using the L index proposed by Lee (2001).

The bi-variate Moran's I is not implemented in the spdep library, but the L index is, so here is what I've tried without success using the L index. A answer showing a solution based on Moran's I would also be very welcomed !

As you can see from the reproducible example below, I've manged so far to calculate the local L indexes. What I would like to do is to estimate the pseudo p-values and create a map of the results like those maps we use in LISA spatial clusters with high-high, high-low, ..., low-low.

In this example, the goal is to create a map with bi-variate Lisa association between black and white population. The map should be created in ggplot2 , showing the clusters:

  • High-presence of black and High-presence of white people
  • High-presence of black and Low-presence of white people
  • Low-presence of black and High-presence of white people
  • Low-presence of black and Low-presence oh white people

Reproducible example

library(UScensus2000tract)
library(ggplot2)
library(spdep)
library(sf)

# load data
  data("oregon.tract")

# plot Census Tract map
  plot(oregon.tract)


# Variables to use in the correlation: white and black population in each census track
  x <- scale(oregon.tract$white)
  y <- scale(oregon.tract$black)


# create  Queen contiguity matrix and Spatial weights matrix
  nb <- poly2nb(oregon.tract)
  lw <- nb2listw(nb)


# Lee index
  Lxy <-lee(x, y, lw, length(x), zero.policy=TRUE)

  # Lee’s L statistic (Global)
    Lxy[1]
    #> -0.1865688811


# 10k permutations to estimate pseudo p-values
  LMCxy <- lee.mc(x, y, nsim=10000, lw, zero.policy=TRUE, alternative="less")

# quik plot of local L
  Lxy[[2]] %>% density() %>% plot() # Lee’s local L statistic  (Local)
  LMCxy[[7]] %>% density() %>% lines(col="red") # plot values simulated 10k times 


# get confidence interval of 95% ( mean +- 2 standard deviations)
  two_sd_above <- mean(LMCxy[[7]]) + 2 * sd(LMCxy[[7]])
  two_sd_below <- mean(LMCxy[[7]]) - 2 * sd(LMCxy[[7]])


# convert spatial object to sf class for easier/faster use
  oregon_sf <- st_as_sf(oregon.tract)


# add L index values to map object
  oregon_sf$Lindex <- Lxy[[2]]

# identify significant local results  
  oregon_sf$sig <- if_else( oregon_sf$Lindex < 2*two_sd_below, 1, if_else( oregon_sf$Lindex > 2*two_sd_above, 1, 0))


# Map of Local L index but only the significant results
  ggplot() + geom_sf(data=oregon_sf, aes(fill=ifelse( sig==T, Lindex, NA)), color=NA)

enter image description here


Solution

  • What about this?

    I'm using the regular Moran's I instead of that Lee Index you suggest. But I think the underlying reasoning is pretty much the same.

    As you may see bellow -- the results produced this way look very much alike those comming from GeoDA

    library(dplyr)
    library(ggplot2)
    library(sf)
    library(spdep)
    library(rgdal)
    library(stringr)
    library(UScensus2000tract)
    
    #======================================================
    # load data
    data("oregon.tract")
    
    # Variables to use in the correlation: white and black population in each census track
    x <- oregon.tract$white
    y <- oregon.tract$black
    
    #======================================================
    # Programming some functions
    
    # Bivariate Moran's I
    moran_I <- function(x, y = NULL, W){
            if(is.null(y)) y = x
    
            xp <- (x - mean(x, na.rm=T))/sd(x, na.rm=T)
            yp <- (y - mean(y, na.rm=T))/sd(y, na.rm=T)
            W[which(is.na(W))] <- 0
            n <- nrow(W)
    
            global <- (xp%*%W%*%yp)/(n - 1)
            local  <- (xp*W%*%yp)
    
            list(global = global, local  = as.numeric(local))
    }
    
    
    # Permutations for the Bivariate Moran's I
    simula_moran <- function(x, y = NULL, W, nsims = 1000){
    
            if(is.null(y)) y = x
    
            n   = nrow(W)
            IDs = 1:n
    
            xp <- (x - mean(x, na.rm=T))/sd(x, na.rm=T)
            W[which(is.na(W))] <- 0
    
            global_sims = NULL
            local_sims  = matrix(NA, nrow = n, ncol=nsims)
    
            ID_sample = sample(IDs, size = n*nsims, replace = T)
    
            y_s = y[ID_sample]
            y_s = matrix(y_s, nrow = n, ncol = nsims)
            y_s <- (y_s - apply(y_s, 1, mean))/apply(y_s, 1, sd)
    
            global_sims  <- as.numeric( (xp%*%W%*%y_s)/(n - 1) )
            local_sims  <- (xp*W%*%y_s)
    
            list(global_sims = global_sims,
                 local_sims  = local_sims)
    }
    
    
    #======================================================
    # Adjacency Matrix (Queen)
    
    nb <- poly2nb(oregon.tract)
    lw <- nb2listw(nb, style = "B", zero.policy = T)
    W  <- as(lw, "symmetricMatrix")
    W  <- as.matrix(W/rowSums(W))
    W[which(is.na(W))] <- 0
    
    #======================================================
    # Calculating the index and its simulated distribution
    # for global and local values
    
    m   <- moran_I(x, y, W)
    m[[1]] # global value
    
    m_i <- m[[2]]  # local values
    
    local_sims <- simula_moran(x, y, W)$local_sims
    
    # Identifying the significant values 
    alpha <- .05  # for a 95% confidence interval
    probs <- c(alpha/2, 1-alpha/2)
    intervals <- t( apply(local_sims, 1, function(x) quantile(x, probs=probs)))
    sig        <- ( m_i < intervals[,1] )  | ( m_i > intervals[,2] )
    
    #======================================================
    # Preparing for plotting
    
    oregon.tract     <- st_as_sf(oregon.tract)
    oregon.tract$sig <- sig
    
    
    # Identifying the LISA patterns
    xp <- (x-mean(x))/sd(x)
    yp <- (y-mean(y))/sd(y)
    
    patterns <- as.character( interaction(xp > 0, W%*%yp > 0) ) 
    patterns <- patterns %>% 
            str_replace_all("TRUE","High") %>% 
            str_replace_all("FALSE","Low")
    patterns[oregon.tract$sig==0] <- "Not significant"
    oregon.tract$patterns <- patterns
    
    
    # Plotting
    ggplot() + geom_sf(data=oregon.tract, aes(fill=patterns), color="NA") +
            scale_fill_manual(values = c("red", "pink", "light blue", "dark blue", "grey95")) + 
            theme_minimal()
    

    You can get results closer (but not identical) to that of GeoDa by making changes in the confidence interval (e.g. using 90% instead of 95%).

    I suppose the remaining discrepancies come from a slightly different method of calculating the Moran's I. My version gives the same values of that function moran available in the package spdep. But GeoDa probably uses another approach.

    GeoDA enter image description here