Identify upstream reaches based on individual codes

I’m using the global river network delineation derived from HydroSHEDS. In this data set each river has it’s individual 8 digits code (HYRIV_ID). The dataset comprehends some additional fields, but for this task I will only rely on NEXT_DOWN, which gives us the code for the next downstream river.

To recreate the data set in the figure below, I have

a <- c(41397427, 41397438, 41397439, 41397440, 41397466, 41397467, 41397494, 41397495, 41397512, 41397513, 41397514, 41397566, 41397584)
b <- c(41397438, 41397494, 41397466, 41397439, 41397438, 41397440, 41397401, 41397440, 41397466, 41397439, 41397495, 41397495, 41397494)

# Data frame of individual codes for reach id (HYRIV_ID) and the next downstream reach (NEXT_DOWN)
df <- data.frame(HYRIV_ID = a, NEXT_DOWN = b)

# Point A and B HYRIV_ID
id <- c(41397494, 41397495)

Now, I need to identify all upstream rivers from points A and B using the HYRIV_ID in vector id. Because the real data has more points, I created a loop to go through all the codes stored in id.

Bellow you can find my attempt,

# NOTE: Although the loop will run, the output of i = 1 (point A) is not correct, as it's only identifying the first upstream reaches 

pt_up <- list() # create a list to store the upstream HYRIV_ID for each target code in id
for (i in 1:length(id)){
  n <- id[[i]] # creates vector with target HYRIV_ID, where all upstream reaches will also be stored
  tmp_id <- df[df$NEXT_DOWN == id[[i]],]$HYRIV_ID # identifies reaches which have n as NEXT_DOWN reach

  # if the target reach is the most upstream, then it can't be downstream of any other
  if(length(tmp_id) == 0){
    n <- n
    # if not, I need to find the next upstream reaches
    }else{
      # If tmp_id is higher than 1 I need to find the upstream reaches of each HYRIV_ID in tmp_id
      # I use a second loop go through codes stored in tmp_id 
     for (j in 1:length(tmp_id)){
       tmp_id2 <- df[df$NEXT_DOWN == tmp_id[[j]],]$HYRIV_ID
       if(length(tmp_id2) == 0){
         n <- append(n, tmp_id[[j]]) # if tmp_id2 as no upstream reaches, then I add the codes in tmp_id to n, as this will be the only upstream reaches
       }else{
         n <- append(n, tmp_id2) 
       }
     }
    }
  pt_up[[i]] <- n # add reaches HYRIV_ID upstream of target reach to the list 
  names(pt_up)[i] <- id[[i]] # name list element to keep track of target reach
  
  }

This works fine for point B (i = 2), where both upstream reaches represent end points (in red and purple in the figure bellow).

However, for point A, where I have multiple ramifications (colored examples in next figure are not exhaustive) I would need to repeat the process in the second loop for each new reach. I though this might be accomplished by placing the second loop inside a repeat function, but I'm not sure how.

EDIT

I ended up with a solution using a repeat loop, but I'm sure there's a cleaner way to do the trick!

pt_up <- list() # create a list to store the upstream HYRIV_ID for each target code in id
for (i in 1:length(id)){
  n <- id[[i]] # creates vector with target HYRIV_ID, where all upstream reaches will also be stored
  tmp_id <- df[df$NEXT_DOWN == id[[i]],]$HYRIV_ID # identifies reaches which have n as NEXT_DOWN reach
  
  # if the target reach is the most upstream, then it can't be downstream of any other
  if(length(tmp_id) == 0){
    n <- n
    # if not, I need to find the next upstream reaches
  }else{
    # If tmp_id is higher than 1 I need to find the upstream reaches of each HYRIV_ID in tmp_id
    # I use a second loop go through codes stored in tmp_id 
    n <- append(n, tmp_id) # adds the HYRIV_ID of the first upstream reaches
    tmp_id2 <- tmp_id # create a new vector that will be updated within the loop, without overwriting
    
    ####### NEW SECTION ADDED HERE ######
    repeat{ # this is where I added the repeat loop    
      for (j in 1:length(tmp_id2)){ # loops through all HYRIV_IDs stored in tmp_2 and repeats after tmp_id2 is updated
      if(length(df[df$NEXT_DOWN == tmp_id2[[j]],]$HYRIV_ID) == 0){
        n <- n # if tmp_id2 has no upstream reaches, then I add the codes in tmp_id to n, as this will be the only upstream reaches
      }else{
        id2 <- df[df$NEXT_DOWN == tmp_id2[[j]],]$HYRIV_ID # HYRIV_ID upstream of codes stored in tmp_id2
        n <- append(n, id2) # add codes to n
      }
      }
      tmp_id2 <- id2 # updated tmp_id2 with the new IDs so to repeat the loop
      if (length(df[df$NEXT_DOWN %in% tmp_id2,]$HYRIV_ID) == 0 & unique(tmp_id2 %in% n) == TRUE){
        break 
      } # this will stop the repeat loop when there are no upstream reaches left 
    }
    ####### END OF NEW SECTION ######

  pt_up[[i]] <- n # add reaches HYRIV_ID upstream of target reach to the list 
  names(pt_up)[i] <- id[[i]] # name list element to keep track of target reach
  }
}

Solution

Your data model of upstream and downstream river segments is a directed acyclic graph, where nodes (vertices) of that graph represent river segments and an edge (connection) between a node represents an upstream -> downstream relationship.

You can use the R package igraph to represent your data and compute on it. Given a directed acyclic graph, how do I determine all of the ancestors of a given node?

library(igraph)
a <- c(41397427, 41397438, 41397439, 41397440, 41397466, 41397467, 41397494, 41397495, 41397512, 41397513, 41397514, 41397566, 41397584)
b <- c(41397438, 41397494, 41397466, 41397439, 41397438, 41397440, 41397401, 41397440, 41397466, 41397439, 41397495, 41397495, 41397494)
df <- data.frame(HYRIV_ID = a, NEXT_DOWN = b)
xx <- graph_from_data_frame(df)

Now use the subcomponent function to find all nodes reachable from a specified vertex, that are going "into" that node (i.e., the ancestor chain):

# Point A
subcomponent(xx, "41397494", "in")
## + 13/14 vertices, named, from a2cc5ff:
## [1] 41397494 41397438 41397584 41397427 41397466 41397439 41397512 41397440
## [9] 41397513 41397467 41397495 41397514 41397566

# Point B
subcomponent(xx, "41397495", "in")
# + 3/14 vertices, named, from a2cc5ff:
# [1] 41397495 41397514 41397566

You can easily exclude the node of interest now if desired.