Getting from a data frame to a phylogenetic tree and back

I would like to use R to manipulate and display phylogenetic trees, using tabular input that stores parent/child relations between taxa.

Suppose I have a data frame of this form:

phylo_df <- data.frame(
  node   = c("t1", "t2", "t3", "t4", "t5", "A", "B", "C"),
  parent = c("A",  "A",  "B",  "C",  "C",  "B", "D", "D"),
  length = c(0.7, 0.08, 0.32, 0.24, 0.07, 0.91, 0.7, 0.95)
)

> phylo_df
  node parent length
1   t1      A   0.70
2   t2      A   0.08
3   t3      B   0.32
4   t4      C   0.24
5   t5      C   0.07
6    A      B   0.91
7    B      D   0.70
8    C      D   0.95

From this, I would like to create a phylogenetic tree, readable to R in Newick and phylo formats. The result is supposed to look like this:

library(ape)
library(ggtree)

newick_tree <- read.tree(text="((t5:0.07,t4:0.24)C:0.95,(t3:0.32,(t2:0.08,t1:0.7)A:0.91)B:0.7)D; ")
    
ggtree(newick_tree)+
      geom_tiplab(color='firebrick')+
      geom_nodelab(aes(x=branch, label=label, subset=nchar(label)>0), geom="label")+
      theme_tree2()

Having this phylogenetic tree (a phylo object of the ape package), I would like to manipulate it manually, e. g. adding an outgroup. Ideally, I would like to do this in the phylo object without having to manually change the Newick file (unlike in the example):

tree2 <- read.tree(text="(((t5:0.07,t4:0.24)C:0.95,(t3:0.32,(t2:0.08,t1:0.7)A:0.91)B:0.7)D:0.5, t6:0.5)E; ")

ggtree(tree2)+
  geom_tiplab(color='firebrick')+
  geom_nodelab(aes(x=branch, label=label, subset=nchar(label)>0), geom="label")+
  theme_tree2()

After the tree has been changed, I would like to convert it back to a tabular format:

> phylo_df
   node parent length
1    t1      A   0.70
2    t2      A   0.08
3    t3      B   0.32
4    t4      C   0.24
5    t5      C   0.07
6     A      B   0.91
7     B      D   0.70
8     C      D   0.95
9     D      E   0.50
10   t6      E   0.50

Basically, what I want is to translate data between a tabular format and common phylogenetic formats without having to manually change Newick files.

Solution

Here is a potential portable solution for transforming a tree into a table and the other way around. These two functions are based on the structure of the edge table that is internally used in ape. You can find more info on the structure of the table here.

## Create the table from any phylo object
phylo.to.table <- function(tree) {
    ## Create the table
    edge_table <- data.frame(tree$edge)
    edge_table <- edge_table[,c(2,1)]
    colnames(edge_table) <- c("node", "parent")
    ## Add the edge.length
    if(!is.null(tree$edge.length)) {
        edge_table <- cbind(edge_table, tree$edge.length)
        colnames(edge_table)[3] <- c("length")

    }
    ## Convert the names
    if(!is.null(tree$tip.label)) {
        tip_matches <- which(edge_table[, "node"] %in% 1:length(tree$tip.label))
        edge_table[tip_matches, "node"] <- tree$tip.label[edge_table[tip_matches, "node"]]
    }
    if(!is.null(tree$node.label)) {
        ## Node (i.e. descendants)
        nodes <- as.integer(edge_table[, "node"])
        nodes[is.na(nodes)] <- 0
        node_matches <- which(nodes %in% (1:length(tree$node.label) + Ntip(tree)) )
        edge_table[node_matches, "node"] <- tree$node.label[nodes[node_matches]-Ntip(tree)]
        ## Parents (i.e. ancestors)
        parents <- as.integer(edge_table[, "parent"])
        node_matches <- which(parents %in% (1:length(tree$node.label) + Ntip(tree)) )
        edge_table[node_matches, "parent"] <- tree$node.label[parents[node_matches]-Ntip(tree)]
    }
    return(edge_table)
}

## Creates a phylo object from a table
table.to.phylo <- function(table) {
    ## Create the edge table
    edge <- table[, c("parent", "node")]
    ## Get the edge lengths
    if(any("length" %in% colnames(table))) {
        edge.length <- table[, "length"]
    } else {
        edge.length <- NULL
    }

    ## Get the number of nodes (half the number of edges for a bifurcating tree)
    Nnode <- (nrow(edge)/2)

    ## Convert the edge table into integers
    ## Detecting each elements
    tips  <- edge[(!edge[,2] %in% edge[,1]), 2] # tips are in the node column only
    nodes <- edge[(edge[,2] %in% edge[,1]), 2] # nodes are in both columns
    root  <- unique(edge[!(edge[,1] %in% edge[, 2]), 1]) # root is only in the parent column
    ## Get the elements integer value
    root_int <- length(tips)+1
    nodes_int <- paste0("node_temp_", 1:Nnode + root_int)
    tips_int <- 1:length(tips)

    ## Update the root as integer
    edge[,1] <- gsub(root, root_int, edge[,1])
    ## Update the nodes as integers
    recursive.gsub <- function(x, y, vector) {
        while(length(x) > 0) {
            vector <- gsub(x[1], y[1], vector)
            x <- x[-1]
            y <- y[-1]
        }
        return(vector)
    }
    edge <- apply(edge, 2, function(x, nodes, nodes_int) recursive.gsub(nodes, nodes_int, x), nodes, nodes_int)
    ## Update the tips as integers
    edge[,2] <- recursive.gsub(tips, tips_int, edge[,2])
    ## Recursively reorder the nodes (root must connect to root+1 and root+2, etc...)
    first_node <- max_node <- root_int
    while(length(grep("node_temp_", edge)) > 0) {
        ## Name the new nodes
        new_node_id <- 1:2 + max_node
        ## Find the descendants of the first node
        to_replace <- edge[grep(first_node, edge[,1]),2]
        ## Check if needs replacement
        needs_replacement <- grep("node_temp_", to_replace)
        if(length(needs_replacement) > 0) {
            to_replace <- to_replace[needs_replacement]
            new_node_id <- new_node_id[1:length(to_replace)]
            ## Replace in the table
            edge <- apply(edge, 2, function(x, to_replace, new_node_id) recursive.gsub(to_replace, new_node_id, x), to_replace, new_node_id)
        }
        ## Update next node to check
        first_node <- new_node_id[1]
        max_node <- max(new_node_id)
    }
    
    ## Get the tip labels
    tip.label <- tips
    node.label <- c(root,nodes)

    ## Format it as an integer matrix
    edge <- apply(edge, 2, as.integer)
    colnames(edge) <- NULL
    ## Sort them by root for convenience
    edge <- edge[order(edge[, 1]),]

    ## Make the list
    tree <- list(edge = edge, tip.label = tip.label, Nnode = Nnode, edge.length = edge.length, node.label = node.label)
    class(tree) <- "phylo"
    return(tree)
}

And here's an example of using these functions to toggle between trees and tables

## Using your example data.frame
phylo_df <- data.frame(
  node   = c("t1", "t2", "t3", "t4", "t5", "A", "B", "C"),
  parent = c("A",  "A",  "B",  "C",  "C",  "B", "D", "D"),
  length = c(0.7, 0.08, 0.32, 0.24, 0.07, 0.91, 0.7, 0.95)
)

## Creating a phylo object
(my_tree <- table.to.phylo(phylo_df))
# 
# Phylogenetic tree with 5 tips and 4 internal nodes.
# 
# Tip labels:
#   t1, t2, t3, t4, t5
# Node labels:
#   D, A, B, C
# 
# Rooted; includes branch lengths.

## Creating the table (you can ignore the coercion warning)
(my_table <- phylo.to.table(my_tree))
#   node parent length
# 1    A      D   0.70
# 2    B      D   0.08
# 3   t3      A   0.32
# 4    C      A   0.24
# 5   t4      B   0.07
# 6   t5      B   0.91
# 7   t1      C   0.70
# 8   t2      C   0.95

Note that the order is a bit different as in the input table to follow the standardised order from the ape edge tables.

Then, for the part going from phylo to newick, you can just use these two wrapper functions for ape::write.tree and ape::read.tree.

## Two other functions for switching from newick to phylo
phylo.to.newick <- function(tree) {
    write.tree(tree)
}
newick.to.phylo <- function(string) {
    read.tree(test = string)
}