Search code examples
rphylogenyapeggtree

Getting from a data frame to a phylogenetic tree and back


I would like to use R to manipulate and display phylogenetic trees, using tabular input that stores parent/child relations between taxa.

Suppose I have a data frame of this form:

phylo_df <- data.frame(
  node   = c("t1", "t2", "t3", "t4", "t5", "A", "B", "C"),
  parent = c("A",  "A",  "B",  "C",  "C",  "B", "D", "D"),
  length = c(0.7, 0.08, 0.32, 0.24, 0.07, 0.91, 0.7, 0.95)
)

> phylo_df
  node parent length
1   t1      A   0.70
2   t2      A   0.08
3   t3      B   0.32
4   t4      C   0.24
5   t5      C   0.07
6    A      B   0.91
7    B      D   0.70
8    C      D   0.95

From this, I would like to create a phylogenetic tree, readable to R in Newick and phylo formats. The result is supposed to look like this:

library(ape)
library(ggtree)

newick_tree <- read.tree(text="((t5:0.07,t4:0.24)C:0.95,(t3:0.32,(t2:0.08,t1:0.7)A:0.91)B:0.7)D; ")
    
ggtree(newick_tree)+
      geom_tiplab(color='firebrick')+
      geom_nodelab(aes(x=branch, label=label, subset=nchar(label)>0), geom="label")+
      theme_tree2()

Phylogenetic tree

Having this phylogenetic tree (a phylo object of the ape package), I would like to manipulate it manually, e. g. adding an outgroup. Ideally, I would like to do this in the phylo object without having to manually change the Newick file (unlike in the example):

tree2 <- read.tree(text="(((t5:0.07,t4:0.24)C:0.95,(t3:0.32,(t2:0.08,t1:0.7)A:0.91)B:0.7)D:0.5, t6:0.5)E; ")

ggtree(tree2)+
  geom_tiplab(color='firebrick')+
  geom_nodelab(aes(x=branch, label=label, subset=nchar(label)>0), geom="label")+
  theme_tree2()

Extended phylogenetic tree

After the tree has been changed, I would like to convert it back to a tabular format:

> phylo_df
   node parent length
1    t1      A   0.70
2    t2      A   0.08
3    t3      B   0.32
4    t4      C   0.24
5    t5      C   0.07
6     A      B   0.91
7     B      D   0.70
8     C      D   0.95
9     D      E   0.50
10   t6      E   0.50

Basically, what I want is to translate data between a tabular format and common phylogenetic formats without having to manually change Newick files.


Solution

  • Here is a potential portable solution for transforming a tree into a table and the other way around. These two functions are based on the structure of the edge table that is internally used in ape. You can find more info on the structure of the table here.

    ## Create the table from any phylo object
    phylo.to.table <- function(tree) {
        ## Create the table
        edge_table <- data.frame(tree$edge)
        edge_table <- edge_table[,c(2,1)]
        colnames(edge_table) <- c("node", "parent")
        ## Add the edge.length
        if(!is.null(tree$edge.length)) {
            edge_table <- cbind(edge_table, tree$edge.length)
            colnames(edge_table)[3] <- c("length")
    
        }
        ## Convert the names
        if(!is.null(tree$tip.label)) {
            tip_matches <- which(edge_table[, "node"] %in% 1:length(tree$tip.label))
            edge_table[tip_matches, "node"] <- tree$tip.label[edge_table[tip_matches, "node"]]
        }
        if(!is.null(tree$node.label)) {
            ## Node (i.e. descendants)
            nodes <- as.integer(edge_table[, "node"])
            nodes[is.na(nodes)] <- 0
            node_matches <- which(nodes %in% (1:length(tree$node.label) + Ntip(tree)) )
            edge_table[node_matches, "node"] <- tree$node.label[nodes[node_matches]-Ntip(tree)]
            ## Parents (i.e. ancestors)
            parents <- as.integer(edge_table[, "parent"])
            node_matches <- which(parents %in% (1:length(tree$node.label) + Ntip(tree)) )
            edge_table[node_matches, "parent"] <- tree$node.label[parents[node_matches]-Ntip(tree)]
        }
        return(edge_table)
    }
    
    ## Creates a phylo object from a table
    table.to.phylo <- function(table) {
        ## Create the edge table
        edge <- table[, c("parent", "node")]
        ## Get the edge lengths
        if(any("length" %in% colnames(table))) {
            edge.length <- table[, "length"]
        } else {
            edge.length <- NULL
        }
    
        ## Get the number of nodes (half the number of edges for a bifurcating tree)
        Nnode <- (nrow(edge)/2)
    
        ## Convert the edge table into integers
        ## Detecting each elements
        tips  <- edge[(!edge[,2] %in% edge[,1]), 2] # tips are in the node column only
        nodes <- edge[(edge[,2] %in% edge[,1]), 2] # nodes are in both columns
        root  <- unique(edge[!(edge[,1] %in% edge[, 2]), 1]) # root is only in the parent column
        ## Get the elements integer value
        root_int <- length(tips)+1
        nodes_int <- paste0("node_temp_", 1:Nnode + root_int)
        tips_int <- 1:length(tips)
    
        ## Update the root as integer
        edge[,1] <- gsub(root, root_int, edge[,1])
        ## Update the nodes as integers
        recursive.gsub <- function(x, y, vector) {
            while(length(x) > 0) {
                vector <- gsub(x[1], y[1], vector)
                x <- x[-1]
                y <- y[-1]
            }
            return(vector)
        }
        edge <- apply(edge, 2, function(x, nodes, nodes_int) recursive.gsub(nodes, nodes_int, x), nodes, nodes_int)
        ## Update the tips as integers
        edge[,2] <- recursive.gsub(tips, tips_int, edge[,2])
        ## Recursively reorder the nodes (root must connect to root+1 and root+2, etc...)
        first_node <- max_node <- root_int
        while(length(grep("node_temp_", edge)) > 0) {
            ## Name the new nodes
            new_node_id <- 1:2 + max_node
            ## Find the descendants of the first node
            to_replace <- edge[grep(first_node, edge[,1]),2]
            ## Check if needs replacement
            needs_replacement <- grep("node_temp_", to_replace)
            if(length(needs_replacement) > 0) {
                to_replace <- to_replace[needs_replacement]
                new_node_id <- new_node_id[1:length(to_replace)]
                ## Replace in the table
                edge <- apply(edge, 2, function(x, to_replace, new_node_id) recursive.gsub(to_replace, new_node_id, x), to_replace, new_node_id)
            }
            ## Update next node to check
            first_node <- new_node_id[1]
            max_node <- max(new_node_id)
        }
        
        ## Get the tip labels
        tip.label <- tips
        node.label <- c(root,nodes)
    
        ## Format it as an integer matrix
        edge <- apply(edge, 2, as.integer)
        colnames(edge) <- NULL
        ## Sort them by root for convenience
        edge <- edge[order(edge[, 1]),]
    
        ## Make the list
        tree <- list(edge = edge, tip.label = tip.label, Nnode = Nnode, edge.length = edge.length, node.label = node.label)
        class(tree) <- "phylo"
        return(tree)
    }
    

    And here's an example of using these functions to toggle between trees and tables

    ## Using your example data.frame
    phylo_df <- data.frame(
      node   = c("t1", "t2", "t3", "t4", "t5", "A", "B", "C"),
      parent = c("A",  "A",  "B",  "C",  "C",  "B", "D", "D"),
      length = c(0.7, 0.08, 0.32, 0.24, 0.07, 0.91, 0.7, 0.95)
    )
    
    ## Creating a phylo object
    (my_tree <- table.to.phylo(phylo_df))
    # 
    # Phylogenetic tree with 5 tips and 4 internal nodes.
    # 
    # Tip labels:
    #   t1, t2, t3, t4, t5
    # Node labels:
    #   D, A, B, C
    # 
    # Rooted; includes branch lengths.
    
    ## Creating the table (you can ignore the coercion warning)
    (my_table <- phylo.to.table(my_tree))
    #   node parent length
    # 1    A      D   0.70
    # 2    B      D   0.08
    # 3   t3      A   0.32
    # 4    C      A   0.24
    # 5   t4      B   0.07
    # 6   t5      B   0.91
    # 7   t1      C   0.70
    # 8   t2      C   0.95
    

    Note that the order is a bit different as in the input table to follow the standardised order from the ape edge tables.

    Then, for the part going from phylo to newick, you can just use these two wrapper functions for ape::write.tree and ape::read.tree.

    ## Two other functions for switching from newick to phylo
    phylo.to.newick <- function(tree) {
        write.tree(tree)
    }
    newick.to.phylo <- function(string) {
        read.tree(test = string)
    }