I would like to use R to manipulate and display phylogenetic trees, using tabular input that stores parent/child relations between taxa.
Suppose I have a data frame of this form:
phylo_df <- data.frame(
node = c("t1", "t2", "t3", "t4", "t5", "A", "B", "C"),
parent = c("A", "A", "B", "C", "C", "B", "D", "D"),
length = c(0.7, 0.08, 0.32, 0.24, 0.07, 0.91, 0.7, 0.95)
)
> phylo_df
node parent length
1 t1 A 0.70
2 t2 A 0.08
3 t3 B 0.32
4 t4 C 0.24
5 t5 C 0.07
6 A B 0.91
7 B D 0.70
8 C D 0.95
From this, I would like to create a phylogenetic tree, readable to R in Newick and phylo formats. The result is supposed to look like this:
library(ape)
library(ggtree)
newick_tree <- read.tree(text="((t5:0.07,t4:0.24)C:0.95,(t3:0.32,(t2:0.08,t1:0.7)A:0.91)B:0.7)D; ")
ggtree(newick_tree)+
geom_tiplab(color='firebrick')+
geom_nodelab(aes(x=branch, label=label, subset=nchar(label)>0), geom="label")+
theme_tree2()
Having this phylogenetic tree (a phylo object of the ape
package), I would like to manipulate it manually, e. g. adding an outgroup. Ideally, I would like to do this in the phylo object without having to manually change the Newick file (unlike in the example):
tree2 <- read.tree(text="(((t5:0.07,t4:0.24)C:0.95,(t3:0.32,(t2:0.08,t1:0.7)A:0.91)B:0.7)D:0.5, t6:0.5)E; ")
ggtree(tree2)+
geom_tiplab(color='firebrick')+
geom_nodelab(aes(x=branch, label=label, subset=nchar(label)>0), geom="label")+
theme_tree2()
After the tree has been changed, I would like to convert it back to a tabular format:
> phylo_df
node parent length
1 t1 A 0.70
2 t2 A 0.08
3 t3 B 0.32
4 t4 C 0.24
5 t5 C 0.07
6 A B 0.91
7 B D 0.70
8 C D 0.95
9 D E 0.50
10 t6 E 0.50
Basically, what I want is to translate data between a tabular format and common phylogenetic formats without having to manually change Newick files.
Here is a potential portable solution for transforming a tree into a table and the other way around.
These two functions are based on the structure of the edge
table that is internally used in ape
.
You can find more info on the structure of the table here.
## Create the table from any phylo object
phylo.to.table <- function(tree) {
## Create the table
edge_table <- data.frame(tree$edge)
edge_table <- edge_table[,c(2,1)]
colnames(edge_table) <- c("node", "parent")
## Add the edge.length
if(!is.null(tree$edge.length)) {
edge_table <- cbind(edge_table, tree$edge.length)
colnames(edge_table)[3] <- c("length")
}
## Convert the names
if(!is.null(tree$tip.label)) {
tip_matches <- which(edge_table[, "node"] %in% 1:length(tree$tip.label))
edge_table[tip_matches, "node"] <- tree$tip.label[edge_table[tip_matches, "node"]]
}
if(!is.null(tree$node.label)) {
## Node (i.e. descendants)
nodes <- as.integer(edge_table[, "node"])
nodes[is.na(nodes)] <- 0
node_matches <- which(nodes %in% (1:length(tree$node.label) + Ntip(tree)) )
edge_table[node_matches, "node"] <- tree$node.label[nodes[node_matches]-Ntip(tree)]
## Parents (i.e. ancestors)
parents <- as.integer(edge_table[, "parent"])
node_matches <- which(parents %in% (1:length(tree$node.label) + Ntip(tree)) )
edge_table[node_matches, "parent"] <- tree$node.label[parents[node_matches]-Ntip(tree)]
}
return(edge_table)
}
## Creates a phylo object from a table
table.to.phylo <- function(table) {
## Create the edge table
edge <- table[, c("parent", "node")]
## Get the edge lengths
if(any("length" %in% colnames(table))) {
edge.length <- table[, "length"]
} else {
edge.length <- NULL
}
## Get the number of nodes (half the number of edges for a bifurcating tree)
Nnode <- (nrow(edge)/2)
## Convert the edge table into integers
## Detecting each elements
tips <- edge[(!edge[,2] %in% edge[,1]), 2] # tips are in the node column only
nodes <- edge[(edge[,2] %in% edge[,1]), 2] # nodes are in both columns
root <- unique(edge[!(edge[,1] %in% edge[, 2]), 1]) # root is only in the parent column
## Get the elements integer value
root_int <- length(tips)+1
nodes_int <- paste0("node_temp_", 1:Nnode + root_int)
tips_int <- 1:length(tips)
## Update the root as integer
edge[,1] <- gsub(root, root_int, edge[,1])
## Update the nodes as integers
recursive.gsub <- function(x, y, vector) {
while(length(x) > 0) {
vector <- gsub(x[1], y[1], vector)
x <- x[-1]
y <- y[-1]
}
return(vector)
}
edge <- apply(edge, 2, function(x, nodes, nodes_int) recursive.gsub(nodes, nodes_int, x), nodes, nodes_int)
## Update the tips as integers
edge[,2] <- recursive.gsub(tips, tips_int, edge[,2])
## Recursively reorder the nodes (root must connect to root+1 and root+2, etc...)
first_node <- max_node <- root_int
while(length(grep("node_temp_", edge)) > 0) {
## Name the new nodes
new_node_id <- 1:2 + max_node
## Find the descendants of the first node
to_replace <- edge[grep(first_node, edge[,1]),2]
## Check if needs replacement
needs_replacement <- grep("node_temp_", to_replace)
if(length(needs_replacement) > 0) {
to_replace <- to_replace[needs_replacement]
new_node_id <- new_node_id[1:length(to_replace)]
## Replace in the table
edge <- apply(edge, 2, function(x, to_replace, new_node_id) recursive.gsub(to_replace, new_node_id, x), to_replace, new_node_id)
}
## Update next node to check
first_node <- new_node_id[1]
max_node <- max(new_node_id)
}
## Get the tip labels
tip.label <- tips
node.label <- c(root,nodes)
## Format it as an integer matrix
edge <- apply(edge, 2, as.integer)
colnames(edge) <- NULL
## Sort them by root for convenience
edge <- edge[order(edge[, 1]),]
## Make the list
tree <- list(edge = edge, tip.label = tip.label, Nnode = Nnode, edge.length = edge.length, node.label = node.label)
class(tree) <- "phylo"
return(tree)
}
And here's an example of using these functions to toggle between trees and tables
## Using your example data.frame
phylo_df <- data.frame(
node = c("t1", "t2", "t3", "t4", "t5", "A", "B", "C"),
parent = c("A", "A", "B", "C", "C", "B", "D", "D"),
length = c(0.7, 0.08, 0.32, 0.24, 0.07, 0.91, 0.7, 0.95)
)
## Creating a phylo object
(my_tree <- table.to.phylo(phylo_df))
#
# Phylogenetic tree with 5 tips and 4 internal nodes.
#
# Tip labels:
# t1, t2, t3, t4, t5
# Node labels:
# D, A, B, C
#
# Rooted; includes branch lengths.
## Creating the table (you can ignore the coercion warning)
(my_table <- phylo.to.table(my_tree))
# node parent length
# 1 A D 0.70
# 2 B D 0.08
# 3 t3 A 0.32
# 4 C A 0.24
# 5 t4 B 0.07
# 6 t5 B 0.91
# 7 t1 C 0.70
# 8 t2 C 0.95
Note that the order is a bit different as in the input table to follow the standardised order from the ape edge
tables.
Then, for the part going from phylo to newick, you can just use these two wrapper functions for ape::write.tree
and ape::read.tree
.
## Two other functions for switching from newick to phylo
phylo.to.newick <- function(tree) {
write.tree(tree)
}
newick.to.phylo <- function(string) {
read.tree(test = string)
}