Search code examples
rtree

Renaming nwk tree tip labels based on character strings and a factors file


I'm looking to rename the tip labels of my nwk tree (colwell_tree) using a taxonomy file (colwell_taxonomy). I tried identifying the old tip labels to the taxonomy file and assign new labels from the same file shown below.

colwell_tree$tip.label[colwell_tree$tip.label %in%
                         colwell_taxonomy$sample.id] <- colwell_taxonomy$Genus

However, this does nothing, surprisingly. I suspect it's because the tip name is part of a longer string in the tree, but I don't know how to get around that.

> dput(colwell_taxonomy)
structure(list(sample.id = c("AB109878.1", "AB109879.1", "AB109880.1", 
"AB109881.1", "AB109882.1", "AB109883.1", "AB109884.1", "AB109885.1"
), Kingdom = c("d__Archaea", "d__Archaea", "d__Archaea", "d__Archaea", 
"d__Archaea", "d__Archaea", "d__Archaea", "d__Archaea"), Phylum = c("Crenarchaeota", 
"Crenarchaeota", "Crenarchaeota", "Crenarchaeota", NA, "Crenarchaeota", 
"Crenarchaeota", "Crenarchaeota"), Class = c("Bathyarchaeia", 
"Bathyarchaeia", "Bathyarchaeia", "Bathyarchaeia", NA, "Bathyarchaeia", 
"Bathyarchaeia", "Bathyarchaeia"), Order = c("Bathyarchaeia", 
"Bathyarchaeia", "Bathyarchaeia", "Bathyarchaeia", NA, "Bathyarchaeia", 
"Bathyarchaeia", "Bathyarchaeia"), Family = c("Bathyarchaeia", 
"Bathyarchaeia", "Bathyarchaeia", "Bathyarchaeia", NA, "Bathyarchaeia", 
"Bathyarchaeia", "Bathyarchaeia"), Genus = c("Bathyarchaeia", 
"Bathyarchaeia", "Bathyarchaeia", "Bathyarchaeia", NA, "Bathyarchaeia", 
"Bathyarchaeia", "Bathyarchaeia"), Species = c("uncultured_marine", 
"uncultured_archaeon", "uncultured_archaeon", "uncultured_archaeon", 
NA, "uncultured_archaeon", "uncultured_archaeon", NA)), row.names = c(NA, 
-8L), class = "data.frame")

> dput(colwell_tree)
structure(list(edge = structure(c(9L, 10L, 10L, 9L, 11L, 11L, 
12L, 13L, 13L, 12L, 14L, 14L, 15L, 15L, 10L, 1L, 2L, 11L, 3L, 
12L, 13L, 4L, 5L, 14L, 6L, 15L, 7L, 8L), dim = c(14L, 2L)), edge.length = c(0.0341921975, 
5e-09, 0.12821348, 0.000367458500000008, 0.027617765, 0.037677039, 
0.028633124, 0.014468092, 5e-09, 0.009763081, 0.078168769, 0.021640684, 
0.341568464, 0.092957415), Nnode = 7L, node.label = c("root", 
"0.917", "", "0.929", "0.921", "0.302", "0.692"), tip.label = c("'AB109881.1 Uncultured archaeon gene for 16S rRNA, partial sequence, clone:pMLA-4'", 
"'AB109880.1 Uncultured archaeon gene for 16S rRNA, partial sequence, clone:pMLA-3'", 
"'AB109883.1 Uncultured archaeon gene for 16S rRNA, partial sequence, clone:pMLA-6'", 
"'AB109879.1 Uncultured archaeon gene for 16S rRNA, partial sequence, clone:pMLA-2'", 
"'AB109884.1 Uncultured archaeon gene for 16S rRNA, partial sequence, clone:pMLA-7'", 
"'AB109878.1 Uncultured archaeon gene for 16S rRNA, partial sequence, clone:pMLA-1'", 
"'AB109882.1 Uncultured archaeon gene for 16S rRNA, partial sequence, clone:pMLA-5'", 
"'AB109885.1 Uncultured archaeon gene for 16S rRNA, partial sequence, clone:pMLA-8'"
)), class = "phylo", order = "cladewise")

Solution

  • I'm not sure what you are trying to do exactly but it seems like you want to join these two datasets. In which case you need a unique column in colwell_tree to use for the join.

    Some ideas for how you can isolate colwell_taxonomy$sample.id from colwell_tree$tip.label:

    library(tidyverse, quietly = TRUE)
    
    text <- "'AB109880.1 Uncultured archaeon gene for 16S rRNA, partial sequence, clone:pMLA-3'"
    
    # Option 1
    paste0("AB", parse_number(text))
    # Option 2
    str_split(text, " ")[[1]][1]