Search code examples
rigraphggraph

igraph node annotation with external annotation information


This was the question which was solved where I had colors coming from modules which I wanted to label. Now I would like to know how can I put the annotation if I have another node file I would like my graph something like this or similar figure where they have annotated the network based on functional enrichment of the genes

This is my basic code to generate the base graph my files

 net_table = read.csv("TGCA_subtype_figure3/string_interactions.tsv",sep = "\t",check.names = FALSE)
head(net_table) 
names(net_table)[1] = "node1"

net_table_filter = net_table %>% select(node1,node2,coexpression) 


net_func = read.csv("TGCA_subtype_figure3/string_functional_annotations.tsv",check.names = FALSE,sep = "\t") 
head(net_func) 
names(net_func)[1] = "name"

edges = net_table_filter
names(edges)[1] = "source"
names(edges)[2] = "target"
names(edges)[3] = "weight"


nodes = net_func %>% select(name,`term description`)
names(nodes)[1] = "name"
names(nodes)[2] = "id"

nodes = nodes[!duplicated(nodes$name),]

g <- graph_from_data_frame(d=edges, vertices=nodes, directed=FALSE)
g

Plot the data

V(g) # nodes V(g)$name # names of each node vertex_attr(g) # all attributes of the nodes E(g) # edges E(g)$weight # weights for each edge edge_attr(g) # all attributes of the edges g[] # adjacency matrix

plot(g,
     vertex.color = "grey", # change color of nodes
     vertex.label.color = "black", # change color of labels
     vertex.label.cex = .75, # change size of labels to 75% of original size
     edge.curved=.25, # add a 25% curve to the edges
     edge.color="grey20") # change edge color to grey

The output i get is this which looks not so good, now my question is

How do i use string_functional_annotations.tsv information to annotate the network.

I do see many nodes which are not connected so is there a way where if there are nodes which are without any interaction can be removed or not rendered in the final network

How did I get here I have list of genes which were used as input for stringdb then i downloaded two files one is network other is functional enrichment. I tried to format my data based on this tutorial

Any help or suggestion would be really appreciated

DATA UPDATE Nodes

structure(list(name = c("A2ML1", "A4GNT", "AARD", "ABCC6", "ABCG1", 
"ABHD6", "ACACB", "ACCS", "ACOT12", "ACTL8", "ACTN1", "ACTN2", 
"ACTR3C", "ACVR2A", "ADAD2", "ADAM28", "ADAM33", "ADAMTS12", 
"ADAMTS4", "ADAMTS6", "ADAMTSL4", "ADD3", "ADGB", "ADRB2", "ADRB3", 
"AGBL4", "AJUBA", "AKAP7", "AKR1B1", "ALDH3B1", "ALDH4A1", "AMDHD1", 
"AMHR2", "AMOT", "ANKEF1", "ANKRD1", "ANKRD20A1", "ANO4", "ANO9", 
"ANXA10", "AP1M2", "AP1S2", "APMAP", "APOL3", "AQP12B", "AQP3", 
"AQP4", "ARAP2", "ARHGAP10", "ARHGAP20", "ARHGAP28", "ARHGAP4", 
"ARHGEF15", "ARID5B", "ARL9", "ARMC5", "ARSH", "ASAP3", "ASIC2", 
"ASNS", "ASTL", "ATP1A3", "ATP2B2", "ATP8A1", "AVPR2", "B4GALT2", 
"B4GALT6", "B9D1", "BAGE5", "BCAR1", "BCAS4", "BCL9", "BCO1", 
"BDH1", "BMI1", "BPIFB3", "BSG", "BTBD17", "BTN1A1", "C10orf55", 
"C11orf96", "C15orf56", "C15orf65", "C17orf64", "C19orf33", "C1QTNF6", 
"C1orf53", "C2orf72", "C2orf80", "C3orf84", "C4B", "C4BPB", "C4orf45", 
"C4orf48", "C4orf50", "C4orf51", "C5orf46", "CA13", "CA7", "CACNA2D1"
), id = c("Negative regulation of metabolic process", "Carbohydrate metabolic process", 
"Multicellular organism development", "System process", "Regulation of peptide secretion", 
"Lipid metabolic process", "Organic acid metabolic process", 
"Organic acid metabolic process", "Organic acid metabolic process", 
"Organelle organization", "Cell morphogenesis", "MAPK cascade", 
"Organelle organization", "Reproduction", "Nucleobase-containing compound metabolic process", 
"Reproduction", "Proteolysis", "Proteoglycan metabolic process", 
"Skeletal system development", "Blood vessel development", "Epithelial cell development", 
"Transport", "Proteolysis", "Temperature homeostasis", "Regulation of protein phosphorylation", 
"Immune effector process", "G2/M transition of mitotic cell cycle", 
"Action potential", "Reproduction", "Cell activation", "Cellular aldehyde metabolic process", 
"Organic acid metabolic process", "Reproduction", "Angiogenesis", 
"Binding", "Negative regulation of transcription by rna polymerase ii", 
"Plasma membrane", "Transport", "Transport", "Binding", "Immune system process", 
"Immune system process", "Metabolic process", "Nitrogen compound metabolic process", 
"Transport", "Response to hypoxia", "Immune system process", 
"Cell communication", "Organelle organization", "Cell communication", 
"Cell communication", "Regulation of cell growth", "Eye development", 
"Reproduction", "Nucleotide binding", "Intracellular", "Catalytic activity", 
"Movement of cell or subcellular component", "Nervous system process involved in regulation of systemic arterial blood pressure", 
"Response to acid chemical", "Reproduction", "Transport", "System process", 
"Cell activation", "Cytokine production", "System process", "Cell morphogenesis", 
"Eye development", "Extracellular region", "Regulation of cell growth", 
"Intracellular", "Regulation of transcription, dna-templated", 
"Retinoid metabolic process", "Generation of precursor metabolites and energy", 
"Negative regulation of transcription by rna polymerase ii", 
"Immune system process", "Reproduction", "Response to external stimulus", 
"Regulation of cytokine production", "Mixed, incl. zinc finger, c2h2 type, and prespliceosome", 
"Mixed, incl. olfactory receptor, and krueppel-associated box", 
"Anthropometric measurement", "Mixed, incl. williams-beuren syndrome, and cell cycle regulatory protein", 
"Mixed, incl. rab-gtpase-tbc domain, and keratin, high sulfur b2 protein", 
"Intracellular", "Binding", "Mostly uncharacterized, incl. akirin, and split hand-foot malformation 1", 
"Mostly uncharacterized, incl. ly-6 antigen/upa receptor-like, and acetylcholine receptor regulator activity", 
"Mostly uncharacterized, incl. magnesium ion transmembrane transport, and putative golgin subfamily a member 2-like protein 5", 
"Mostly uncharacterized, incl. phospholipid translocation, and domain of unknown function duf4210", 
"Response to molecule of bacterial origin", "Adaptive immune response", 
"Mixed, incl. b-box-type zinc finger, and zinc finger, ring-type", 
"Extracellular region", "Pulmonary function measurement", "Mixed, incl. mfs transporter superfamily, and tlv/env coat polyprotein", 
"Extracellular region", "One-carbon metabolic process", "One-carbon metabolic process", 
"Action potential")), row.names = c(1L, 288L, 389L, 406L, 900L, 
1242L, 1453L, 1828L, 1875L, 1988L, 2046L, 2438L, 3075L, 3175L, 
3557L, 3626L, 3743L, 3839L, 4010L, 4168L, 4297L, 4465L, 4698L, 
4754L, 5278L, 5465L, 5708L, 6049L, 6216L, 6572L, 6749L, 6942L, 
7041L, 7256L, 7540L, 7568L, 7960L, 7999L, 8114L, 8265L, 8305L, 
8510L, 8824L, 8890L, 8990L, 9021L, 9206L, 9436L, 9554L, 9676L, 
9741L, 9846L, 10034L, 10186L, 10492L, 10524L, 10669L, 10740L, 
10893L, 11141L, 11446L, 11574L, 12118L, 12370L, 12676L, 13056L, 
13244L, 13455L, 13852L, 13870L, 14131L, 14155L, 14326L, 14457L, 
14599L, 14853L, 14891L, 15236L, 15274L, 15407L, 15420L, 15429L, 
15438L, 15444L, 15455L, 15489L, 15536L, 15542L, 15546L, 15553L, 
15559L, 15844L, 15983L, 16008L, 16018L, 16050L, 16062L, 16084L, 
16139L, 16245L), class = "data.frame")

Edges

structure(list(source = c("A2ML1", "A2ML1", "ABCG1", "ABCG1", 
"ABCG1", "ABCG1", "ABCG1", "ABHD6", "ACACB", "ACACB", "ACACB", 
"ACACB", "ACACB", "ACACB", "ACOT12", "ACOT12", "ACTL8", "ACTL8", 
"ACTN1", "ACTN1", "ACTN1", "ACTN1", "ACTN1", "ACTN1", "ACTN1", 
"ACTN1", "ACTN1", "ACTN1", "ACTN1", "ACTN1", "ACTN1", "ACTN1", 
"ACTN1", "ACTN1", "ACTN2", "ACTN2", "ACTN2", "ACTN2", "ACTN2", 
"ACTN2", "ACTN2", "ACTN2", "ACTN2", "ACTN2", "ACTN2", "ACTN2", 
"ACTN2", "ACTN2", "ACTR3C", "ACVR2A", "ACVR2A", "ACVR2A", "ACVR2A", 
"ACVR2A", "ADAD2", "ADAD2", "ADAD2", "ADAD2", "ADAM33", "ADAM33", 
"ADAMTS12", "ADAMTS12", "ADAMTS12", "ADAMTS4", "ADAMTS4", "ADAMTS4", 
"ADAMTS4", "ADAMTS4", "ADAMTS4", "ADAMTS6", "ADAMTS6", "ADAMTS6", 
"ADAMTS6", "ADAMTS6", "ADAMTS6", "ADAMTSL4", "ADAMTSL4", "ADAMTSL4", 
"ADAMTSL4", "ADAMTSL4", "ADGB", "ADGB", "ADGB", "ADGB", "ADRB2", 
"ADRB2", "ADRB2", "ADRB2", "ADRB2", "ADRB2", "ADRB2", "ADRB2", 
"ADRB2", "ADRB3", "ADRB3", "ADRB3", "AGBL4", "AJUBA", "AJUBA", 
"AKAP7"), target = c("C4orf51", "EPPK1", "LRP2", "NPC1L1", "DHCR7", 
"JAM2", "PLTP", "CNR1", "G6PC", "ACOT12", "NXNL2", "LPIN1", "ME3", 
"ELOVL6", "SLCO1A2", "ACACB", "C5orf46", "C4orf51", "ITGB4", 
"DAPK2", "KIF1B", "MYH15", "GRIA4", "ITGB6", "MICALL2", "MYOZ2", 
"CSRP2", "ITGA11", "FERMT2", "MYLK", "FSCN1", "OAS1", "BCAR1", 
"ACTN2", "MYOM2", "GRIA4", "MICALL2", "MYOZ2", "CSRP2", "MEF2C", 
"FERMT2", "MYLK", "KLHL31", "ANKRD1", "AQP4", "ACTN1", "BCAR1", 
"SCN5A", "LRRC61", "DUSP2", "SMAD6", "ENG", "INHBB", "MSTN", 
"ENO4", "GABRA3", "M1AP", "ADGB", "LTC4S", "ADRB2", "ADAMTS4", 
"ADAMTSL4", "ADAMTS6", "EFEMP2", "MATN3", "ERMN", "ADAMTS6", 
"ADAMTS12", "ADAMTSL4", "ARHGAP20", "ASAP3", "ADAMTS4", "ADAMTSL4", 
"NRSN2", "ADAMTS12", "LTBP3", "ADAMTS4", "FBN3", "ADAMTS12", 
"ADAMTS6", "ADAD2", "M1AP", "HIVEP3", "IQCA1", "OPRD1", "GABBR2", 
"ADAM33", "F2R", "AVPR2", "CNR1", "EDN2", "ADRB3", "BSG", "MYOM2", 
"ADRB2", "CEBPA", "SYN3", "WWTR1", "TNFSF4", "CDCP2"), weight = c(0, 
0, 0, 0.123, 0, 0, 0.062, 0, 0.063, 0.065, 0, 0.064, 0.062, 0.107, 
0.218, 0.065, 0, 0, 0.062, 0.062, 0, 0.095, 0, 0.111, 0.065, 
0.265, 0.362, 0, 0.085, 0.183, 0.073, 0, 0.069, 0, 0.406, 0.098, 
0, 0.49, 0.362, 0.077, 0.062, 0.183, 0.421, 0.146, 0.332, 0, 
0.06, 0.081, 0.052, 0, 0.062, 0, 0, 0.062, 0, 0, 0, 0, 0, 0, 
0.062, 0.062, 0.076, 0, 0, 0.083, 0, 0.062, 0, 0, 0, 0, 0.058, 
0, 0.076, 0.064, 0, 0, 0.062, 0.058, 0, 0, 0, 0.053, 0, 0, 0, 
0, 0, 0, 0, 0, 0, 0, 0, 0, 0.052, 0.104, 0, 0)), row.names = c(NA, 
100L), class = "data.frame")

my image


Solution

  • PesKchan, in the absence of reproducible data (I suggest using dput() as you did previously), I am going to respond using the data set from your earlier question, and build on that answer. I draw heavily on this response so if you find this works for you, please consider voting up the original idea Repel text from edges in network. The approach that @JBGruber came up with is to create new nodes from the labels and "let the network layout algorithm do the work".

    library(igraph)
    library(ggraph)
    library(dplyr)
    
    dd <- 
    structure(list(gene1 = c("GBA3", "GBA3", "GBA3", "GBA3", "GBA3", 
                             "GBA3", "GBA3", "GBA3", "GBA3", "GBA3", "GBA3", "GBA3", "GBA3", 
                             "GBA3", "GBA3", "IGHV3-52", "IGHV3-52", "IGHV3-52", "IGHV3-52", 
                             "IGHV3-52", "IGHV3-52", "IGHV3-52", "IGHV3-52", "IGHV3-52", "IGHV3-52", 
                             "IGHV3-52", "IGHV3-52", "IGHV3-52", "IGHV3-52", "IGHV3-52", "GGNBP1", 
                             "GGNBP1", "GGNBP1", "GGNBP1", "GGNBP1", "GGNBP1", "GGNBP1", "GGNBP1", 
                             "GGNBP1", "GGNBP1", "GGNBP1", "GGNBP1", "GGNBP1", "GGNBP1", "GGNBP1", 
                             "OR52B6", "OR52B6", "OR52B6", "OR52B6", "OR52B6", "OR52B6", "OR52B6", 
                             "OR52B6", "OR52B6", "OR52B6", "OR52B6", "OR52B6", "OR52B6", "OR52B6", 
                             "OR52B6"), gene2 = c("LRP2BP", "ADGB", "ASNSP3", "HSD17B2", "HSP90B1", 
                                                  "IFT22", "P4HB", "TTC22", "XKR9", "IQSEC2", "NECAB2", "ANO1", 
                                                  "CPPED1", "MAGEE1", "MAPRE3", "COTL1P1", "OR13G1", "FTH1P11", 
                                                  "KRT8P44", "LINC00243", "MYOZ1", "PARD6G", "PDLIM5", "RN7SL67P", 
                                                  "PARP3", "SH3BGRL3", "KIF1B", "CDK6", "CYP24A1", "TFEB", "LRP2BP", 
                                                  "ADGB", "ASNSP3", "HSD17B2", "HSP90B1", "IFT22", "P4HB", "TTC22", 
                                                  "XKR9", "IQSEC2", "NECAB2", "ANO1", "CPPED1", "MAGEE1", "MAPRE3", 
                                                  "COTL1P1", "OR13G1", "FTH1P11", "KRT8P44", "LINC00243", "MYOZ1", 
                                                  "PARD6G", "PDLIM5", "RN7SL67P", "PARP3", "SH3BGRL3", "KIF1B", 
                                                  "CDK6", "CYP24A1", "TFEB"), correlation = c(1.19842058210312e-07, 
                                                                                              3.95592260312023e-09, 1.18879994893077e-09, 3.67331679745971e-10, 
                                                                                              5.48302012245219e-09, 7.97197389702251e-06, 9.7387584019434e-08, 
                                                                                              5.77878345171157e-08, 1.01118703571283e-08, 1.81543845754574e-07, 
                                                                                              3.7673420265534e-08, 1.02575704450652e-08, 4.82487451740043e-08, 
                                                                                              1.65401803325697e-07, 2.95827225165244e-09, 1.35635056964288e-07, 
                                                                                              1.16813988688191e-09, 1.34340296981193e-07, 5.26153755948588e-08, 
                                                                                              5.06031471203736e-05, 1.63465042896832e-09, 2.10400523574347e-09, 
                                                                                              1.08460550923374e-08, 1.09938266167239e-06, 3.31572488037795e-08, 
                                                                                              3.97957891649769e-07, 2.0833042793021e-08, 4.16797585733493e-06, 
                                                                                              1.02162139939232e-07, 3.74962089757379e-06, 5.10285758466629e-07, 
                                                                                              0.000165189152741692, 0.000572780674091671, 2.43056928465514e-07, 
                                                                                              0.00166978419035755, 2.3826397075692e-07, 0.000204964046470693, 
                                                                                              1.32648351252772e-06, 2.79759921075308e-06, 1.11117833192239e-06, 
                                                                                              6.87171744654038e-09, 2.33022551088771e-09, 2.7732284839245e-06, 
                                                                                              1.74867497254059e-06, 1.16457488078883e-08, 2.58493584273799e-05, 
                                                                                              0.000117632422231583, 0.000115191350816912, 3.45926695804785e-05, 
                                                                                              6.60444623946169e-07, 8.48280303856373e-09, 9.3470012463335e-07, 
                                                                                              2.33358874243648e-05, 9.13982092399789e-05, 6.16545562787355e-06, 
                                                                                              0.0014007113940871, 1.549339320847e-06, 0.000373320941277797, 
                                                                                              2.87750585085082e-08, 0.00105876974504533), module1 = structure(c(9L, 
                                                                                                                                                                9L, 9L, 9L, 9L, 9L, 9L, 9L, 9L, 9L, 9L, 9L, 9L, 9L, 9L, 9L, 9L, 
                                                                                                                                                                9L, 9L, 9L, 9L, 9L, 9L, 9L, 9L, 9L, 9L, 9L, 9L, 9L, 14L, 14L, 
                                                                                                                                                                14L, 14L, 14L, 14L, 14L, 14L, 14L, 14L, 14L, 14L, 14L, 14L, 14L, 
                                                                                                                                                                14L, 14L, 14L, 14L, 14L, 14L, 14L, 14L, 14L, 14L, 14L, 14L, 14L, 
                                                                                                                                                                14L, 14L), .Label = c("black", "blue", "brown", "cyan", "green", 
                                                                                                                                                                                      "greenyellow", "grey", "magenta", "midnightblue", "pink", "purple", 
                                                                                                                                                                                      "red", "salmon", "tan", "turquoise", "yellow"), class = "factor"), 
                   module2 = structure(c(3L, 7L, 2L, 7L, 1L, 4L, 1L, 3L, 5L, 
                                         15L, 15L, 7L, 15L, 3L, 15L, 15L, 2L, 15L, 2L, 3L, 7L, 16L, 
                                         15L, 11L, 15L, 15L, 15L, 2L, 3L, 15L, 3L, 7L, 2L, 7L, 1L, 
                                         4L, 1L, 3L, 5L, 15L, 15L, 7L, 15L, 3L, 15L, 15L, 2L, 15L, 
                                         2L, 3L, 7L, 16L, 15L, 11L, 15L, 15L, 15L, 2L, 3L, 15L), .Label = c("black", 
                                                                                                            "blue", "brown", "cyan", "green", "greenyellow", "grey", 
                                                                                                            "magenta", "midnightblue", "pink", "purple", "red", "salmon", 
                                                                                                            "tan", "turquoise", "yellow"), class = "factor")), row.names = c(NA, 
                                                                                                                                                                             -60L), class = c("tbl_df", "tbl", "data.frame"))
    
    library(igraph)
    library(ggraph)
    library(dplyr)
    
    
    df1 <- dd %>%
        select(gene = gene1, color = module1)
    
    df2 <- dd %>%
        select(gene = gene2, color = module2)
    
    df_verts<- unique(rbind(df1,df2))
    
    
    
    #using sample just to create this example
    sample <- sample(df_verts$gene, 8)
    
    #add the labels as separate vertices
    df_verts_lab <- tibble(gene = paste0("Label",1:length(sample)), color = NA, label = sample)
    df_verts$label <- NA
    df_verts <- rbind(df_verts, df_verts_lab)
                           
    #Add fields to existing relationships to indicate if they are labels and a color for the edge
    rel_base <- dd[,1:2] %>%
        mutate(lab = 0, color = "steelblue")
    
    #Create relationships for the labels
    rel_lab <- tibble(gene1 = sample, gene2 = paste0("Label",1:length(sample)), lab = 1, color="grey")
    
    rel <- rbind(rel_base, rel_lab)
    
    
    g2 <- graph_from_data_frame(rel, vertices = df_verts)
    
    
    ggraph(g2, layout = "igraph", algorithm = "fr") +
        geom_node_point(aes(color = color,filter = is.na(label)), shape = 20, size = 1)+
        geom_edge_link(aes(edge_color = color), width = 0.1) +
        geom_node_text(aes(label = label, color = color, filter = !is.na(label))) +
        theme_void() +
        theme(legend.position = "none") 
    

    enter image description here