I am trying to investigate the differences between Blood and SF (these are two levels of a variable called "Tissue"). I have 4 samples from Blood and 4 samples from SF which are paired so during plotting, I am envisioning 4 dots for each and 4 lines connecting the paired samples from each tissue type. The y-variable to be plotted is called "freqcd8bytissue". I hope to then facet it by the different levels of the variable "clusters_names".
Please see code below describing the dataset and resulting issues with the plot I obtain:
#data
df <- structure(list(sample_id = c("PB73-4", "PB73-4", "PB74", "PB74",
"PB76", "PB76", "PB81", "PB81", "PB73-4", "PB73-4", "PB74", "PB74",
"PB76", "PB76", "PB81", "PB81"), Tissue = c("Blood", "Blood",
"Blood", "Blood", "Blood", "Blood", "Blood", "Blood", "SF", "SF",
"SF", "SF", "SF", "SF", "SF", "SF"), Group = structure(c(2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L), levels = c("HC PBMC",
"axSpA PBMC", "axSpA SFMC", "InEx", "PD-1+ TIGIT+", "ReA PBMC",
"ReA SFMC"), class = "factor"), clusters_names = structure(c(2L,
1L, 1L, 2L, 2L, 1L, 2L, 1L, 1L, 2L, 2L, 1L, 1L, 2L, 2L, 1L), levels = c("GZMK+CXCR3-high",
"CD69+ITGAE+ TRM.1", "ISG-high", "CD69+ITGAE+ TRM.2", "GZMK+CXCR3-low",
"CD69+ITGAE+ TRM", "GNLY+GZMB+", "Mito-high", "CD69+ITGAE- TRM",
"GZMK+GZMB+", "Proliferating", "TCM-high"), class = "factor"),
CD8TcellCountbytissue = c(6457L, 6457L, 9349L, 9349L, 9349L,
9349L, 9349L, 9349L, 6457L, 6457L, 6457L, 6457L, 6457L, 6457L,
9349L, 9349L), patientcd8clustersizebytissue = c(2076L, 1354L,
1354L, 2076L, 2076L, 1354L, 2076L, 1354L, 1621L, 484L, 484L,
1621L, 1621L, 484L, 484L, 1621L), freqcd8bytissue = c(0.321511537865882,
0.20969490475453, 0.144828323884907, 0.222055834848647, 0.222055834848647,
0.144828323884907, 0.222055834848647, 0.144828323884907,
0.251045377110113, 0.0749574105621806, 0.0749574105621806,
0.251045377110113, 0.251045377110113, 0.0749574105621806,
0.0517702428067173, 0.173387528077869)), row.names = c(NA,
-16L), class = c("tbl_df", "tbl", "data.frame"))
#stats
stat.test_df <- df %>%
group_by(clusters_names) %>%
pairwise_t_test(freqcd8bytissue ~ Tissue, p.adjust.method = "bonferroni") %>%
add_significance()
stat.test_df <- stat.test_df %>% add_xy_position(x = "Tissue")
#plot
ggplot(df, aes(x = Tissue, y = freqcd8bytissue)) +
geom_point(aes(color = Tissue), alpha=0.4) +
geom_line(aes(group = sample_id)) +
geom_jitter(aes(color = Tissue), width = 0.05, height = 0.01) +
ylab("Frequency of CD8 subcluster in all CD8 cells, per patient") + stat_pvalue_manual(stat.test_df, label = "p.adj") +
scale_color_manual(values = c("#f2a21e", "#88a0c4")) +
facet_wrap(~ clusters_names) + theme_bw()
This gives me the following plot:
However, in this plot I am seeing more than 4 points for each Tissue type and I see only 3 lines connecting each pair when it should be 4 lines.
I welcome any suggestions to amend the code.
Three issues.
You have to sort the data first, because geom_line
does that behind the scenes so that they "flow" in the right direction.
You need to set the same seed for the jittering in both geom_point
and geom_line
so that they coincide.
Don't add geom_jitter
(reason why you see >4 points)
arrange(df, clusters_names, sample_id) %>%
ggplot( aes(x = Tissue, y = freqcd8bytissue)) +
geom_point(aes(color = Tissue), size=3, position=position_jitter(w=0.2, h=0, seed = 1)) +
geom_line(aes(group = sample_id), position=position_jitter(w=0.2, h=0, seed = 1)) +
#geom_jitter(aes(color = Tissue), width = 0.05, height = 0.01) +
ylab("Frequency of CD8 subcluster in all CD8 cells, per patient") +
stat_pvalue_manual(stat.test_df, label = "p.adj") +
scale_color_manual(values = c("#f2a21e", "#88a0c4")) +
facet_wrap(~ clusters_names) +
theme_bw()