Hi there I just managed to get this plot done in bokeh
, so I imagine there are many things that could be improved. Nonetheless, what bothers me the most is that I cannot figure out how to have all entries for my eight populations in the UMAP plot...
Right now it shows only one entry which I don't know if it's associated to the correct population, and I manipulate with legend_lable
.
What I actually want to show is a legend with all eight populations (EUR, SIB, AFR, SAS, CEA, OCE, MENA and AME) and their associated colors. See below for the code I used and and example for the plot. Any help is appreciated!
import numpy as np
import pandas as pd
import plotly.express as px
import bokeh.plotting as bp
from bokeh.plotting import ColumnDataSource, figure, show
from umap import UMAP
umap = pd.read_csv("SGDP_download/SGDP_bi_snps_norm-2.eigenvec", sep="\t")
umap.rename(columns={"#IID": "#ID"}, inplace=True)
loc = pd.read_csv("SGDP_download/pca_loc_fix_python-order.txt")
colors = pd.read_csv("SGDP_download/bokeh_colors.txt")
eigenval = pd.read_csv("SGDP_download/SGDP_bi_snps_norm-2.eigenval", header=None)
pve = round(eigenval / (eigenval.sum(axis=0))*100, 2)
pve.head()
umap.sort_values('#ID', inplace=True)
umap.insert(loc=1, column='#LOC', value=loc)
umap.rename(columns={'#ID': 'ID', '#LOC': 'LOC'}, inplace=True)
regions_umap = umap.iloc[:, 2:12]
umap_plot = UMAP(n_components=2, init="random", random_state=15)
umap_proj = umap_plot.fit_transform(regions_umap)
#umap_proj.view()
#umap_proj.shape
df = pd.DataFrame(umap_proj, columns=['UMAP1', 'UMAP2'])
df.insert(loc=0, column='population', value=loc)
df.insert(loc=1, column='color', value=colors)
df.index = umap["ID"]
source=ColumnDataSource(df)
#source
df
TOOLS="hover,crosshair,pan,wheel_zoom,zoom_in,zoom_out,box_zoom,undo,redo,reset,tap,save,box_select,poly_select,lasso_select,examine,help"
fig = figure(tools=TOOLS, x_axis_label='UMAP1', y_axis_label='UMAP2')
fig.scatter(x=df['UMAP1'], y=df['UMAP2'], color=df['color'], size=5, legend_label='population',
fill_alpha=0.6, line_color=None)
fig.legend.location = "top_left"
fig.legend.title = "metapopulations"
show(fig)
P.S. as a side note it is possible to have the legend at the bottom of the plot with the legend title centered?
EDIT this is what the df looks like @droumis
If you want to have a legend entry for each unique key in your column "population" the best is to use groupby()
from pandas and loop over the groups and plot the scatter for each.
See the minimal example below.
import pandas as pd
from bokeh.plotting import show, figure, output_notebook
from bokeh.models import Legend
output_notebook()
df = pd.DataFrame({
'UMAP1': [1,2,3,4,5,6],
'UMAP2': [1,2,3,4,5,6],
'population':['EUR', 'SIB', 'AME']*2,
'color':['#1e90ff', '#bdb76b', '#eeaeee']*2,
})
p = figure()
legend = Legend(orientation='horizontal')
p.add_layout(legend, 'below')
grouper = df.groupby('population')
for label, g in grouper:
p.scatter(g['UMAP1'], g['UMAP2'], color=g['color'], legend_label=label)
show(p)