How can I draw a line along my graph to show me where the data density is the highest?

datos = fits.open('/home/citlali/Documentos/Servicio/Lista.fits')
data = datos[1].data


#Linea [SIII] 9532
Mask_1 = data['flux_[SIII]9531.1_Re_fit'] / data['e_flux_[SIII]9531.1_Re_fit'] > 5
newdata1 = data[Mask_1]
dat_flux = newdata1['flux_[SIII]9069.0_Re_fit']
dat_eflux = newdata1['e_flux_[SIII]9069.0_Re_fit']
Mask_2 = dat_flux / dat_eflux > 5
newdata2 = newdata1[Mask_2]


H1_alpha = newdata1['log_NII_Ha_Re']
H1_beta = newdata1['log_OIII_Hb_Re']
H2_alpha = newdata2['log_NII_Ha_Re']
H2_beta = newdata2['log_OIII_Hb_Re']


M = H1_alpha < -0.9
newx = H1_alpha[M]
newy = H1_beta[M]  
ex = newx 
ey = newy 
#print("Elementos de SIII [9532]: ", len(newx))
m = H2_alpha < -0.9
newxm = H2_alpha[m]
newym = H2_beta[m] 
#print("Elementos de SIII [9069]: ", len(newxm))

sm = heapq.nsmallest(3000, zip(newx, newy)) # zip them to sort together
newx, newy = zip(*sm) # unzip them

plt.figure()
plt.plot(H1_alpha, H1_beta, '*', color ='darkred', markersize="7", label = "SIII [9532]") 
plt.plot(H2_alpha, H2_beta, '.', color ='rosybrown', markersize="3", label = "SIII [9069]")
plt.xlim(-1.5, 0.75)
plt.ylim(-1, 1) 
plt.title('Diagrama de diagnóstico')
plt.ylabel('OIII/Hbeta')
plt.xlabel('NII/Halpha')
plt.grid()
plt.legend()
fig = plt.gcf()
fig.set_size_inches(8, 6)
plt.show()

In the figure I show my plot, and the black line is what I want to obtain.

The code reads me downloaded data that I have plotted, it shows galaxies with a signal/noise greater than 5. The data to consider for plotting the line must be H1_alpha, H1_beta and/or H2_alpha, H2_beta.

Solution

The example below using a test dataset shows how you can derive sample weights based on density, and then supply those sample weights to a curve-fitting algorithm.

The example dataset is a quadratic-like curve/density embedded in noise:

We can model the density using sklearn's GaussianMixture. It provides a score for each sample, where higher scores correspond to the more probable (i.e more dense) areas.

Next, we use scipy's curve_fit() to fit a quadratic curve using the previously-computed sample weights. curve_fit needs the inverse of the weights (error bars), so we supply it with sigma=1/weights:

Overlaying the individual steps:

import numpy as np
import matplotlib.pyplot as plt

#
#Make some test data - a quadratic curve embedded in noise
#
from sklearn.datasets import make_moons
X1, y1 = make_moons(500, noise=0.7, random_state=0)
X2, y2 = make_moons(500, noise=0.4, random_state=1)
X3, y3 = make_moons(600, noise=0.08, random_state=2)
X = np.concatenate([X1, X2, X3[y3==0]], axis=0)

data_xy = X
data_x, data_y = X[:, 0], X[:, 1]

#View the data
plt.scatter(data_x, data_y, marker='s', s=1)
plt.gca().set(xlabel='x', ylabel='y')
plt.gcf().set_size_inches(8, 3)

#
#Model the density and view the results.
#

# Increasing "n_components" gives you a more granular/patchy-looking fit; tune as needed.
from sklearn.mixture import GaussianMixture
gmm = GaussianMixture(n_components=20, random_state=0).fit(data_xy)

#Define a regular XY grid
x_axis_fine = np.linspace(data_x.min(), data_x.max())
y_axis_fine = np.linspace(data_y.min(), data_y.max())
xx, yy = np.meshgrid(x_axis_fine, y_axis_fine)

log_scores = gmm.score_samples(np.c_[xx.ravel(), yy.ravel()]).reshape(xx.shape)
scores = np.exp(log_scores)

plt.contour(xx, yy, scores, levels=6, cmap='plasma_r', alpha=1)

#Get the density score for each datapoint
# we can optionally scale them to interpret them as weights summing to 1
data_scores = np.exp(gmm.score_samples(data_xy))
data_scores /= data_scores.sum()


#
#Define and fit a curve using scipy's curve_fit
#
from scipy.optimize import curve_fit

def quadratic_func(x, A, B, C):
    #Given x and coefficients (A, B C): y =  Ax^2 + Bx + C
    return A*x**2 + B*x + C

#Find optimal parameters
(A_opt, B_opt, C_opt), _ = curve_fit(quadratic_func, data_x, data_y, sigma=1 / data_scores)

plt.plot(
    x_axis_fine, quadratic_func(x_axis_fine, A_opt, B_opt, C_opt),
    color='tab:brown', lw=10, alpha=0.3, label='curve_fit()'
)
plt.gca().legend()

#Optional formatting
ax = plt.gca()
ax.spines['left'].set_bounds([-3, 2])
ax.spines['bottom'].set_bounds([-2, 3])
ax.spines[['top', 'right']].set_visible(False)