Problem reproducing the predicted covariance of a gaussian process using gpytorch with same hyperparameters

I need to build a function that gives the a posteriori covariance of a Gaussian Process. The idea is to train a GP using GPytorch, then take the learned hyperparameters, and pass them into my kernel function. (for several reason I can't use the GPyTorch directly).

Now the problem is that I can't reproduce the prediction. Here the code I wrote. I have been working on it the whole day but I can't find the problem. Do you know what I am doing wrong?

        from gpytorch.mlls import ExactMarginalLogLikelihood 
        import numpy as np
        import gpytorch
        import torch
        train_x1 = torch.linspace(0, 0.95, 50) + 0.05 * torch.rand(50)
        train_y1 = torch.sin(train_x1 * (2 * np.pi)) + 0.2 * torch.randn_like(train_x1)

        n_datapoints = train_x1.shape[0]

        def kernel_rbf(x1, x2, c, l):
            # my RBF function
            if x1.shape is ():
                x1 = np.atleast_2d(x1)
            if x2.shape is ():
                x2 = np.atleast_2d(x2)
            return c * np.exp(- np.matmul((x1 - x2).T, (x1 - x2)) / (2 * l ** 2))

        class ExactGPModel(gpytorch.models.ExactGP):
            def __init__(self, train_x, train_y, likelihood):
                super().__init__(train_x, train_y, likelihood)

                lengthscale_prior = gpytorch.priors.GammaPrior(3.0, 6.0)
                outputscale_prior = gpytorch.priors.GammaPrior(2.0, 0.15)

                self.mean_module = gpytorch.means.ConstantMean()
                self.covar_module = gpytorch.kernels.ScaleKernel(
                    gpytorch.kernels.RBFKernel(lengthscale_prior=lengthscale_prior),
                    outputscale_prior=outputscale_prior)

            def forward(self, x):
                mean_x = self.mean_module(x)
                covar_x = self.covar_module(x)
                return gpytorch.distributions.MultivariateNormal(mean_x, covar_x)

        likelihood = gpytorch.likelihoods.GaussianLikelihood()
        model = ExactGPModel(train_x1, train_y1, likelihood)

        # Find optimal model hyperparameters
        model.train()
        likelihood.train()

        mll = ExactMarginalLogLikelihood(likelihood, model)

        # Use the Adam optimizer
        optimizer = torch.optim.Adam(model.parameters(), lr=0.1)  # Includes GaussianLikelihood parameters
        training_iterations = 50
        for i in range(training_iterations):
            optimizer.zero_grad()
            output = model(*model.train_inputs)
            loss = -mll(output, model.train_targets)
            loss.backward()
            print('Iter %d/%d - Loss: %.3f' % (i + 1, training_iterations, loss.item()))
            optimizer.step()

        # Get the learned hyperparameters 
        outputscale = model.covar_module.outputscale.item()
        lengthscale = model.covar_module.base_kernel.lengthscale.item()
        noise = likelihood.noise_covar.noise.item()

        train_x1 = train_x1.numpy()
        train_y1 = train_y1.numpy()

        # Get covariance train points
        K = np.zeros((n_datapoints, n_datapoints))
        for i in range(n_datapoints):
            for j in range(n_datapoints):
                K[i, j] = kernel_rbf(train_x1[i], train_x1[j], outputscale, lengthscale)

        # Add noise
        K += noise ** 2 * np.eye(n_datapoints)

        # Get covariance train-test points
        x_test = torch.rand(1, 1)
        Ks = np.zeros((n_datapoints, 1))
        for i in range(n_datapoints):
            Ks[i] = kernel_rbf(train_x1[i], x_test.numpy(), outputscale, lengthscale)

        # Get variance test points
        Kss = kernel_rbf(x_test.numpy(), x_test.numpy(), outputscale, lengthscale)

        L = np.linalg.cholesky(K)
        v = np.linalg.solve(L, Ks)
        var = Kss - np.matmul(v.T, v)

        model.eval()
        likelihood.eval()
        with gpytorch.settings.fast_pred_var():
            y_preds = likelihood(model(x_test))

        print(f"Predicted variance with gpytorch:{y_preds.variance.item()}")
        print(f"Predicted variance with my kernel:{var}")

Solution

I found the errors:

The noise is not squared so it is K += noise * np.eye(n_datapoints) and not K += noise**2 * np.eye(n_datapoints)
I forgot to add the noise term in the $$ K** $$, i.e. Kss += noise