CUDA program using exponential amount of system ram

My particle simulation is using up too much ram. What's concerning to me isn't the amount in and of itself, but the fact that it grows exponentially when I have reason to believe that it should not. I am using CUDA, which is the component I added most recently and thus what I suspect to be causing the issue. I have determined that it is not (only) a problem within the kernel, as the ram usage grows even when the kernel is not being run. I suspect it has to do with the way I am allocating memory, but I do not understand where I went wrong. I apologize for such a trifling question, I am new to CUDA (if that wasn't obvious). Here's the spaghetti in question, thank you for your time.


int main() {
    std::srand(time(0));
    window.setFramerateLimit(limit);
    window.setVerticalSyncEnabled(true);
    sf::Clock clock;
    
    while (window.isOpen()) {
        sf::Event evnt;
        while (window.pollEvent(evnt)) {
            switch (evnt.type) {
            case sf::Event::Closed:
                window.close();
                break;
            case sf::Event::TextEntered:
                if (evnt.text.unicode < 128) {
                    //printf("%c", evnt.text.unicode);
                }
            }
            
        }


        if (sf::Keyboard::isKeyPressed(sf::Keyboard::Key::Space)) {
            spawnParticle();
            
        }
        if (sf::Keyboard::isKeyPressed(sf::Keyboard::Key::R)) {
            for (auto particle : particleList) {
                delete particle;
            }
            particleList.clear();
        }

        window.clear(sf::Color::Color::Black);
        background.setFillColor(sf::Color::Color(25, 25, 25, 255));
        background.setPosition(-8, -8);
        window.draw(background);

        for (int i = 0; i < particleList.size(); i++) {
            particleList[i]->write(i);

        }


        int Num = particleList.size();

        // Vectors for holding the host-side (CPU-side) data
        float* h_big_algo, * h_big_relationships, * h_location,  * h_destinations, * h_energies, * h_frequencies;
        int* h_N;
        cudaMallocHost(&h_big_algo, Num * Num * 8 * sizeof(float));
        cudaMallocHost(&h_big_relationships, Num * Num * 3 * sizeof(float));
        cudaMallocHost(&h_location, Num * 2 * sizeof(float));
        cudaMallocHost(&h_N, sizeof(int));
        cudaMallocHost(&h_destinations, Num * 2 * sizeof(float));
        cudaMallocHost(&h_energies, Num * sizeof(float));
        cudaMallocHost(&h_frequencies, Num * sizeof(float));

        h_big_algo = big_algo.data();
        h_big_relationships = big_relationships.data();
        h_location = location_list.data();
        h_N = &Num;
        h_frequencies = frequencies.data();

        // Allocate device memory
        float* d_big_algo, * d_big_relationships, * d_location, *d_destinations,  *d_energies, *d_frequencies;
        int* d_N, * d_influence_N;
        cudaMalloc(&d_big_algo, Num * Num *8*sizeof(float));
        cudaMalloc(&d_big_relationships, Num * Num *3*sizeof(float));
        cudaMalloc(&d_location, Num *2*sizeof(float));
        cudaMalloc(&d_N, sizeof(int));
        cudaMalloc(&d_destinations, Num * 2 * sizeof(float));
        cudaMalloc(&d_influence_N, Num *sizeof(int));
        cudaMalloc(&d_energies, Num * sizeof(float));
        cudaMalloc(&d_frequencies, Num * sizeof(float));

        // Copy data to the device
        cudaMemcpy(d_big_algo, h_big_algo, Num * 8 * Num * sizeof(float), cudaMemcpyHostToDevice);
        cudaMemcpy(d_big_relationships, h_big_relationships, Num * Num * 3 * sizeof(float), cudaMemcpyHostToDevice);
        cudaMemcpy(d_location, h_location, Num * 2 * sizeof(float), cudaMemcpyHostToDevice);
        cudaMemcpy(d_N, h_N, sizeof(int), cudaMemcpyHostToDevice);
        cudaMemcpy(d_energies, energies.data(), Num * sizeof(float), cudaMemcpyHostToDevice);
        cudaMemcpy(d_frequencies, h_frequencies, Num * sizeof(float), cudaMemcpyHostToDevice);
        cudaMemcpy(d_destinations, h_location, Num * 2 * sizeof(float), cudaMemcpyHostToDevice);

        cudaMemset(d_influence_N, 0, Num * sizeof(int));

        int NUM_THREADS = 1024;

        int NUM_BLOCKS = (pow(Num,2) + NUM_THREADS - 1) / NUM_THREADS;

        move <<<NUM_BLOCKS, NUM_THREADS>>> (d_big_algo, d_big_relationships, d_location, d_N, 
                                                d_destinations, d_influence_N, d_energies, d_frequencies);
         
        // Copy back to the host
        cudaMemcpy(h_destinations, d_destinations, Num * 2 * sizeof(float), cudaMemcpyDeviceToHost);
        cudaMemcpy(h_energies, d_energies, Num * sizeof(float), cudaMemcpyDeviceToHost);

        // Free memory on device
        cudaFree(d_big_algo);
        cudaFree(d_big_relationships);
        cudaFree(d_location);
        cudaFree(d_N);
        cudaFree(d_destinations);
        cudaFree(d_influence_N);
        cudaFree(d_energies);
        cudaFree(d_frequencies);

        big_algo.clear();
        big_relationships.clear();

        location_list.clear();
        energies.clear();
        frequencies.clear();

        //read from h_locations and h_energies

        cudaFreeHost(h_big_algo);
        cudaFreeHost(h_big_relationships);
        cudaFreeHost(h_N);
        cudaFreeHost(h_frequencies);
        cudaFreeHost(h_location);
        apply_all(h_destinations, h_energies);
        cudaFreeHost(h_energies);
        cudaFreeHost(h_destinations);

        for (int i = 0; i < particleList.size(); i++) {
            particleList[i]->draw_self();

            /*if (particleList[i]->energy < 0) {
                cout << "particle died" << endl;
                particleList[i]->seppuku();
                //doomed_particles.push_back({ i, particleList[i] });
            }
            if (particleList[i]->energy > 10) {
                particleList[i]->reproduce();
                particleList[i]->energy -= reproduction_cost;
            }*/
            
        }
        

        window.display();
        }
        
    return 0;
}

and the kernel for good measure:

__global__ void move(float* d_big_algo, float* d_big_relationships, float* d_location, 
                int* N,  float* d_destinations, int * d_influence_N, float *d_energies, float*d_frequencies) {
    
    int id = (blockIdx.x * blockDim.x) + threadIdx.x;
    if (id < (*N)*(*N)) {
        //printf("%i ", *N);

        int subject = (id-((id+*N)%*N))/ *N;
        int object = (id + *N) % *N;
        
        float distance = sqrt(powf((d_location[object*2] - d_location[subject*2]), 2.0f) 
                    + powf((d_location[object * 2 +1] - d_location[subject * 2+ 1]), 2.0f));

        float relative_maximum = d_big_relationships[(object * 3) + (subject * *N * 3) +2];
        
        if ((distance < relative_maximum)&&(distance > 0)) {
            float relative_minimum = d_big_relationships[(object * 3) + (subject * *N * 3)];
            float relative_medium = d_big_relationships[(object * 3) + (subject * *N * 3) + 1];

            /*if (distance < 12) {
                if (abs(d_frequencies[subject] - d_frequencies[object]) > 0.1) {
                    if (d_energies[subject] > d_energies[object]) {
                        d_energies[subject]+=0.1;
                        d_energies[object]-=0.1;
                    }
                    if (d_energies[subject] < d_energies[object]) {
                        d_energies[subject]-=0.1;
                        d_energies[object]+=0.1;
                    }
                }
                //else {
                //  d_energies[subject]+= (d_energies[subject] - d_energies[object])/100;
                //}
            }*/

            if ((distance < 8) && (distance > 0)) {
                //printf("%i moving\n", id);
                float force = 2.0f * (-distance / powf(distance, 2));
                d_influence_N[subject] += 1;
                //printf("%i ready to sync\n", id);
                //__syncthreads();
                //printf("%i INFLUENCE ", d_influence_N[subject]);
                d_destinations[subject*2] += force * (d_location[object * 2] - d_location[subject * 2]);
                d_destinations[subject * 2+1] += force * (d_location[object * 2 + 1] - d_location[subject * 2 + 1]);
                //delete& force;

            }
            else if ((distance < relative_medium) && (distance > relative_minimum)) {
                //printf("%i moving\n", id);
                float force = d_big_algo[(object * 8) + (subject * *N * 8) + 4] * abs((d_big_algo[(object * 8) + (subject * *N * 8) + 5] * distance)
                                                - d_big_algo[(object * 8) + (subject * *N * 8) + 6]) + d_big_algo[(object * 8) + (subject * *N * 8) + 7];
                d_influence_N[subject] += 1;
                //printf("%i ready to sync\n", id);
                //__syncthreads();
                //printf("%i INFLUENCE ", d_influence_N[subject]);
                float destination_mod = (2.0f * d_influence_N[subject]) / powf(d_influence_N[subject], 2.0f);
                d_destinations[subject * 2] += force * (d_location[object * 2] - d_location[subject * 2]) * destination_mod;
                d_destinations[subject * 2 + 1] += force * (d_location[object * 2 + 1] - d_location[subject * 2 + 1]) * destination_mod;
                //delete& force;
                //delete& destination_mod;
            }

            else if (distance > relative_medium) {
                //printf("%i moving\n", id);
                float force = d_big_algo[(object * 8) + (subject * *N * 8)] * abs((d_big_algo[(object * 8) + (subject * *N * 8) +1] * distance)
                                                - d_big_algo[(object * 8) + (subject * *N * 8) + 2]) + d_big_algo[(object * 8) + (subject * *N * 8) + 3];
                d_influence_N[subject] += 1;
                //printf("%i ready to sync\n", id);
                //__syncthreads();
                //printf("%i INFLUENCE ", d_influence_N[subject]);
                float destination_mod = (2.0f * d_influence_N[subject]) / powf(d_influence_N[subject], 2.0f);
                d_destinations[subject * 2] += force * (d_location[object * 2] - d_location[subject * 2]) * destination_mod;
                d_destinations[subject * 2 + 1] += force * (d_location[object * 2 + 1] - d_location[subject * 2 + 1]) * destination_mod;
                //delete& force;
                //delete& destination_mod;
                
            }
            //delete& relative_minimum;
            //delete& relative_medium;
        }
        //delete& subject;
        //delete& object;
        //delete& distance;
        //delete& relative_maximum;
    }

    //__syncthreads();
    if (id < *N) {
        if (d_location[id * 2+1] < 2 || d_location[id * 2 + 1] > HEIGHT - 2) {
            d_destinations[id * 2+1] = HEIGHT / 2;
        }
        if (d_location[id * 2]<2 || d_location[id * 2] > WIDTH - 2) {
            d_destinations[id * 2] = WIDTH / 2;
        }

        if (d_location[id * 2] >= WIDTH - 10) {
            d_destinations[id * 2] = abs(d_location[id * 2]) - ((d_location[id * 2] - (WIDTH - 10)) / 2) / (WIDTH / abs(d_location[id * 2]));
        }
        if (d_location[id * 2] < 10) {
            d_destinations[id * 2] = abs(d_location[id * 2]) + ((d_location[id * 2] + 10) / 2) / (abs(d_location[id * 2]) + 0.1);
        }
        if (d_location[id * 2 + 1] >= HEIGHT - 10) {
            d_destinations[id * 2 + 1] = abs(d_location[id * 2 + 1]) - ((d_location[id * 2 + 1] - (HEIGHT - 10)) / 2) / (HEIGHT / abs(d_location[id * 2 + 1]));
        }
        if (d_location[id * 2 + 1] < 10) {
            d_destinations[id * 2 + 1] = abs(d_location[id * 2 + 1]) + ((d_location[id * 2 + 1] + 10) / 2) / (abs(d_location[id * 2 + 1]) + 0.1);
        }

        if (2.0f * (sqrt(powf(d_location[id * 2] - WIDTH / 2.0f, 2) + powf(d_location[id * 2 + 1] - WIDTH / 2.0f, 2))) > WIDTH) {
            if (d_location[id * 2 + 1] >= HEIGHT / 2.0f) {
                d_destinations[id * 2 + 1] = sqrt(abs(powf(HEIGHT / 2.0f, 2) - powf(d_location[id * 2] - HEIGHT / 2.0f, 2))) + HEIGHT / 2.0f;
            }
            if (d_location[id * 2 + 1] <= HEIGHT / 2.0f) {
                d_destinations[id * 2 + 1] = -sqrt(abs(powf(HEIGHT / 2.0f, 2) - powf(d_location[id * 2] - HEIGHT / 2.0f, 2))) + HEIGHT / 2.0f;
            }
            if (d_location[id * 2] <= WIDTH / 2.0f) {
                d_destinations[id * 2] = -sqrt(abs(powf(WIDTH / 2.0f, 2) - powf(d_location[id * 2 + 1] - WIDTH / 2.0f, 2))) + WIDTH / 2.0f;
            }
            if (d_location[id * 2] >= WIDTH / 2.0f) {
                d_destinations[id * 2] = sqrt(abs(powf(WIDTH / 2.0f, 2) - powf(d_location[id * 2 + 1] - WIDTH / 2.0f, 2))) + WIDTH / 2.0f;

                //send out
            }
        }
    }
    delete &id;
    
    
}

There is more to the code but these are the parts I believe to be causing the problem (perhaps worth noting I am also using SFML).

Solution

You call cudaMallocHost multiple times to allocate space and store it in local variables (h_big_algo, h_N, etc.) then immediately overwrite the returned pointers with other data (h_big_algo = big_algo.data();, h_N = &Num;, etc.).

This will leak the memory allocated by cudaMallocHost.

You later call cudaFreeHost which will free the memory returned by the function calls, not the memory allocated by cudaMallocHost. Worse, the call to cudaFreeHost(h_N); will pass a pointer to a local stack-based variable.

You should not call cudaMallocHost for those pointers you assign values to.