C++ 使用指数级系统ram的CUDA程序_C++_Cuda_Sfml

C++ 使用指数级系统ram的CUDA程序

c++ cuda

C++ 使用指数级系统ram的CUDA程序,c++,cuda,sfml,C++,Cuda,Sfml,我的粒子模拟占用了太多内存。我关心的不是数量本身，而是事实，当我有理由相信它不应该增长时，它会呈指数增长。我正在使用CUDA，这是我最近添加的组件，因此我怀疑是导致问题的原因。我已经确定这不仅仅是内核内部的问题，因为即使内核没有运行，ram的使用也会增加。我怀疑这与我分配内存的方式有关，但我不明白我哪里出错了。我为这样一个微不足道的问题道歉，我是CUDA的新手（如果这不是显而易见的话）。这是有问题的意大利面，谢谢你抽出时间 int main() { std::srand(time(0)

我的粒子模拟占用了太多内存。我关心的不是数量本身，而是事实，当我有理由相信它不应该增长时，它会呈指数增长。我正在使用CUDA，这是我最近添加的组件，因此我怀疑是导致问题的原因。我已经确定这不仅仅是内核内部的问题，因为即使内核没有运行，ram的使用也会增加。我怀疑这与我分配内存的方式有关，但我不明白我哪里出错了。我为这样一个微不足道的问题道歉，我是CUDA的新手（如果这不是显而易见的话）。这是有问题的意大利面，谢谢你抽出时间


int main() {
    std::srand(time(0));
    window.setFramerateLimit(limit);
    window.setVerticalSyncEnabled(true);
    sf::Clock clock;
    
    while (window.isOpen()) {
        sf::Event evnt;
        while (window.pollEvent(evnt)) {
            switch (evnt.type) {
            case sf::Event::Closed:
                window.close();
                break;
            case sf::Event::TextEntered:
                if (evnt.text.unicode < 128) {
                    //printf("%c", evnt.text.unicode);
                }
            }
            
        }


        if (sf::Keyboard::isKeyPressed(sf::Keyboard::Key::Space)) {
            spawnParticle();
            
        }
        if (sf::Keyboard::isKeyPressed(sf::Keyboard::Key::R)) {
            for (auto particle : particleList) {
                delete particle;
            }
            particleList.clear();
        }

        window.clear(sf::Color::Color::Black);
        background.setFillColor(sf::Color::Color(25, 25, 25, 255));
        background.setPosition(-8, -8);
        window.draw(background);

        for (int i = 0; i < particleList.size(); i++) {
            particleList[i]->write(i);

        }


        int Num = particleList.size();

        // Vectors for holding the host-side (CPU-side) data
        float* h_big_algo, * h_big_relationships, * h_location,  * h_destinations, * h_energies, * h_frequencies;
        int* h_N;
        cudaMallocHost(&h_big_algo, Num * Num * 8 * sizeof(float));
        cudaMallocHost(&h_big_relationships, Num * Num * 3 * sizeof(float));
        cudaMallocHost(&h_location, Num * 2 * sizeof(float));
        cudaMallocHost(&h_N, sizeof(int));
        cudaMallocHost(&h_destinations, Num * 2 * sizeof(float));
        cudaMallocHost(&h_energies, Num * sizeof(float));
        cudaMallocHost(&h_frequencies, Num * sizeof(float));

        h_big_algo = big_algo.data();
        h_big_relationships = big_relationships.data();
        h_location = location_list.data();
        h_N = &Num;
        h_frequencies = frequencies.data();

        // Allocate device memory
        float* d_big_algo, * d_big_relationships, * d_location, *d_destinations,  *d_energies, *d_frequencies;
        int* d_N, * d_influence_N;
        cudaMalloc(&d_big_algo, Num * Num *8*sizeof(float));
        cudaMalloc(&d_big_relationships, Num * Num *3*sizeof(float));
        cudaMalloc(&d_location, Num *2*sizeof(float));
        cudaMalloc(&d_N, sizeof(int));
        cudaMalloc(&d_destinations, Num * 2 * sizeof(float));
        cudaMalloc(&d_influence_N, Num *sizeof(int));
        cudaMalloc(&d_energies, Num * sizeof(float));
        cudaMalloc(&d_frequencies, Num * sizeof(float));

        // Copy data to the device
        cudaMemcpy(d_big_algo, h_big_algo, Num * 8 * Num * sizeof(float), cudaMemcpyHostToDevice);
        cudaMemcpy(d_big_relationships, h_big_relationships, Num * Num * 3 * sizeof(float), cudaMemcpyHostToDevice);
        cudaMemcpy(d_location, h_location, Num * 2 * sizeof(float), cudaMemcpyHostToDevice);
        cudaMemcpy(d_N, h_N, sizeof(int), cudaMemcpyHostToDevice);
        cudaMemcpy(d_energies, energies.data(), Num * sizeof(float), cudaMemcpyHostToDevice);
        cudaMemcpy(d_frequencies, h_frequencies, Num * sizeof(float), cudaMemcpyHostToDevice);
        cudaMemcpy(d_destinations, h_location, Num * 2 * sizeof(float), cudaMemcpyHostToDevice);

        cudaMemset(d_influence_N, 0, Num * sizeof(int));

        int NUM_THREADS = 1024;

        int NUM_BLOCKS = (pow(Num,2) + NUM_THREADS - 1) / NUM_THREADS;

        move <<<NUM_BLOCKS, NUM_THREADS>>> (d_big_algo, d_big_relationships, d_location, d_N, 
                                                d_destinations, d_influence_N, d_energies, d_frequencies);
         
        // Copy back to the host
        cudaMemcpy(h_destinations, d_destinations, Num * 2 * sizeof(float), cudaMemcpyDeviceToHost);
        cudaMemcpy(h_energies, d_energies, Num * sizeof(float), cudaMemcpyDeviceToHost);

        // Free memory on device
        cudaFree(d_big_algo);
        cudaFree(d_big_relationships);
        cudaFree(d_location);
        cudaFree(d_N);
        cudaFree(d_destinations);
        cudaFree(d_influence_N);
        cudaFree(d_energies);
        cudaFree(d_frequencies);

        big_algo.clear();
        big_relationships.clear();

        location_list.clear();
        energies.clear();
        frequencies.clear();

        //read from h_locations and h_energies

        cudaFreeHost(h_big_algo);
        cudaFreeHost(h_big_relationships);
        cudaFreeHost(h_N);
        cudaFreeHost(h_frequencies);
        cudaFreeHost(h_location);
        apply_all(h_destinations, h_energies);
        cudaFreeHost(h_energies);
        cudaFreeHost(h_destinations);

        for (int i = 0; i < particleList.size(); i++) {
            particleList[i]->draw_self();

            /*if (particleList[i]->energy < 0) {
                cout << "particle died" << endl;
                particleList[i]->seppuku();
                //doomed_particles.push_back({ i, particleList[i] });
            }
            if (particleList[i]->energy > 10) {
                particleList[i]->reproduce();
                particleList[i]->energy -= reproduction_cost;
            }*/
            
        }
        

        window.display();
        }
        
    return 0;
}


int main（）{
标准：srand（时间（0））；
设置帧率限制（限制）；
window.setVerticalSyncEnabled（真）；
sf：时钟；
while（window.isOpen（））{
sf：：事件evnt；
while（window.pollEvent（evnt））{
开关（evnt.type）{
案例sf:：事件：：已结束：
window.close（）；
打破
案例sf:：事件：：文本输入：
if（evnt.text.unicode<128）{
//printf（“%c”，evnt.text.unicode）；
}
}
}
如果（sf:：Keyboard:：isKeyPressed（sf:：Keyboard:：Key:：Space））{
粒子（）；
}
如果（sf:：Keyboard:：isKeyPressed（sf:：Keyboard:：Key:：R））{
用于（自动粒子：particleList）{
删除粒子；
}
particleList.clear（）；
}
窗口。清除（sf:：颜色：：颜色：：黑色）；
setFillColor（sf:：Color:：Color（25,25,25255））；
背景设置位置（-8，-8）；
窗口。绘制（背景）；
对于（int i=0；i写（i）；
}
int Num=particleList.size（）；
//用于保存主机端（CPU端）数据的向量
浮动*h_大算法，*h_大关系，*h_位置，*h_目的地，*h_能量，*h_频率；
int*hn；
cudamalochost（&h_big_algo，Num*Num*8*sizeof（float））；
cudaMallocHost（&h_big_relationships，Num*Num*3*sizeof（float））；
cudaMallocHost（&h_位置，Num*2*sizeof（float））；
库达马洛霍斯特（h_N，sizeof（int））；
cudaMallocHost（和h_目的地，数量*2*sizeof（浮动））；
Cudamalochost（&h_能量，Num*sizeof（float））；
cudaMallocHost（&h_频率，Num*sizeof（float））；
h_big_algo=big_algo.data（）；
h_big_relationships=big_relationships.data（）；
h_location=location_list.data（）；
h_N=&Num；
h_频率=频率。数据（）；
//分配设备内存
浮动*d_大算法、*d_大关系、*d_位置、*d_目的地、*d_能量、*d_频率；
int*d_N，*d_影响；
cudaMalloc（&d_big_algo，Num*Num*8*sizeof（float））；
cudamaloc（和d_big_关系，Num*Num*3*sizeof（float））；
cudaMalloc（&d_位置，数量*2*sizeof（浮动））；
库达马洛克(d_N,sizeof(int)),；
Cudamaloc（和d_目的地，数量*2*sizeof（浮动））；
cudamaloc（&d_influence_N，Num*sizeof（int））；
Cudamaloc（&d_能量，Num*sizeof（float））；
cudaMalloc（&d_频率，Num*sizeof（float））；
//将数据复制到设备
cudaMemcpy（d_big_algo，h_big_algo，Num*8*Num*sizeof（float），cudaMemcpyHostToDevice）；
cudaMemcpy（d_big_关系、h_big_关系、Num*Num*3*sizeof（float）、cudaMemcpyHostToDevice）；
cudaMemcpy（d_位置、h_位置、Num*2*sizeof（float）、cudaMemcpyHostToDevice）；
cudaMemcpy（d_N，h_N，sizeof（int），cudamemcpyhostodevice）；
cudaMemcpy（d_energies，energies.data（），Num*sizeof（float），cudaMemcpyHostToDevice）；
cudaMemcpy（d_频率、h_频率、Num*sizeof（float）、cudaMemcpyHostToDevice）；
cudaMemcpy（d_目的地、h_位置、Num*2*sizeof（float）、cudaMemcpyHostToDevice）；
cudaMemset（d_influence_N，0，Num*sizeof（int））；
int NUM_线程=1024；
int NUM_块=（pow（NUM，2）+NUM_线程-1）/NUM_线程；
移动（d_big_algo，d_big_关系，d_位置，d_N，
d_目的地、d_影响、d_能量、d_频率）；
//复制回主机
cudaMemcpy（h_目的地、d_目的地、Num*2*sizeof（float）、cudamemcpydevicetoost）；
cudaMemcpy（h_能量，d_能量，Num*sizeof（float），cudamemcpydevicetoost）；
//设备上的可用内存
cudaFree（d_big_algo）；
cudaFree（d_big_关系）；
cudaFree（d_位置）；
库达弗里（d_N）；
cudaFree（迪乌目的地）；
cudaFree（d_影响N）；
cudaFree（d_能量）；
cudaFree（d_频率）；
大算法清除（）；
大关系。清除（）；
位置_list.clear（）；
能量；
频率。清除（）；
//从h_位置和h_能量读取
cudaFreeHost（h_big_algo）；
cudaFreeHost（h_big_关系）；
cudaFreeHost（h_N）；
cudaFreeHost（h_频率）；
cudaFreeHost（h_位置）；
全部应用（目的地、能量）；
cudaFreeHost（h_能量）；
cudaFreeHost（h_目的地）；
对于（int i=0；i绘制自身（）；
/*if（粒子列表[i]->能量<0）{
库特能量>10）{
分词列表[i]->repearch（）；
粒子列表[i]>能量-=复制成本；
}*/
}
window.display（）；
}
返回0；
}

以及内核的良好措施：

__global__ void move(float* d_big_algo, float* d_big_relationships, float* d_location, 
                int* N,  float* d_destinations, int * d_influence_N, float *d_energies, float*d_frequencies) {
    
    int id = (blockIdx.x * blockDim.x) + threadIdx.x;
    if (id < (*N)*(*N)) {
        //printf("%i ", *N);

        int subject = (id-((id+*N)%*N))/ *N;
        int object = (id + *N) % *N;
        
        float distance = sqrt(powf((d_location[object*2] - d_location[subject*2]), 2.0f) 
                    + powf((d_location[object * 2 +1] - d_location[subject * 2+ 1]), 2.0f));

        float relative_maximum = d_big_relationships[(object * 3) + (subject * *N * 3) +2];
        
        if ((distance < relative_maximum)&&(distance > 0)) {
            float relative_minimum = d_big_relationships[(object * 3) + (subject * *N * 3)];
            float relative_medium = d_big_relationships[(object * 3) + (subject * *N * 3) + 1];

            /*if (distance < 12) {
                if (abs(d_frequencies[subject] - d_frequencies[object]) > 0.1) {
                    if (d_energies[subject] > d_energies[object]) {
                        d_energies[subject]+=0.1;
                        d_energies[object]-=0.1;
                    }
                    if (d_energies[subject] < d_energies[object]) {
                        d_energies[subject]-=0.1;
                        d_energies[object]+=0.1;
                    }
                }
                //else {
                //  d_energies[subject]+= (d_energies[subject] - d_energies[object])/100;
                //}
            }*/

            if ((distance < 8) && (distance > 0)) {
                //printf("%i moving\n", id);
                float force = 2.0f * (-distance / powf(distance, 2));
                d_influence_N[subject] += 1;
                //printf("%i ready to sync\n", id);
                //__syncthreads();
                //printf("%i INFLUENCE ", d_influence_N[subject]);
                d_destinations[subject*2] += force * (d_location[object * 2] - d_location[subject * 2]);
                d_destinations[subject * 2+1] += force * (d_location[object * 2 + 1] - d_location[subject * 2 + 1]);
                //delete& force;

            }
            else if ((distance < relative_medium) && (distance > relative_minimum)) {
                //printf("%i moving\n", id);
                float force = d_big_algo[(object * 8) + (subject * *N * 8) + 4] * abs((d_big_algo[(object * 8) + (subject * *N * 8) + 5] * distance)
                                                - d_big_algo[(object * 8) + (subject * *N * 8) + 6]) + d_big_algo[(object * 8) + (subject * *N * 8) + 7];
                d_influence_N[subject] += 1;
                //printf("%i ready to sync\n", id);
                //__syncthreads();
                //printf("%i INFLUENCE ", d_influence_N[subject]);
                float destination_mod = (2.0f * d_influence_N[subject]) / powf(d_influence_N[subject], 2.0f);
                d_destinations[subject * 2] += force * (d_location[object * 2] - d_location[subject * 2]) * destination_mod;
                d_destinations[subject * 2 + 1] += force * (d_location[object * 2 + 1] - d_location[subject * 2 + 1]) * destination_mod;
                //delete& force;
                //delete& destination_mod;
            }

            else if (distance > relative_medium) {
                //printf("%i moving\n", id);
                float force = d_big_algo[(object * 8) + (subject * *N * 8)] * abs((d_big_algo[(object * 8) + (subject * *N * 8) +1] * distance)
                                                - d_big_algo[(object * 8) + (subject * *N * 8) + 2]) + d_big_algo[(object * 8) + (subject * *N * 8) + 3];
                d_influence_N[subject] += 1;
                //printf("%i ready to sync\n", id);
                //__syncthreads();
                //printf("%i INFLUENCE ", d_influence_N[subject]);
                float destination_mod = (2.0f * d_influence_N[subject]) / powf(d_influence_N[subject], 2.0f);
                d_destinations[subject * 2] += force * (d_location[object * 2] - d_location[subject * 2]) * destination_mod;
                d_destinations[subject * 2 + 1] += force * (d_location[object * 2 + 1] - d_location[subject * 2 + 1]) * destination_mod;
                //delete& force;
                //delete& destination_mod;
                
            }
            //delete& relative_minimum;
            //delete& relative_medium;
        }
        //delete& subject;
        //delete& object;
        //delete& distance;
        //delete& relative_maximum;
    }

    //__syncthreads();
    if (id < *N) {
        if (d_location[id * 2+1] < 2 || d_location[id * 2 + 1] > HEIGHT - 2) {
            d_destinations[id * 2+1] = HEIGHT / 2;
        }
        if (d_location[id * 2]<2 || d_location[id * 2] > WIDTH - 2) {
            d_destinations[id * 2] = WIDTH / 2;
        }

        if (d_location[id * 2] >= WIDTH - 10) {
            d_destinations[id * 2] = abs(d_location[id * 2]) - ((d_location[id * 2] - (WIDTH - 10)) / 2) / (WIDTH / abs(d_location[id * 2]));
        }
        if (d_location[id * 2] < 10) {
            d_destinations[id * 2] = abs(d_location[id * 2]) + ((d_location[id * 2] + 10) / 2) / (abs(d_location[id * 2]) + 0.1);
        }
        if (d_location[id * 2 + 1] >= HEIGHT - 10) {
            d_destinations[id * 2 + 1] = abs(d_location[id * 2 + 1]) - ((d_location[id * 2 + 1] - (HEIGHT - 10)) / 2) / (HEIGHT / abs(d_location[id * 2 + 1]));
        }
        if (d_location[id * 2 + 1] < 10) {
            d_destinations[id * 2 + 1] = abs(d_location[id * 2 + 1]) + ((d_location[id * 2 + 1] + 10) / 2) / (abs(d_location[id * 2 + 1]) + 0.1);
        }

        if (2.0f * (sqrt(powf(d_location[id * 2] - WIDTH / 2.0f, 2) + powf(d_location[id * 2 + 1] - WIDTH / 2.0f, 2))) > WIDTH) {
            if (d_location[id * 2 + 1] >= HEIGHT / 2.0f) {
                d_destinations[id * 2 + 1] = sqrt(abs(powf(HEIGHT / 2.0f, 2) - powf(d_location[id * 2] - HEIGHT / 2.0f, 2))) + HEIGHT / 2.0f;
            }
            if (d_location[id * 2 + 1] <= HEIGHT / 2.0f) {
                d_destinations[id * 2 + 1] = -sqrt(abs(powf(HEIGHT / 2.0f, 2) - powf(d_location[id * 2] - HEIGHT / 2.0f, 2))) + HEIGHT / 2.0f;
            }
            if (d_location[id * 2] <= WIDTH / 2.0f) {
                d_destinations[id * 2] = -sqrt(abs(powf(WIDTH / 2.0f, 2) - powf(d_location[id * 2 + 1] - WIDTH / 2.0f, 2))) + WIDTH / 2.0f;
            }
            if (d_location[id * 2] >= WIDTH / 2.0f) {
                d_destinations[id * 2] = sqrt(abs(powf(WIDTH / 2.0f, 2) - powf(d_location[id * 2 + 1] - WIDTH / 2.0f, 2))) + WIDTH / 2.0f;

                //send out
            }
        }
    }
    delete &id;
    
    
}

\uuuu全局\uuuu无效移动（浮点*d\u大\u算法，浮点*d\u大\u关系，浮点*d\u低）