C++ 使用指数级系统ram的CUDA程序
我的粒子模拟占用了太多内存。我关心的不是数量本身,而是事实,当我有理由相信它不应该增长时,它会呈指数增长。我正在使用CUDA,这是我最近添加的组件,因此我怀疑是导致问题的原因。我已经确定这不仅仅是内核内部的问题,因为即使内核没有运行,ram的使用也会增加。我怀疑这与我分配内存的方式有关,但我不明白我哪里出错了。我为这样一个微不足道的问题道歉,我是CUDA的新手(如果这不是显而易见的话)。这是有问题的意大利面,谢谢你抽出时间C++ 使用指数级系统ram的CUDA程序,c++,cuda,sfml,C++,Cuda,Sfml,我的粒子模拟占用了太多内存。我关心的不是数量本身,而是事实,当我有理由相信它不应该增长时,它会呈指数增长。我正在使用CUDA,这是我最近添加的组件,因此我怀疑是导致问题的原因。我已经确定这不仅仅是内核内部的问题,因为即使内核没有运行,ram的使用也会增加。我怀疑这与我分配内存的方式有关,但我不明白我哪里出错了。我为这样一个微不足道的问题道歉,我是CUDA的新手(如果这不是显而易见的话)。这是有问题的意大利面,谢谢你抽出时间 int main() { std::srand(time(0)
int main() {
std::srand(time(0));
window.setFramerateLimit(limit);
window.setVerticalSyncEnabled(true);
sf::Clock clock;
while (window.isOpen()) {
sf::Event evnt;
while (window.pollEvent(evnt)) {
switch (evnt.type) {
case sf::Event::Closed:
window.close();
break;
case sf::Event::TextEntered:
if (evnt.text.unicode < 128) {
//printf("%c", evnt.text.unicode);
}
}
}
if (sf::Keyboard::isKeyPressed(sf::Keyboard::Key::Space)) {
spawnParticle();
}
if (sf::Keyboard::isKeyPressed(sf::Keyboard::Key::R)) {
for (auto particle : particleList) {
delete particle;
}
particleList.clear();
}
window.clear(sf::Color::Color::Black);
background.setFillColor(sf::Color::Color(25, 25, 25, 255));
background.setPosition(-8, -8);
window.draw(background);
for (int i = 0; i < particleList.size(); i++) {
particleList[i]->write(i);
}
int Num = particleList.size();
// Vectors for holding the host-side (CPU-side) data
float* h_big_algo, * h_big_relationships, * h_location, * h_destinations, * h_energies, * h_frequencies;
int* h_N;
cudaMallocHost(&h_big_algo, Num * Num * 8 * sizeof(float));
cudaMallocHost(&h_big_relationships, Num * Num * 3 * sizeof(float));
cudaMallocHost(&h_location, Num * 2 * sizeof(float));
cudaMallocHost(&h_N, sizeof(int));
cudaMallocHost(&h_destinations, Num * 2 * sizeof(float));
cudaMallocHost(&h_energies, Num * sizeof(float));
cudaMallocHost(&h_frequencies, Num * sizeof(float));
h_big_algo = big_algo.data();
h_big_relationships = big_relationships.data();
h_location = location_list.data();
h_N = &Num;
h_frequencies = frequencies.data();
// Allocate device memory
float* d_big_algo, * d_big_relationships, * d_location, *d_destinations, *d_energies, *d_frequencies;
int* d_N, * d_influence_N;
cudaMalloc(&d_big_algo, Num * Num *8*sizeof(float));
cudaMalloc(&d_big_relationships, Num * Num *3*sizeof(float));
cudaMalloc(&d_location, Num *2*sizeof(float));
cudaMalloc(&d_N, sizeof(int));
cudaMalloc(&d_destinations, Num * 2 * sizeof(float));
cudaMalloc(&d_influence_N, Num *sizeof(int));
cudaMalloc(&d_energies, Num * sizeof(float));
cudaMalloc(&d_frequencies, Num * sizeof(float));
// Copy data to the device
cudaMemcpy(d_big_algo, h_big_algo, Num * 8 * Num * sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy(d_big_relationships, h_big_relationships, Num * Num * 3 * sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy(d_location, h_location, Num * 2 * sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy(d_N, h_N, sizeof(int), cudaMemcpyHostToDevice);
cudaMemcpy(d_energies, energies.data(), Num * sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy(d_frequencies, h_frequencies, Num * sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy(d_destinations, h_location, Num * 2 * sizeof(float), cudaMemcpyHostToDevice);
cudaMemset(d_influence_N, 0, Num * sizeof(int));
int NUM_THREADS = 1024;
int NUM_BLOCKS = (pow(Num,2) + NUM_THREADS - 1) / NUM_THREADS;
move <<<NUM_BLOCKS, NUM_THREADS>>> (d_big_algo, d_big_relationships, d_location, d_N,
d_destinations, d_influence_N, d_energies, d_frequencies);
// Copy back to the host
cudaMemcpy(h_destinations, d_destinations, Num * 2 * sizeof(float), cudaMemcpyDeviceToHost);
cudaMemcpy(h_energies, d_energies, Num * sizeof(float), cudaMemcpyDeviceToHost);
// Free memory on device
cudaFree(d_big_algo);
cudaFree(d_big_relationships);
cudaFree(d_location);
cudaFree(d_N);
cudaFree(d_destinations);
cudaFree(d_influence_N);
cudaFree(d_energies);
cudaFree(d_frequencies);
big_algo.clear();
big_relationships.clear();
location_list.clear();
energies.clear();
frequencies.clear();
//read from h_locations and h_energies
cudaFreeHost(h_big_algo);
cudaFreeHost(h_big_relationships);
cudaFreeHost(h_N);
cudaFreeHost(h_frequencies);
cudaFreeHost(h_location);
apply_all(h_destinations, h_energies);
cudaFreeHost(h_energies);
cudaFreeHost(h_destinations);
for (int i = 0; i < particleList.size(); i++) {
particleList[i]->draw_self();
/*if (particleList[i]->energy < 0) {
cout << "particle died" << endl;
particleList[i]->seppuku();
//doomed_particles.push_back({ i, particleList[i] });
}
if (particleList[i]->energy > 10) {
particleList[i]->reproduce();
particleList[i]->energy -= reproduction_cost;
}*/
}
window.display();
}
return 0;
}
int main(){
标准:srand(时间(0));
设置帧率限制(限制);
window.setVerticalSyncEnabled(真);
sf:时钟;
while(window.isOpen()){
sf::事件evnt;
while(window.pollEvent(evnt)){
开关(evnt.type){
案例sf::事件::已结束:
window.close();
打破
案例sf::事件::文本输入:
if(evnt.text.unicode<128){
//printf(“%c”,evnt.text.unicode);
}
}
}
如果(sf::Keyboard::isKeyPressed(sf::Keyboard::Key::Space)){
粒子();
}
如果(sf::Keyboard::isKeyPressed(sf::Keyboard::Key::R)){
用于(自动粒子:particleList){
删除粒子;
}
particleList.clear();
}
窗口。清除(sf::颜色::颜色::黑色);
setFillColor(sf::Color::Color(25,25,25255));
背景设置位置(-8,-8);
窗口。绘制(背景);
对于(int i=0;i写(i);
}
int Num=particleList.size();
//用于保存主机端(CPU端)数据的向量
浮动*h_大算法,*h_大关系,*h_位置,*h_目的地,*h_能量,*h_频率;
int*hn;
cudamalochost(&h_big_algo,Num*Num*8*sizeof(float));
cudaMallocHost(&h_big_relationships,Num*Num*3*sizeof(float));
cudaMallocHost(&h_位置,Num*2*sizeof(float));
库达马洛霍斯特(h_N,sizeof(int));
cudaMallocHost(和h_目的地,数量*2*sizeof(浮动));
Cudamalochost(&h_能量,Num*sizeof(float));
cudaMallocHost(&h_频率,Num*sizeof(float));
h_big_algo=big_algo.data();
h_big_relationships=big_relationships.data();
h_location=location_list.data();
h_N=&Num;
h_频率=频率。数据();
//分配设备内存
浮动*d_大算法、*d_大关系、*d_位置、*d_目的地、*d_能量、*d_频率;
int*d_N,*d_影响;
cudaMalloc(&d_big_algo,Num*Num*8*sizeof(float));
cudamaloc(和d_big_关系,Num*Num*3*sizeof(float));
cudaMalloc(&d_位置,数量*2*sizeof(浮动));
库达马洛克(d_N,sizeof(int)),;
Cudamaloc(和d_目的地,数量*2*sizeof(浮动));
cudamaloc(&d_influence_N,Num*sizeof(int));
Cudamaloc(&d_能量,Num*sizeof(float));
cudaMalloc(&d_频率,Num*sizeof(float));
//将数据复制到设备
cudaMemcpy(d_big_algo,h_big_algo,Num*8*Num*sizeof(float),cudaMemcpyHostToDevice);
cudaMemcpy(d_big_关系、h_big_关系、Num*Num*3*sizeof(float)、cudaMemcpyHostToDevice);
cudaMemcpy(d_位置、h_位置、Num*2*sizeof(float)、cudaMemcpyHostToDevice);
cudaMemcpy(d_N,h_N,sizeof(int),cudamemcpyhostodevice);
cudaMemcpy(d_energies,energies.data(),Num*sizeof(float),cudaMemcpyHostToDevice);
cudaMemcpy(d_频率、h_频率、Num*sizeof(float)、cudaMemcpyHostToDevice);
cudaMemcpy(d_目的地、h_位置、Num*2*sizeof(float)、cudaMemcpyHostToDevice);
cudaMemset(d_influence_N,0,Num*sizeof(int));
int NUM_线程=1024;
int NUM_块=(pow(NUM,2)+NUM_线程-1)/NUM_线程;
移动(d_big_algo,d_big_关系,d_位置,d_N,
d_目的地、d_影响、d_能量、d_频率);
//复制回主机
cudaMemcpy(h_目的地、d_目的地、Num*2*sizeof(float)、cudamemcpydevicetoost);
cudaMemcpy(h_能量,d_能量,Num*sizeof(float),cudamemcpydevicetoost);
//设备上的可用内存
cudaFree(d_big_algo);
cudaFree(d_big_关系);
cudaFree(d_位置);
库达弗里(d_N);
cudaFree(迪乌目的地);
cudaFree(d_影响N);
cudaFree(d_能量);
cudaFree(d_频率);
大算法清除();
大关系。清除();
位置_list.clear();
能量;
频率。清除();
//从h_位置和h_能量读取
cudaFreeHost(h_big_algo);
cudaFreeHost(h_big_关系);
cudaFreeHost(h_N);
cudaFreeHost(h_频率);
cudaFreeHost(h_位置);
全部应用(目的地、能量);
cudaFreeHost(h_能量);
cudaFreeHost(h_目的地);
对于(int i=0;i绘制自身();
/*if(粒子列表[i]->能量<0){
库特能量>10){
分词列表[i]->repearch();
粒子列表[i]>能量-=复制成本;
}*/
}
window.display();
}
返回0;
}
以及内核的良好措施:
__global__ void move(float* d_big_algo, float* d_big_relationships, float* d_location,
int* N, float* d_destinations, int * d_influence_N, float *d_energies, float*d_frequencies) {
int id = (blockIdx.x * blockDim.x) + threadIdx.x;
if (id < (*N)*(*N)) {
//printf("%i ", *N);
int subject = (id-((id+*N)%*N))/ *N;
int object = (id + *N) % *N;
float distance = sqrt(powf((d_location[object*2] - d_location[subject*2]), 2.0f)
+ powf((d_location[object * 2 +1] - d_location[subject * 2+ 1]), 2.0f));
float relative_maximum = d_big_relationships[(object * 3) + (subject * *N * 3) +2];
if ((distance < relative_maximum)&&(distance > 0)) {
float relative_minimum = d_big_relationships[(object * 3) + (subject * *N * 3)];
float relative_medium = d_big_relationships[(object * 3) + (subject * *N * 3) + 1];
/*if (distance < 12) {
if (abs(d_frequencies[subject] - d_frequencies[object]) > 0.1) {
if (d_energies[subject] > d_energies[object]) {
d_energies[subject]+=0.1;
d_energies[object]-=0.1;
}
if (d_energies[subject] < d_energies[object]) {
d_energies[subject]-=0.1;
d_energies[object]+=0.1;
}
}
//else {
// d_energies[subject]+= (d_energies[subject] - d_energies[object])/100;
//}
}*/
if ((distance < 8) && (distance > 0)) {
//printf("%i moving\n", id);
float force = 2.0f * (-distance / powf(distance, 2));
d_influence_N[subject] += 1;
//printf("%i ready to sync\n", id);
//__syncthreads();
//printf("%i INFLUENCE ", d_influence_N[subject]);
d_destinations[subject*2] += force * (d_location[object * 2] - d_location[subject * 2]);
d_destinations[subject * 2+1] += force * (d_location[object * 2 + 1] - d_location[subject * 2 + 1]);
//delete& force;
}
else if ((distance < relative_medium) && (distance > relative_minimum)) {
//printf("%i moving\n", id);
float force = d_big_algo[(object * 8) + (subject * *N * 8) + 4] * abs((d_big_algo[(object * 8) + (subject * *N * 8) + 5] * distance)
- d_big_algo[(object * 8) + (subject * *N * 8) + 6]) + d_big_algo[(object * 8) + (subject * *N * 8) + 7];
d_influence_N[subject] += 1;
//printf("%i ready to sync\n", id);
//__syncthreads();
//printf("%i INFLUENCE ", d_influence_N[subject]);
float destination_mod = (2.0f * d_influence_N[subject]) / powf(d_influence_N[subject], 2.0f);
d_destinations[subject * 2] += force * (d_location[object * 2] - d_location[subject * 2]) * destination_mod;
d_destinations[subject * 2 + 1] += force * (d_location[object * 2 + 1] - d_location[subject * 2 + 1]) * destination_mod;
//delete& force;
//delete& destination_mod;
}
else if (distance > relative_medium) {
//printf("%i moving\n", id);
float force = d_big_algo[(object * 8) + (subject * *N * 8)] * abs((d_big_algo[(object * 8) + (subject * *N * 8) +1] * distance)
- d_big_algo[(object * 8) + (subject * *N * 8) + 2]) + d_big_algo[(object * 8) + (subject * *N * 8) + 3];
d_influence_N[subject] += 1;
//printf("%i ready to sync\n", id);
//__syncthreads();
//printf("%i INFLUENCE ", d_influence_N[subject]);
float destination_mod = (2.0f * d_influence_N[subject]) / powf(d_influence_N[subject], 2.0f);
d_destinations[subject * 2] += force * (d_location[object * 2] - d_location[subject * 2]) * destination_mod;
d_destinations[subject * 2 + 1] += force * (d_location[object * 2 + 1] - d_location[subject * 2 + 1]) * destination_mod;
//delete& force;
//delete& destination_mod;
}
//delete& relative_minimum;
//delete& relative_medium;
}
//delete& subject;
//delete& object;
//delete& distance;
//delete& relative_maximum;
}
//__syncthreads();
if (id < *N) {
if (d_location[id * 2+1] < 2 || d_location[id * 2 + 1] > HEIGHT - 2) {
d_destinations[id * 2+1] = HEIGHT / 2;
}
if (d_location[id * 2]<2 || d_location[id * 2] > WIDTH - 2) {
d_destinations[id * 2] = WIDTH / 2;
}
if (d_location[id * 2] >= WIDTH - 10) {
d_destinations[id * 2] = abs(d_location[id * 2]) - ((d_location[id * 2] - (WIDTH - 10)) / 2) / (WIDTH / abs(d_location[id * 2]));
}
if (d_location[id * 2] < 10) {
d_destinations[id * 2] = abs(d_location[id * 2]) + ((d_location[id * 2] + 10) / 2) / (abs(d_location[id * 2]) + 0.1);
}
if (d_location[id * 2 + 1] >= HEIGHT - 10) {
d_destinations[id * 2 + 1] = abs(d_location[id * 2 + 1]) - ((d_location[id * 2 + 1] - (HEIGHT - 10)) / 2) / (HEIGHT / abs(d_location[id * 2 + 1]));
}
if (d_location[id * 2 + 1] < 10) {
d_destinations[id * 2 + 1] = abs(d_location[id * 2 + 1]) + ((d_location[id * 2 + 1] + 10) / 2) / (abs(d_location[id * 2 + 1]) + 0.1);
}
if (2.0f * (sqrt(powf(d_location[id * 2] - WIDTH / 2.0f, 2) + powf(d_location[id * 2 + 1] - WIDTH / 2.0f, 2))) > WIDTH) {
if (d_location[id * 2 + 1] >= HEIGHT / 2.0f) {
d_destinations[id * 2 + 1] = sqrt(abs(powf(HEIGHT / 2.0f, 2) - powf(d_location[id * 2] - HEIGHT / 2.0f, 2))) + HEIGHT / 2.0f;
}
if (d_location[id * 2 + 1] <= HEIGHT / 2.0f) {
d_destinations[id * 2 + 1] = -sqrt(abs(powf(HEIGHT / 2.0f, 2) - powf(d_location[id * 2] - HEIGHT / 2.0f, 2))) + HEIGHT / 2.0f;
}
if (d_location[id * 2] <= WIDTH / 2.0f) {
d_destinations[id * 2] = -sqrt(abs(powf(WIDTH / 2.0f, 2) - powf(d_location[id * 2 + 1] - WIDTH / 2.0f, 2))) + WIDTH / 2.0f;
}
if (d_location[id * 2] >= WIDTH / 2.0f) {
d_destinations[id * 2] = sqrt(abs(powf(WIDTH / 2.0f, 2) - powf(d_location[id * 2 + 1] - WIDTH / 2.0f, 2))) + WIDTH / 2.0f;
//send out
}
}
}
delete &id;
}
\uuuu全局\uuuu无效移动(浮点*d\u大\u算法,浮点*d\u大\u关系,浮点*d\u低)