C++ 如何处理修改原子值
我想写一个简单的代码,根据数据的输入向量进行一些计算。它应该只返回一个值。我不知道如何做到这一点。我写了一个简单的测试来检查它是如何工作的,我得到了一个编译错误。代码如下:C++ 如何处理修改原子值,c++,multithreading,visual-studio,gpgpu,c++-amp,C++,Multithreading,Visual Studio,Gpgpu,C++ Amp,我想写一个简单的代码,根据数据的输入向量进行一些计算。它应该只返回一个值。我不知道如何做到这一点。我写了一个简单的测试来检查它是如何工作的,我得到了一个编译错误。代码如下: Float Subset::parallel_tests() { float sum = 0.0f; concurrency::parallel_for_each(concurrency::extent<1>(121), [=, &sum] (concurrency::index<1>
Float Subset::parallel_tests()
{
float sum = 0.0f;
concurrency::parallel_for_each(concurrency::extent<1>(121), [=, &sum] (concurrency::index<1> idx) restrict(amp)
{
sum += 0.2f;
});
return sum;
}
Float子集::并行_测试()
{
浮动总和=0.0f;
并发性::并行性(concurrency::extent(121),[=,&sum](concurrency::index idx)限制(amp)
{
总和+=0.2f;
});
回报金额;
}
当我试图编译此代码时,会出现以下错误:
错误C3590:“sum”:如果lambda受amp限制,则不支持按引用捕获或“this”捕获
错误C3581:'cci::Subset::parallel_tests::':amp受限代码中不支持的类型代码未编译的原因是
sum
在类中声明,而不是包装在数组视图中
。实际上,您正试图从AMP限制代码访问this->sum
。您需要使用以下命令来包装sum
,然后将其传递到并行\u for_each
,然后该并行\u应使用avSum
int sum = 0;
array_view<int, 1> avSum(1, &sum);
零元素现在包含总数
class SimpleReduction
{
public:
int Reduce(accelerator_view& view, const std::vector<int>& source,
double& computeTime) const
{
assert(source.size() <= UINT_MAX);
int elementCount = static_cast<int>(source.size());
// Copy data
array<int, 1> a(elementCount, source.cbegin(), source.cend(), view);
std::vector<int> result(1);
int tailResult = (elementCount % 2) ? source[elementCount - 1] : 0;
array_view<int, 1> tailResultView(1, &tailResult);
for (int stride = (elementCount / 2); stride > 0; stride /= 2)
{
parallel_for_each(view, extent<1>(stride), [=, &a] (index<1> idx)
restrict(amp)
{
a[idx] += a[idx + stride];
// If there are an odd number of elements then the
// first thread adds the last element.
if ((idx[0] == 0) && (stride & 0x1) && (stride != 1))
tailResultView[idx] += a[stride - 1];
});
}
// Only copy out the first element in the array as this
// contains the final answer.
copy(a.section(0, 1), result.begin());
tailResultView.synchronize();
return result[0] + tailResult;
}
};
class SimpleReduce
{
公众:
int Reduce(加速器视图和视图,常量标准::向量和源,
双精度(计算时间)常数
{
断言(source.size()0;步长/=2)
{
每个(视图、范围(步幅)、[=、&a](索引idx)的平行度
限制(安培)
{
a[idx]+=a[idx+步幅];
//如果有奇数个元素,则
//第一个线程添加最后一个元素。
如果((idx[0]==0)&&&(步幅&0x1)&&&(步幅!=1))
tailResultView[idx]+=a[stride-1];
});
}
//仅复制数组中的第一个元素,如下所示
//包含最终答案。
复制(a.section(0,1),result.begin());
tailResultView.synchronize();
返回结果[0]+tailResult;
}
};
可以平铺,平铺中的每个线程负责为其元素生成结果,然后将所有平铺的结果相加
template <int TileSize>
class TiledReduction
{
public:
int Reduce(accelerator_view& view, const std::vector<int>& source,
double& computeTime) const
{
int elementCount = static_cast<int>(source.size());
// Copy data
array<int, 1> arr(elementCount, source.cbegin(), source.cend(), view);
int result;
computeTime = TimeFunc(view, [&]()
{
while (elementCount >= TileSize)
{
extent<1> e(elementCount);
array<int, 1> tmpArr(elementCount / TileSize);
parallel_for_each(view, e.tile<TileSize>(),
[=, &arr, &tmpArr] (tiled_index<TileSize> tidx) restrict(amp)
{
// For each tile do the reduction on the first thread of the tile.
// This isn't expected to be very efficient as all the other
// threads in the tile are idle.
if (tidx.local[0] == 0)
{
int tid = tidx.global[0];
int tempResult = arr[tid];
for (int i = 1; i < TileSize; ++i)
tempResult += arr[tid + i];
// Take the result from each tile and create a new array.
// This will be used in the next iteration. Use temporary
// array to avoid race condition.
tmpArr[tidx.tile[0]] = tempResult;
}
});
elementCount /= TileSize;
std::swap(tmpArr, arr);
}
// Copy the final results from each tile to the CPU and accumulate them
std::vector<int> partialResult(elementCount);
copy(arr.section(0, elementCount), partialResult.begin());
result = std::accumulate(partialResult.cbegin(), partialResult.cend(), 0);
});
return result;
}
};
模板
课堂教学
{
公众:
int Reduce(加速器视图和视图,常量标准::向量和源,
双精度(计算时间)常数
{
int elementCount=static_cast(source.size());
//复制数据
数组arr(elementCount,source.cbegin(),source.cend(),view);
int结果;
computeTime=TimeFunc(视图,[&]()
{
while(elementCount>=TileSize)
{
范围e(元素计数);
数组tmpArr(elementCount/TileSize);
每个视图的平行视图(视图,例如平铺(),
[=,&arr,&tmpArr](平铺索引tidx)限制(amp)
{
//对于每个瓷砖,在瓷砖的第一个螺纹上进行减少。
//这并不像所有其他的方法那样高效
//磁贴中的线程处于空闲状态。
if(tidx.local[0]==0)
{
int tid=tidx.global[0];
int tempResult=arr[tid];
对于(int i=1;i
这仍然不是最有效的解决方案,因为它没有良好的内存访问模式。你可以在书的Codeplex网站上看到这方面的进一步改进 好的,我开始实施缩减。我从简单的简化开始,遇到了一个问题。我不想将std::vector传递给函数,而是传递一个或两个concurrency::array 我需要从源代码获取信息,并并行求和所有内容,以便返回值。我应该如何实施它 naive版本中的代码应类似于以下内容:
float Subset::reduction_simple_1(const concurrency::array<float, 1>& source)
{
assert(source.size() <= UINT_MAX);
//unsigned element_count = static_cast<unsigned>(source.size());
unsigned element_count = 121;
assert(element_count != 0); // Cannot reduce an empty sequence.
if (element_count == 1)
{
return source[0];
}
// Using array, as we mostly need just temporary memory to store
// the algorithm state between iterations and in the end we have to copy
// back only the first element.
//concurrency::array<float, 1> a(element_count, source.begin());
// Takes care of odd input elements – we could completely avoid tail sum
// if we would require source to have even number of elements.
float tail_sum = (element_count % 2) ? source[element_count - 1] : 0;
concurrency::array_view<float, 1> av_tail_sum(1, &tail_sum);
// Each thread reduces two elements.
for (unsigned s = element_count / 2; s > 0; s /= 2)
{
concurrency::parallel_for_each(concurrency::extent<1>(s), [=, &a] (concurrency::index<1> idx) restrict(amp)
{
//get information from source, do some computations and store it in accumulator
accumulator[idx] = accumulator[idx] + accumulator[idx + s];
// Reduce the tail in cases where the number of elements is odd.
if ((idx[0] == s - 1) && (s & 0x1) && (s != 1))
{
av_tail_sum[0] += accumulator[s - 1];
}
});
}
// Copy the results back to CPU.
std::vector<float> result(1);
copy(accumulator.section(0, 1), result.begin());
av_tail_sum.synchronize();
return result[0] + tail_sum;
}
float子集::reduce\u simple\u 1(常量并发::数组和源代码)
{
断言(source.size()0;s/=2)
{
并发性::每个(并发性::区段,[=,&a](并发性::索引idx)限制(amp)
{
//从源代码中获取信息,进行一些计算并将其存储在累加器中
累加器[idx]=累加器[idx]+累加器[idx+s];
//在元素数为奇数的情况下减少尾部。
如果((idx[0]==s-1)&&(s&0x1)&&(s!=1))
{
av_tail_sum[0]+=累加器[s-1];
}
});
}
//将结果复制回CPU。
std::向量结果(1);
复制(累加器.section(0,1),result.begin());
av_tail_sum.synchronize();
返回结果[0]+尾和;
}
我需要以某种方式实现“累加器”,但我不知道如何实现 //该方法应计算两个图像(已复制到GPU内存)的相关值
//The method should compute a correlation value of two images (which had already been copied to GPU memory)
float Subset::compute_correlation(const concurrency::array<float, 1>& source1, const concurrency::array<float, 1>& source2)
{
float result;
float parameter_1;
float parameter_2;
.
.
.
float parameter_n;
parrallel_for_each(...)
{
//here do some computations using source1 and source2
parameter_1 = source1[idx] o source2[idx];
.
.
.
//I am computing every parameter in different way
parameter_n = source1[idx] o source2[idx];
}
//compute the result based on the parameters
result = parameter_1 o parameter_2 o ... o parameter_n;
return result;
}
浮点子集::计算相关性(常量并发::数组和源1,常量并发::数组和源2)
{
浮动结果;
浮点参数_1;
浮点参数_2;
.
.
.
浮动参数;
//The method should compute a correlation value of two images (which had already been copied to GPU memory)
float Subset::compute_correlation(const concurrency::array<float, 1>& source1, const concurrency::array<float, 1>& source2)
{
float result;
float parameter_1;
float parameter_2;
.
.
.
float parameter_n;
parrallel_for_each(...)
{
//here do some computations using source1 and source2
parameter_1 = source1[idx] o source2[idx];
.
.
.
//I am computing every parameter in different way
parameter_n = source1[idx] o source2[idx];
}
//compute the result based on the parameters
result = parameter_1 o parameter_2 o ... o parameter_n;
return result;
}