C++ 如何处理修改原子值_C++_Multithreading_Visual Studio_Gpgpu_C++ Amp

C++ 如何处理修改原子值

c++ multithreading visual-studio

C++ 如何处理修改原子值,c++,multithreading,visual-studio,gpgpu,c++-amp,C++,Multithreading,Visual Studio,Gpgpu,C++ Amp,我想写一个简单的代码，根据数据的输入向量进行一些计算。它应该只返回一个值。我不知道如何做到这一点。我写了一个简单的测试来检查它是如何工作的，我得到了一个编译错误。代码如下： Float Subset::parallel_tests() { float sum = 0.0f; concurrency::parallel_for_each(concurrency::extent<1>(121), [=, &sum] (concurrency::index<1>

我想写一个简单的代码，根据数据的输入向量进行一些计算。它应该只返回一个值。我不知道如何做到这一点。我写了一个简单的测试来检查它是如何工作的，我得到了一个编译错误。代码如下：

Float Subset::parallel_tests() 
{ 
float sum = 0.0f; 

concurrency::parallel_for_each(concurrency::extent<1>(121), [=, &sum] (concurrency::index<1> idx) restrict(amp) 
{ 
    sum += 0.2f; 
}); 

return sum; 
}

Float子集：：并行_测试（）
{ 
浮动总和=0.0f；
并发性：：并行性（concurrency:：extent（121），[=，&sum]（concurrency：：index idx）限制（amp）
{ 
总和+=0.2f；
}); 
回报金额；
}

当我试图编译此代码时，会出现以下错误：

错误C3590:“sum”：如果lambda受amp限制，则不支持按引用捕获或“this”捕获

错误C3581:'cci:：Subset:：parallel_tests:：'：amp受限代码中不支持的类型

代码未编译的原因是

sum

在类中声明，而不是包装在

数组视图中

。实际上，您正试图从AMP限制代码访问

this->sum

。您需要使用以下命令来包装

sum

，然后将其传递到

并行\u for_each

，然后该并行\u应使用

avSum

int sum = 0;
array_view<int, 1> avSum(1, &sum);

零元素现在包含总数

class SimpleReduction
{
public:
    int Reduce(accelerator_view& view, const std::vector<int>& source, 
        double& computeTime) const
    {
        assert(source.size() <= UINT_MAX);
        int elementCount = static_cast<int>(source.size());

        // Copy data
        array<int, 1> a(elementCount, source.cbegin(), source.cend(), view);
        std::vector<int> result(1);
        int tailResult = (elementCount % 2) ? source[elementCount - 1] : 0;
        array_view<int, 1> tailResultView(1, &tailResult);

        for (int stride = (elementCount / 2); stride > 0; stride /= 2)
        {
            parallel_for_each(view, extent<1>(stride), [=, &a] (index<1> idx)
                restrict(amp)
            {
                a[idx] += a[idx + stride];

                // If there are an odd number of elements then the 
                // first thread adds the last element.
                if ((idx[0] == 0) && (stride & 0x1) && (stride != 1))
                    tailResultView[idx] += a[stride - 1];
            });
        }

        // Only copy out the first element in the array as this 
        // contains the final answer.
        copy(a.section(0, 1), result.begin());

        tailResultView.synchronize();
        return result[0] + tailResult;
    }
};

class SimpleReduce
{
公众：
int Reduce（加速器视图和视图，常量标准：：向量和源，
双精度（计算时间）常数
{
断言（source.size（）0；步长/=2）
{
每个（视图、范围（步幅）、[=、&a]（索引idx）的平行度
限制（安培）
{
a[idx]+=a[idx+步幅]；
//如果有奇数个元素，则
//第一个线程添加最后一个元素。
如果（（idx[0]==0）&&&（步幅&0x1）&&&（步幅！=1））
tailResultView[idx]+=a[stride-1]；
});
}
//仅复制数组中的第一个元素，如下所示
//包含最终答案。
复制（a.section（0，1），result.begin（））；
tailResultView.synchronize（）；
返回结果[0]+tailResult；
}
};

可以平铺，平铺中的每个线程负责为其元素生成结果，然后将所有平铺的结果相加

template <int TileSize>
class TiledReduction 
{
public:
    int Reduce(accelerator_view& view, const std::vector<int>& source, 
        double& computeTime) const
    {
        int elementCount = static_cast<int>(source.size());

        // Copy data
        array<int, 1> arr(elementCount, source.cbegin(), source.cend(), view);

        int result;
        computeTime = TimeFunc(view, [&]() 
        {
            while (elementCount >= TileSize)
            {
                extent<1> e(elementCount);
                array<int, 1> tmpArr(elementCount / TileSize);

                parallel_for_each(view, e.tile<TileSize>(), 
                    [=, &arr, &tmpArr] (tiled_index<TileSize> tidx) restrict(amp)
                {
                    //  For each tile do the reduction on the first thread of the tile.
                    //  This isn't expected to be very efficient as all the other
                    //  threads in the tile are idle.
                    if (tidx.local[0] == 0)
                    {
                        int tid = tidx.global[0];
                        int tempResult = arr[tid];
                        for (int i = 1; i < TileSize; ++i)
                            tempResult += arr[tid + i];

                        //  Take the result from each tile and create a new array. 
                        //  This will be used in the next iteration. Use temporary 
                        // array to avoid race condition.
                        tmpArr[tidx.tile[0]] = tempResult;
                    }
                });

                elementCount /= TileSize;
                std::swap(tmpArr, arr);
            }

            //  Copy the final results from each tile to the CPU and accumulate them 
            std::vector<int> partialResult(elementCount);
            copy(arr.section(0, elementCount), partialResult.begin());
            result = std::accumulate(partialResult.cbegin(), partialResult.cend(), 0);
        });
        return result;
    }
};

模板
课堂教学
{
公众：
int Reduce（加速器视图和视图，常量标准：：向量和源，
双精度（计算时间）常数
{
int elementCount=static_cast（source.size（））；
//复制数据
数组arr（elementCount，source.cbegin（），source.cend（），view）；
int结果；
computeTime=TimeFunc（视图，[&]（）
{
while（elementCount>=TileSize）
{
范围e（元素计数）；
数组tmpArr（elementCount/TileSize）；
每个视图的平行视图（视图，例如平铺（），
[=，&arr，&tmpArr]（平铺索引tidx）限制（amp）
{
//对于每个瓷砖，在瓷砖的第一个螺纹上进行减少。
//这并不像所有其他的方法那样高效
//磁贴中的线程处于空闲状态。
if（tidx.local[0]==0）
{
int tid=tidx.global[0]；
int tempResult=arr[tid]；
对于（int i=1；i


这仍然不是最有效的解决方案，因为它没有良好的内存访问模式。你可以在书的Codeplex网站上看到这方面的进一步改进
 好的，我开始实施缩减。我从简单的简化开始，遇到了一个问题。我不想将std:：vector传递给函数，而是传递一个或两个concurrency:：array
我需要从源代码获取信息，并并行求和所有内容，以便返回值。我应该如何实施它
naive版本中的代码应类似于以下内容：
float Subset::reduction_simple_1(const concurrency::array<float, 1>& source)
{
    assert(source.size() <= UINT_MAX);
//unsigned element_count = static_cast<unsigned>(source.size());

unsigned element_count = 121; 

assert(element_count != 0); // Cannot reduce an empty sequence.
    if (element_count == 1)
    {
         return source[0];
    }

    // Using array, as we mostly need just temporary memory to store
    // the algorithm state between iterations and in the end we have to copy
    // back only the first element.
    //concurrency::array<float, 1> a(element_count, source.begin());

    // Takes care of odd input elements – we could completely avoid tail sum
    // if we would require source to have even number of elements.
    float tail_sum = (element_count % 2) ? source[element_count - 1] : 0;
    concurrency::array_view<float, 1> av_tail_sum(1, &tail_sum);

    // Each thread reduces two elements.
    for (unsigned s = element_count / 2; s > 0; s /= 2)
    {
        concurrency::parallel_for_each(concurrency::extent<1>(s), [=, &a] (concurrency::index<1> idx) restrict(amp)
    {
        //get information from source, do some computations and store it in accumulator 
        accumulator[idx] = accumulator[idx] + accumulator[idx + s];

        // Reduce the tail in cases where the number of elements is odd.
        if ((idx[0] == s - 1) && (s & 0x1) && (s != 1))
        {
            av_tail_sum[0] += accumulator[s - 1];
        }
    });
}

// Copy the results back to CPU.
std::vector<float> result(1);
copy(accumulator.section(0, 1), result.begin());
av_tail_sum.synchronize();

return result[0] + tail_sum;
} 

float子集：：reduce\u simple\u 1（常量并发：：数组和源代码）
{
断言（source.size（）0；s/=2）
{
并发性：：每个（并发性：：区段，[=，&a]（并发性：：索引idx）限制（amp）
{
//从源代码中获取信息，进行一些计算并将其存储在累加器中
累加器[idx]=累加器[idx]+累加器[idx+s]；
//在元素数为奇数的情况下减少尾部。
如果（（idx[0]==s-1）&&（s&0x1）&&（s！=1））
{
av_tail_sum[0]+=累加器[s-1]；
}
});
}
//将结果复制回CPU。
std：：向量结果（1）；
复制（累加器.section（0，1），result.begin（））；
av_tail_sum.synchronize（）；
返回结果[0]+尾和；
} 

我需要以某种方式实现“累加器”，但我不知道如何实现
 //该方法应计算两个图像（已复制到GPU内存）的相关值
//The method should compute a correlation value of two images (which had already been copied to GPU memory) 

float Subset::compute_correlation(const concurrency::array<float, 1>& source1, const concurrency::array<float, 1>& source2) 
{
    float result; 
    float parameter_1; 
    float parameter_2; 
    . 
    . 
    . 
    float parameter_n; 
    parrallel_for_each(...) 
    { 
         //here do some computations using source1 and source2 
         parameter_1 = source1[idx] o source2[idx]; 
         .
         . 
         . 
         //I am computing every parameter in different way 
         parameter_n = source1[idx] o source2[idx]; 
    } 
    //compute the result based on the parameters 
    result = parameter_1 o parameter_2 o ... o parameter_n; 

    return result; 
} 

浮点子集：：计算相关性（常量并发：：数组和源1，常量并发：：数组和源2）
{
浮动结果；
浮点参数_1；
浮点参数_2；
. 
. 
. 
浮动参数；
//The method should compute a correlation value of two images (which had already been copied to GPU memory) 

float Subset::compute_correlation(const concurrency::array<float, 1>& source1, const concurrency::array<float, 1>& source2) 
{
    float result; 
    float parameter_1; 
    float parameter_2; 
    . 
    . 
    . 
    float parameter_n; 
    parrallel_for_each(...) 
    { 
         //here do some computations using source1 and source2 
         parameter_1 = source1[idx] o source2[idx]; 
         .
         . 
         . 
         //I am computing every parameter in different way 
         parameter_n = source1[idx] o source2[idx]; 
    } 
    //compute the result based on the parameters 
    result = parameter_1 o parameter_2 o ... o parameter_n; 

    return result; 
}