Warning: file_get_contents(/data/phpspider/zhask/data//catemap/2/python/326.json): failed to open stream: No such file or directory in /data/phpspider/zhask/libs/function.php on line 167

Warning: Invalid argument supplied for foreach() in /data/phpspider/zhask/libs/tag.function.php on line 1116

Notice: Undefined index: in /data/phpspider/zhask/libs/function.php on line 180

Warning: array_chunk() expects parameter 1 to be array, null given in /data/phpspider/zhask/libs/function.php on line 181

Warning: file_get_contents(/data/phpspider/zhask/data//catemap/6/cplusplus/162.json): failed to open stream: No such file or directory in /data/phpspider/zhask/libs/function.php on line 167

Warning: Invalid argument supplied for foreach() in /data/phpspider/zhask/libs/tag.function.php on line 1116

Notice: Undefined index: in /data/phpspider/zhask/libs/function.php on line 180

Warning: array_chunk() expects parameter 1 to be array, null given in /data/phpspider/zhask/libs/function.php on line 181
CUDA共享内存问题(以及将CUDA与python/ctypes一起使用)_Python_C++_Cuda_Ctypes - Fatal编程技术网

CUDA共享内存问题(以及将CUDA与python/ctypes一起使用)

CUDA共享内存问题(以及将CUDA与python/ctypes一起使用),python,c++,cuda,ctypes,Python,C++,Cuda,Ctypes,不知何故,当我在下面的代码中修改d\u更新的水\u流量\u地图时,d\u地形\u高度\u地图也会被修改 更改两个阵列的分配顺序可以解决问题,但我认为这只是掩盖了问题的根本原因 cudaCheck(cudaMalloc((void **)&d_water_flow_map, SIZE * 4)); cudaCheck(cudaMalloc((void **)&d_updated_water_flow_map, SIZE * 4)); // changing this array

不知何故,当我在下面的代码中修改
d\u更新的水\u流量\u地图
时,
d\u地形\u高度\u地图
也会被修改

更改两个阵列的分配顺序可以解决问题,但我认为这只是掩盖了问题的根本原因

cudaCheck(cudaMalloc((void **)&d_water_flow_map, SIZE * 4)); 
cudaCheck(cudaMalloc((void **)&d_updated_water_flow_map, SIZE * 4)); // changing this array also changes d_terrain_height_map
cudaCheck(cudaMalloc((void **)&d_terrain_height_map, SIZE));  
我正在将内核编译成DLL,并从Blender 3D python解释器中的python文件下面调用它。所有值都是32位浮点值

cu_包括.h

#pragma once  

#ifdef MATHLIBRARY_EXPORTS  
#define MATHLIBRARY_API __declspec(dllexport)   
#else  
#define MATHLIBRARY_API __declspec(dllimport)   
#endif  


extern "C" __declspec(dllexport)
void init(float *t_height_map,
float *w_height_map,
float *s_height_map,
int SIZE_X,
int SIZE_Y);

extern "C" __declspec(dllexport)
void run_hydro_erosion(int cycles,
float t_step,
float min_tilt_angle,
float SEDIMENT_CAP,
float DISSOLVE_CONST,
float DEPOSIT_CONST,
int SIZE_X,
int SIZE_Y,
float PIPE_LENGTH,
float ADJACENT_LENGTH,
float TIME_STEP,
float MIN_TILT_ANGLE);

extern "C" __declspec(dllexport)
void free_mem();

extern "C" __declspec(dllexport)
void procedural_rain(float *water_height_map, float *rain_map, int SIZE_X, int SIZE_Y);
内核.dll

#include "cu_include.h"

// includes, system
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <math.h>
#include <time.h>
#include <iostream>
#include <algorithm>
#include <random>

// includes CUDA
#include <cuda_runtime.h>

using namespace std;

#define FLOW_RIGHT 0
#define FLOW_UP 1
#define FLOW_LEFT 2
#define FLOW_DOWN 3
#define X_VEL 0
#define Y_VEL 1
#define LEFT_CELL row, col - 1
#define RIGHT_CELL row, col + 1
#define ABOVE_CELL row - 1, col
#define BELOW_CELL row + 1, col

// CUDA API error checking macro
#define T 1024
#define M 1536
#define blockSize 1024
#define cudaCheck(error) \
  if (error != cudaSuccess) { \
    printf("Fatal error: %s at %s:%d\n", \
      cudaGetErrorString(error), \
      __FILE__, __LINE__); \
    exit(1); \
              }


__global__ void update_water_flow(float *water_height_map, float *water_flow_map, float *d_updated_water_flow_map, int SIZE_X, int SIZE_Y)
{
    int index = blockIdx.x * blockDim.x + threadIdx.x;
    int col = index % SIZE_X;
    int row = index / SIZE_X; 

    index = row * (SIZE_X * 4) + col * 4;   // 3D index
    d_updated_water_flow_map[index + FLOW_RIGHT] = 0;
    d_updated_water_flow_map[index + FLOW_UP] = 0;
    d_updated_water_flow_map[index + FLOW_LEFT] = 0;
    d_updated_water_flow_map[index + FLOW_DOWN] = 0;

}

static float *terrain_height_map;
static float *water_height_map;
static float *sediment_height_map;

void init(float *t_height_map,
    float *w_height_map,
    float *s_height_map,
    int SIZE_X,
    int SIZE_Y)
{
    /* set vars HOST*/
    terrain_height_map = t_height_map;
    water_height_map = w_height_map;
    sediment_height_map = s_height_map;
}

void run_hydro_erosion(int cycles,
    float t_step,
    float min_tilt_angle,
    float SEDIMENT_CAP,
    float DISSOLVE_CONST,
    float DEPOSIT_CONST,
    int SIZE_X,
    int SIZE_Y,
    float PIPE_LENGTH,
    float ADJACENT_LENGTH,
    float TIME_STEP,
    float MIN_TILT_ANGLE)
{ 
    int numBlocks = (SIZE_X * SIZE_Y + (blockSize - 1)) / blockSize;
    int SIZE = SIZE_X * SIZE_Y * sizeof(float);

    float *d_terrain_height_map, *d_updated_terrain_height_map;
    float *d_water_height_map, *d_updated_water_height_map;
    float *d_sediment_height_map, *d_updated_sediment_height_map;

    float *d_suspended_sediment_level;
    float *d_updated_suspended_sediment_level;
    float *d_water_flow_map;
    float *d_updated_water_flow_map;
    float *d_prev_water_height_map;
    float *d_water_velocity_vec;
    float *d_rain_map;

    cudaCheck(cudaMalloc(&d_water_height_map, SIZE));
    cudaCheck(cudaMalloc(&d_updated_water_height_map, SIZE));
    cudaCheck(cudaMalloc(&d_prev_water_height_map, SIZE));
    cudaCheck(cudaMalloc(&d_water_flow_map, SIZE * 4));
    cudaCheck(cudaMalloc(&d_updated_water_flow_map, SIZE * 4)); // changing this array also changes d_terrain_height_map
    cudaCheck(cudaMalloc(&d_terrain_height_map, SIZE));
    cudaCheck(cudaMalloc(&d_updated_terrain_height_map, SIZE));
    cudaCheck(cudaMalloc(&d_sediment_height_map, SIZE));
    cudaCheck(cudaMalloc(&d_updated_sediment_height_map, SIZE));
    cudaCheck(cudaMalloc(&d_suspended_sediment_level, SIZE));
    cudaCheck(cudaMalloc(&d_updated_suspended_sediment_level, SIZE));
    cudaCheck(cudaMalloc(&d_rain_map, SIZE));
    cudaCheck(cudaMalloc(&d_water_velocity_vec, SIZE * 2));

    cudaCheck(cudaMemcpy(d_terrain_height_map, terrain_height_map, SIZE, cudaMemcpyHostToDevice));
    cudaCheck(cudaMemcpy(d_water_height_map, water_height_map, SIZE, cudaMemcpyHostToDevice));
    cudaCheck(cudaMemcpy(d_sediment_height_map, sediment_height_map, SIZE, cudaMemcpyHostToDevice));

    cout << "init terrain_height_map" << endl;
    for (int i = 0; i < SIZE_X * SIZE_Y; i++) {
        cout << terrain_height_map[i] << ", ";
        if (i % SIZE_X == 0 && i != 0) cout << endl;
    }

    /* launch the kernel on the GPU */
    float *temp;
    while (cycles--) {
        update_water_flow << < numBlocks, blockSize >> >(d_water_height_map, d_water_flow_map, d_updated_water_flow_map, SIZE_X, SIZE_Y); 
        temp = d_water_flow_map;
        d_water_flow_map = d_updated_water_flow_map;
        d_updated_water_flow_map = temp;        
    }
    cudaCheck(cudaMemcpy(terrain_height_map, d_terrain_height_map, SIZE, cudaMemcpyDeviceToHost)); 


    cout << "updated terrain" << endl;
    for (int i = 0; i < SIZE_X * SIZE_Y; i++) {
        cout << terrain_height_map[i] << ", ";
        if (i % SIZE_X == 0 && i != 0) cout << endl;
    } 
} 
错误输出:

预期输出(在我注释掉更新水流量后):

//更新水流量>>(d水高度图、d水流量图、d水流量图、d水流量图、尺寸X、尺寸Y);

图形卡:GTX460M

(请注意,此答案中的代码还提供了一个完整的配方/示例,说明如何在与使用python ctypes的python应用程序共享的库中使用CUDA代码(例如CUDA设备内核)。如果您希望使用CUDA库功能,则答案提供了一个使用python ctypes的示例。)

这里的问题是内核写入超出了边界,显然编译器/运行时在设备内存中找到了足够接近的分配,超过第一次分配的边界会导致代码写入第二次分配:

cudaCheck(cudaMalloc(&d_updated_water_flow_map, SIZE * 4)); // changing this array also changes d_terrain_height_map
cudaCheck(cudaMalloc(&d_terrain_height_map, SIZE));
出现越界访问是因为内核启动涉及足够多的线程(在本例中是启动1024个线程),而我们实际上只“需要”
SIZE\u X*SIZE\u Y
线程(即本例中的16个):

下面是一个基于所提供代码的完整示例(尽管是在linux上,并且在python代码中删除了blender依赖项),显示了前后效果。请注意,我们甚至可以使用
cuda memcheck
运行这样的代码,它会指出这种情况下的越界访问(为了清楚起见,从下面的第一个示例中省略):


如果没有答案,这个问题是在浪费每个人的时间,包括你的时间。@Talonmes我添加了一个有效的、最简单的例子!很抱歉……这在CUDA中是非法的:
\uuuuu设备\uuuuuuuuuuuu浮动*d\u更新的\u水\u流量\u地图。。。cudaCheck(Cudamaloc(&d_更新水流量图,尺寸*4))和是您现在发布的代码版本与以前发布的版本之间的显著差异(该版本将
d\u更新的\u水\u流量\u映射作为普通主机堆栈变量)。您不能在主机代码中获取
\uuuu device\uuuu
变量的地址(即使在
cudamaloc
的调用/参数中也是如此),这不是创建
\uuu device\uuuu
指针变量的正确方法。@RobertCrovella我将代码更改回原始设计。我尝试了一些不同的方法,但忘记了将变量更改回原来的状态。我进行了测试,但问题仍然存在。修改d_更新的水流量地图仍然会修改d_地形高度地图使用
cuda memcheck
在搅拌机外运行您的代码。我只需从python脚本中删除
import bpy
,然后运行
cuda memcheck python test.py
就可以做到这一点。当我这样做时,
cuda memcheck
会报告内核中无效的内存访问错误。再次感谢!顺便说一句,我试着运行了
cuda memcheck
我得到了以下两个错误:
===================内部memcheck错误:设备不受支持
=========================内部memcheck错误:初始化失败
哪个cuda版本,哪个驱动程序版本,哪个GPU?您可能正在运行的
cuda memcheck
版本对于您的GPU来说太旧,或者与安装的cuda版本不匹配。GeForce GTX 460M,驱动程序:384.76,不确定是哪个cuda版本,这是否有帮助:cuda版本很重要。如果键入
cuda-memcheck--version
cuda-memcheck version 8.0.61 ID:(41)
//update_water_flow << < numBlocks, blockSize >> >(d_water_height_map, d_water_flow_map, d_updated_water_flow_map, SIZE_X, SIZE_Y); 
cudaCheck(cudaMalloc(&d_updated_water_flow_map, SIZE * 4)); // changing this array also changes d_terrain_height_map
cudaCheck(cudaMalloc(&d_terrain_height_map, SIZE));
#define blockSize 1024
...
int numBlocks = (SIZE_X * SIZE_Y + (blockSize - 1)) / blockSize;
...
update_water_flow << < numBlocks, blockSize >> >(d_water_height_map, d_water_flow_map, d_updated_water_flow_map, SIZE_X, SIZE_Y); 
if ((row >= SIZE_Y) || (col >= SIZE_X)) return;
$ cat t383.cu
extern "C"
void init(float *t_height_map,
float *w_height_map,
float *s_height_map,
int SIZE_X,
int SIZE_Y);

extern "C"
void run_hydro_erosion(int cycles,
float t_step,
float min_tilt_angle,
float SEDIMENT_CAP,
float DISSOLVE_CONST,
float DEPOSIT_CONST,
int SIZE_X,
int SIZE_Y,
float PIPE_LENGTH,
float ADJACENT_LENGTH,
float TIME_STEP,
float MIN_TILT_ANGLE);

extern "C"
void free_mem();

extern "C"
void procedural_rain(float *water_height_map, float *rain_map, int SIZE_X, int SIZE_Y);

// includes, system
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <math.h>
#include <time.h>
#include <iostream>
#include <algorithm>
#include <random>

// includes CUDA
#include <cuda_runtime.h>

using namespace std;

#define FLOW_RIGHT 0
#define FLOW_UP 1
#define FLOW_LEFT 2
#define FLOW_DOWN 3
#define X_VEL 0
#define Y_VEL 1
#define LEFT_CELL row, col - 1
#define RIGHT_CELL row, col + 1
#define ABOVE_CELL row - 1, col
#define BELOW_CELL row + 1, col

// CUDA API error checking macro
#define T 1024
#define M 1536
#define blockSize 1024
#define cudaCheck(error) \
  if (error != cudaSuccess) { \
    printf("Fatal error: %s at %s:%d\n", \
      cudaGetErrorString(error), \
      __FILE__, __LINE__); \
    exit(1); \
              }


__global__ void update_water_flow(float *water_height_map, float *water_flow_map, float *d_updated_water_flow_map, int SIZE_X, int SIZE_Y)
{
    int index = blockIdx.x * blockDim.x + threadIdx.x;
    int col = index % SIZE_X;
    int row = index / SIZE_X;

    index = row * (SIZE_X * 4) + col * 4;   // 3D index
#ifdef FIX
    if ((row >= SIZE_Y) || (col >= SIZE_X)) return;
#endif
    d_updated_water_flow_map[index + FLOW_RIGHT] = 0;
    d_updated_water_flow_map[index + FLOW_UP] = 0;
    d_updated_water_flow_map[index + FLOW_LEFT] = 0;
    d_updated_water_flow_map[index + FLOW_DOWN] = 0;

}

static float *terrain_height_map;
static float *water_height_map;
static float *sediment_height_map;

void init(float *t_height_map,
    float *w_height_map,
    float *s_height_map,
    int SIZE_X,
    int SIZE_Y)
{
    /* set vars HOST*/
    terrain_height_map = t_height_map;
    water_height_map = w_height_map;
    sediment_height_map = s_height_map;
}

void run_hydro_erosion(int cycles,
    float t_step,
    float min_tilt_angle,
    float SEDIMENT_CAP,
    float DISSOLVE_CONST,
    float DEPOSIT_CONST,
    int SIZE_X,
    int SIZE_Y,
    float PIPE_LENGTH,
    float ADJACENT_LENGTH,
    float TIME_STEP,
    float MIN_TILT_ANGLE)
{
    int numBlocks = (SIZE_X * SIZE_Y + (blockSize - 1)) / blockSize;
    int SIZE = SIZE_X * SIZE_Y * sizeof(float);

    float *d_terrain_height_map, *d_updated_terrain_height_map;
    float *d_water_height_map, *d_updated_water_height_map;
    float *d_sediment_height_map, *d_updated_sediment_height_map;

    float *d_suspended_sediment_level;
    float *d_updated_suspended_sediment_level;
    float *d_water_flow_map;
    float *d_updated_water_flow_map;
    float *d_prev_water_height_map;
    float *d_water_velocity_vec;
    float *d_rain_map;

    cudaCheck(cudaMalloc(&d_water_height_map, SIZE));
    cudaCheck(cudaMalloc(&d_updated_water_height_map, SIZE));
    cudaCheck(cudaMalloc(&d_prev_water_height_map, SIZE));
    cudaCheck(cudaMalloc(&d_water_flow_map, SIZE * 4));
    cudaCheck(cudaMalloc(&d_updated_water_flow_map, SIZE * 4)); // changing this array also changes d_terrain_height_map
    cudaCheck(cudaMalloc(&d_terrain_height_map, SIZE));
    cudaCheck(cudaMalloc(&d_updated_terrain_height_map, SIZE));
    cudaCheck(cudaMalloc(&d_sediment_height_map, SIZE));
    cudaCheck(cudaMalloc(&d_updated_sediment_height_map, SIZE));
    cudaCheck(cudaMalloc(&d_suspended_sediment_level, SIZE));
    cudaCheck(cudaMalloc(&d_updated_suspended_sediment_level, SIZE));
    cudaCheck(cudaMalloc(&d_rain_map, SIZE));
    cudaCheck(cudaMalloc(&d_water_velocity_vec, SIZE * 2));

    cudaCheck(cudaMemcpy(d_terrain_height_map, terrain_height_map, SIZE, cudaMemcpyHostToDevice));
    cudaCheck(cudaMemcpy(d_water_height_map, water_height_map, SIZE, cudaMemcpyHostToDevice));
    cudaCheck(cudaMemcpy(d_sediment_height_map, sediment_height_map, SIZE, cudaMemcpyHostToDevice));

    cout << "init terrain_height_map" << endl;
    for (int i = 0; i < SIZE_X * SIZE_Y; i++) {
        cout << terrain_height_map[i] << ", ";
        if (i % SIZE_X == 0 && i != 0) cout << endl;
    }

    /* launch the kernel on the GPU */
    float *temp;
    while (cycles--) {
        update_water_flow << < numBlocks, blockSize >> >(d_water_height_map, d_water_flow_map, d_updated_water_flow_map, SIZE_X, SIZE_Y);
        temp = d_water_flow_map;
        d_water_flow_map = d_updated_water_flow_map;
        d_updated_water_flow_map = temp;
    }
    cudaCheck(cudaMemcpy(terrain_height_map, d_terrain_height_map, SIZE, cudaMemcpyDeviceToHost));


    cout << "updated terrain" << endl;
    for (int i = 0; i < SIZE_X * SIZE_Y; i++) {
        cout << terrain_height_map[i] << ", ";
        if (i % SIZE_X == 0 && i != 0) cout << endl;
    }
}
$ cat t383.py
import numpy
import ctypes
import random

width = 4
height = 4

size_x = width
size_y = height
N = size_x * size_y

scrpt_cycles = 1
kernel_cycles = 1
time_step = 0.005
pipe_length = 1.0
adjacent_length = 1.0
min_tilt_angle = 10
sediment_cap = 0.01
dissolve_const = 0.01
deposit_const = 0.01

# initialize arrays
ter_height_map = numpy.ones((N), dtype=numpy.float32)
water_height_map = numpy.zeros((N), dtype=numpy.float32)
sed_height_map = numpy.zeros((N), dtype=numpy.float32)
rain_map = numpy.ones((N), dtype=numpy.float32)


# load terrain height from image
for i in range(0, len(ter_height_map)):
    ter_height_map[i] = 1


# import DLL
E = ctypes.cdll.LoadLibrary("./t383.so")

# initialize device memory
E.init( ctypes.c_void_p(ter_height_map.ctypes.data),
        ctypes.c_void_p(water_height_map.ctypes.data),
        ctypes.c_void_p(sed_height_map.ctypes.data),
        ctypes.c_int(size_x),
        ctypes.c_int(size_y))


# run erosion
while(scrpt_cycles):
    scrpt_cycles = scrpt_cycles - 1
    E.run_hydro_erosion(ctypes.c_int(kernel_cycles),
                        ctypes.c_float(time_step),
                        ctypes.c_float(min_tilt_angle),
                        ctypes.c_float(sediment_cap),
                        ctypes.c_float(dissolve_const),
                        ctypes.c_float(deposit_const),
                        ctypes.c_int(size_x),
                        ctypes.c_int(size_y),
                        ctypes.c_float(pipe_length),
                        ctypes.c_float(adjacent_length),
                        ctypes.c_float(time_step),
                        ctypes.c_float(min_tilt_angle))
$ nvcc -Xcompiler -fPIC -std=c++11 -shared -arch=sm_61 -o t383.so t383.cu
$ python t383.py
init terrain_height_map
1, 1, 1, 1, 1,
1, 1, 1, 1,
1, 1, 1, 1,
1, 1, 1, updated terrain
0, 0, 0, 0, 0,
0, 0, 0, 0,
0, 0, 0, 0,
0, 0, 0, 
$ nvcc -Xcompiler -fPIC -std=c++11 -shared -arch=sm_61 -o t383.so t383.cu -DFIX
$ cuda-memcheck python t383.py
========= CUDA-MEMCHECK
init terrain_height_map
1, 1, 1, 1, 1,
1, 1, 1, 1,
1, 1, 1, 1,
1, 1, 1, updated terrain
1, 1, 1, 1, 1,
1, 1, 1, 1,
1, 1, 1, 1,
1, 1, 1, 
========= ERROR SUMMARY: 0 errors
$
$nvcc -Xcompiler -fPIC -std=c++11 -shared -arch=sm_61 -o t383.so t383.cu
$ cuda-memcheck python t383.py
========= CUDA-MEMCHECK
init terrain_height_map
1, 1, 1, 1, 1,
1, 1, 1, 1,
1, 1, 1, 1,
========= Invalid __global__ write of size 4
=========     at 0x000002f0 in update_water_flow(float*, float*, float*, int, int)
=========     by thread (31,0,0) in block (0,0,0)
=========     Address 0x1050d6009f0 is out of bounds
=========     Saved host backtrace up to driver entry point at kernel launch time
=========     Host Frame:/usr/lib/x86_64-linux-gnu/libcuda.so.1 (cuLaunchKernel + 0x2c5) [0x204505]
=========     Host Frame:./t383.so [0x1c291]
=========     Host Frame:./t383.so [0x39e33]
=========     Host Frame:./t383.so [0x6879]
=========     Host Frame:./t383.so (_Z43__device_stub__Z17update_water_flowPfS_S_iiPfS_S_ii + 0xe3) [0x6747]
=========     Host Frame:./t383.so (_Z17update_water_flowPfS_S_ii + 0x38) [0x6781]
=========     Host Frame:./t383.so (run_hydro_erosion + 0x8f2) [0x648b]
=========     Host Frame:/usr/lib/x86_64-linux-gnu/libffi.so.6 (ffi_call_unix64 + 0x4c) [0x5adc]
=========     Host Frame:/usr/lib/x86_64-linux-gnu/libffi.so.6 (ffi_call + 0x1fc) [0x540c]
=========     Host Frame:/usr/lib/python2.7/lib-dynload/_ctypes.x86_64-linux-gnu.so (_ctypes_callproc + 0x48e) [0x145fe]
=========     Host Frame:/usr/lib/python2.7/lib-dynload/_ctypes.x86_64-linux-gnu.so [0x15f9e]
=========     Host Frame:python (PyEval_EvalFrameEx + 0x98d) [0x1244dd]
=========     Host Frame:python [0x167d14]
=========     Host Frame:python (PyRun_FileExFlags + 0x92) [0x65bf4]
=========     Host Frame:python (PyRun_SimpleFileExFlags + 0x2ee) [0x6612d]
=========     Host Frame:python (Py_Main + 0xb5e) [0x66d92]
=========     Host Frame:/lib/x86_64-linux-gnu/libc.so.6 (__libc_start_main + 0xf5) [0x21f45]
=========     Host Frame:python [0x177c2e]
=========
========= Invalid __global__ write of size 4
=========     at 0x000002f0 in update_water_flow(float*, float*, float*, int, int)
=========     by thread (30,0,0) in block (0,0,0)
=========     Address 0x1050d6009e0 is out of bounds
=========     Saved host backtrace up to driver entry point at kernel launch time
=========     Host Frame:/usr/lib/x86_64-linux-gnu/libcuda.so.1 (cuLaunchKernel + 0x2c5) [0x204505]
=========     Host Frame:./t383.so [0x1c291]
=========     Host Frame:./t383.so [0x39e33]
=========     Host Frame:./t383.so [0x6879]
=========     Host Frame:./t383.so (_Z43__device_stub__Z17update_water_flowPfS_S_iiPfS_S_ii + 0xe3) [0x6747]
=========     Host Frame:./t383.so (_Z17update_water_flowPfS_S_ii + 0x38) [0x6781]
=========     Host Frame:./t383.so (run_hydro_erosion + 0x8f2) [0x648b]
=========     Host Frame:/usr/lib/x86_64-linux-gnu/libffi.so.6 (ffi_call_unix64 + 0x4c) [0x5adc]
=========     Host Frame:/usr/lib/x86_64-linux-gnu/libffi.so.6 (ffi_call + 0x1fc) [0x540c]
=========     Host Frame:/usr/lib/python2.7/lib-dynload/_ctypes.x86_64-linux-gnu.so (_ctypes_callproc + 0x48e) [0x145fe]
=========     Host Frame:/usr/lib/python2.7/lib-dynload/_ctypes.x86_64-linux-gnu.so [0x15f9e]
=========     Host Frame:python (PyEval_EvalFrameEx + 0x98d) [0x1244dd]
=========     Host Frame:python [0x167d14]
=========     Host Frame:python (PyRun_FileExFlags + 0x92) [0x65bf4]
=========     Host Frame:python (PyRun_SimpleFileExFlags + 0x2ee) [0x6612d]
=========     Host Frame:python (Py_Main + 0xb5e) [0x66d92]
=========     Host Frame:/lib/x86_64-linux-gnu/libc.so.6 (__libc_start_main + 0xf5) [0x21f45]
=========     Host Frame:python [0x177c2e]
=========
... (output truncated for brevity of presentation)
========= ERROR SUMMARY: 18 errors
$