Warning: file_get_contents(/data/phpspider/zhask/data//catemap/5/excel/23.json): failed to open stream: No such file or directory in /data/phpspider/zhask/libs/function.php on line 167

Warning: Invalid argument supplied for foreach() in /data/phpspider/zhask/libs/tag.function.php on line 1116

Notice: Undefined index: in /data/phpspider/zhask/libs/function.php on line 180

Warning: array_chunk() expects parameter 1 to be array, null given in /data/phpspider/zhask/libs/function.php on line 181
PyCUDA在同一平台上的结果不一致_Cuda_Pycuda - Fatal编程技术网

PyCUDA在同一平台上的结果不一致

PyCUDA在同一平台上的结果不一致,cuda,pycuda,Cuda,Pycuda,我正在使用PyCUDA为大学工作实现密码破解。除了在CUDA上实现NTLM算法外,其他一切似乎都正常工作 为了测试它,我创建了一个小模块,它启动一个只有1个线程的内核,对一个值进行散列,并返回该值与在CPU上获得的散列值进行比较。代码如下: import pycuda.autoinit 将pycuda.driver导入为cuda 从pycuda.compiler导入SourceModule 进口numpy 从passlib.hash导入nthash mod=源模块( """ #包括 #包括 #定

我正在使用
PyCUDA
为大学工作实现密码破解。除了在CUDA上实现NTLM算法外,其他一切似乎都正常工作

为了测试它,我创建了一个小模块,它启动一个只有1个线程的内核,对一个值进行散列,并返回该值与在CPU上获得的散列值进行比较。代码如下:

import pycuda.autoinit
将pycuda.driver导入为cuda
从pycuda.compiler导入SourceModule
进口numpy
从passlib.hash导入nthash
mod=源模块(
"""
#包括
#包括
#定义初始化A 0x67452301
#定义INIT_B 0xefcdab89
#定义INIT_C 0x98badcfe
#定义初始值0x10325476
#定义SQRT\u 2 0x5a827999
#定义SQRT_3 0x6ed9eba1
__设备无效NTLM(字符*,整数,字符*);
//__设备字符十六进制格式[33];
__设备常数字符itoa16[17]=“0123456789ABCDEF”;
__全局无效NTBruteforce(字符*十六进制格式){
int i;
字符测试[4]={'t','h','e','n'};
NTLM(测试,4,十六进制格式);
}
__设备无效NTLM(字符*键,整数键长度,字符*十六进制格式){
无符号整数nt_缓冲区[16];
无符号整数输出[4];
//轮换球
无符号整数a=INIT_a;
无符号整数b=INIT_b;
无符号int c=INIT_c;
无符号整数d=INIT_d;
//为哈希计算准备字符串
int i;
int length=键长度;
//memset(nt_缓冲区,0,4);
对于(i=0;i21);
b+=(a^(c&(d^a))+nt_缓冲区[3];
b=(b>13);
a+=(d^(b&(c^d))+nt_缓冲区[4];
a=(a>29);
d+=(c^(a&(b^c))+nt_缓冲区[5];
d=(d>25);
c+=(b^(d&(a^b))+nt_缓冲区[6];
c=(c>21);
b+=(a^(c&(d^a))+nt_缓冲区[7];
b=(b>13);
a+=(d^(b&(c^d))+nt_缓冲区[8];
a=(a>29);
d+=(c^(a&(b^c))+nt_缓冲区[9];
d=(d>25);
c+=(b^(d&(a^b))+nt_缓冲区[10];
c=(c>21);
b+=(a^(c&(d^a))+nt_缓冲区[11];
b=(b>13);
a+=(d^(b&(c^d))+nt_缓冲区[12];
a=(a>29);
d+=(c^(a&(b^c))+nt_缓冲区[13];
d=(d>25);
c+=(b^(d&(a^b))+nt_缓冲区[14];
c=(c>21);
b+=(a^(c&(d^a))+nt_缓冲区[15];
b=(b>13);
/*第二轮*/
a+=((b&(c|d))|(c&d))+nt_缓冲区[0]+SQRT_2;
a=(a>29);
d+=((a和(b|c))|(b和c))+nt_缓冲区[4]+SQRT_2;
d=(d>27);
c+=((d和(a | b))|(a和b))+nt_缓冲区[8]+SQRT_2;
c=(c>23);
b+=((c&(d|a))|(d&a))+nt_缓冲区[12]+SQRT_2;
b=(b>19);
a+=((b&(c|d))|(c&d))+nt_缓冲区[1]+SQRT_2;
a=(a>29);
d+=((a和(b|c))|(b和c))+nt_缓冲区[5]+SQRT_2;
d=(d>27);
c+=((d和(a | b))|(a和b))+nt_缓冲区[9]+SQRT_2;
c=(c>23);
b+=((c&(d|a))|(d&a))+nt_缓冲区[13]+SQRT_2;
b=(b>19);
a+=((b&(c|d))|(c&d))+nt_缓冲区[2]+SQRT_2;
a=(a>29);
d+=((a和(b|c))|(b和c))+nt_缓冲区[6]+SQRT_2;
d=(d>27);
c+=((d和(a | b))|(a和b))+nt_缓冲区[10]+SQRT_2;
c=(c>23);
b+=((c&(d|a))|(d&a))+nt_缓冲区[14]+SQRT_2;
b=(b>19);
a+=((b&(c|d))|(c&d))+nt_缓冲区[3]+SQRT_2;
a=(a>29);
d+=((a和(b|c))|(b&c))+nt_缓冲区[7]+SQRT_2;
d=(d>27);
c+=((d和(a | b))|(a和b))+nt_缓冲区[11]+SQRT_2;
c=(c>23);
b+=((c&(d|a))|(d&a))+nt_缓冲区[15]+SQRT_2;
b=(b>19);
/*第三轮*/
a+=(d^c^b)+nt_缓冲区[0]+SQRT_3;
a=(a>29);
d+=(c^b^a)+nt_缓冲区[8]+SQRT_3;
d=(d>23);
c+=(b^a^d)+nt_缓冲区[4]+SQRT_3;
c=(c>21);
b+=(a^d^c)+nt_缓冲区[12]+SQRT_3;
b=(b>17);
a+=(d^c^b)+nt_缓冲区[2]+SQRT_3;
a=(a>29);
d+=(c^b^a)+nt_缓冲区[10]+SQRT_3;
d=(d>23);
c+=(b^a^d)+nt_缓冲区[6]+SQRT_3;
c=(c>21);
b+=(a^d^c)+nt_缓冲区[14]+SQRT_3;
b=(b>17);
a+=(d^c^b)+nt_缓冲区[1]+SQRT_3;
a=(a>29);
d+=(c^b^a)+nt_缓冲区[9]+SQRT_3;
d=(d>23);
c+=(b^a^d)+nt_缓冲区[5]+SQRT_3;
c=(c>21);
b+=(a^d^c)+nt_缓冲区[13]+SQRT_3;
b=(b>17);
a+=(d^c^b)+nt_缓冲区[3]+SQRT_3;
a=(a>29);
d+=(c^b^a)+nt_缓冲区[11]+SQRT_3;
d=(d>23);
c+=(b^a^d)+nt_缓冲区[7]+SQRT_3;
c=(c>21);
b+=(a^d^c)+nt_缓冲区[15]+SQRT_3;
b=(b>17);
输出[0]=a+0x67452301;
输出[1]=b+0xefcdab89;
输出[2]=c+0x98badcfe;
输出[3]=d+0x10325476;
//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
//将哈希值转换为十六进制(以便可读)
//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

对于(i=0;i您忘记初始化
nt\u buffer
。您观察到的是未初始化变量的一个典型后果:内存中的垃圾可能在每次运行中都不同,因此结果不一致。只需通过以下方式更改变量声明行:

无符号int\u缓冲区[16]={0};
应修复您的问题(有关C样式阵列初始化的信息,请参阅)。以下是完整的(修复+错误检查)CUDA/C++代码供感兴趣的人使用:

#包括
#包括
#包括
#定义初始化A 0x67452301
#定义INIT_B 0xefcdab89
#定义INIT_C 0x98badcfe
#定义初始值0x10325476
#定义SQRT\u 2 0x5a827999
#定义SQRT_3 0x6ed9eba1
#定义CUDA\U CHECK\U ERROR()
#定义CUDA\u安全调用(err)\uUDA\u安全调用(err、\uuuuuuuuu文件、\uuuuuuuu行)
内联无效校验错误(常量字符*文件名,常量内联行号)
{
cudaError err=cudaDeviceSynchronize();
if(err!=cudaSuccess)
{
import pycuda.autoinit
import pycuda.driver as cuda
from pycuda.compiler import SourceModule
import numpy
from passlib.hash import nthash

mod = SourceModule(
"""
#include <string.h>
#include <iostream>
#include <stdio.h>

#define INIT_A 0x67452301
#define INIT_B 0xefcdab89
#define INIT_C 0x98badcfe
#define INIT_D 0x10325476

#define SQRT_2 0x5a827999
#define SQRT_3 0x6ed9eba1

#define CUDA_CHECK_ERROR()  __cuda_check_errors(__FILE__, __LINE__)
#define CUDA_SAFE_CALL(err) __cuda_safe_call(err, __FILE__, __LINE__)

inline void __cuda_check_errors(const char *filename, const int line_number)
{
    cudaError err = cudaDeviceSynchronize();
    if(err != cudaSuccess)
    {
        printf("CUDA error %i at %s:%i: %s\\n",
               err, filename, line_number, cudaGetErrorString(err));
        exit(-1);
    }
}

inline void __cuda_safe_call(cudaError err, const char *filename, const int line_number)
{
    if (err != cudaSuccess)
    {
        printf("CUDA error %i at %s:%i: %s\\n",
               err, filename, line_number, cudaGetErrorString(err));
        exit(-1);
    }
}

__device__ void NTLM(char *, int, char*);
__device__ __constant__ char itoa16[17] = "0123456789ABCDEF";

extern "C" {

__global__ void NTBruteforce(char *hex_format){
    char test[4] = {'t', 'h', 'e', 'n'};
    NTLM(test, 4, hex_format);      
}

}

__device__ void NTLM(char *key, int key_length, char *hex_format) {
    unsigned int nt_buffer[16] = { 0 };
    unsigned int output[4] = { 0 };

    //Globals for rounds
    unsigned int a = INIT_A;
    unsigned int b = INIT_B;
    unsigned int c = INIT_C;
    unsigned int d = INIT_D;

    // Prepare the string for hash calculation
    int i;
    int length = key_length;

    for (i = 0; i < length / 2; i++)
        nt_buffer[i] = key[2 * i] | (key[2 * i + 1] << 16);

    //padding
    if (length % 2 == 1)
        nt_buffer[i] = key[length - 1] | 0x800000;
    else
        nt_buffer[i] = 0x80;

    //put the length
    nt_buffer[14] = length << 4;

    // NTLM hash calculation

    /* Round 1 */
    a += (d ^ (b & (c ^ d))) + nt_buffer[0];
    a = (a << 3) | (a >> 29);
    d += (c ^ (a & (b ^ c))) + nt_buffer[1];
    d = (d << 7) | (d >> 25);
    c += (b ^ (d & (a ^ b))) + nt_buffer[2];
    c = (c << 11) | (c >> 21);
    b += (a ^ (c & (d ^ a))) + nt_buffer[3];
    b = (b << 19) | (b >> 13);

    a += (d ^ (b & (c ^ d))) + nt_buffer[4];
    a = (a << 3) | (a >> 29);
    d += (c ^ (a & (b ^ c))) + nt_buffer[5];
    d = (d << 7) | (d >> 25);
    c += (b ^ (d & (a ^ b))) + nt_buffer[6];
    c = (c << 11) | (c >> 21);
    b += (a ^ (c & (d ^ a))) + nt_buffer[7];
    b = (b << 19) | (b >> 13);

    a += (d ^ (b & (c ^ d))) + nt_buffer[8];
    a = (a << 3) | (a >> 29);
    d += (c ^ (a & (b ^ c))) + nt_buffer[9];
    d = (d << 7) | (d >> 25);
    c += (b ^ (d & (a ^ b))) + nt_buffer[10];
    c = (c << 11) | (c >> 21);
    b += (a ^ (c & (d ^ a))) + nt_buffer[11];
    b = (b << 19) | (b >> 13);

    a += (d ^ (b & (c ^ d))) + nt_buffer[12];
    a = (a << 3) | (a >> 29);
    d += (c ^ (a & (b ^ c))) + nt_buffer[13];
    d = (d << 7) | (d >> 25);
    c += (b ^ (d & (a ^ b))) + nt_buffer[14];
    c = (c << 11) | (c >> 21);
    b += (a ^ (c & (d ^ a))) + nt_buffer[15];
    b = (b << 19) | (b >> 13);

    /* Round 2 */
    a += ((b & (c | d)) | (c & d)) + nt_buffer[0] + SQRT_2;
    a = (a << 3) | (a >> 29);
    d += ((a & (b | c)) | (b & c)) + nt_buffer[4] + SQRT_2;
    d = (d << 5) | (d >> 27);
    c += ((d & (a | b)) | (a & b)) + nt_buffer[8] + SQRT_2;
    c = (c << 9) | (c >> 23);
    b += ((c & (d | a)) | (d & a)) + nt_buffer[12] + SQRT_2;
    b = (b << 13) | (b >> 19);

    a += ((b & (c | d)) | (c & d)) + nt_buffer[1] + SQRT_2;
    a = (a << 3) | (a >> 29);
    d += ((a & (b | c)) | (b & c)) + nt_buffer[5] + SQRT_2;
    d = (d << 5) | (d >> 27);
    c += ((d & (a | b)) | (a & b)) + nt_buffer[9] + SQRT_2;
    c = (c << 9) | (c >> 23);
    b += ((c & (d | a)) | (d & a)) + nt_buffer[13] + SQRT_2;
    b = (b << 13) | (b >> 19);

    a += ((b & (c | d)) | (c & d)) + nt_buffer[2] + SQRT_2;
    a = (a << 3) | (a >> 29);
    d += ((a & (b | c)) | (b & c)) + nt_buffer[6] + SQRT_2;
    d = (d << 5) | (d >> 27);
    c += ((d & (a | b)) | (a & b)) + nt_buffer[10] + SQRT_2;
    c = (c << 9) | (c >> 23);
    b += ((c & (d | a)) | (d & a)) + nt_buffer[14] + SQRT_2;
    b = (b << 13) | (b >> 19);

    a += ((b & (c | d)) | (c & d)) + nt_buffer[3] + SQRT_2;
    a = (a << 3) | (a >> 29);
    d += ((a & (b | c)) | (b & c)) + nt_buffer[7] + SQRT_2;
    d = (d << 5) | (d >> 27);
    c += ((d & (a | b)) | (a & b)) + nt_buffer[11] + SQRT_2;
    c = (c << 9) | (c >> 23);
    b += ((c & (d | a)) | (d & a)) + nt_buffer[15] + SQRT_2;
    b = (b << 13) | (b >> 19);

    /* Round 3 */
    a += (d ^ c ^ b) + nt_buffer[0] + SQRT_3;
    a = (a << 3) | (a >> 29);
    d += (c ^ b ^ a) + nt_buffer[8] + SQRT_3;
    d = (d << 9) | (d >> 23);
    c += (b ^ a ^ d) + nt_buffer[4] + SQRT_3;
    c = (c << 11) | (c >> 21);
    b += (a ^ d ^ c) + nt_buffer[12] + SQRT_3;
    b = (b << 15) | (b >> 17);

    a += (d ^ c ^ b) + nt_buffer[2] + SQRT_3;
    a = (a << 3) | (a >> 29);
    d += (c ^ b ^ a) + nt_buffer[10] + SQRT_3;
    d = (d << 9) | (d >> 23);
    c += (b ^ a ^ d) + nt_buffer[6] + SQRT_3;
    c = (c << 11) | (c >> 21);
    b += (a ^ d ^ c) + nt_buffer[14] + SQRT_3;
    b = (b << 15) | (b >> 17);

    a += (d ^ c ^ b) + nt_buffer[1] + SQRT_3;
    a = (a << 3) | (a >> 29);
    d += (c ^ b ^ a) + nt_buffer[9] + SQRT_3;
    d = (d << 9) | (d >> 23);
    c += (b ^ a ^ d) + nt_buffer[5] + SQRT_3;
    c = (c << 11) | (c >> 21);
    b += (a ^ d ^ c) + nt_buffer[13] + SQRT_3;
    b = (b << 15) | (b >> 17);

    a += (d ^ c ^ b) + nt_buffer[3] + SQRT_3;
    a = (a << 3) | (a >> 29);
    d += (c ^ b ^ a) + nt_buffer[11] + SQRT_3;
    d = (d << 9) | (d >> 23);
    c += (b ^ a ^ d) + nt_buffer[7] + SQRT_3;
    c = (c << 11) | (c >> 21);
    b += (a ^ d ^ c) + nt_buffer[15] + SQRT_3;
    b = (b << 15) | (b >> 17);

    output[0] = a + 0x67452301;
    output[1] = b + 0xefcdab89;
    output[2] = c + 0x98badcfe;
    output[3] = d + 0x10325476;

    //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
    // Convert the hash to hex (for being readable)
    //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
    for(i=0; i<4; i++)
    {
        int j = 0;
        unsigned int n = output[i];

        //iterate the bytes of the integer
        for(; j<4; j++)
        {
            unsigned int convert = n % 256;
            hex_format[i * 8 + j * 2 + 1] = itoa16[convert % 16];
            convert = convert / 16;
            hex_format[i * 8 + j * 2 + 0] = itoa16[convert % 16];
            n = n / 256;
        }
    }       
}
""", no_extern_c=True)
expected = nthash.encrypt('then')
data = numpy.array(expected)
cleartext = numpy.zeros_like(data)
cleartext_gpu = cuda.mem_alloc(data.nbytes)
func = mod.get_function('NTBruteforce')
func(cleartext_gpu, block=(1,1,1))
cuda.memcpy_dtoh(cleartext, cleartext_gpu)
print 'Expected: {}'.format(expected.upper())
print "GPU     : {}".format(cleartext.tostring())
Expected: 35B5C3F393D57F7836FF61514BCF1289
GPU     : 35B5C3F393D57F7836FF61514BCF1289