I am implementing a password cracker for college work using PyCUDA
. Everything seems to be working correctly except the implementation of the NTLM algorithm on CUDA.
To test it out, I created a small module that launches a kernel with only 1 thread, hashes a value and returns it for comparison with the hash obtained on the CPU. Here is the code below:
import pycuda.autoinit
import pycuda.driver as cuda
from pycuda.compiler import SourceModule
import numpy
from passlib.hash import nthash
mod = SourceModule(
"""
#include <string.h>
#include <stdio.h>
#define INIT_A 0x67452301
#define INIT_B 0xefcdab89
#define INIT_C 0x98badcfe
#define INIT_D 0x10325476
#define SQRT_2 0x5a827999
#define SQRT_3 0x6ed9eba1
__device__ void NTLM(char *, int, char*);
//__device__ char hex_format[33];
__device__ __constant__ char itoa16[17] = "0123456789ABCDEF";
__global__ void NTBruteforce(char *hex_format){
int i;
char test[4] = {'t', 'h', 'e', 'n'};
NTLM(test, 4, hex_format);
}
__device__ void NTLM(char *key, int key_length, char *hex_format) {
unsigned int nt_buffer[16];
unsigned int output[4];
//Globals for rounds
unsigned int a = INIT_A;
unsigned int b = INIT_B;
unsigned int c = INIT_C;
unsigned int d = INIT_D;
// Prepare the string for hash calculation
int i;
int length = key_length;
//memset(nt_buffer, 0, 4);
for (i = 0; i < length / 2; i++)
nt_buffer[i] = key[2 * i] | (key[2 * i + 1] << 16);
//padding
if (length % 2 == 1)
nt_buffer[i] = key[length - 1] | 0x800000;
else
nt_buffer[i] = 0x80;
//put the length
nt_buffer[14] = length << 4;
// NTLM hash calculation
/* Round 1 */
a += (d ^ (b & (c ^ d))) + nt_buffer[0];
a = (a << 3) | (a >> 29);
d += (c ^ (a & (b ^ c))) + nt_buffer[1];
d = (d << 7) | (d >> 25);
c += (b ^ (d & (a ^ b))) + nt_buffer[2];
c = (c << 11) | (c >> 21);
b += (a ^ (c & (d ^ a))) + nt_buffer[3];
b = (b << 19) | (b >> 13);
a += (d ^ (b & (c ^ d))) + nt_buffer[4];
a = (a << 3) | (a >> 29);
d += (c ^ (a & (b ^ c))) + nt_buffer[5];
d = (d << 7) | (d >> 25);
c += (b ^ (d & (a ^ b))) + nt_buffer[6];
c = (c << 11) | (c >> 21);
b += (a ^ (c & (d ^ a))) + nt_buffer[7];
b = (b << 19) | (b >> 13);
a += (d ^ (b & (c ^ d))) + nt_buffer[8];
a = (a << 3) | (a >> 29);
d += (c ^ (a & (b ^ c))) + nt_buffer[9];
d = (d << 7) | (d >> 25);
c += (b ^ (d & (a ^ b))) + nt_buffer[10];
c = (c << 11) | (c >> 21);
b += (a ^ (c & (d ^ a))) + nt_buffer[11];
b = (b << 19) | (b >> 13);
a += (d ^ (b & (c ^ d))) + nt_buffer[12];
a = (a << 3) | (a >> 29);
d += (c ^ (a & (b ^ c))) + nt_buffer[13];
d = (d << 7) | (d >> 25);
c += (b ^ (d & (a ^ b))) + nt_buffer[14];
c = (c << 11) | (c >> 21);
b += (a ^ (c & (d ^ a))) + nt_buffer[15];
b = (b << 19) | (b >> 13);
/* Round 2 */
a += ((b & (c | d)) | (c & d)) + nt_buffer[0] + SQRT_2;
a = (a << 3) | (a >> 29);
d += ((a & (b | c)) | (b & c)) + nt_buffer[4] + SQRT_2;
d = (d << 5) | (d >> 27);
c += ((d & (a | b)) | (a & b)) + nt_buffer[8] + SQRT_2;
c = (c << 9) | (c >> 23);
b += ((c & (d | a)) | (d & a)) + nt_buffer[12] + SQRT_2;
b = (b << 13) | (b >> 19);
a += ((b & (c | d)) | (c & d)) + nt_buffer[1] + SQRT_2;
a = (a << 3) | (a >> 29);
d += ((a & (b | c)) | (b & c)) + nt_buffer[5] + SQRT_2;
d = (d << 5) | (d >> 27);
c += ((d & (a | b)) | (a & b)) + nt_buffer[9] + SQRT_2;
c = (c << 9) | (c >> 23);
b += ((c & (d | a)) | (d & a)) + nt_buffer[13] + SQRT_2;
b = (b << 13) | (b >> 19);
a += ((b & (c | d)) | (c & d)) + nt_buffer[2] + SQRT_2;
a = (a << 3) | (a >> 29);
d += ((a & (b | c)) | (b & c)) + nt_buffer[6] + SQRT_2;
d = (d << 5) | (d >> 27);
c += ((d & (a | b)) | (a & b)) + nt_buffer[10] + SQRT_2;
c = (c << 9) | (c >> 23);
b += ((c & (d | a)) | (d & a)) + nt_buffer[14] + SQRT_2;
b = (b << 13) | (b >> 19);
a += ((b & (c | d)) | (c & d)) + nt_buffer[3] + SQRT_2;
a = (a << 3) | (a >> 29);
d += ((a & (b | c)) | (b & c)) + nt_buffer[7] + SQRT_2;
d = (d << 5) | (d >> 27);
c += ((d & (a | b)) | (a & b)) + nt_buffer[11] + SQRT_2;
c = (c << 9) | (c >> 23);
b += ((c & (d | a)) | (d & a)) + nt_buffer[15] + SQRT_2;
b = (b << 13) | (b >> 19);
/* Round 3 */
a += (d ^ c ^ b) + nt_buffer[0] + SQRT_3;
a = (a << 3) | (a >> 29);
d += (c ^ b ^ a) + nt_buffer[8] + SQRT_3;
d = (d << 9) | (d >> 23);
c += (b ^ a ^ d) + nt_buffer[4] + SQRT_3;
c = (c << 11) | (c >> 21);
b += (a ^ d ^ c) + nt_buffer[12] + SQRT_3;
b = (b << 15) | (b >> 17);
a += (d ^ c ^ b) + nt_buffer[2] + SQRT_3;
a = (a << 3) | (a >> 29);
d += (c ^ b ^ a) + nt_buffer[10] + SQRT_3;
d = (d << 9) | (d >> 23);
c += (b ^ a ^ d) + nt_buffer[6] + SQRT_3;
c = (c << 11) | (c >> 21);
b += (a ^ d ^ c) + nt_buffer[14] + SQRT_3;
b = (b << 15) | (b >> 17);
a += (d ^ c ^ b) + nt_buffer[1] + SQRT_3;
a = (a << 3) | (a >> 29);
d += (c ^ b ^ a) + nt_buffer[9] + SQRT_3;
d = (d << 9) | (d >> 23);
c += (b ^ a ^ d) + nt_buffer[5] + SQRT_3;
c = (c << 11) | (c >> 21);
b += (a ^ d ^ c) + nt_buffer[13] + SQRT_3;
b = (b << 15) | (b >> 17);
a += (d ^ c ^ b) + nt_buffer[3] + SQRT_3;
a = (a << 3) | (a >> 29);
d += (c ^ b ^ a) + nt_buffer[11] + SQRT_3;
d = (d << 9) | (d >> 23);
c += (b ^ a ^ d) + nt_buffer[7] + SQRT_3;
c = (c << 11) | (c >> 21);
b += (a ^ d ^ c) + nt_buffer[15] + SQRT_3;
b = (b << 15) | (b >> 17);
output[0] = a + 0x67452301;
output[1] = b + 0xefcdab89;
output[2] = c + 0x98badcfe;
output[3] = d + 0x10325476;
//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
// Convert the hash to hex (for being readable)
//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
for(i=0; i<4; i++)
{
int j = 0;
unsigned int n = output[i];
//iterate the bytes of the integer
for(; j<4; j++)
{
unsigned int convert = n % 256;
hex_format[i * 8 + j * 2 + 1] = itoa16[convert % 16];
convert = convert / 16;
hex_format[i * 8 + j * 2 + 0] = itoa16[convert % 16];
n = n / 256;
}
}
}
""")
expected = nthash.encrypt('then')
data = numpy.array(expected)
cleartext = numpy.zeros_like(data)
cleartext_gpu = cuda.mem_alloc(data.nbytes)
func = mod.get_function('NTBruteforce')
func(cleartext_gpu, block=(1,1,1))
cuda.memcpy_dtoh(cleartext, cleartext_gpu)
print 'Expected: {}'.format(expected.upper())
print "GPU : {}".format(cleartext.tostring())
The problem is that I get different results on consecutive runs. Sometimes I get the correct result a few times in a row, but the next time I run it (after 2-3 secs), the result is wrong. My output looks like this:
Expected: 35B5C3F393D57F7836FF61514BCF1289
GPU : 90ABFDFAA5F9F1F25DAF679A3FC1331F
Expected: 35B5C3F393D57F7836FF61514BCF1289
GPU : 4A3F30740C38FC259867716DF887349B
Expected: 35B5C3F393D57F7836FF61514BCF1289
GPU : 2CA784517A80BBE10437EE88CFDEC269
Expected: 35B5C3F393D57F7836FF61514BCF1289
GPU : 35B5C3F393D57F7836FF61514BCF1289
Expected: 35B5C3F393D57F7836FF61514BCF1289
GPU : 35B5C3F393D57F7836FF61514BCF1289
Expected: 35B5C3F393D57F7836FF61514BCF1289
GPU : 8EA84AB098A6C8E37FFF1F6440127273
The above output is just an example of running the program a few times consecutively. As you can see I get the correct result sometimes (and sometimes consecutively as well) but other times the result is wrong and I don't understand why.
I've tried re-installing the CUDA SDK (version 4.2.9) and rebooting my computer but the same thing happens.
Using Windows 7 64-bit, Geforce GT240
Any ideas?
You forgot to initialize nt_buffer
. What you observed is a typical consequence of uninitialized variables: the junk in memory may vary from one run to the next, hence the inconsistent results. Simply changing the variable declaration line by:
unsigned int nt_buffer[16] = { 0 };
should fix your issue (see this answer for information on C-style array initialization). Here is the complete (fix + error checking) CUDA/C++ code for those interested:
#include <string.h>
#include <iostream>
#include <stdio.h>
#define INIT_A 0x67452301
#define INIT_B 0xefcdab89
#define INIT_C 0x98badcfe
#define INIT_D 0x10325476
#define SQRT_2 0x5a827999
#define SQRT_3 0x6ed9eba1
#define CUDA_CHECK_ERROR() __cuda_check_errors(__FILE__, __LINE__)
#define CUDA_SAFE_CALL(err) __cuda_safe_call(err, __FILE__, __LINE__)
inline void __cuda_check_errors(const char *filename, const int line_number)
{
cudaError err = cudaDeviceSynchronize();
if(err != cudaSuccess)
{
printf("CUDA error %i at %s:%i: %s\n",
err, filename, line_number, cudaGetErrorString(err));
exit(-1);
}
}
inline void __cuda_safe_call(cudaError err, const char *filename, const int line_number)
{
if (err != cudaSuccess)
{
printf("CUDA error %i at %s:%i: %s\n",
err, filename, line_number, cudaGetErrorString(err));
exit(-1);
}
}
__device__ void NTLM(char *, int, char*);
__device__ __constant__ char itoa16[17] = "0123456789ABCDEF";
__global__ void NTBruteforce(char *hex_format){
char test[4] = {'t', 'h', 'e', 'n'};
NTLM(test, 4, hex_format);
}
__device__ void NTLM(char *key, int key_length, char *hex_format) {
unsigned int nt_buffer[16] = { 0 };
unsigned int output[4] = { 0 };
//Globals for rounds
unsigned int a = INIT_A;
unsigned int b = INIT_B;
unsigned int c = INIT_C;
unsigned int d = INIT_D;
// Prepare the string for hash calculation
int i;
int length = key_length;
for (i = 0; i < length / 2; i++)
nt_buffer[i] = key[2 * i] | (key[2 * i + 1] << 16);
//padding
if (length % 2 == 1)
nt_buffer[i] = key[length - 1] | 0x800000;
else
nt_buffer[i] = 0x80;
//put the length
nt_buffer[14] = length << 4;
// NTLM hash calculation
/* Round 1 */
a += (d ^ (b & (c ^ d))) + nt_buffer[0];
a = (a << 3) | (a >> 29);
d += (c ^ (a & (b ^ c))) + nt_buffer[1];
d = (d << 7) | (d >> 25);
c += (b ^ (d & (a ^ b))) + nt_buffer[2];
c = (c << 11) | (c >> 21);
b += (a ^ (c & (d ^ a))) + nt_buffer[3];
b = (b << 19) | (b >> 13);
a += (d ^ (b & (c ^ d))) + nt_buffer[4];
a = (a << 3) | (a >> 29);
d += (c ^ (a & (b ^ c))) + nt_buffer[5];
d = (d << 7) | (d >> 25);
c += (b ^ (d & (a ^ b))) + nt_buffer[6];
c = (c << 11) | (c >> 21);
b += (a ^ (c & (d ^ a))) + nt_buffer[7];
b = (b << 19) | (b >> 13);
a += (d ^ (b & (c ^ d))) + nt_buffer[8];
a = (a << 3) | (a >> 29);
d += (c ^ (a & (b ^ c))) + nt_buffer[9];
d = (d << 7) | (d >> 25);
c += (b ^ (d & (a ^ b))) + nt_buffer[10];
c = (c << 11) | (c >> 21);
b += (a ^ (c & (d ^ a))) + nt_buffer[11];
b = (b << 19) | (b >> 13);
a += (d ^ (b & (c ^ d))) + nt_buffer[12];
a = (a << 3) | (a >> 29);
d += (c ^ (a & (b ^ c))) + nt_buffer[13];
d = (d << 7) | (d >> 25);
c += (b ^ (d & (a ^ b))) + nt_buffer[14];
c = (c << 11) | (c >> 21);
b += (a ^ (c & (d ^ a))) + nt_buffer[15];
b = (b << 19) | (b >> 13);
/* Round 2 */
a += ((b & (c | d)) | (c & d)) + nt_buffer[0] + SQRT_2;
a = (a << 3) | (a >> 29);
d += ((a & (b | c)) | (b & c)) + nt_buffer[4] + SQRT_2;
d = (d << 5) | (d >> 27);
c += ((d & (a | b)) | (a & b)) + nt_buffer[8] + SQRT_2;
c = (c << 9) | (c >> 23);
b += ((c & (d | a)) | (d & a)) + nt_buffer[12] + SQRT_2;
b = (b << 13) | (b >> 19);
a += ((b & (c | d)) | (c & d)) + nt_buffer[1] + SQRT_2;
a = (a << 3) | (a >> 29);
d += ((a & (b | c)) | (b & c)) + nt_buffer[5] + SQRT_2;
d = (d << 5) | (d >> 27);
c += ((d & (a | b)) | (a & b)) + nt_buffer[9] + SQRT_2;
c = (c << 9) | (c >> 23);
b += ((c & (d | a)) | (d & a)) + nt_buffer[13] + SQRT_2;
b = (b << 13) | (b >> 19);
a += ((b & (c | d)) | (c & d)) + nt_buffer[2] + SQRT_2;
a = (a << 3) | (a >> 29);
d += ((a & (b | c)) | (b & c)) + nt_buffer[6] + SQRT_2;
d = (d << 5) | (d >> 27);
c += ((d & (a | b)) | (a & b)) + nt_buffer[10] + SQRT_2;
c = (c << 9) | (c >> 23);
b += ((c & (d | a)) | (d & a)) + nt_buffer[14] + SQRT_2;
b = (b << 13) | (b >> 19);
a += ((b & (c | d)) | (c & d)) + nt_buffer[3] + SQRT_2;
a = (a << 3) | (a >> 29);
d += ((a & (b | c)) | (b & c)) + nt_buffer[7] + SQRT_2;
d = (d << 5) | (d >> 27);
c += ((d & (a | b)) | (a & b)) + nt_buffer[11] + SQRT_2;
c = (c << 9) | (c >> 23);
b += ((c & (d | a)) | (d & a)) + nt_buffer[15] + SQRT_2;
b = (b << 13) | (b >> 19);
/* Round 3 */
a += (d ^ c ^ b) + nt_buffer[0] + SQRT_3;
a = (a << 3) | (a >> 29);
d += (c ^ b ^ a) + nt_buffer[8] + SQRT_3;
d = (d << 9) | (d >> 23);
c += (b ^ a ^ d) + nt_buffer[4] + SQRT_3;
c = (c << 11) | (c >> 21);
b += (a ^ d ^ c) + nt_buffer[12] + SQRT_3;
b = (b << 15) | (b >> 17);
a += (d ^ c ^ b) + nt_buffer[2] + SQRT_3;
a = (a << 3) | (a >> 29);
d += (c ^ b ^ a) + nt_buffer[10] + SQRT_3;
d = (d << 9) | (d >> 23);
c += (b ^ a ^ d) + nt_buffer[6] + SQRT_3;
c = (c << 11) | (c >> 21);
b += (a ^ d ^ c) + nt_buffer[14] + SQRT_3;
b = (b << 15) | (b >> 17);
a += (d ^ c ^ b) + nt_buffer[1] + SQRT_3;
a = (a << 3) | (a >> 29);
d += (c ^ b ^ a) + nt_buffer[9] + SQRT_3;
d = (d << 9) | (d >> 23);
c += (b ^ a ^ d) + nt_buffer[5] + SQRT_3;
c = (c << 11) | (c >> 21);
b += (a ^ d ^ c) + nt_buffer[13] + SQRT_3;
b = (b << 15) | (b >> 17);
a += (d ^ c ^ b) + nt_buffer[3] + SQRT_3;
a = (a << 3) | (a >> 29);
d += (c ^ b ^ a) + nt_buffer[11] + SQRT_3;
d = (d << 9) | (d >> 23);
c += (b ^ a ^ d) + nt_buffer[7] + SQRT_3;
c = (c << 11) | (c >> 21);
b += (a ^ d ^ c) + nt_buffer[15] + SQRT_3;
b = (b << 15) | (b >> 17);
output[0] = a + 0x67452301;
output[1] = b + 0xefcdab89;
output[2] = c + 0x98badcfe;
output[3] = d + 0x10325476;
//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
// Convert the hash to hex (for being readable)
//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
for(i=0; i<4; i++)
{
int j = 0;
unsigned int n = output[i];
//iterate the bytes of the integer
for(; j<4; j++)
{
unsigned int convert = n % 256;
hex_format[i * 8 + j * 2 + 1] = itoa16[convert % 16];
convert = convert / 16;
hex_format[i * 8 + j * 2 + 0] = itoa16[convert % 16];
n = n / 256;
}
}
}
int main()
{
char* d_hex;
char h_hex[33] = "";
CUDA_SAFE_CALL(cudaMalloc(&d_hex, 33 * sizeof(char)));
NTBruteforce<<<1, 1>>>(d_hex);
CUDA_CHECK_ERROR();
CUDA_SAFE_CALL(cudaMemcpy(h_hex, d_hex, 32 * sizeof(char), cudaMemcpyDeviceToHost));
CUDA_SAFE_CALL(cudaFree(d_hex));
h_hex[32] = '\0';
std::cout << h_hex << std::endl;
}
which always returns 35B5C3F393D57F7836FF61514BCF1289
. This was tested on Linux with CUDA 5.0, GeForce GT 650M and 319.12 beta drivers.
Here is the file I used to test with PyCUDA. Note that I had to modify a few things:
\n
I added, else PyCUDA processes them...no_extern_c=True
to SourceModule
and put NTBruteforce
in extern "C"
, else compilation fails for me (error: this declaration may not have extern "C" linkage
).The complete PyCUDA program becomes:
import pycuda.autoinit
import pycuda.driver as cuda
from pycuda.compiler import SourceModule
import numpy
from passlib.hash import nthash
mod = SourceModule(
"""
#include <string.h>
#include <iostream>
#include <stdio.h>
#define INIT_A 0x67452301
#define INIT_B 0xefcdab89
#define INIT_C 0x98badcfe
#define INIT_D 0x10325476
#define SQRT_2 0x5a827999
#define SQRT_3 0x6ed9eba1
#define CUDA_CHECK_ERROR() __cuda_check_errors(__FILE__, __LINE__)
#define CUDA_SAFE_CALL(err) __cuda_safe_call(err, __FILE__, __LINE__)
inline void __cuda_check_errors(const char *filename, const int line_number)
{
cudaError err = cudaDeviceSynchronize();
if(err != cudaSuccess)
{
printf("CUDA error %i at %s:%i: %s\\n",
err, filename, line_number, cudaGetErrorString(err));
exit(-1);
}
}
inline void __cuda_safe_call(cudaError err, const char *filename, const int line_number)
{
if (err != cudaSuccess)
{
printf("CUDA error %i at %s:%i: %s\\n",
err, filename, line_number, cudaGetErrorString(err));
exit(-1);
}
}
__device__ void NTLM(char *, int, char*);
__device__ __constant__ char itoa16[17] = "0123456789ABCDEF";
extern "C" {
__global__ void NTBruteforce(char *hex_format){
char test[4] = {'t', 'h', 'e', 'n'};
NTLM(test, 4, hex_format);
}
}
__device__ void NTLM(char *key, int key_length, char *hex_format) {
unsigned int nt_buffer[16] = { 0 };
unsigned int output[4] = { 0 };
//Globals for rounds
unsigned int a = INIT_A;
unsigned int b = INIT_B;
unsigned int c = INIT_C;
unsigned int d = INIT_D;
// Prepare the string for hash calculation
int i;
int length = key_length;
for (i = 0; i < length / 2; i++)
nt_buffer[i] = key[2 * i] | (key[2 * i + 1] << 16);
//padding
if (length % 2 == 1)
nt_buffer[i] = key[length - 1] | 0x800000;
else
nt_buffer[i] = 0x80;
//put the length
nt_buffer[14] = length << 4;
// NTLM hash calculation
/* Round 1 */
a += (d ^ (b & (c ^ d))) + nt_buffer[0];
a = (a << 3) | (a >> 29);
d += (c ^ (a & (b ^ c))) + nt_buffer[1];
d = (d << 7) | (d >> 25);
c += (b ^ (d & (a ^ b))) + nt_buffer[2];
c = (c << 11) | (c >> 21);
b += (a ^ (c & (d ^ a))) + nt_buffer[3];
b = (b << 19) | (b >> 13);
a += (d ^ (b & (c ^ d))) + nt_buffer[4];
a = (a << 3) | (a >> 29);
d += (c ^ (a & (b ^ c))) + nt_buffer[5];
d = (d << 7) | (d >> 25);
c += (b ^ (d & (a ^ b))) + nt_buffer[6];
c = (c << 11) | (c >> 21);
b += (a ^ (c & (d ^ a))) + nt_buffer[7];
b = (b << 19) | (b >> 13);
a += (d ^ (b & (c ^ d))) + nt_buffer[8];
a = (a << 3) | (a >> 29);
d += (c ^ (a & (b ^ c))) + nt_buffer[9];
d = (d << 7) | (d >> 25);
c += (b ^ (d & (a ^ b))) + nt_buffer[10];
c = (c << 11) | (c >> 21);
b += (a ^ (c & (d ^ a))) + nt_buffer[11];
b = (b << 19) | (b >> 13);
a += (d ^ (b & (c ^ d))) + nt_buffer[12];
a = (a << 3) | (a >> 29);
d += (c ^ (a & (b ^ c))) + nt_buffer[13];
d = (d << 7) | (d >> 25);
c += (b ^ (d & (a ^ b))) + nt_buffer[14];
c = (c << 11) | (c >> 21);
b += (a ^ (c & (d ^ a))) + nt_buffer[15];
b = (b << 19) | (b >> 13);
/* Round 2 */
a += ((b & (c | d)) | (c & d)) + nt_buffer[0] + SQRT_2;
a = (a << 3) | (a >> 29);
d += ((a & (b | c)) | (b & c)) + nt_buffer[4] + SQRT_2;
d = (d << 5) | (d >> 27);
c += ((d & (a | b)) | (a & b)) + nt_buffer[8] + SQRT_2;
c = (c << 9) | (c >> 23);
b += ((c & (d | a)) | (d & a)) + nt_buffer[12] + SQRT_2;
b = (b << 13) | (b >> 19);
a += ((b & (c | d)) | (c & d)) + nt_buffer[1] + SQRT_2;
a = (a << 3) | (a >> 29);
d += ((a & (b | c)) | (b & c)) + nt_buffer[5] + SQRT_2;
d = (d << 5) | (d >> 27);
c += ((d & (a | b)) | (a & b)) + nt_buffer[9] + SQRT_2;
c = (c << 9) | (c >> 23);
b += ((c & (d | a)) | (d & a)) + nt_buffer[13] + SQRT_2;
b = (b << 13) | (b >> 19);
a += ((b & (c | d)) | (c & d)) + nt_buffer[2] + SQRT_2;
a = (a << 3) | (a >> 29);
d += ((a & (b | c)) | (b & c)) + nt_buffer[6] + SQRT_2;
d = (d << 5) | (d >> 27);
c += ((d & (a | b)) | (a & b)) + nt_buffer[10] + SQRT_2;
c = (c << 9) | (c >> 23);
b += ((c & (d | a)) | (d & a)) + nt_buffer[14] + SQRT_2;
b = (b << 13) | (b >> 19);
a += ((b & (c | d)) | (c & d)) + nt_buffer[3] + SQRT_2;
a = (a << 3) | (a >> 29);
d += ((a & (b | c)) | (b & c)) + nt_buffer[7] + SQRT_2;
d = (d << 5) | (d >> 27);
c += ((d & (a | b)) | (a & b)) + nt_buffer[11] + SQRT_2;
c = (c << 9) | (c >> 23);
b += ((c & (d | a)) | (d & a)) + nt_buffer[15] + SQRT_2;
b = (b << 13) | (b >> 19);
/* Round 3 */
a += (d ^ c ^ b) + nt_buffer[0] + SQRT_3;
a = (a << 3) | (a >> 29);
d += (c ^ b ^ a) + nt_buffer[8] + SQRT_3;
d = (d << 9) | (d >> 23);
c += (b ^ a ^ d) + nt_buffer[4] + SQRT_3;
c = (c << 11) | (c >> 21);
b += (a ^ d ^ c) + nt_buffer[12] + SQRT_3;
b = (b << 15) | (b >> 17);
a += (d ^ c ^ b) + nt_buffer[2] + SQRT_3;
a = (a << 3) | (a >> 29);
d += (c ^ b ^ a) + nt_buffer[10] + SQRT_3;
d = (d << 9) | (d >> 23);
c += (b ^ a ^ d) + nt_buffer[6] + SQRT_3;
c = (c << 11) | (c >> 21);
b += (a ^ d ^ c) + nt_buffer[14] + SQRT_3;
b = (b << 15) | (b >> 17);
a += (d ^ c ^ b) + nt_buffer[1] + SQRT_3;
a = (a << 3) | (a >> 29);
d += (c ^ b ^ a) + nt_buffer[9] + SQRT_3;
d = (d << 9) | (d >> 23);
c += (b ^ a ^ d) + nt_buffer[5] + SQRT_3;
c = (c << 11) | (c >> 21);
b += (a ^ d ^ c) + nt_buffer[13] + SQRT_3;
b = (b << 15) | (b >> 17);
a += (d ^ c ^ b) + nt_buffer[3] + SQRT_3;
a = (a << 3) | (a >> 29);
d += (c ^ b ^ a) + nt_buffer[11] + SQRT_3;
d = (d << 9) | (d >> 23);
c += (b ^ a ^ d) + nt_buffer[7] + SQRT_3;
c = (c << 11) | (c >> 21);
b += (a ^ d ^ c) + nt_buffer[15] + SQRT_3;
b = (b << 15) | (b >> 17);
output[0] = a + 0x67452301;
output[1] = b + 0xefcdab89;
output[2] = c + 0x98badcfe;
output[3] = d + 0x10325476;
//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
// Convert the hash to hex (for being readable)
//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
for(i=0; i<4; i++)
{
int j = 0;
unsigned int n = output[i];
//iterate the bytes of the integer
for(; j<4; j++)
{
unsigned int convert = n % 256;
hex_format[i * 8 + j * 2 + 1] = itoa16[convert % 16];
convert = convert / 16;
hex_format[i * 8 + j * 2 + 0] = itoa16[convert % 16];
n = n / 256;
}
}
}
""", no_extern_c=True)
expected = nthash.encrypt('then')
data = numpy.array(expected)
cleartext = numpy.zeros_like(data)
cleartext_gpu = cuda.mem_alloc(data.nbytes)
func = mod.get_function('NTBruteforce')
func(cleartext_gpu, block=(1,1,1))
cuda.memcpy_dtoh(cleartext, cleartext_gpu)
print 'Expected: {}'.format(expected.upper())
print "GPU : {}".format(cleartext.tostring())
The result is, as expected:
Expected: 35B5C3F393D57F7836FF61514BCF1289
GPU : 35B5C3F393D57F7836FF61514BCF1289