/*
* snake_oil2.cu
*
* CUDA program to do bad encryption
*
* compile with: nvcc -o snake_oil2 snake_oil2.cu
*
* resultsovercoffee@gmail.com
*/
#include <iostream>
#include <string>
#include <new>
#include <cstdlib>
using namespace std;
// function prototype for our CUDA kernel
__global__ void badcrypto(char *, char *, char *, int, int);
int main(int argc, char *argv[]) {
string key("password");
int keysize = key.size()+ 1;
//read from stdin to a buffer (max of 512 bytes)
const int max = 512;
char *h_in = new (nothrow)char[max];
if (h_in == 0) {
cerr << "host allocation failed" << endl;
exit(EXIT_FAILURE);
}
cin.read(h_in,max);
int size = cin.gcount();
//declare, allocate, and copy the input to the device
char *d_in;
if (cudaMalloc(&d_in,size) != cudaSuccess){
cerr << cudaGetErrorString(cudaGetLastError()) << endl;
exit(EXIT_FAILURE);
}
if (cudaMemcpy(d_in,h_in,size,cudaMemcpyHostToDevice)
!= cudaSuccess) {
cerr << cudaGetErrorString(cudaGetLastError()) << endl;
exit(EXIT_FAILURE);
}
//declare, allocate, and copy the key to the device
char *d_key;
if (cudaMalloc(&d_key,keysize) != cudaSuccess){
cerr << cudaGetErrorString(cudaGetLastError()) << endl;
exit(EXIT_FAILURE);
}
if (cudaMemcpy(d_key,key.c_str(),keysize,cudaMemcpyHostToDevice)
!= cudaSuccess) {
cerr << cudaGetErrorString(cudaGetLastError()) << endl;
exit(EXIT_FAILURE);
}
// declare, allocate and zero output array on device
char *d_out;
if (cudaMalloc(&d_out,size) != cudaSuccess){
cerr << cudaGetErrorString(cudaGetLastError()) << endl;
exit(EXIT_FAILURE);
}
if (cudaMemset(d_out,0,size) != cudaSuccess){
cerr << cudaGetErrorString(cudaGetLastError()) << endl;
exit(EXIT_FAILURE);
}
//launch kernel on device to generate results
dim3 grid(1,1);
dim3 block(512);
badcrypto<<<grid,block>>>(d_in,d_out,d_key,size,keysize);
//declare and allocate an output array on the host
char *h_out = new (nothrow)char[size];
if (h_out == 0) {
cerr << "host allocation failed" << endl;
exit(EXIT_FAILURE);
}
//copy the device buffer back to the host
if (cudaMemcpy(h_out,d_out,size,cudaMemcpyDeviceToHost)
!= cudaSuccess) {
cerr << cudaGetErrorString(cudaGetLastError()) << endl;
exit(EXIT_FAILURE);
}
//print results
for (int i=0;i<size;i++) {
cout << (int)h_out[i] << " " ;
}
cout << endl;
//deallocate arrays
cudaFree(d_in);
cudaFree(d_key);
cudaFree(d_out);
delete[] h_in;
delete[] h_out;
//bye
exit(EXIT_SUCCESS);
}
/*
* badcrypto
*
* kernel that "encrypts" data using xor. Don't do this.
*/
__global__ void badcrypto(char *in, char *out, char *key,
int in_length, int key_length) {
if (threadIdx.x < in_length)
out[threadIdx.x] = in[threadIdx.x] ^ key[threadIdx.x % key_length];
}
First let's cover the minor modifications. Rather than using fixed input, we are now reading the cleartext from stdin to a host side buffer. For simplicity, we've limited the buffer to 512 bytes for the time being. After reading in the file, we set size to the number of bytes read. This modification will allow us to use snake_oil2 on the command line like this:
$ echo -n cleartxt | ./snake_oil2
But note that the output is still printed as integers for now.
The other modifications are in the CUDA C kernel. Now that we are accepting input from stdin, we can no longer assume that the cleartext and the key are the same length. To handle this, we are passing in the length of the key, and using the modulus operator (%) to find the index into the key that is appropriate for the current position in the cleartext. The result is that our cleartext and key can be different lengths -- the key is simply repeated encrypt the cleartext.
The more important change is how we have dealt with the variable length of the cleartext. Rather than launching one CUDA thread per input character, we are launching a fixed number of CUDA threads (512), but predicating the encryption operation. We pass in the length of the input cleartext, and each thread checks if it has a thread index less than that length. Threads with an index that would fall beyond the end of the cleartext fail the if statement, and thus do not perform any operation. This decouples the thread count from the array size.
The decoupling is possible because, unlike SIMD (single instruction, multiple data) vector paradigms, CUDA C is SIMT (single instruction, multiple threads). These threads have some autonomy and (within some practical limits) can support diverging code paths. In this case, we have predicated the encryption operation to make the threadblock behave like an unrolled while() loop.
Obviously, this example is still pretty unusable. We're limited to a maximum of 512 bytes of input. We also have many threads that are not contributing to the calculation, and we're only using 32 CUDA cores. Unfortunately, my 20 minutes is up, so I'll have to address those issues next week.