As usual, first the code:
/*
* snake_oil3.cu
*
* CUDA program to do bad encryption
*
* compile with: nvcc -o snake_oil3 snake_oil3.cu
*
* resultsovercoffee@gmail.com
*/
#include <iostream>
#include <cstring>
#include <new>
#include <cstdlib>
using namespace std;
// function prototype for our CUDA kernel
__global__ void badcrypto(char *, char *, char *, int, int);
int main(int argc,char *argv[]) {
//exit if we are not called correctly
if (argc != 2) {
cerr << "Usage: "<< argv[0] << " password" << endl;
exit(EXIT_FAILURE);
}
//declare and allocate input and output arrays on host
const int max = 128*1024*1024; //128MB
char *h_in = new (nothrow)char[max];
if (h_in == 0) {
cerr << "host allocation failed" << endl;
exit(EXIT_FAILURE);
}
char *h_out = new (nothrow)char[max];
if (h_out == 0) {
cerr << "host allocation failed" << endl;
exit(EXIT_FAILURE);
}
//read cleartext from stdin
cin.read(h_in,max);
int size = cin.gcount();
//declare, allocate, and copy the cleartext to the device
char *d_in;
if (cudaMalloc(&d_in,size) != cudaSuccess){
cerr << cudaGetErrorString(cudaGetLastError()) << endl;
exit(EXIT_FAILURE);
}
if (cudaMemcpy(d_in,h_in,size,cudaMemcpyHostToDevice)
!= cudaSuccess) {
cerr << cudaGetErrorString(cudaGetLastError()) << endl;
exit(EXIT_FAILURE);
}
//declare, allocate, and copy the key to the device
size_t keysize = strlen(argv[1]);
char *d_key;
if (cudaMalloc(&d_key,keysize) != cudaSuccess){
cerr << cudaGetErrorString(cudaGetLastError()) << endl;
exit(EXIT_FAILURE);
}
if (cudaMemcpy(d_key,argv[1],keysize,cudaMemcpyHostToDevice)
!= cudaSuccess) {
cerr << cudaGetErrorString(cudaGetLastError()) << endl;
exit(EXIT_FAILURE);
}
//declare, allocate and zero the output array on device
char *d_out;
if (cudaMalloc(&d_out,size) != cudaSuccess){
cerr << cudaGetErrorString(cudaGetLastError()) << endl;
exit(EXIT_FAILURE);
}
if (cudaMemset(d_out,0,size) != cudaSuccess){
cerr << cudaGetErrorString(cudaGetLastError()) << endl;
exit(EXIT_FAILURE);
}
//launch kernel on device to generate results
dim3 grid(1,1);
dim3 block(256);
badcrypto<<<grid,block>>>(d_in,d_out,d_key,size,keysize);
//copy the device buffer back to the host
if (cudaMemcpy(h_out,d_out,size,cudaMemcpyDeviceToHost)
!= cudaSuccess) {
cerr << cudaGetErrorString(cudaGetLastError()) << endl;
exit(EXIT_FAILURE);
}
//write ciphertext to stdout
cout.write(h_out,size);
//deallocate arrays
cudaFree(d_in);
cudaFree(d_key);
cudaFree(d_out);
delete[] h_in;
delete[] h_out;
//bye
exit(EXIT_SUCCESS);
}
/*
* badcrypto
*
* kernel that "encrypts" data using xor. Don't do this.
*/
__global__ void badcrypto(char *in, char *out, char *key,
int in_length, int key_length) {
int index = threadIdx.x;
while (index < in_length) {
out[index] = in[index] ^ key[index % key_length];
index += blockDim.x;
}
}
I've done some minor restructuring of the previous example code. The allocation of input and output arrays on the host are now done together, and we've increased the maximum size to 128MB. We've also made this a fully functional example -- the password is supplied as an argument on the command line, and the ciphertext is now written to standard output rather than printed as integers. As a result, the program can now be used with the standard shell redirection operators like this:
./snake_oil3 password < input.txt > output.txt
The key difference is how we have re-written the kernel. Rather than just testing to see if our current threadIdx.x is too large, we are using a while loop that tests to see if our index is less than the length of the input array. At the end of each loop we increment our index by blockDim.x. As a result, our threads will "walk" though the array until all the input data has been encrypted. So even though we a launching a fixed number of threads (256) our input array can be many times larger (up to 128MB in my arbitrary example here).
To prove this, I downloaded a copy of War and Peace from Project Gutenberg. The UTF-8 version weighs in at a little over 3MB:
$ wc war_and_peace.txt
65335 565450 3288707 war_and_peace.txt
Using our new version of snake_oil, I can "encrypt" this file and put the output in the file ciphertext.xor.
$./snake_oil3 secretpassword < war_and_peace.txt > ciphertext.xor
You can examine the ciphertext.xor file to verify that it is "encrypted" or at least different from the input text. When you're ready, you can then decrypt it by feeding it back into snake_oil using the same password that was used to "encrypt" it ("secretpassword" in this example):
$./snake_oil3 secretpassword < ciphertext.xor > cleartext.txt
As a final check, we can verify that the decrypted text matches the original exactly:
$ md5sum war_and_peace.txt cleartext.txt
23755b6d1871a58160c36485455fa6fd war_and_peace.txt
23755b6d1871a58160c36485455fa6fd cleartext.txt
So, we've now written a fully-functional program that does very insecure encryption using CUDA-capable GPUs. In the next few weeks we'll look at ways to make this program more efficient -- we have a long way to go.
Finally, a note about my absence: new puppy. Photo is below.