Algorithm :
I'm writing a program with CUDA and the problem is the following:
Two matrices A (n * 128) and B (m * 128)
I take the first row of A, and I compute the distance between that vector and all the rows of B, one by one.
I write the result of each distance on a row of a matrix C, so the element C(i,j) of C contains the distance between row i of A and row j of B.
and I proceed with the next row of A.
I've implemented it this way: I've got a grid made by ( n * m ) blocks, and 128 threads per block. ( 1 * 128 ).
QUESTION: The program runs successfully with the expected results but the time execution is only around 5 to 10 times faster than the one-threaded CPU version of it. So I would like to know how to increase the work per thread before reduction in order to increase performance.
Kernel code (original : Not optimized)
__global__ void EuclideanDistances( float *A, float *B , float *C , int n , int m)
{
// SIZE is equal to 128
__shared__ float accumResult[SIZE];
float sA;
float sB;
// MAPPING
int bx = blockIdx.x; // n
int by = blockIdx.y; // m
int ty = threadIdx.y; // 128
int tx = threadIdx.x; // 1
sA = A [bx * SIZE + ty];
sB = B [by * SIZE + ty];
__syncthreads();
accumResult[ty] = (sA - sB) * (sA - sB);
__syncthreads();
// Parallel tree-reduction
for (int stride = SIZE/2 ; stride > 0 ; stride >>= 1)
if (ty < stride)
{
accumResult[ty] += accumResult [stride + ty];
__syncthreads();
}
// Writing results to output matrix
if ((threadIdx.y == 0))
C [bx * m + by] = accumResult[ty];
__syncthreads();
}
UPDATE
Now, I'm using another mapping : Instead of taking a grid of n
by m
blocks and a block of 128
threads, I'm increasing the number of threads within a block in order to decrease the number of blocks.
New mapping:
Block of 128
by 8
threads (total of 1024 threads, which is the max size)
Grid of n/8
by m/8
blocks
Unfortunately, it's giving wrong results ).
Optimized kernel code (to be updated)
__global__ void EuclideanDistances( float *A, float *B , float *C, int n , int m)
{
__shared__ float accumResult[SIZE][8];
__shared__ float sA[SIZE][8];
__shared__ float sB[SIZE][8];
int bx = blockIdx.x; // n / 8
int by = blockIdx.y; // m / 8
int tx = threadIdx.x; // 8
int ty = threadIdx.y; // 128
int i = bx * tx * SIZE + ty;
int j = by * tx * SIZE + ty;
sA[ty][tx] = A [i];
sB[ty][tx] = B[j];
__syncthreads();
accumResult[ty][tx] = (sA[ty][tx] - sB[ty][tx]) * (sA[ty][tx] - sB[ty][tx]);
__syncthreads();
// Reduction
for (int stride = SIZE/2 ; stride > 0 ; stride>>=1)
if (ty < stride)
{
accumResult[ty][tx] += accumResult [stride + ty][tx];
__syncthreads();
}
C[bx * m + by] = accumResult[0][tx];
}
HOST CODE (allocations + kernel calls)
int main()
{
int m = 20000; //MatrixA size : m * SIZE
int n = 4000; //MatrixB size : n * SIZE
srand((unsigned)time(0));
// Host Allocations
float *matrixA = (float *) malloc (n * SIZE * sizeof(float));
for(int i=0; i < n * SIZE; i++)
matrixA[i] = (float) (rand()%100)+1;
float *matrixB = (float *) malloc (m * SIZE * sizeof(float));
for(int i=0; i < m * SIZE; i++)
matrixB[i] = (float) (rand()%100)+1;
float *results_kernel1 = (float *) malloc (n * m * sizeof(float));
float *results_kernel2 = (float *) malloc (n * m * sizeof(float));
//Device Allocation
float *d_matrixA;
float *d_matrixB;
cudaMalloc((void **)&d_matrixA, n * SIZE * sizeof(float));
cudaMalloc((void **)&d_matrixB, m * SIZE * sizeof(float));
cudaMemcpy(d_matrixA , matrixA , n * SIZE * sizeof(float) , cudaMemcpyHostToDevice);
cudaMemcpy(d_matrixB , matrixB , m * SIZE * sizeof(float) , cudaMemcpyHostToDevice);
float *d_results_kernel1;
float *d_results_kernel2;
cudaMalloc((void **)&d_results_kernel1 , n * m * sizeof(float));
cudaMalloc((void **)&d_results_kernel2 , n * m * sizeof(float));
dim3 threads1 (1 , 128);
dim3 blocks1 (n , m);
EuclideanDistances1 <<<blocks1 , threads1>>> (d_matrixA , d_matrixB , d_results_kernel1 , n , m);
cudaDeviceSynchronize();
cudaMemcpy(results_kernel1 , d_results_kernel1 , n * m *sizeof(float) , cudaMemcpyDeviceToHost);
cudaFree(d_results_kernel1);
dim3 threads2 (8 , 128); // 1024 threads per block (maximum)
dim3 blocks2 (ceil((float)n/8) , ceil((float)m/8));
EuclideanDistances2 <<<blocks2 , threads2>>> (d_matrixA , d_matrixB , d_results_kernel2 , n , m);
cudaDeviceSynchronize();
cudaMemcpy(results_kernel2 , d_results_kernel2 , n * m *sizeof(float) , cudaMemcpyDeviceToHost);
cudaFree(d_results_kernel2);
// Visualising and comparing results
for (int i = 0 ; i < 50 ; i++)
std::cout << "kernel1 : " << results_kernel1[i] << " | kernel2 : " << results_kernel2[i] << std::endl;
free(matrixA);
free(matrixB);
free(results_kernel1);
free(results_kernel2);
return 0;
}
PS: I have CUDA 6.0 with a NVIDIA GTX 650 (compute capability 3.0)
See Question&Answers more detail:os