I am a fairly new cuda user.
(我是一个相当新的cuda用户。)
I'm practicing on my first cuda application where I try to accelerate kmeans algorithm by using GPU(GTX 670).(我正在第一个cuda应用程序上练习,尝试使用GPU(GTX 670)加速kmeans算法。)
Briefly, each thread works on a single point which is compared to all cluster centers and a point is assigned to a center with minimum distance(kernel code can be seen below with comments).
(简而言之,每个线程都在单个点上工作,该点与所有群集中心进行比较,并且将一个点分配到具有最小距离的中心(可以在下面的注释中看到内核代码)。)
According to Nsight Visual Studio, I have an occupancy of 99.61%(1024 blocks, 1024 threads per block), 99.34% Streaming Multiprocessor activity, 79.98% warp issue efficiency, no shared memory bank conflicts, 18.4GFLOPs Single MUL and 55.2 GFLOPs Single ADD(takes about 14,5 ms to complete kmeans kernel with given parameters).
(根据Nsight Visual Studio,我的占用率为99.61%(1024个块,每个块1024个线程),99.34%的流式多处理器活动,79.98%的warp发行效率,无共享内存库冲突,18.4GFLOPs单个MUL和55.2 GFLOPs单个添加(大约需要14.5毫秒才能完成具有给定参数的kmeans内核)。)
According to Wikipedia, GTX670's peak performance is 2460 GFLOPs.
(根据维基百科,GTX670的最高性能是2460 GFLOP。)
I am nowhere close to it.(我离它很近。)
In addition to these, some papers claim they can achieve more than half of the peak performance.(除了这些以外,一些论文声称它们可以达到峰值性能的一半以上。)
I cannot see how further I can optimize this kernel code.(我看不出我可以进一步优化此内核代码。)
Is there any optimization that I can apply to the kernel?(我可以对内核进行任何优化吗?)
Any suggestion or help is appreciated and I can give any additional information on demand.(如有任何建议或帮助,我可以根据需要提供任何其他信息。)
(完整的代码)
Thanks in advance.
(提前致谢。)
#define SIZE 1024*1024 //number of points
#define CENTERS 32 //number of cluster centroids
#define DIM 8 //dimension of each point and center
#define cudaTHREADSIZE 1024 //threads per block
#define cudaBLOCKSIZE SIZE/cudaTHREADSIZE //number of blocks for kernel
__global__ void kMeans(float *dp, float *dc,int *tag, int *membershipChangedPerBlock)
{
//TOTAL NUMBER OF THREADS SHOULD BE EQUAL TO THE NUMBER OF POINTS, BECAUSE EACH THREAD WORKS ON A SINGLE POINT
__shared__ unsigned char membershipChanged[cudaTHREADSIZE];
__shared__ float dc_shared[CENTERS*DIM];
int tid = threadIdx.x + blockIdx.x * blockDim.x;
int threadID = threadIdx.x;
membershipChanged[threadIdx.x] = 0;
//move centers to shared memory, because each and every thread will call it(roughly + %10 performance here)
while(threadID < CENTERS*DIM){
dc_shared[threadID] = dc[threadID];
threadID += blockDim.x;
}
__syncthreads();
while(tid < SIZE){
int index,prevIndex;
float dist, min_dist;
index = 0;//all initial point indices(centroid number) are assigned to 0.
prevIndex = 0;
dist = 0;
min_dist = 0;
//euclid distance for center 0
for(int dimIdx = 0; dimIdx < DIM; dimIdx++){
min_dist += (dp[tid + dimIdx*SIZE] - dc_shared[dimIdx*CENTERS])*(dp[tid + dimIdx*SIZE] - dc_shared[dimIdx*CENTERS]);
}
//euclid distance for other centers with distance comparison
for(int centerIdx = 1; centerIdx < CENTERS; centerIdx++){
dist = 0;
for(int dimIdx = 0; dimIdx < DIM; dimIdx++){
dist += (dp[tid + dimIdx*SIZE] - dc_shared[centerIdx + dimIdx*CENTERS])*(dp[tid + dimIdx*SIZE] - dc_shared[centerIdx + dimIdx*CENTERS]);
}
//compare distances, if found a shorter one, change index to that centroid number
if(dist < min_dist){
min_dist = dist;
index = centerIdx;
}
}
if (tag[tid] != index) {//if a point's cluster membership changes, flag it as changed in order to compute total membership changes later on
membershipChanged[threadIdx.x] = 1;
}
tag[tid] = index;
__syncthreads();//sync before applying sum reduction to membership changes
//sum reduction
for (unsigned int s = blockDim.x / 2; s > 0; s >>= 1) {
if (threadIdx.x < s) {
membershipChanged[threadIdx.x] +=
membershipChanged[threadIdx.x + s];
}
__syncthreads();
}
if (threadIdx.x == 0) {
membershipChangedPerBlock[blockIdx.x] = membershipChanged[0];
}
tid += blockDim.x * gridDim.x;
}
}
ask by menderft translate from so